LCOV - differential code coverage report
Current view: top level - src/backend/storage/buffer - bufmgr.c (source / functions) Coverage Total Hit UNC LBC UIC UBC GBC GIC GNC CBC ECB DUB DCB
Current: bed3ffbf9d952be6c7d739d068cdce44c046dfb7 vs 574581b50ac9c63dd9e4abebb731a3b67e5b50f6 Lines: 92.0 % 2418 2225 68 6 1 118 14 1 723 1487 3 33 389
Current Date: 2026-05-05 10:23:31 +0900 Functions: 96.5 % 144 139 2 3 96 43 1 12
Baseline: lcov-20260505-025707-baseline Branches: 72.0 % 1817 1309 150 8 16 334 14 397 898 70 180
Baseline Date: 2026-05-05 10:27:06 +0900 Line coverage date bins:
Legend: Lines:     hit not hit
Branches: + taken - not taken # not executed
(7,30] days: 100.0 % 2 2 2
(30,360] days: 91.9 % 792 728 63 1 2 1 708 17
(360..) days: 92.1 % 1624 1495 5 6 1 117 12 13 1470 3
Function coverage date bins:
(30,360] days: 94.9 % 39 37 2 37
(360..) days: 97.1 % 105 102 3 59 43
Branch coverage date bins:
(30,360] days: 73.2 % 541 396 143 2 1 392 3
(360..) days: 71.6 % 1276 913 7 8 16 332 13 5 895

 Age         Owner                    Branch data    TLA  Line data    Source code
                                  1                 :                : /*-------------------------------------------------------------------------
                                  2                 :                :  *
                                  3                 :                :  * bufmgr.c
                                  4                 :                :  *    buffer manager interface routines
                                  5                 :                :  *
                                  6                 :                :  * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
                                  7                 :                :  * Portions Copyright (c) 1994, Regents of the University of California
                                  8                 :                :  *
                                  9                 :                :  *
                                 10                 :                :  * IDENTIFICATION
                                 11                 :                :  *    src/backend/storage/buffer/bufmgr.c
                                 12                 :                :  *
                                 13                 :                :  *-------------------------------------------------------------------------
                                 14                 :                :  */
                                 15                 :                : /*
                                 16                 :                :  * Principal entry points:
                                 17                 :                :  *
                                 18                 :                :  * ReadBuffer() -- find or create a buffer holding the requested page,
                                 19                 :                :  *      and pin it so that no one can destroy it while this process
                                 20                 :                :  *      is using it.
                                 21                 :                :  *
                                 22                 :                :  * StartReadBuffer() -- as above, with separate wait step
                                 23                 :                :  * StartReadBuffers() -- multiple block version
                                 24                 :                :  * WaitReadBuffers() -- second step of above
                                 25                 :                :  *
                                 26                 :                :  * ReleaseBuffer() -- unpin a buffer
                                 27                 :                :  *
                                 28                 :                :  * MarkBufferDirty() -- mark a pinned buffer's contents as "dirty".
                                 29                 :                :  *      The disk write is delayed until buffer replacement or checkpoint.
                                 30                 :                :  *
                                 31                 :                :  * See also these files:
                                 32                 :                :  *      freelist.c -- chooses victim for buffer replacement
                                 33                 :                :  *      buf_table.c -- manages the buffer lookup table
                                 34                 :                :  */
                                 35                 :                : #include "postgres.h"
                                 36                 :                : 
                                 37                 :                : #include <sys/file.h>
                                 38                 :                : #include <unistd.h>
                                 39                 :                : 
                                 40                 :                : #include "access/tableam.h"
                                 41                 :                : #include "access/xloginsert.h"
                                 42                 :                : #include "access/xlogutils.h"
                                 43                 :                : #ifdef USE_ASSERT_CHECKING
                                 44                 :                : #include "catalog/pg_tablespace_d.h"
                                 45                 :                : #endif
                                 46                 :                : #include "catalog/storage.h"
                                 47                 :                : #include "catalog/storage_xlog.h"
                                 48                 :                : #include "common/hashfn.h"
                                 49                 :                : #include "executor/instrument.h"
                                 50                 :                : #include "lib/binaryheap.h"
                                 51                 :                : #include "miscadmin.h"
                                 52                 :                : #include "pg_trace.h"
                                 53                 :                : #include "pgstat.h"
                                 54                 :                : #include "postmaster/bgwriter.h"
                                 55                 :                : #include "storage/aio.h"
                                 56                 :                : #include "storage/buf_internals.h"
                                 57                 :                : #include "storage/bufmgr.h"
                                 58                 :                : #include "storage/fd.h"
                                 59                 :                : #include "storage/ipc.h"
                                 60                 :                : #include "storage/lmgr.h"
                                 61                 :                : #include "storage/proc.h"
                                 62                 :                : #include "storage/proclist.h"
                                 63                 :                : #include "storage/procsignal.h"
                                 64                 :                : #include "storage/read_stream.h"
                                 65                 :                : #include "storage/smgr.h"
                                 66                 :                : #include "storage/standby.h"
                                 67                 :                : #include "utils/memdebug.h"
                                 68                 :                : #include "utils/ps_status.h"
                                 69                 :                : #include "utils/rel.h"
                                 70                 :                : #include "utils/resowner.h"
                                 71                 :                : #include "utils/timestamp.h"
                                 72                 :                : #include "utils/wait_event.h"
                                 73                 :                : 
                                 74                 :                : 
                                 75                 :                : /* Note: these two macros only work on shared buffers, not local ones! */
                                 76                 :                : #define BufHdrGetBlock(bufHdr)  ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
                                 77                 :                : #define BufferGetLSN(bufHdr)    (PageGetLSN(BufHdrGetBlock(bufHdr)))
                                 78                 :                : 
                                 79                 :                : /* Note: this macro only works on local buffers, not shared ones! */
                                 80                 :                : #define LocalBufHdrGetBlock(bufHdr) \
                                 81                 :                :     LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]
                                 82                 :                : 
                                 83                 :                : /* Bits in SyncOneBuffer's return value */
                                 84                 :                : #define BUF_WRITTEN             0x01
                                 85                 :                : #define BUF_REUSABLE            0x02
                                 86                 :                : 
                                 87                 :                : #define RELS_BSEARCH_THRESHOLD      20
                                 88                 :                : 
                                 89                 :                : /*
                                 90                 :                :  * This is the size (in the number of blocks) above which we scan the
                                 91                 :                :  * entire buffer pool to remove the buffers for all the pages of relation
                                 92                 :                :  * being dropped. For the relations with size below this threshold, we find
                                 93                 :                :  * the buffers by doing lookups in BufMapping table.
                                 94                 :                :  */
                                 95                 :                : #define BUF_DROP_FULL_SCAN_THRESHOLD        (uint64) (NBuffers / 32)
                                 96                 :                : 
                                 97                 :                : /*
                                 98                 :                :  * This is separated out from PrivateRefCountEntry to allow for copying all
                                 99                 :                :  * the data members via struct assignment.
                                100                 :                :  */
                                101                 :                : typedef struct PrivateRefCountData
                                102                 :                : {
                                103                 :                :     /*
                                104                 :                :      * How many times has the buffer been pinned by this backend.
                                105                 :                :      */
                                106                 :                :     int32       refcount;
                                107                 :                : 
                                108                 :                :     /*
                                109                 :                :      * Is the buffer locked by this backend? BUFFER_LOCK_UNLOCK indicates that
                                110                 :                :      * the buffer is not locked.
                                111                 :                :      */
                                112                 :                :     BufferLockMode lockmode;
                                113                 :                : } PrivateRefCountData;
                                114                 :                : 
                                115                 :                : typedef struct PrivateRefCountEntry
                                116                 :                : {
                                117                 :                :     /*
                                118                 :                :      * Note that this needs to be same as the entry's corresponding
                                119                 :                :      * PrivateRefCountArrayKeys[i], if the entry is stored in the array. We
                                120                 :                :      * store it in both places as this is used for the hashtable key and
                                121                 :                :      * because it is more convenient (passing around a PrivateRefCountEntry
                                122                 :                :      * suffices to identify the buffer) and faster (checking the keys array is
                                123                 :                :      * faster when checking many entries, checking the entry is faster if just
                                124                 :                :      * checking a single entry).
                                125                 :                :      */
                                126                 :                :     Buffer      buffer;
                                127                 :                : 
                                128                 :                :     char        status;
                                129                 :                : 
                                130                 :                :     PrivateRefCountData data;
                                131                 :                : } PrivateRefCountEntry;
                                132                 :                : 
                                133                 :                : #define SH_PREFIX refcount
                                134                 :                : #define SH_ELEMENT_TYPE PrivateRefCountEntry
                                135                 :                : #define SH_KEY_TYPE Buffer
                                136                 :                : #define SH_KEY buffer
                                137                 :                : #define SH_HASH_KEY(tb, key) murmurhash32((uint32) (key))
                                138                 :                : #define SH_EQUAL(tb, a, b) ((a) == (b))
                                139                 :                : #define SH_SCOPE static inline
                                140                 :                : #define SH_DECLARE
                                141                 :                : #define SH_DEFINE
                                142                 :                : #include "lib/simplehash.h"
                                143                 :                : 
                                144                 :                : /* 64 bytes, about the size of a cache line on common systems */
                                145                 :                : #define REFCOUNT_ARRAY_ENTRIES 8
                                146                 :                : 
                                147                 :                : /*
                                148                 :                :  * Status of buffers to checkpoint for a particular tablespace, used
                                149                 :                :  * internally in BufferSync.
                                150                 :                :  */
                                151                 :                : typedef struct CkptTsStatus
                                152                 :                : {
                                153                 :                :     /* oid of the tablespace */
                                154                 :                :     Oid         tsId;
                                155                 :                : 
                                156                 :                :     /*
                                157                 :                :      * Checkpoint progress for this tablespace. To make progress comparable
                                158                 :                :      * between tablespaces the progress is, for each tablespace, measured as a
                                159                 :                :      * number between 0 and the total number of to-be-checkpointed pages. Each
                                160                 :                :      * page checkpointed in this tablespace increments this space's progress
                                161                 :                :      * by progress_slice.
                                162                 :                :      */
                                163                 :                :     float8      progress;
                                164                 :                :     float8      progress_slice;
                                165                 :                : 
                                166                 :                :     /* number of to-be checkpointed pages in this tablespace */
                                167                 :                :     int         num_to_scan;
                                168                 :                :     /* already processed pages in this tablespace */
                                169                 :                :     int         num_scanned;
                                170                 :                : 
                                171                 :                :     /* current offset in CkptBufferIds for this tablespace */
                                172                 :                :     int         index;
                                173                 :                : } CkptTsStatus;
                                174                 :                : 
                                175                 :                : /*
                                176                 :                :  * Type for array used to sort SMgrRelations
                                177                 :                :  *
                                178                 :                :  * FlushRelationsAllBuffers shares the same comparator function with
                                179                 :                :  * DropRelationsAllBuffers. Pointer to this struct and RelFileLocator must be
                                180                 :                :  * compatible.
                                181                 :                :  */
                                182                 :                : typedef struct SMgrSortArray
                                183                 :                : {
                                184                 :                :     RelFileLocator rlocator;    /* This must be the first member */
                                185                 :                :     SMgrRelation srel;
                                186                 :                : } SMgrSortArray;
                                187                 :                : 
                                188                 :                : /* GUC variables */
                                189                 :                : bool        zero_damaged_pages = false;
                                190                 :                : int         bgwriter_lru_maxpages = 100;
                                191                 :                : double      bgwriter_lru_multiplier = 2.0;
                                192                 :                : bool        track_io_timing = false;
                                193                 :                : 
                                194                 :                : /*
                                195                 :                :  * How many buffers PrefetchBuffer callers should try to stay ahead of their
                                196                 :                :  * ReadBuffer calls by.  Zero means "never prefetch".  This value is only used
                                197                 :                :  * for buffers not belonging to tablespaces that have their
                                198                 :                :  * effective_io_concurrency parameter set.
                                199                 :                :  */
                                200                 :                : int         effective_io_concurrency = DEFAULT_EFFECTIVE_IO_CONCURRENCY;
                                201                 :                : 
                                202                 :                : /*
                                203                 :                :  * Like effective_io_concurrency, but used by maintenance code paths that might
                                204                 :                :  * benefit from a higher setting because they work on behalf of many sessions.
                                205                 :                :  * Overridden by the tablespace setting of the same name.
                                206                 :                :  */
                                207                 :                : int         maintenance_io_concurrency = DEFAULT_MAINTENANCE_IO_CONCURRENCY;
                                208                 :                : 
                                209                 :                : /*
                                210                 :                :  * Limit on how many blocks should be handled in single I/O operations.
                                211                 :                :  * StartReadBuffers() callers should respect it, as should other operations
                                212                 :                :  * that call smgr APIs directly.  It is computed as the minimum of underlying
                                213                 :                :  * GUCs io_combine_limit_guc and io_max_combine_limit.
                                214                 :                :  */
                                215                 :                : int         io_combine_limit = DEFAULT_IO_COMBINE_LIMIT;
                                216                 :                : int         io_combine_limit_guc = DEFAULT_IO_COMBINE_LIMIT;
                                217                 :                : int         io_max_combine_limit = DEFAULT_IO_COMBINE_LIMIT;
                                218                 :                : 
                                219                 :                : /*
                                220                 :                :  * GUC variables about triggering kernel writeback for buffers written; OS
                                221                 :                :  * dependent defaults are set via the GUC mechanism.
                                222                 :                :  */
                                223                 :                : int         checkpoint_flush_after = DEFAULT_CHECKPOINT_FLUSH_AFTER;
                                224                 :                : int         bgwriter_flush_after = DEFAULT_BGWRITER_FLUSH_AFTER;
                                225                 :                : int         backend_flush_after = DEFAULT_BACKEND_FLUSH_AFTER;
                                226                 :                : 
                                227                 :                : /* local state for LockBufferForCleanup */
                                228                 :                : static BufferDesc *PinCountWaitBuf = NULL;
                                229                 :                : 
                                230                 :                : /*
                                231                 :                :  * Backend-Private refcount management:
                                232                 :                :  *
                                233                 :                :  * Each buffer also has a private refcount that keeps track of the number of
                                234                 :                :  * times the buffer is pinned in the current process.  This is so that the
                                235                 :                :  * shared refcount needs to be modified only once if a buffer is pinned more
                                236                 :                :  * than once by an individual backend.  It's also used to check that no
                                237                 :                :  * buffers are still pinned at the end of transactions and when exiting. We
                                238                 :                :  * also use this mechanism to track whether this backend has a buffer locked,
                                239                 :                :  * and, if so, in what mode.
                                240                 :                :  *
                                241                 :                :  *
                                242                 :                :  * To avoid - as we used to - requiring an array with NBuffers entries to keep
                                243                 :                :  * track of local buffers, we use a small sequentially searched array
                                244                 :                :  * (PrivateRefCountArrayKeys, with the corresponding data stored in
                                245                 :                :  * PrivateRefCountArray) and an overflow hash table (PrivateRefCountHash) to
                                246                 :                :  * keep track of backend local pins.
                                247                 :                :  *
                                248                 :                :  * Until no more than REFCOUNT_ARRAY_ENTRIES buffers are pinned at once, all
                                249                 :                :  * refcounts are kept track of in the array; after that, new array entries
                                250                 :                :  * displace old ones into the hash table. That way a frequently used entry
                                251                 :                :  * can't get "stuck" in the hashtable while infrequent ones clog the array.
                                252                 :                :  *
                                253                 :                :  * Note that in most scenarios the number of pinned buffers will not exceed
                                254                 :                :  * REFCOUNT_ARRAY_ENTRIES.
                                255                 :                :  *
                                256                 :                :  *
                                257                 :                :  * To enter a buffer into the refcount tracking mechanism first reserve a free
                                258                 :                :  * entry using ReservePrivateRefCountEntry() and then later, if necessary,
                                259                 :                :  * fill it with NewPrivateRefCountEntry(). That split lets us avoid doing
                                260                 :                :  * memory allocations in NewPrivateRefCountEntry() which can be important
                                261                 :                :  * because in some scenarios it's called with a spinlock held...
                                262                 :                :  */
                                263                 :                : static Buffer PrivateRefCountArrayKeys[REFCOUNT_ARRAY_ENTRIES];
                                264                 :                : static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES];
                                265                 :                : static refcount_hash *PrivateRefCountHash = NULL;
                                266                 :                : static int32 PrivateRefCountOverflowed = 0;
                                267                 :                : static uint32 PrivateRefCountClock = 0;
                                268                 :                : static int  ReservedRefCountSlot = -1;
                                269                 :                : static int  PrivateRefCountEntryLast = -1;
                                270                 :                : 
                                271                 :                : static uint32 MaxProportionalPins;
                                272                 :                : 
                                273                 :                : static void ReservePrivateRefCountEntry(void);
                                274                 :                : static PrivateRefCountEntry *NewPrivateRefCountEntry(Buffer buffer);
                                275                 :                : static PrivateRefCountEntry *GetPrivateRefCountEntry(Buffer buffer, bool do_move);
                                276                 :                : static inline int32 GetPrivateRefCount(Buffer buffer);
                                277                 :                : static void ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref);
                                278                 :                : 
                                279                 :                : /* ResourceOwner callbacks to hold in-progress I/Os and buffer pins */
                                280                 :                : static void ResOwnerReleaseBufferIO(Datum res);
                                281                 :                : static char *ResOwnerPrintBufferIO(Datum res);
                                282                 :                : static void ResOwnerReleaseBuffer(Datum res);
                                283                 :                : static char *ResOwnerPrintBuffer(Datum res);
                                284                 :                : 
                                285                 :                : const ResourceOwnerDesc buffer_io_resowner_desc =
                                286                 :                : {
                                287                 :                :     .name = "buffer io",
                                288                 :                :     .release_phase = RESOURCE_RELEASE_BEFORE_LOCKS,
                                289                 :                :     .release_priority = RELEASE_PRIO_BUFFER_IOS,
                                290                 :                :     .ReleaseResource = ResOwnerReleaseBufferIO,
                                291                 :                :     .DebugPrint = ResOwnerPrintBufferIO
                                292                 :                : };
                                293                 :                : 
                                294                 :                : const ResourceOwnerDesc buffer_resowner_desc =
                                295                 :                : {
                                296                 :                :     .name = "buffer",
                                297                 :                :     .release_phase = RESOURCE_RELEASE_BEFORE_LOCKS,
                                298                 :                :     .release_priority = RELEASE_PRIO_BUFFER_PINS,
                                299                 :                :     .ReleaseResource = ResOwnerReleaseBuffer,
                                300                 :                :     .DebugPrint = ResOwnerPrintBuffer
                                301                 :                : };
                                302                 :                : 
                                303                 :                : /*
                                304                 :                :  * Ensure that the PrivateRefCountArray has sufficient space to store one more
                                305                 :                :  * entry. This has to be called before using NewPrivateRefCountEntry() to fill
                                306                 :                :  * a new entry - but it's perfectly fine to not use a reserved entry.
                                307                 :                :  */
                                308                 :                : static void
 4124 andres@anarazel.de        309                 :CBC    87715023 : ReservePrivateRefCountEntry(void)
                                310                 :                : {
                                311                 :                :     /* Already reserved (or freed), nothing to do */
  142 andres@anarazel.de        312         [ +  + ]:GNC    87715023 :     if (ReservedRefCountSlot != -1)
 4124 andres@anarazel.de        313                 :CBC    82719361 :         return;
                                314                 :                : 
                                315                 :                :     /*
                                316                 :                :      * First search for a free entry the array, that'll be sufficient in the
                                317                 :                :      * majority of cases.
                                318                 :                :      */
                                319                 :                :     {
                                320                 :                :         int         i;
                                321                 :                : 
                                322         [ +  + ]:       44960958 :         for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
                                323                 :                :         {
  142 andres@anarazel.de        324         [ +  + ]:GNC    39965296 :             if (PrivateRefCountArrayKeys[i] == InvalidBuffer)
                                325                 :                :             {
                                326                 :       29070527 :                 ReservedRefCountSlot = i;
                                327                 :                : 
                                328                 :                :                 /*
                                329                 :                :                  * We could return immediately, but iterating till the end of
                                330                 :                :                  * the array allows compiler-autovectorization.
                                331                 :                :                  */
                                332                 :                :             }
                                333                 :                :         }
                                334                 :                : 
                                335         [ +  + ]:        4995662 :         if (ReservedRefCountSlot != -1)
                                336                 :        4806727 :             return;
                                337                 :                :     }
                                338                 :                : 
                                339                 :                :     /*
                                340                 :                :      * No luck. All array entries are full. Move one array entry into the hash
                                341                 :                :      * table.
                                342                 :                :      */
                                343                 :                :     {
                                344                 :                :         /*
                                345                 :                :          * Move entry from the current clock position in the array into the
                                346                 :                :          * hashtable. Use that slot.
                                347                 :                :          */
                                348                 :                :         int         victim_slot;
                                349                 :                :         PrivateRefCountEntry *victim_entry;
                                350                 :                :         PrivateRefCountEntry *hashent;
                                351                 :                :         bool        found;
                                352                 :                : 
                                353                 :                :         /* select victim slot */
                                354                 :         188935 :         victim_slot = PrivateRefCountClock++ % REFCOUNT_ARRAY_ENTRIES;
                                355                 :         188935 :         victim_entry = &PrivateRefCountArray[victim_slot];
                                356                 :         188935 :         ReservedRefCountSlot = victim_slot;
                                357                 :                : 
                                358                 :                :         /* Better be used, otherwise we shouldn't get here. */
                                359         [ -  + ]:         188935 :         Assert(PrivateRefCountArrayKeys[victim_slot] != InvalidBuffer);
                                360         [ -  + ]:         188935 :         Assert(PrivateRefCountArray[victim_slot].buffer != InvalidBuffer);
                                361         [ -  + ]:         188935 :         Assert(PrivateRefCountArrayKeys[victim_slot] == PrivateRefCountArray[victim_slot].buffer);
                                362                 :                : 
                                363                 :                :         /* enter victim array entry into hashtable */
   54 pg@bowt.ie                364                 :         188935 :         hashent = refcount_insert(PrivateRefCountHash,
                                365                 :                :                                   PrivateRefCountArrayKeys[victim_slot],
                                366                 :                :                                   &found);
 4124 andres@anarazel.de        367         [ -  + ]:CBC      188935 :         Assert(!found);
                                368                 :                :         /* move data from the entry in the array to the hash entry */
  142 andres@anarazel.de        369                 :GNC      188935 :         hashent->data = victim_entry->data;
                                370                 :                : 
                                371                 :                :         /* clear the now free array slot */
                                372                 :         188935 :         PrivateRefCountArrayKeys[victim_slot] = InvalidBuffer;
                                373                 :         188935 :         victim_entry->buffer = InvalidBuffer;
                                374                 :                : 
                                375                 :                :         /* clear the whole data member, just for future proofing */
                                376                 :         188935 :         memset(&victim_entry->data, 0, sizeof(victim_entry->data));
                                377                 :         188935 :         victim_entry->data.refcount = 0;
  110                           378                 :         188935 :         victim_entry->data.lockmode = BUFFER_LOCK_UNLOCK;
                                379                 :                : 
 4124 andres@anarazel.de        380                 :CBC      188935 :         PrivateRefCountOverflowed++;
                                381                 :                :     }
                                382                 :                : }
                                383                 :                : 
                                384                 :                : /*
                                385                 :                :  * Fill a previously reserved refcount entry.
                                386                 :                :  */
                                387                 :                : static PrivateRefCountEntry *
                                388                 :       76120912 : NewPrivateRefCountEntry(Buffer buffer)
                                389                 :                : {
                                390                 :                :     PrivateRefCountEntry *res;
                                391                 :                : 
                                392                 :                :     /* only allowed to be called when a reservation has been made */
  142 andres@anarazel.de        393         [ -  + ]:GNC    76120912 :     Assert(ReservedRefCountSlot != -1);
                                394                 :                : 
                                395                 :                :     /* use up the reserved entry */
                                396                 :       76120912 :     res = &PrivateRefCountArray[ReservedRefCountSlot];
                                397                 :                : 
                                398                 :                :     /* and fill it */
                                399                 :       76120912 :     PrivateRefCountArrayKeys[ReservedRefCountSlot] = buffer;
 4124 andres@anarazel.de        400                 :CBC    76120912 :     res->buffer = buffer;
  142 andres@anarazel.de        401                 :GNC    76120912 :     res->data.refcount = 0;
  110                           402                 :       76120912 :     res->data.lockmode = BUFFER_LOCK_UNLOCK;
                                403                 :                : 
                                404                 :                :     /* update cache for the next lookup */
  142                           405                 :       76120912 :     PrivateRefCountEntryLast = ReservedRefCountSlot;
                                406                 :                : 
                                407                 :       76120912 :     ReservedRefCountSlot = -1;
                                408                 :                : 
 4124 andres@anarazel.de        409                 :CBC    76120912 :     return res;
                                410                 :                : }
                                411                 :                : 
                                412                 :                : /*
                                413                 :                :  * Slow-path for GetPrivateRefCountEntry(). This is big enough to not be worth
                                414                 :                :  * inlining. This particularly seems to be true if the compiler is capable of
                                415                 :                :  * auto-vectorizing the code, as that imposes additional stack-alignment
                                416                 :                :  * requirements etc.
                                417                 :                :  */
                                418                 :                : static pg_noinline PrivateRefCountEntry *
  142 andres@anarazel.de        419                 :GNC   126473464 : GetPrivateRefCountEntrySlow(Buffer buffer, bool do_move)
                                420                 :                : {
                                421                 :                :     PrivateRefCountEntry *res;
                                422                 :      126473464 :     int         match = -1;
                                423                 :                :     int         i;
                                424                 :                : 
                                425                 :                :     /*
                                426                 :                :      * First search for references in the array, that'll be sufficient in the
                                427                 :                :      * majority of cases.
                                428                 :                :      */
 4266 andres@anarazel.de        429         [ +  + ]:CBC  1138261176 :     for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
                                430                 :                :     {
  142 andres@anarazel.de        431         [ +  + ]:GNC  1011787712 :         if (PrivateRefCountArrayKeys[i] == buffer)
                                432                 :                :         {
                                433                 :       51208336 :             match = i;
                                434                 :                :             /* see ReservePrivateRefCountEntry() for why we don't return */
                                435                 :                :         }
                                436                 :                :     }
                                437                 :                : 
                                438         [ +  + ]:      126473464 :     if (likely(match != -1))
                                439                 :                :     {
                                440                 :                :         /* update cache for the next lookup */
                                441                 :       51208336 :         PrivateRefCountEntryLast = match;
                                442                 :                : 
                                443                 :       51208336 :         return &PrivateRefCountArray[match];
                                444                 :                :     }
                                445                 :                : 
                                446                 :                :     /*
                                447                 :                :      * By here we know that the buffer, if already pinned, isn't residing in
                                448                 :                :      * the array.
                                449                 :                :      *
                                450                 :                :      * Only look up the buffer in the hashtable if we've previously overflowed
                                451                 :                :      * into it.
                                452                 :                :      */
 4124 andres@anarazel.de        453         [ +  + ]:CBC    75265128 :     if (PrivateRefCountOverflowed == 0)
                                454                 :       73452443 :         return NULL;
                                455                 :                : 
   54 pg@bowt.ie                456                 :GNC     1812685 :     res = refcount_lookup(PrivateRefCountHash, buffer);
                                457                 :                : 
 4124 andres@anarazel.de        458         [ +  + ]:CBC     1812685 :     if (res == NULL)
                                459                 :         403887 :         return NULL;
                                460         [ +  + ]:        1408798 :     else if (!do_move)
                                461                 :                :     {
                                462                 :                :         /* caller doesn't want us to move the hash entry into the array */
                                463                 :        1311987 :         return res;
                                464                 :                :     }
                                465                 :                :     else
                                466                 :                :     {
                                467                 :                :         /* move buffer from hashtable into the free array slot */
                                468                 :                :         PrivateRefCountEntry *free;
                                469                 :                :         PrivateRefCountData data;
                                470                 :                : 
                                471                 :                :         /* Save data and delete from hashtable while res is still valid */
   54 pg@bowt.ie                472                 :GNC       96811 :         data = res->data;
                                473                 :          96811 :         refcount_delete_item(PrivateRefCountHash, res);
                                474         [ -  + ]:          96811 :         Assert(PrivateRefCountOverflowed > 0);
                                475                 :          96811 :         PrivateRefCountOverflowed--;
                                476                 :                : 
                                477                 :                :         /* Ensure there's a free array slot */
 4124 andres@anarazel.de        478                 :CBC       96811 :         ReservePrivateRefCountEntry();
                                479                 :                : 
                                480                 :                :         /* Use up the reserved slot */
  142 andres@anarazel.de        481         [ -  + ]:GNC       96811 :         Assert(ReservedRefCountSlot != -1);
                                482                 :          96811 :         free = &PrivateRefCountArray[ReservedRefCountSlot];
                                483         [ -  + ]:          96811 :         Assert(PrivateRefCountArrayKeys[ReservedRefCountSlot] == free->buffer);
 4124 andres@anarazel.de        484         [ -  + ]:CBC       96811 :         Assert(free->buffer == InvalidBuffer);
                                485                 :                : 
                                486                 :                :         /* and fill it */
                                487                 :          96811 :         free->buffer = buffer;
   54 pg@bowt.ie                488                 :GNC       96811 :         free->data = data;
  142 andres@anarazel.de        489                 :          96811 :         PrivateRefCountArrayKeys[ReservedRefCountSlot] = buffer;
                                490                 :                :         /* update cache for the next lookup */
   55                           491                 :          96811 :         PrivateRefCountEntryLast = ReservedRefCountSlot;
                                492                 :                : 
  142                           493                 :          96811 :         ReservedRefCountSlot = -1;
                                494                 :                : 
 4124 andres@anarazel.de        495                 :CBC       96811 :         return free;
                                496                 :                :     }
                                497                 :                : }
                                498                 :                : 
                                499                 :                : /*
                                500                 :                :  * Return the PrivateRefCount entry for the passed buffer.
                                501                 :                :  *
                                502                 :                :  * Returns NULL if a buffer doesn't have a refcount entry. Otherwise, if
                                503                 :                :  * do_move is true, and the entry resides in the hashtable the entry is
                                504                 :                :  * optimized for frequent access by moving it to the array.
                                505                 :                :  */
                                506                 :                : static inline PrivateRefCountEntry *
  142 andres@anarazel.de        507                 :GNC  1323790495 : GetPrivateRefCountEntry(Buffer buffer, bool do_move)
                                508                 :                : {
                                509         [ -  + ]:     1323790495 :     Assert(BufferIsValid(buffer));
                                510         [ -  + ]:     1323790495 :     Assert(!BufferIsLocal(buffer));
                                511                 :                : 
                                512                 :                :     /*
                                513                 :                :      * It's very common to look up the same buffer repeatedly. To make that
                                514                 :                :      * fast, we have a one-entry cache.
                                515                 :                :      *
                                516                 :                :      * In contrast to the loop in GetPrivateRefCountEntrySlow(), here it
                                517                 :                :      * faster to check PrivateRefCountArray[].buffer, as in the case of a hit
                                518                 :                :      * fewer addresses are computed and fewer cachelines are accessed. Whereas
                                519                 :                :      * in GetPrivateRefCountEntrySlow()'s case, checking
                                520                 :                :      * PrivateRefCountArrayKeys saves a lot of memory accesses.
                                521                 :                :      */
                                522         [ +  + ]:     1323790495 :     if (likely(PrivateRefCountEntryLast != -1) &&
                                523         [ +  + ]:     1323775235 :         likely(PrivateRefCountArray[PrivateRefCountEntryLast].buffer == buffer))
                                524                 :                :     {
                                525                 :     1197317031 :         return &PrivateRefCountArray[PrivateRefCountEntryLast];
                                526                 :                :     }
                                527                 :                : 
                                528                 :                :     /*
                                529                 :                :      * The code for the cached lookup is small enough to be worth inlining
                                530                 :                :      * into the caller. In the miss case however, that empirically doesn't
                                531                 :                :      * seem worth it.
                                532                 :                :      */
                                533                 :      126473464 :     return GetPrivateRefCountEntrySlow(buffer, do_move);
                                534                 :                : }
                                535                 :                : 
                                536                 :                : /*
                                537                 :                :  * Returns how many times the passed buffer is pinned by this backend.
                                538                 :                :  *
                                539                 :                :  * Only works for shared memory buffers!
                                540                 :                :  */
                                541                 :                : static inline int32
 4266 andres@anarazel.de        542                 :CBC   721724984 : GetPrivateRefCount(Buffer buffer)
                                543                 :                : {
                                544                 :                :     PrivateRefCountEntry *ref;
                                545                 :                : 
                                546         [ -  + ]:      721724984 :     Assert(BufferIsValid(buffer));
                                547         [ -  + ]:      721724984 :     Assert(!BufferIsLocal(buffer));
                                548                 :                : 
                                549                 :                :     /*
                                550                 :                :      * Not moving the entry - that's ok for the current users, but we might
                                551                 :                :      * want to change this one day.
                                552                 :                :      */
 4124                           553                 :      721724984 :     ref = GetPrivateRefCountEntry(buffer, false);
                                554                 :                : 
 4266                           555         [ +  + ]:      721724984 :     if (ref == NULL)
                                556                 :             50 :         return 0;
  142 andres@anarazel.de        557                 :GNC   721724934 :     return ref->data.refcount;
                                558                 :                : }
                                559                 :                : 
                                560                 :                : /*
                                561                 :                :  * Release resources used to track the reference count of a buffer which we no
                                562                 :                :  * longer have pinned and don't want to pin again immediately.
                                563                 :                :  */
                                564                 :                : static void
 4266 andres@anarazel.de        565                 :CBC    76120912 : ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref)
                                566                 :                : {
  142 andres@anarazel.de        567         [ -  + ]:GNC    76120912 :     Assert(ref->data.refcount == 0);
  110                           568         [ -  + ]:       76120912 :     Assert(ref->data.lockmode == BUFFER_LOCK_UNLOCK);
                                569                 :                : 
 4266 andres@anarazel.de        570   [ +  -  +  + ]:CBC    76120912 :     if (ref >= &PrivateRefCountArray[0] &&
                                571                 :                :         ref < &PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES])
                                572                 :                :     {
                                573                 :       76028788 :         ref->buffer = InvalidBuffer;
  142 andres@anarazel.de        574                 :GNC    76028788 :         PrivateRefCountArrayKeys[ref - PrivateRefCountArray] = InvalidBuffer;
                                575                 :                : 
                                576                 :                : 
                                577                 :                :         /*
                                578                 :                :          * Mark the just used entry as reserved - in many scenarios that
                                579                 :                :          * allows us to avoid ever having to search the array/hash for free
                                580                 :                :          * entries.
                                581                 :                :          */
                                582                 :       76028788 :         ReservedRefCountSlot = ref - PrivateRefCountArray;
                                583                 :                :     }
                                584                 :                :     else
                                585                 :                :     {
   54 pg@bowt.ie                586                 :          92124 :         refcount_delete_item(PrivateRefCountHash, ref);
 4266 andres@anarazel.de        587         [ -  + ]:CBC       92124 :         Assert(PrivateRefCountOverflowed > 0);
                                588                 :          92124 :         PrivateRefCountOverflowed--;
                                589                 :                :     }
                                590                 :       76120912 : }
                                591                 :                : 
                                592                 :                : /*
                                593                 :                :  * BufferIsPinned
                                594                 :                :  *      True iff the buffer is pinned (also checks for valid buffer number).
                                595                 :                :  *
                                596                 :                :  *      NOTE: what we check here is that *this* backend holds a pin on
                                597                 :                :  *      the buffer.  We do not care whether some other backend does.
                                598                 :                :  */
                                599                 :                : #define BufferIsPinned(bufnum) \
                                600                 :                : ( \
                                601                 :                :     !BufferIsValid(bufnum) ? \
                                602                 :                :         false \
                                603                 :                :     : \
                                604                 :                :         BufferIsLocal(bufnum) ? \
                                605                 :                :             (LocalRefCount[-(bufnum) - 1] > 0) \
                                606                 :                :         : \
                                607                 :                :     (GetPrivateRefCount(bufnum) > 0) \
                                608                 :                : )
                                609                 :                : 
                                610                 :                : 
                                611                 :                : static Buffer ReadBuffer_common(Relation rel,
                                612                 :                :                                 SMgrRelation smgr, char smgr_persistence,
                                613                 :                :                                 ForkNumber forkNum, BlockNumber blockNum,
                                614                 :                :                                 ReadBufferMode mode, BufferAccessStrategy strategy);
                                615                 :                : static BlockNumber ExtendBufferedRelCommon(BufferManagerRelation bmr,
                                616                 :                :                                            ForkNumber fork,
                                617                 :                :                                            BufferAccessStrategy strategy,
                                618                 :                :                                            uint32 flags,
                                619                 :                :                                            uint32 extend_by,
                                620                 :                :                                            BlockNumber extend_upto,
                                621                 :                :                                            Buffer *buffers,
                                622                 :                :                                            uint32 *extended_by);
                                623                 :                : static BlockNumber ExtendBufferedRelShared(BufferManagerRelation bmr,
                                624                 :                :                                            ForkNumber fork,
                                625                 :                :                                            BufferAccessStrategy strategy,
                                626                 :                :                                            uint32 flags,
                                627                 :                :                                            uint32 extend_by,
                                628                 :                :                                            BlockNumber extend_upto,
                                629                 :                :                                            Buffer *buffers,
                                630                 :                :                                            uint32 *extended_by);
                                631                 :                : static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy,
                                632                 :                :                       bool skip_if_not_valid);
                                633                 :                : static void PinBuffer_Locked(BufferDesc *buf);
                                634                 :                : static void UnpinBuffer(BufferDesc *buf);
                                635                 :                : static void UnpinBufferNoOwner(BufferDesc *buf);
                                636                 :                : static void BufferSync(int flags);
                                637                 :                : static int  SyncOneBuffer(int buf_id, bool skip_recently_used,
                                638                 :                :                           WritebackContext *wb_context);
                                639                 :                : static void WaitIO(BufferDesc *buf);
                                640                 :                : static void AbortBufferIO(Buffer buffer);
                                641                 :                : static void shared_buffer_write_error_callback(void *arg);
                                642                 :                : static void local_buffer_write_error_callback(void *arg);
                                643                 :                : static inline BufferDesc *BufferAlloc(SMgrRelation smgr,
                                644                 :                :                                       char relpersistence,
                                645                 :                :                                       ForkNumber forkNum,
                                646                 :                :                                       BlockNumber blockNum,
                                647                 :                :                                       BufferAccessStrategy strategy,
                                648                 :                :                                       bool *foundPtr, IOContext io_context);
                                649                 :                : static bool AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress);
                                650                 :                : static void CheckReadBuffersOperation(ReadBuffersOperation *operation, bool is_complete);
                                651                 :                : 
                                652                 :                : static pg_attribute_always_inline void TrackBufferHit(IOObject io_object,
                                653                 :                :                                                       IOContext io_context,
                                654                 :                :                                                       Relation rel, char persistence, SMgrRelation smgr,
                                655                 :                :                                                       ForkNumber forknum, BlockNumber blocknum);
                                656                 :                : static Buffer GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context);
                                657                 :                : static void FlushUnlockedBuffer(BufferDesc *buf, SMgrRelation reln,
                                658                 :                :                                 IOObject io_object, IOContext io_context);
                                659                 :                : static void FlushBuffer(BufferDesc *buf, SMgrRelation reln,
                                660                 :                :                         IOObject io_object, IOContext io_context);
                                661                 :                : static void FindAndDropRelationBuffers(RelFileLocator rlocator,
                                662                 :                :                                        ForkNumber forkNum,
                                663                 :                :                                        BlockNumber nForkBlock,
                                664                 :                :                                        BlockNumber firstDelBlock);
                                665                 :                : static void RelationCopyStorageUsingBuffer(RelFileLocator srclocator,
                                666                 :                :                                            RelFileLocator dstlocator,
                                667                 :                :                                            ForkNumber forkNum, bool permanent);
                                668                 :                : static void AtProcExit_Buffers(int code, Datum arg);
                                669                 :                : static void CheckForBufferLeaks(void);
                                670                 :                : #ifdef USE_ASSERT_CHECKING
                                671                 :                : static void AssertNotCatalogBufferLock(Buffer buffer, BufferLockMode mode);
                                672                 :                : #endif
                                673                 :                : static int  rlocator_comparator(const void *p1, const void *p2);
                                674                 :                : static inline int buffertag_comparator(const BufferTag *ba, const BufferTag *bb);
                                675                 :                : static inline int ckpt_buforder_comparator(const CkptSortItem *a, const CkptSortItem *b);
                                676                 :                : static int  ts_ckpt_progress_comparator(Datum a, Datum b, void *arg);
                                677                 :                : 
                                678                 :                : static void BufferLockAcquire(Buffer buffer, BufferDesc *buf_hdr, BufferLockMode mode);
                                679                 :                : static void BufferLockUnlock(Buffer buffer, BufferDesc *buf_hdr);
                                680                 :                : static bool BufferLockConditional(Buffer buffer, BufferDesc *buf_hdr, BufferLockMode mode);
                                681                 :                : static bool BufferLockHeldByMeInMode(BufferDesc *buf_hdr, BufferLockMode mode);
                                682                 :                : static bool BufferLockHeldByMe(BufferDesc *buf_hdr);
                                683                 :                : static inline void BufferLockDisown(Buffer buffer, BufferDesc *buf_hdr);
                                684                 :                : static inline int BufferLockDisownInternal(Buffer buffer, BufferDesc *buf_hdr);
                                685                 :                : static inline bool BufferLockAttempt(BufferDesc *buf_hdr, BufferLockMode mode);
                                686                 :                : static void BufferLockQueueSelf(BufferDesc *buf_hdr, BufferLockMode mode);
                                687                 :                : static void BufferLockDequeueSelf(BufferDesc *buf_hdr);
                                688                 :                : static void BufferLockWakeup(BufferDesc *buf_hdr, bool wake_exclusive);
                                689                 :                : static void BufferLockProcessRelease(BufferDesc *buf_hdr, BufferLockMode mode, uint64 lockstate);
                                690                 :                : static inline uint64 BufferLockReleaseSub(BufferLockMode mode);
                                691                 :                : 
                                692                 :                : 
                                693                 :                : /*
                                694                 :                :  * Implementation of PrefetchBuffer() for shared buffers.
                                695                 :                :  */
                                696                 :                : PrefetchBufferResult
 2218 tmunro@postgresql.or      697                 :          40401 : PrefetchSharedBuffer(SMgrRelation smgr_reln,
                                698                 :                :                      ForkNumber forkNum,
                                699                 :                :                      BlockNumber blockNum)
                                700                 :                : {
                                701                 :          40401 :     PrefetchBufferResult result = {InvalidBuffer, false};
                                702                 :                :     BufferTag   newTag;         /* identity of requested block */
                                703                 :                :     uint32      newHash;        /* hash value for newTag */
                                704                 :                :     LWLock     *newPartitionLock;   /* buffer partition lock for it */
                                705                 :                :     int         buf_id;
                                706                 :                : 
                                707         [ -  + ]:          40401 :     Assert(BlockNumberIsValid(blockNum));
                                708                 :                : 
                                709                 :                :     /* create a tag so we can lookup the buffer */
 1378 rhaas@postgresql.org      710                 :          40401 :     InitBufferTag(&newTag, &smgr_reln->smgr_rlocator.locator,
                                711                 :                :                   forkNum, blockNum);
                                712                 :                : 
                                713                 :                :     /* determine its hash code and partition lock ID */
 2218 tmunro@postgresql.or      714                 :          40401 :     newHash = BufTableHashCode(&newTag);
                                715                 :          40401 :     newPartitionLock = BufMappingPartitionLock(newHash);
                                716                 :                : 
                                717                 :                :     /* see if the block is in the buffer pool already */
                                718                 :          40401 :     LWLockAcquire(newPartitionLock, LW_SHARED);
                                719                 :          40401 :     buf_id = BufTableLookup(&newTag, newHash);
                                720                 :          40401 :     LWLockRelease(newPartitionLock);
                                721                 :                : 
                                722                 :                :     /* If not in buffers, initiate prefetch */
                                723         [ +  + ]:          40401 :     if (buf_id < 0)
                                724                 :                :     {
                                725                 :                : #ifdef USE_PREFETCH
                                726                 :                :         /*
                                727                 :                :          * Try to initiate an asynchronous read.  This returns false in
                                728                 :                :          * recovery if the relation file doesn't exist.
                                729                 :                :          */
 1123                           730   [ +  +  +  - ]:          18483 :         if ((io_direct_flags & IO_DIRECT_DATA) == 0 &&
  871                           731                 :           9115 :             smgrprefetch(smgr_reln, forkNum, blockNum, 1))
                                732                 :                :         {
 2218                           733                 :           9115 :             result.initiated_io = true;
                                734                 :                :         }
                                735                 :                : #endif                          /* USE_PREFETCH */
                                736                 :                :     }
                                737                 :                :     else
                                738                 :                :     {
                                739                 :                :         /*
                                740                 :                :          * Report the buffer it was in at that time.  The caller may be able
                                741                 :                :          * to avoid a buffer table lookup, but it's not pinned and it must be
                                742                 :                :          * rechecked!
                                743                 :                :          */
                                744                 :          31033 :         result.recent_buffer = buf_id + 1;
                                745                 :                :     }
                                746                 :                : 
                                747                 :                :     /*
                                748                 :                :      * If the block *is* in buffers, we do nothing.  This is not really ideal:
                                749                 :                :      * the block might be just about to be evicted, which would be stupid
                                750                 :                :      * since we know we are going to need it soon.  But the only easy answer
                                751                 :                :      * is to bump the usage_count, which does not seem like a great solution:
                                752                 :                :      * when the caller does ultimately touch the block, usage_count would get
                                753                 :                :      * bumped again, resulting in too much favoritism for blocks that are
                                754                 :                :      * involved in a prefetch sequence. A real fix would involve some
                                755                 :                :      * additional per-buffer state, and it's not clear that there's enough of
                                756                 :                :      * a problem to justify that.
                                757                 :                :      */
                                758                 :                : 
                                759                 :          40401 :     return result;
                                760                 :                : }
                                761                 :                : 
                                762                 :                : /*
                                763                 :                :  * PrefetchBuffer -- initiate asynchronous read of a block of a relation
                                764                 :                :  *
                                765                 :                :  * This is named by analogy to ReadBuffer but doesn't actually allocate a
                                766                 :                :  * buffer.  Instead it tries to ensure that a future ReadBuffer for the given
                                767                 :                :  * block will not be delayed by the I/O.  Prefetching is optional.
                                768                 :                :  *
                                769                 :                :  * There are three possible outcomes:
                                770                 :                :  *
                                771                 :                :  * 1.  If the block is already cached, the result includes a valid buffer that
                                772                 :                :  * could be used by the caller to avoid the need for a later buffer lookup, but
                                773                 :                :  * it's not pinned, so the caller must recheck it.
                                774                 :                :  *
                                775                 :                :  * 2.  If the kernel has been asked to initiate I/O, the initiated_io member is
                                776                 :                :  * true.  Currently there is no way to know if the data was already cached by
                                777                 :                :  * the kernel and therefore didn't really initiate I/O, and no way to know when
                                778                 :                :  * the I/O completes other than using synchronous ReadBuffer().
                                779                 :                :  *
                                780                 :                :  * 3.  Otherwise, the buffer wasn't already cached by PostgreSQL, and
                                781                 :                :  * USE_PREFETCH is not defined (this build doesn't support prefetching due to
                                782                 :                :  * lack of a kernel facility), direct I/O is enabled, or the underlying
                                783                 :                :  * relation file wasn't found and we are in recovery.  (If the relation file
                                784                 :                :  * wasn't found and we are not in recovery, an error is raised).
                                785                 :                :  */
                                786                 :                : PrefetchBufferResult
 6322 tgl@sss.pgh.pa.us         787                 :          29833 : PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum)
                                788                 :                : {
                                789         [ -  + ]:          29833 :     Assert(RelationIsValid(reln));
                                790         [ -  + ]:          29833 :     Assert(BlockNumberIsValid(blockNum));
                                791                 :                : 
 5622 rhaas@postgresql.org      792         [ +  + ]:          29833 :     if (RelationUsesLocalBuffers(reln))
                                793                 :                :     {
                                794                 :                :         /* see comments in ReadBufferExtended */
 6244 tgl@sss.pgh.pa.us         795   [ +  -  -  + ]:           1535 :         if (RELATION_IS_OTHER_TEMP(reln))
 6244 tgl@sss.pgh.pa.us         796         [ #  # ]:UBC           0 :             ereport(ERROR,
                                797                 :                :                     (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
                                798                 :                :                      errmsg("cannot access temporary tables of other sessions")));
                                799                 :                : 
                                800                 :                :         /* pass it off to localbuf.c */
 1758 tgl@sss.pgh.pa.us         801                 :CBC        1535 :         return PrefetchLocalBuffer(RelationGetSmgr(reln), forkNum, blockNum);
                                802                 :                :     }
                                803                 :                :     else
                                804                 :                :     {
                                805                 :                :         /* pass it to the shared buffer version */
                                806                 :          28298 :         return PrefetchSharedBuffer(RelationGetSmgr(reln), forkNum, blockNum);
                                807                 :                :     }
                                808                 :                : }
                                809                 :                : 
                                810                 :                : /*
                                811                 :                :  * ReadRecentBuffer -- try to pin a block in a recently observed buffer
                                812                 :                :  *
                                813                 :                :  * Compared to ReadBuffer(), this avoids a buffer mapping lookup when it's
                                814                 :                :  * successful.  Return true if the buffer is valid and still has the expected
                                815                 :                :  * tag.  In that case, the buffer is pinned and the usage count is bumped.
                                816                 :                :  */
                                817                 :                : bool
 1399 rhaas@postgresql.org      818                 :           5045 : ReadRecentBuffer(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockNum,
                                819                 :                :                  Buffer recent_buffer)
                                820                 :                : {
                                821                 :                :     BufferDesc *bufHdr;
                                822                 :                :     BufferTag   tag;
                                823                 :                :     uint64      buf_state;
                                824                 :                : 
 1853 tmunro@postgresql.or      825         [ -  + ]:           5045 :     Assert(BufferIsValid(recent_buffer));
                                826                 :                : 
  909 heikki.linnakangas@i      827                 :           5045 :     ResourceOwnerEnlarge(CurrentResourceOwner);
 1853 tmunro@postgresql.or      828                 :           5045 :     ReservePrivateRefCountEntry();
 1378 rhaas@postgresql.org      829                 :           5045 :     InitBufferTag(&tag, &rlocator, forkNum, blockNum);
                                830                 :                : 
 1853 tmunro@postgresql.or      831         [ +  + ]:           5045 :     if (BufferIsLocal(recent_buffer))
                                832                 :                :     {
 1380 heikki.linnakangas@i      833                 :            210 :         int         b = -recent_buffer - 1;
                                834                 :                : 
                                835                 :            210 :         bufHdr = GetLocalBufferDescriptor(b);
  110 andres@anarazel.de        836                 :GNC         210 :         buf_state = pg_atomic_read_u64(&bufHdr->state);
                                837                 :                : 
                                838                 :                :         /* Is it still valid and holding the right tag? */
 1378 rhaas@postgresql.org      839   [ +  -  +  - ]:CBC         210 :         if ((buf_state & BM_VALID) && BufferTagsEqual(&tag, &bufHdr->tag))
                                840                 :                :         {
 1126 andres@anarazel.de        841                 :            210 :             PinLocalBuffer(bufHdr, true);
                                842                 :                : 
 1489 tmunro@postgresql.or      843                 :            210 :             pgBufferUsage.local_blks_hit++;
                                844                 :                : 
 1853                           845                 :            210 :             return true;
                                846                 :                :         }
                                847                 :                :     }
                                848                 :                :     else
                                849                 :                :     {
                                850                 :           4835 :         bufHdr = GetBufferDescriptor(recent_buffer - 1);
                                851                 :                : 
                                852                 :                :         /*
                                853                 :                :          * Is it still valid and holding the right tag?  We do an unlocked tag
                                854                 :                :          * comparison first, to make it unlikely that we'll increment the
                                855                 :                :          * usage counter of the wrong buffer, if someone calls us with a very
                                856                 :                :          * out of date recent_buffer.  Then we'll check it again if we get the
                                857                 :                :          * pin.
                                858                 :                :          */
  209 andres@anarazel.de        859   [ +  +  +  + ]:GNC        9622 :         if (BufferTagsEqual(&tag, &bufHdr->tag) &&
                                860                 :           4787 :             PinBuffer(bufHdr, NULL, true))
                                861                 :                :         {
                                862         [ +  - ]:           4778 :             if (BufferTagsEqual(&tag, &bufHdr->tag))
                                863                 :                :             {
                                864                 :           4778 :                 pgBufferUsage.shared_blks_hit++;
                                865                 :           4778 :                 return true;
                                866                 :                :             }
  209 andres@anarazel.de        867                 :UNC           0 :             UnpinBuffer(bufHdr);
                                868                 :                :         }
                                869                 :                :     }
                                870                 :                : 
 1853 tmunro@postgresql.or      871                 :CBC          57 :     return false;
                                872                 :                : }
                                873                 :                : 
                                874                 :                : /*
                                875                 :                :  * ReadBuffer -- a shorthand for ReadBufferExtended, for reading from main
                                876                 :                :  *      fork with RBM_NORMAL mode and default strategy.
                                877                 :                :  */
                                878                 :                : Buffer
 6395 heikki.linnakangas@i      879                 :       62975583 : ReadBuffer(Relation reln, BlockNumber blockNum)
                                880                 :                : {
                                881                 :       62975583 :     return ReadBufferExtended(reln, MAIN_FORKNUM, blockNum, RBM_NORMAL, NULL);
                                882                 :                : }
                                883                 :                : 
                                884                 :                : /*
                                885                 :                :  * ReadBufferExtended -- returns a buffer containing the requested
                                886                 :                :  *      block of the requested relation.  If the blknum
                                887                 :                :  *      requested is P_NEW, extend the relation file and
                                888                 :                :  *      allocate a new block.  (Caller is responsible for
                                889                 :                :  *      ensuring that only one backend tries to extend a
                                890                 :                :  *      relation at the same time!)
                                891                 :                :  *
                                892                 :                :  * Returns: the buffer number for the buffer containing
                                893                 :                :  *      the block read.  The returned buffer has been pinned.
                                894                 :                :  *      Does not return on error --- elog's instead.
                                895                 :                :  *
                                896                 :                :  * Assume when this function is called, that reln has been opened already.
                                897                 :                :  *
                                898                 :                :  * In RBM_NORMAL mode, the page is read from disk, and the page header is
                                899                 :                :  * validated.  An error is thrown if the page header is not valid.  (But
                                900                 :                :  * note that an all-zero page is considered "valid"; see
                                901                 :                :  * PageIsVerified().)
                                902                 :                :  *
                                903                 :                :  * RBM_ZERO_ON_ERROR is like the normal mode, but if the page header is not
                                904                 :                :  * valid, the page is zeroed instead of throwing an error. This is intended
                                905                 :                :  * for non-critical data, where the caller is prepared to repair errors.
                                906                 :                :  *
                                907                 :                :  * In RBM_ZERO_AND_LOCK mode, if the page isn't in buffer cache already, it's
                                908                 :                :  * filled with zeros instead of reading it from disk.  Useful when the caller
                                909                 :                :  * is going to fill the page from scratch, since this saves I/O and avoids
                                910                 :                :  * unnecessary failure if the page-on-disk has corrupt page headers.
                                911                 :                :  * The page is returned locked to ensure that the caller has a chance to
                                912                 :                :  * initialize the page before it's made visible to others.
                                913                 :                :  * Caution: do not use this mode to read a page that is beyond the relation's
                                914                 :                :  * current physical EOF; that is likely to cause problems in md.c when
                                915                 :                :  * the page is modified and written out. P_NEW is OK, though.
                                916                 :                :  *
                                917                 :                :  * RBM_ZERO_AND_CLEANUP_LOCK is the same as RBM_ZERO_AND_LOCK, but acquires
                                918                 :                :  * a cleanup-strength lock on the page.
                                919                 :                :  *
                                920                 :                :  * RBM_NORMAL_NO_LOG mode is treated the same as RBM_NORMAL here.
                                921                 :                :  *
                                922                 :                :  * If strategy is not NULL, a nondefault buffer access strategy is used.
                                923                 :                :  * See buffer/README for details.
                                924                 :                :  */
                                925                 :                : inline Buffer
                                926                 :       74155377 : ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum,
                                927                 :                :                    ReadBufferMode mode, BufferAccessStrategy strategy)
                                928                 :                : {
                                929                 :                :     Buffer      buf;
                                930                 :                : 
                                931                 :                :     /*
                                932                 :                :      * Reject attempts to read non-local temporary relations; we would be
                                933                 :                :      * likely to get wrong data since we have no visibility into the owning
                                934                 :                :      * session's local buffers.
                                935                 :                :      */
 6244 tgl@sss.pgh.pa.us         936   [ +  +  -  + ]:       74155377 :     if (RELATION_IS_OTHER_TEMP(reln))
 6244 tgl@sss.pgh.pa.us         937         [ #  # ]:UBC           0 :         ereport(ERROR,
                                938                 :                :                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
                                939                 :                :                  errmsg("cannot access temporary tables of other sessions")));
                                940                 :                : 
                                941                 :                :     /*
                                942                 :                :      * Read the buffer, and update pgstat counters to reflect a cache hit or
                                943                 :                :      * miss.
                                944                 :                :      */
  762 tmunro@postgresql.or      945                 :CBC    74155377 :     buf = ReadBuffer_common(reln, RelationGetSmgr(reln), 0,
                                946                 :                :                             forkNum, blockNum, mode, strategy);
                                947                 :                : 
 6536 heikki.linnakangas@i      948                 :       74155350 :     return buf;
                                949                 :                : }
                                950                 :                : 
                                951                 :                : 
                                952                 :                : /*
                                953                 :                :  * ReadBufferWithoutRelcache -- like ReadBufferExtended, but doesn't require
                                954                 :                :  *      a relcache entry for the relation.
                                955                 :                :  *
                                956                 :                :  * Pass permanent = true for a RELPERSISTENCE_PERMANENT relation, and
                                957                 :                :  * permanent = false for a RELPERSISTENCE_UNLOGGED relation. This function
                                958                 :                :  * cannot be used for temporary relations (and making that work might be
                                959                 :                :  * difficult, unless we only want to read temporary relations for our own
                                960                 :                :  * ProcNumber).
                                961                 :                :  */
                                962                 :                : Buffer
 1399 rhaas@postgresql.org      963                 :        5965385 : ReadBufferWithoutRelcache(RelFileLocator rlocator, ForkNumber forkNum,
                                964                 :                :                           BlockNumber blockNum, ReadBufferMode mode,
                                965                 :                :                           BufferAccessStrategy strategy, bool permanent)
                                966                 :                : {
  793 heikki.linnakangas@i      967                 :        5965385 :     SMgrRelation smgr = smgropen(rlocator, INVALID_PROC_NUMBER);
                                968                 :                : 
  762 tmunro@postgresql.or      969         [ +  - ]:        5965385 :     return ReadBuffer_common(NULL, smgr,
                                970                 :                :                              permanent ? RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED,
                                971                 :                :                              forkNum, blockNum,
                                972                 :                :                              mode, strategy);
                                973                 :                : }
                                974                 :                : 
                                975                 :                : /*
                                976                 :                :  * Convenience wrapper around ExtendBufferedRelBy() extending by one block.
                                977                 :                :  */
                                978                 :                : Buffer
  986                           979                 :          57426 : ExtendBufferedRel(BufferManagerRelation bmr,
                                980                 :                :                   ForkNumber forkNum,
                                981                 :                :                   BufferAccessStrategy strategy,
                                982                 :                :                   uint32 flags)
                                983                 :                : {
                                984                 :                :     Buffer      buf;
 1126 andres@anarazel.de        985                 :          57426 :     uint32      extend_by = 1;
                                986                 :                : 
  986 tmunro@postgresql.or      987                 :          57426 :     ExtendBufferedRelBy(bmr, forkNum, strategy, flags, extend_by,
                                988                 :                :                         &buf, &extend_by);
                                989                 :                : 
 1126 andres@anarazel.de        990                 :          57426 :     return buf;
                                991                 :                : }
                                992                 :                : 
                                993                 :                : /*
                                994                 :                :  * Extend relation by multiple blocks.
                                995                 :                :  *
                                996                 :                :  * Tries to extend the relation by extend_by blocks. Depending on the
                                997                 :                :  * availability of resources the relation may end up being extended by a
                                998                 :                :  * smaller number of pages (unless an error is thrown, always by at least one
                                999                 :                :  * page). *extended_by is updated to the number of pages the relation has been
                               1000                 :                :  * extended to.
                               1001                 :                :  *
                               1002                 :                :  * buffers needs to be an array that is at least extend_by long. Upon
                               1003                 :                :  * completion, the first extend_by array elements will point to a pinned
                               1004                 :                :  * buffer.
                               1005                 :                :  *
                               1006                 :                :  * If EB_LOCK_FIRST is part of flags, the first returned buffer is
                               1007                 :                :  * locked. This is useful for callers that want a buffer that is guaranteed to
                               1008                 :                :  * be empty.
                               1009                 :                :  */
                               1010                 :                : BlockNumber
  986 tmunro@postgresql.or     1011                 :         210224 : ExtendBufferedRelBy(BufferManagerRelation bmr,
                               1012                 :                :                     ForkNumber fork,
                               1013                 :                :                     BufferAccessStrategy strategy,
                               1014                 :                :                     uint32 flags,
                               1015                 :                :                     uint32 extend_by,
                               1016                 :                :                     Buffer *buffers,
                               1017                 :                :                     uint32 *extended_by)
                               1018                 :                : {
                               1019         [ -  + ]:         210224 :     Assert((bmr.rel != NULL) != (bmr.smgr != NULL));
  196 alvherre@kurilemu.de     1020   [ -  +  -  - ]:GNC      210224 :     Assert(bmr.smgr == NULL || bmr.relpersistence != '\0');
 1126 andres@anarazel.de       1021         [ -  + ]:CBC      210224 :     Assert(extend_by > 0);
                               1022                 :                : 
  196 alvherre@kurilemu.de     1023         [ +  - ]:GNC      210224 :     if (bmr.relpersistence == '\0')
  986 tmunro@postgresql.or     1024                 :CBC      210224 :         bmr.relpersistence = bmr.rel->rd_rel->relpersistence;
                               1025                 :                : 
                               1026                 :         210224 :     return ExtendBufferedRelCommon(bmr, fork, strategy, flags,
                               1027                 :                :                                    extend_by, InvalidBlockNumber,
                               1028                 :                :                                    buffers, extended_by);
                               1029                 :                : }
                               1030                 :                : 
                               1031                 :                : /*
                               1032                 :                :  * Extend the relation so it is at least extend_to blocks large, return buffer
                               1033                 :                :  * (extend_to - 1).
                               1034                 :                :  *
                               1035                 :                :  * This is useful for callers that want to write a specific page, regardless
                               1036                 :                :  * of the current size of the relation (e.g. useful for visibilitymap and for
                               1037                 :                :  * crash recovery).
                               1038                 :                :  */
                               1039                 :                : Buffer
                               1040                 :          55973 : ExtendBufferedRelTo(BufferManagerRelation bmr,
                               1041                 :                :                     ForkNumber fork,
                               1042                 :                :                     BufferAccessStrategy strategy,
                               1043                 :                :                     uint32 flags,
                               1044                 :                :                     BlockNumber extend_to,
                               1045                 :                :                     ReadBufferMode mode)
                               1046                 :                : {
                               1047                 :                :     BlockNumber current_size;
 1126 andres@anarazel.de       1048                 :          55973 :     uint32      extended_by = 0;
                               1049                 :          55973 :     Buffer      buffer = InvalidBuffer;
                               1050                 :                :     Buffer      buffers[64];
                               1051                 :                : 
  986 tmunro@postgresql.or     1052         [ -  + ]:          55973 :     Assert((bmr.rel != NULL) != (bmr.smgr != NULL));
  196 alvherre@kurilemu.de     1053   [ +  +  -  + ]:GNC       55973 :     Assert(bmr.smgr == NULL || bmr.relpersistence != '\0');
 1126 andres@anarazel.de       1054   [ +  -  -  + ]:CBC       55973 :     Assert(extend_to != InvalidBlockNumber && extend_to > 0);
                               1055                 :                : 
  196 alvherre@kurilemu.de     1056         [ +  + ]:GNC       55973 :     if (bmr.relpersistence == '\0')
  986 tmunro@postgresql.or     1057                 :CBC        9452 :         bmr.relpersistence = bmr.rel->rd_rel->relpersistence;
                               1058                 :                : 
                               1059                 :                :     /*
                               1060                 :                :      * If desired, create the file if it doesn't exist.  If
                               1061                 :                :      * smgr_cached_nblocks[fork] is positive then it must exist, no need for
                               1062                 :                :      * an smgrexists call.
                               1063                 :                :      */
 1126 andres@anarazel.de       1064         [ +  + ]:          55973 :     if ((flags & EB_CREATE_FORK_IF_NEEDED) &&
  196 alvherre@kurilemu.de     1065   [ +  -  +  + ]:GNC        9452 :         (BMR_GET_SMGR(bmr)->smgr_cached_nblocks[fork] == 0 ||
                               1066   [ +  -  -  + ]:             33 :          BMR_GET_SMGR(bmr)->smgr_cached_nblocks[fork] == InvalidBlockNumber) &&
                               1067   [ +  -  +  + ]:           9419 :         !smgrexists(BMR_GET_SMGR(bmr), fork))
                               1068                 :                :     {
  986 tmunro@postgresql.or     1069                 :CBC        9382 :         LockRelationForExtension(bmr.rel, ExclusiveLock);
                               1070                 :                : 
                               1071                 :                :         /* recheck, fork might have been created concurrently */
  196 alvherre@kurilemu.de     1072   [ +  -  +  + ]:GNC        9382 :         if (!smgrexists(BMR_GET_SMGR(bmr), fork))
                               1073         [ +  - ]:           9379 :             smgrcreate(BMR_GET_SMGR(bmr), fork, flags & EB_PERFORMING_RECOVERY);
                               1074                 :                : 
  986 tmunro@postgresql.or     1075                 :CBC        9382 :         UnlockRelationForExtension(bmr.rel, ExclusiveLock);
                               1076                 :                :     }
                               1077                 :                : 
                               1078                 :                :     /*
                               1079                 :                :      * If requested, invalidate size cache, so that smgrnblocks asks the
                               1080                 :                :      * kernel.
                               1081                 :                :      */
 1126 andres@anarazel.de       1082         [ +  + ]:          55973 :     if (flags & EB_CLEAR_SIZE_CACHE)
  196 alvherre@kurilemu.de     1083         [ +  - ]:GNC        9452 :         BMR_GET_SMGR(bmr)->smgr_cached_nblocks[fork] = InvalidBlockNumber;
                               1084                 :                : 
                               1085                 :                :     /*
                               1086                 :                :      * Estimate how many pages we'll need to extend by. This avoids acquiring
                               1087                 :                :      * unnecessarily many victim buffers.
                               1088                 :                :      */
                               1089         [ +  + ]:          55973 :     current_size = smgrnblocks(BMR_GET_SMGR(bmr), fork);
                               1090                 :                : 
                               1091                 :                :     /*
                               1092                 :                :      * Since no-one else can be looking at the page contents yet, there is no
                               1093                 :                :      * difference between an exclusive lock and a cleanup-strength lock. Note
                               1094                 :                :      * that we pass the original mode to ReadBuffer_common() below, when
                               1095                 :                :      * falling back to reading the buffer to a concurrent relation extension.
                               1096                 :                :      */
 1117 andres@anarazel.de       1097   [ +  +  -  + ]:CBC       55973 :     if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
 1126                          1098                 :          46129 :         flags |= EB_LOCK_TARGET;
                               1099                 :                : 
                               1100         [ +  + ]:         114142 :     while (current_size < extend_to)
                               1101                 :                :     {
                               1102                 :          58169 :         uint32      num_pages = lengthof(buffers);
                               1103                 :                :         BlockNumber first_block;
                               1104                 :                : 
                               1105         [ +  + ]:          58169 :         if ((uint64) current_size + num_pages > extend_to)
                               1106                 :          58103 :             num_pages = extend_to - current_size;
                               1107                 :                : 
  986 tmunro@postgresql.or     1108                 :          58169 :         first_block = ExtendBufferedRelCommon(bmr, fork, strategy, flags,
                               1109                 :                :                                               num_pages, extend_to,
                               1110                 :                :                                               buffers, &extended_by);
                               1111                 :                : 
 1126 andres@anarazel.de       1112                 :          58169 :         current_size = first_block + extended_by;
                               1113   [ -  +  -  - ]:          58169 :         Assert(num_pages != 0 || current_size >= extend_to);
                               1114                 :                : 
  959 peter@eisentraut.org     1115         [ +  + ]:         125169 :         for (uint32 i = 0; i < extended_by; i++)
                               1116                 :                :         {
 1126 andres@anarazel.de       1117         [ +  + ]:          67000 :             if (first_block + i != extend_to - 1)
                               1118                 :          11036 :                 ReleaseBuffer(buffers[i]);
                               1119                 :                :             else
                               1120                 :          55964 :                 buffer = buffers[i];
                               1121                 :                :         }
                               1122                 :                :     }
                               1123                 :                : 
                               1124                 :                :     /*
                               1125                 :                :      * It's possible that another backend concurrently extended the relation.
                               1126                 :                :      * In that case read the buffer.
                               1127                 :                :      *
                               1128                 :                :      * XXX: Should we control this via a flag?
                               1129                 :                :      */
                               1130         [ +  + ]:          55973 :     if (buffer == InvalidBuffer)
                               1131                 :                :     {
                               1132         [ -  + ]:              9 :         Assert(extended_by == 0);
  196 alvherre@kurilemu.de     1133         [ +  - ]:GNC           9 :         buffer = ReadBuffer_common(bmr.rel, BMR_GET_SMGR(bmr), bmr.relpersistence,
                               1134                 :                :                                    fork, extend_to - 1, mode, strategy);
                               1135                 :                :     }
                               1136                 :                : 
 1126 andres@anarazel.de       1137                 :CBC       55973 :     return buffer;
                               1138                 :                : }
                               1139                 :                : 
                               1140                 :                : /*
                               1141                 :                :  * Lock and optionally zero a buffer, as part of the implementation of
                               1142                 :                :  * RBM_ZERO_AND_LOCK or RBM_ZERO_AND_CLEANUP_LOCK.  The buffer must be already
                               1143                 :                :  * pinned.  If the buffer is not already valid, it is zeroed and made valid.
                               1144                 :                :  */
                               1145                 :                : static void
  694 tmunro@postgresql.or     1146                 :         358133 : ZeroAndLockBuffer(Buffer buffer, ReadBufferMode mode, bool already_valid)
                               1147                 :                : {
                               1148                 :                :     BufferDesc *bufHdr;
                               1149                 :                :     bool        need_to_zero;
                               1150                 :         358133 :     bool        isLocalBuf = BufferIsLocal(buffer);
                               1151                 :                :     StartBufferIOResult sbres;
                               1152                 :                : 
  762                          1153   [ +  +  -  + ]:         358133 :     Assert(mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK);
                               1154                 :                : 
  694                          1155         [ +  + ]:         358133 :     if (already_valid)
                               1156                 :                :     {
                               1157                 :                :         /*
                               1158                 :                :          * If the caller already knew the buffer was valid, we can skip some
                               1159                 :                :          * header interaction.  The caller just wants to lock the buffer.
                               1160                 :                :          */
                               1161                 :          38290 :         need_to_zero = false;
                               1162                 :                :     }
                               1163                 :                :     else
                               1164                 :                :     {
   39 andres@anarazel.de       1165         [ +  + ]:GNC      319843 :         if (isLocalBuf)
                               1166                 :                :         {
                               1167                 :                :             /* Simple case for non-shared buffers. */
                               1168                 :             45 :             bufHdr = GetLocalBufferDescriptor(-buffer - 1);
                               1169                 :             45 :             sbres = StartLocalBufferIO(bufHdr, true, true, NULL);
                               1170                 :                :         }
                               1171                 :                :         else
                               1172                 :                :         {
                               1173                 :                :             /*
                               1174                 :                :              * Take BM_IO_IN_PROGRESS, or discover that BM_VALID has been set
                               1175                 :                :              * concurrently.  Even though we aren't doing I/O, that ensures
                               1176                 :                :              * that we don't zero a page that someone else has pinned.  An
                               1177                 :                :              * exclusive content lock wouldn't be enough, because readers are
                               1178                 :                :              * allowed to drop the content lock after determining that a tuple
                               1179                 :                :              * is visible (see buffer access rules in README).
                               1180                 :                :              */
                               1181                 :         319798 :             bufHdr = GetBufferDescriptor(buffer - 1);
                               1182                 :         319798 :             sbres = StartSharedBufferIO(bufHdr, true, true, NULL);
                               1183                 :                :         }
                               1184                 :                : 
                               1185         [ -  + ]:         319843 :         Assert(sbres != BUFFER_IO_IN_PROGRESS);
                               1186                 :         319843 :         need_to_zero = sbres == BUFFER_IO_READY_FOR_IO;
                               1187                 :                :     }
                               1188                 :                : 
  694 tmunro@postgresql.or     1189         [ +  + ]:CBC      358133 :     if (need_to_zero)
                               1190                 :                :     {
                               1191                 :         319843 :         memset(BufferGetPage(buffer), 0, BLCKSZ);
                               1192                 :                : 
                               1193                 :                :         /*
                               1194                 :                :          * Grab the buffer content lock before marking the page as valid, to
                               1195                 :                :          * make sure that no other backend sees the zeroed page before the
                               1196                 :                :          * caller has had a chance to initialize it.
                               1197                 :                :          *
                               1198                 :                :          * Since no-one else can be looking at the page contents yet, there is
                               1199                 :                :          * no difference between an exclusive lock and a cleanup-strength
                               1200                 :                :          * lock. (Note that we cannot use LockBuffer() or
                               1201                 :                :          * LockBufferForCleanup() here, because they assert that the buffer is
                               1202                 :                :          * already valid.)
                               1203                 :                :          */
                               1204         [ +  + ]:         319843 :         if (!isLocalBuf)
  209 andres@anarazel.de       1205                 :GNC      319798 :             LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
                               1206                 :                : 
                               1207                 :                :         /* Set BM_VALID, terminate IO, and wake up any waiters */
  694 tmunro@postgresql.or     1208         [ +  + ]:CBC      319843 :         if (isLocalBuf)
  401 andres@anarazel.de       1209                 :             45 :             TerminateLocalBufferIO(bufHdr, false, BM_VALID, false);
                               1210                 :                :         else
                               1211                 :         319798 :             TerminateBufferIO(bufHdr, false, BM_VALID, true, false);
                               1212                 :                :     }
  694 tmunro@postgresql.or     1213         [ +  + ]:          38290 :     else if (!isLocalBuf)
                               1214                 :                :     {
                               1215                 :                :         /*
                               1216                 :                :          * The buffer is valid, so we can't zero it.  The caller still expects
                               1217                 :                :          * the page to be locked on return.
                               1218                 :                :          */
                               1219         [ +  + ]:          38260 :         if (mode == RBM_ZERO_AND_LOCK)
                               1220                 :          38114 :             LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
                               1221                 :                :         else
                               1222                 :            146 :             LockBufferForCleanup(buffer);
                               1223                 :                :     }
  762                          1224                 :         358133 : }
                               1225                 :                : 
                               1226                 :                : /*
                               1227                 :                :  * Pin a buffer for a given block.  *foundPtr is set to true if the block was
                               1228                 :                :  * already present, or false if more work is required to either read it in or
                               1229                 :                :  * zero it.
                               1230                 :                :  */
                               1231                 :                : static pg_attribute_always_inline Buffer
                               1232                 :       84621197 : PinBufferForBlock(Relation rel,
                               1233                 :                :                   SMgrRelation smgr,
                               1234                 :                :                   char persistence,
                               1235                 :                :                   ForkNumber forkNum,
                               1236                 :                :                   BlockNumber blockNum,
                               1237                 :                :                   BufferAccessStrategy strategy,
                               1238                 :                :                   IOObject io_object,
                               1239                 :                :                   IOContext io_context,
                               1240                 :                :                   bool *foundPtr)
                               1241                 :                : {
                               1242                 :                :     BufferDesc *bufHdr;
                               1243                 :                : 
                               1244         [ -  + ]:       84621197 :     Assert(blockNum != P_NEW);
                               1245                 :                : 
                               1246                 :                :     /* Persistence should be set before */
  654 noah@leadboat.com        1247   [ +  +  +  +  :       84621197 :     Assert((persistence == RELPERSISTENCE_TEMP ||
                                              -  + ]
                               1248                 :                :             persistence == RELPERSISTENCE_PERMANENT ||
                               1249                 :                :             persistence == RELPERSISTENCE_UNLOGGED));
                               1250                 :                : 
                               1251                 :                :     TRACE_POSTGRESQL_BUFFER_READ_START(forkNum, blockNum,
                               1252                 :                :                                        smgr->smgr_rlocator.locator.spcOid,
                               1253                 :                :                                        smgr->smgr_rlocator.locator.dbOid,
                               1254                 :                :                                        smgr->smgr_rlocator.locator.relNumber,
                               1255                 :                :                                        smgr->smgr_rlocator.backend);
                               1256                 :                : 
  762 tmunro@postgresql.or     1257         [ +  + ]:       84621197 :     if (persistence == RELPERSISTENCE_TEMP)
                               1258                 :        1912003 :         bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, foundPtr);
                               1259                 :                :     else
                               1260                 :       82709194 :         bufHdr = BufferAlloc(smgr, persistence, forkNum, blockNum,
                               1261                 :                :                              strategy, foundPtr, io_context);
                               1262                 :                : 
   40 andres@anarazel.de       1263         [ +  + ]:GNC    84621189 :     if (*foundPtr)
                               1264                 :       82645703 :         TrackBufferHit(io_object, io_context, rel, persistence, smgr, forkNum, blockNum);
                               1265                 :                : 
  762 tmunro@postgresql.or     1266         [ +  + ]:CBC    84621189 :     if (rel)
                               1267                 :                :     {
                               1268                 :                :         /*
                               1269                 :                :          * While pgBufferUsage's "read" counter isn't bumped unless we reach
                               1270                 :                :          * WaitReadBuffers() (so, not for hits, and not for buffers that are
                               1271                 :                :          * zeroed instead), the per-relation stats always count them.
                               1272                 :                :          */
                               1273   [ +  +  +  +  :       78389713 :         pgstat_count_buffer_read(rel);
                                              +  + ]
                               1274                 :                :     }
                               1275                 :                : 
                               1276                 :       84621189 :     return BufferDescriptorGetBuffer(bufHdr);
                               1277                 :                : }
                               1278                 :                : 
                               1279                 :                : /*
                               1280                 :                :  * ReadBuffer_common -- common logic for all ReadBuffer variants
                               1281                 :                :  *
                               1282                 :                :  * smgr is required, rel is optional unless using P_NEW.
                               1283                 :                :  */
                               1284                 :                : static pg_attribute_always_inline Buffer
                               1285                 :       80120771 : ReadBuffer_common(Relation rel, SMgrRelation smgr, char smgr_persistence,
                               1286                 :                :                   ForkNumber forkNum,
                               1287                 :                :                   BlockNumber blockNum, ReadBufferMode mode,
                               1288                 :                :                   BufferAccessStrategy strategy)
                               1289                 :                : {
                               1290                 :                :     ReadBuffersOperation operation;
                               1291                 :                :     Buffer      buffer;
                               1292                 :                :     int         flags;
                               1293                 :                :     char        persistence;
                               1294                 :                : 
                               1295                 :                :     /*
                               1296                 :                :      * Backward compatibility path, most code should use ExtendBufferedRel()
                               1297                 :                :      * instead, as acquiring the extension lock inside ExtendBufferedRel()
                               1298                 :                :      * scales a lot better.
                               1299                 :                :      */
                               1300         [ +  + ]:       80120771 :     if (unlikely(blockNum == P_NEW))
                               1301                 :                :     {
                               1302                 :            322 :         uint32      flags = EB_SKIP_EXTENSION_LOCK;
                               1303                 :                : 
                               1304                 :                :         /*
                               1305                 :                :          * Since no-one else can be looking at the page contents yet, there is
                               1306                 :                :          * no difference between an exclusive lock and a cleanup-strength
                               1307                 :                :          * lock.
                               1308                 :                :          */
                               1309   [ +  -  -  + ]:            322 :         if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
  762 tmunro@postgresql.or     1310                 :UBC           0 :             flags |= EB_LOCK_FIRST;
                               1311                 :                : 
  762 tmunro@postgresql.or     1312                 :CBC         322 :         return ExtendBufferedRel(BMR_REL(rel), forkNum, strategy, flags);
                               1313                 :                :     }
                               1314                 :                : 
  654 noah@leadboat.com        1315         [ +  + ]:       80120449 :     if (rel)
                               1316                 :       74155064 :         persistence = rel->rd_rel->relpersistence;
                               1317                 :                :     else
                               1318                 :        5965385 :         persistence = smgr_persistence;
                               1319                 :                : 
  762 tmunro@postgresql.or     1320   [ +  +  +  +  :       80120449 :     if (unlikely(mode == RBM_ZERO_AND_CLEANUP_LOCK ||
                                              +  + ]
                               1321                 :                :                  mode == RBM_ZERO_AND_LOCK))
                               1322                 :                :     {
                               1323                 :                :         bool        found;
                               1324                 :                :         IOContext   io_context;
                               1325                 :                :         IOObject    io_object;
                               1326                 :                : 
   40 andres@anarazel.de       1327         [ +  + ]:GNC      358133 :         if (persistence == RELPERSISTENCE_TEMP)
                               1328                 :                :         {
                               1329                 :             75 :             io_context = IOCONTEXT_NORMAL;
                               1330                 :             75 :             io_object = IOOBJECT_TEMP_RELATION;
                               1331                 :                :         }
                               1332                 :                :         else
                               1333                 :                :         {
                               1334                 :         358058 :             io_context = IOContextForStrategy(strategy);
                               1335                 :         358058 :             io_object = IOOBJECT_RELATION;
                               1336                 :                :         }
                               1337                 :                : 
  654 noah@leadboat.com        1338                 :CBC      358133 :         buffer = PinBufferForBlock(rel, smgr, persistence,
                               1339                 :                :                                    forkNum, blockNum, strategy,
                               1340                 :                :                                    io_object, io_context, &found);
  694 tmunro@postgresql.or     1341                 :         358133 :         ZeroAndLockBuffer(buffer, mode, found);
  762                          1342                 :         358133 :         return buffer;
                               1343                 :                :     }
                               1344                 :                : 
                               1345                 :                :     /*
                               1346                 :                :      * Signal that we are going to immediately wait. If we're immediately
                               1347                 :                :      * waiting, there is no benefit in actually executing the IO
                               1348                 :                :      * asynchronously, it would just add dispatch overhead.
                               1349                 :                :      */
  401 andres@anarazel.de       1350                 :       79762316 :     flags = READ_BUFFERS_SYNCHRONOUSLY;
  762 tmunro@postgresql.or     1351         [ +  + ]:       79762316 :     if (mode == RBM_ZERO_ON_ERROR)
  401 andres@anarazel.de       1352                 :        1394192 :         flags |= READ_BUFFERS_ZERO_ON_ERROR;
  762 tmunro@postgresql.or     1353                 :       79762316 :     operation.smgr = smgr;
                               1354                 :       79762316 :     operation.rel = rel;
  654 noah@leadboat.com        1355                 :       79762316 :     operation.persistence = persistence;
  762 tmunro@postgresql.or     1356                 :       79762316 :     operation.forknum = forkNum;
                               1357                 :       79762316 :     operation.strategy = strategy;
                               1358         [ +  + ]:       79762316 :     if (StartReadBuffer(&operation,
                               1359                 :                :                         &buffer,
                               1360                 :                :                         blockNum,
                               1361                 :                :                         flags))
                               1362                 :         776120 :         WaitReadBuffers(&operation);
                               1363                 :                : 
                               1364                 :       79762289 :     return buffer;
                               1365                 :                : }
                               1366                 :                : 
                               1367                 :                : static pg_attribute_always_inline bool
                               1368                 :       84082024 : StartReadBuffersImpl(ReadBuffersOperation *operation,
                               1369                 :                :                      Buffer *buffers,
                               1370                 :                :                      BlockNumber blockNum,
                               1371                 :                :                      int *nblocks,
                               1372                 :                :                      int flags,
                               1373                 :                :                      bool allow_forwarding)
                               1374                 :                : {
                               1375                 :       84082024 :     int         actual_nblocks = *nblocks;
  574 andres@anarazel.de       1376                 :       84082024 :     int         maxcombine = 0;
                               1377                 :                :     bool        did_start_io;
                               1378                 :                :     IOContext   io_context;
                               1379                 :                :     IOObject    io_object;
                               1380                 :                : 
  410 tmunro@postgresql.or     1381   [ +  +  -  + ]:       84082024 :     Assert(*nblocks == 1 || allow_forwarding);
  762                          1382         [ -  + ]:       84082024 :     Assert(*nblocks > 0);
                               1383         [ -  + ]:       84082024 :     Assert(*nblocks <= MAX_IO_COMBINE_LIMIT);
                               1384                 :                : 
   40 andres@anarazel.de       1385         [ +  + ]:GNC    84082024 :     if (operation->persistence == RELPERSISTENCE_TEMP)
                               1386                 :                :     {
                               1387                 :        1903712 :         io_context = IOCONTEXT_NORMAL;
                               1388                 :        1903712 :         io_object = IOOBJECT_TEMP_RELATION;
                               1389                 :                :     }
                               1390                 :                :     else
                               1391                 :                :     {
                               1392                 :       82178312 :         io_context = IOContextForStrategy(operation->strategy);
                               1393                 :       82178312 :         io_object = IOOBJECT_RELATION;
                               1394                 :                :     }
                               1395                 :                : 
  762 tmunro@postgresql.or     1396         [ +  + ]:CBC    85737707 :     for (int i = 0; i < actual_nblocks; ++i)
                               1397                 :                :     {
                               1398                 :                :         bool        found;
                               1399                 :                : 
  410                          1400   [ +  +  +  + ]:       84265307 :         if (allow_forwarding && buffers[i] != InvalidBuffer)
                               1401                 :           2243 :         {
                               1402                 :                :             BufferDesc *bufHdr;
                               1403                 :                : 
                               1404                 :                :             /*
                               1405                 :                :              * This is a buffer that was pinned by an earlier call to
                               1406                 :                :              * StartReadBuffers(), but couldn't be handled in one operation at
                               1407                 :                :              * that time.  The operation was split, and the caller has passed
                               1408                 :                :              * an already pinned buffer back to us to handle the rest of the
                               1409                 :                :              * operation.  It must continue at the expected block number.
                               1410                 :                :              */
                               1411         [ -  + ]:           2243 :             Assert(BufferGetBlockNumber(buffers[i]) == blockNum + i);
                               1412                 :                : 
                               1413                 :                :             /*
                               1414                 :                :              * It might be an already valid buffer (a hit) that followed the
                               1415                 :                :              * final contiguous block of an earlier I/O (a miss) marking the
                               1416                 :                :              * end of it, or a buffer that some other backend has since made
                               1417                 :                :              * valid by performing the I/O for us, in which case we can handle
                               1418                 :                :              * it as a hit now.  It is safe to check for a BM_VALID flag with
                               1419                 :                :              * a relaxed load, because we got a fresh view of it while pinning
                               1420                 :                :              * it in the previous call.
                               1421                 :                :              *
                               1422                 :                :              * On the other hand if we don't see BM_VALID yet, it must be an
                               1423                 :                :              * I/O that was split by the previous call and we need to try to
                               1424                 :                :              * start a new I/O from this block.  We're also racing against any
                               1425                 :                :              * other backend that might start the I/O or even manage to mark
                               1426                 :                :              * it BM_VALID after this check, but StartBufferIO() will handle
                               1427                 :                :              * those cases.
                               1428                 :                :              */
                               1429         [ +  + ]:           2243 :             if (BufferIsLocal(buffers[i]))
                               1430                 :             26 :                 bufHdr = GetLocalBufferDescriptor(-buffers[i] - 1);
                               1431                 :                :             else
                               1432                 :           2217 :                 bufHdr = GetBufferDescriptor(buffers[i] - 1);
  110 andres@anarazel.de       1433         [ -  + ]:GNC        2243 :             Assert(pg_atomic_read_u64(&bufHdr->state) & BM_TAG_VALID);
                               1434                 :           2243 :             found = pg_atomic_read_u64(&bufHdr->state) & BM_VALID;
                               1435                 :                :         }
                               1436                 :                :         else
                               1437                 :                :         {
  410 tmunro@postgresql.or     1438                 :CBC    84263056 :             buffers[i] = PinBufferForBlock(operation->rel,
  410 tmunro@postgresql.or     1439                 :ECB  (57353754) :                                            operation->smgr,
  410 tmunro@postgresql.or     1440                 :CBC    84263064 :                                            operation->persistence,
                               1441                 :                :                                            operation->forknum,
                               1442                 :                :                                            blockNum + i,
                               1443                 :                :                                            operation->strategy,
                               1444                 :                :                                            io_object, io_context,
                               1445                 :                :                                            &found);
                               1446                 :                :         }
                               1447                 :                : 
  762                          1448         [ +  + ]:       84265299 :         if (found)
                               1449                 :                :         {
                               1450                 :                :             /*
                               1451                 :                :              * We have a hit.  If it's the first block in the requested range,
                               1452                 :                :              * we can return it immediately and report that WaitReadBuffers()
                               1453                 :                :              * does not need to be called.  If the initial value of *nblocks
                               1454                 :                :              * was larger, the caller will have to call again for the rest.
                               1455                 :                :              */
  410                          1456         [ +  + ]:       82609616 :             if (i == 0)
                               1457                 :                :             {
                               1458                 :       82607410 :                 *nblocks = 1;
                               1459                 :                : 
                               1460                 :                : #ifdef USE_ASSERT_CHECKING
                               1461                 :                : 
                               1462                 :                :                 /*
                               1463                 :                :                  * Initialize enough of ReadBuffersOperation to make
                               1464                 :                :                  * CheckReadBuffersOperation() work. Outside of assertions
                               1465                 :                :                  * that's not necessary when no IO is issued.
                               1466                 :                :                  */
  401 andres@anarazel.de       1467                 :       82607410 :                 operation->buffers = buffers;
                               1468                 :       82607410 :                 operation->blocknum = blockNum;
                               1469                 :       82607410 :                 operation->nblocks = 1;
                               1470                 :       82607410 :                 operation->nblocks_done = 1;
                               1471                 :       82607410 :                 CheckReadBuffersOperation(operation, true);
                               1472                 :                : #endif
  410 tmunro@postgresql.or     1473                 :       82607410 :                 return false;
                               1474                 :                :             }
                               1475                 :                : 
                               1476                 :                :             /*
                               1477                 :                :              * Otherwise we already have an I/O to perform, but this block
                               1478                 :                :              * can't be included as it is already valid.  Split the I/O here.
                               1479                 :                :              * There may or may not be more blocks requiring I/O after this
                               1480                 :                :              * one, we haven't checked, but they can't be contiguous with this
                               1481                 :                :              * one in the way.  We'll leave this buffer pinned, forwarding it
                               1482                 :                :              * to the next call, avoiding the need to unpin it here and re-pin
                               1483                 :                :              * it in the next call.
                               1484                 :                :              */
                               1485                 :           2206 :             actual_nblocks = i;
  762                          1486                 :           2206 :             break;
                               1487                 :                :         }
                               1488                 :                :         else
                               1489                 :                :         {
                               1490                 :                :             /*
                               1491                 :                :              * Check how many blocks we can cover with the same IO. The smgr
                               1492                 :                :              * implementation might e.g. be limited due to a segment boundary.
                               1493                 :                :              */
  574 andres@anarazel.de       1494   [ +  +  +  + ]:        1655683 :             if (i == 0 && actual_nblocks > 1)
                               1495                 :                :             {
                               1496                 :          37118 :                 maxcombine = smgrmaxcombine(operation->smgr,
                               1497                 :                :                                             operation->forknum,
                               1498                 :                :                                             blockNum);
                               1499         [ -  + ]:          37118 :                 if (unlikely(maxcombine < actual_nblocks))
                               1500                 :                :                 {
  574 andres@anarazel.de       1501         [ #  # ]:UBC           0 :                     elog(DEBUG2, "limiting nblocks at %u from %u to %u",
                               1502                 :                :                          blockNum, actual_nblocks, maxcombine);
                               1503                 :              0 :                     actual_nblocks = maxcombine;
                               1504                 :                :                 }
                               1505                 :                :             }
                               1506                 :                :         }
                               1507                 :                :     }
  762 tmunro@postgresql.or     1508                 :CBC     1474606 :     *nblocks = actual_nblocks;
                               1509                 :                : 
                               1510                 :                :     /* Populate information needed for I/O. */
                               1511                 :        1474606 :     operation->buffers = buffers;
                               1512                 :        1474606 :     operation->blocknum = blockNum;
                               1513                 :        1474606 :     operation->flags = flags;
                               1514                 :        1474606 :     operation->nblocks = actual_nblocks;
  401 andres@anarazel.de       1515                 :        1474606 :     operation->nblocks_done = 0;
                               1516                 :        1474606 :     pgaio_wref_clear(&operation->io_wref);
                               1517                 :                : 
                               1518                 :                :     /*
                               1519                 :                :      * When using AIO, start the IO in the background. If not, issue prefetch
                               1520                 :                :      * requests if desired by the caller.
                               1521                 :                :      *
                               1522                 :                :      * The reason we have a dedicated path for IOMETHOD_SYNC here is to
                               1523                 :                :      * de-risk the introduction of AIO somewhat. It's a large architectural
                               1524                 :                :      * change, with lots of chances for unanticipated performance effects.
                               1525                 :                :      *
                               1526                 :                :      * Use of IOMETHOD_SYNC already leads to not actually performing IO
                               1527                 :                :      * asynchronously, but without the check here we'd execute IO earlier than
                               1528                 :                :      * we used to. Eventually this IOMETHOD_SYNC specific path should go away.
                               1529                 :                :      */
                               1530         [ +  + ]:        1474606 :     if (io_method != IOMETHOD_SYNC)
                               1531                 :                :     {
                               1532                 :                :         /*
                               1533                 :                :          * Try to start IO asynchronously. It's possible that no IO needs to
                               1534                 :                :          * be started, if another backend already performed the IO.
                               1535                 :                :          *
                               1536                 :                :          * Note that if an IO is started, it might not cover the entire
                               1537                 :                :          * requested range, e.g. because an intermediary block has been read
                               1538                 :                :          * in by another backend.  In that case any "trailing" buffers we
                               1539                 :                :          * already pinned above will be "forwarded" by read_stream.c to the
                               1540                 :                :          * next call to StartReadBuffers().
                               1541                 :                :          *
                               1542                 :                :          * This is signalled to the caller by decrementing *nblocks *and*
                               1543                 :                :          * reducing operation->nblocks. The latter is done here, but not below
                               1544                 :                :          * WaitReadBuffers(), as in WaitReadBuffers() we can't "shorten" the
                               1545                 :                :          * overall read size anymore, we need to retry until done in its
                               1546                 :                :          * entirety or until failed.
                               1547                 :                :          */
                               1548                 :        1473230 :         did_start_io = AsyncReadBuffers(operation, nblocks);
                               1549                 :                : 
                               1550                 :        1473215 :         operation->nblocks = *nblocks;
                               1551                 :                :     }
                               1552                 :                :     else
                               1553                 :                :     {
                               1554                 :           1376 :         operation->flags |= READ_BUFFERS_SYNCHRONOUSLY;
                               1555                 :                : 
                               1556         [ +  + ]:           1376 :         if (flags & READ_BUFFERS_ISSUE_ADVICE)
                               1557                 :                :         {
                               1558                 :                :             /*
                               1559                 :                :              * In theory we should only do this if PinBufferForBlock() had to
                               1560                 :                :              * allocate new buffers above.  That way, if two calls to
                               1561                 :                :              * StartReadBuffers() were made for the same blocks before
                               1562                 :                :              * WaitReadBuffers(), only the first would issue the advice.
                               1563                 :                :              * That'd be a better simulation of true asynchronous I/O, which
                               1564                 :                :              * would only start the I/O once, but isn't done here for
                               1565                 :                :              * simplicity.
                               1566                 :                :              */
                               1567                 :             19 :             smgrprefetch(operation->smgr,
                               1568                 :                :                          operation->forknum,
                               1569                 :                :                          blockNum,
                               1570                 :                :                          actual_nblocks);
                               1571                 :                :         }
                               1572                 :                : 
                               1573                 :                :         /*
                               1574                 :                :          * Indicate that WaitReadBuffers() should be called. WaitReadBuffers()
                               1575                 :                :          * will initiate the necessary IO.
                               1576                 :                :          */
                               1577                 :           1376 :         did_start_io = true;
                               1578                 :                :     }
                               1579                 :                : 
                               1580                 :        1474591 :     CheckReadBuffersOperation(operation, !did_start_io);
                               1581                 :                : 
                               1582                 :        1474591 :     return did_start_io;
                               1583                 :                : }
                               1584                 :                : 
                               1585                 :                : /*
                               1586                 :                :  * Begin reading a range of blocks beginning at blockNum and extending for
                               1587                 :                :  * *nblocks.  *nblocks and the buffers array are in/out parameters.  On entry,
                               1588                 :                :  * the buffers elements covered by *nblocks must hold either InvalidBuffer or
                               1589                 :                :  * buffers forwarded by an earlier call to StartReadBuffers() that was split
                               1590                 :                :  * and is now being continued.  On return, *nblocks holds the number of blocks
                               1591                 :                :  * accepted by this operation.  If it is less than the original number then
                               1592                 :                :  * this operation has been split, but buffer elements up to the original
                               1593                 :                :  * requested size may hold forwarded buffers to be used for a continuing
                               1594                 :                :  * operation.  The caller must either start a new I/O beginning at the block
                               1595                 :                :  * immediately following the blocks accepted by this call and pass those
                               1596                 :                :  * buffers back in, or release them if it chooses not to.  It shouldn't make
                               1597                 :                :  * any other use of or assumptions about forwarded buffers.
                               1598                 :                :  *
                               1599                 :                :  * If false is returned, no I/O is necessary and the buffers covered by
                               1600                 :                :  * *nblocks on exit are valid and ready to be accessed.  If true is returned,
                               1601                 :                :  * an I/O has been started, and WaitReadBuffers() must be called with the same
                               1602                 :                :  * operation object before the buffers covered by *nblocks on exit can be
                               1603                 :                :  * accessed.  Along with the operation object, the caller-supplied array of
                               1604                 :                :  * buffers must remain valid until WaitReadBuffers() is called, and any
                               1605                 :                :  * forwarded buffers must also be preserved for a continuing call unless
                               1606                 :                :  * they are explicitly released.
                               1607                 :                :  */
                               1608                 :                : bool
  762 tmunro@postgresql.or     1609                 :        1786469 : StartReadBuffers(ReadBuffersOperation *operation,
                               1610                 :                :                  Buffer *buffers,
                               1611                 :                :                  BlockNumber blockNum,
                               1612                 :                :                  int *nblocks,
                               1613                 :                :                  int flags)
                               1614                 :                : {
  410                          1615                 :        1786469 :     return StartReadBuffersImpl(operation, buffers, blockNum, nblocks, flags,
                               1616                 :                :                                 true /* expect forwarded buffers */ );
                               1617                 :                : }
                               1618                 :                : 
                               1619                 :                : /*
                               1620                 :                :  * Single block version of the StartReadBuffers().  This might save a few
                               1621                 :                :  * instructions when called from another translation unit, because it is
                               1622                 :                :  * specialized for nblocks == 1.
                               1623                 :                :  *
                               1624                 :                :  * This version does not support "forwarded" buffers: they cannot be created
                               1625                 :                :  * by reading only one block and *buffer is ignored on entry.
                               1626                 :                :  */
                               1627                 :                : bool
  762                          1628                 :       82295555 : StartReadBuffer(ReadBuffersOperation *operation,
                               1629                 :                :                 Buffer *buffer,
                               1630                 :                :                 BlockNumber blocknum,
                               1631                 :                :                 int flags)
                               1632                 :                : {
                               1633                 :       82295555 :     int         nblocks = 1;
                               1634                 :                :     bool        result;
                               1635                 :                : 
  410                          1636                 :       82295555 :     result = StartReadBuffersImpl(operation, buffer, blocknum, &nblocks, flags,
                               1637                 :                :                                   false /* single block, no forwarding */ );
  762                          1638         [ -  + ]:       82295540 :     Assert(nblocks == 1);       /* single block can't be short */
                               1639                 :                : 
                               1640                 :       82295540 :     return result;
                               1641                 :                : }
                               1642                 :                : 
                               1643                 :                : /*
                               1644                 :                :  * Perform sanity checks on the ReadBuffersOperation.
                               1645                 :                :  */
                               1646                 :                : static void
  401 andres@anarazel.de       1647                 :       87030838 : CheckReadBuffersOperation(ReadBuffersOperation *operation, bool is_complete)
                               1648                 :                : {
                               1649                 :                : #ifdef USE_ASSERT_CHECKING
                               1650         [ -  + ]:       87030838 :     Assert(operation->nblocks_done <= operation->nblocks);
                               1651   [ +  +  -  + ]:       87030838 :     Assert(!is_complete || operation->nblocks == operation->nblocks_done);
                               1652                 :                : 
                               1653         [ +  + ]:      174607919 :     for (int i = 0; i < operation->nblocks; i++)
                               1654                 :                :     {
                               1655                 :       87577081 :         Buffer      buffer = operation->buffers[i];
                               1656                 :       87577081 :         BufferDesc *buf_hdr = BufferIsLocal(buffer) ?
                               1657         [ +  + ]:       87577081 :             GetLocalBufferDescriptor(-buffer - 1) :
                               1658                 :       85643789 :             GetBufferDescriptor(buffer - 1);
                               1659                 :                : 
                               1660         [ -  + ]:       87577081 :         Assert(BufferGetBlockNumber(buffer) == operation->blocknum + i);
  110 andres@anarazel.de       1661         [ -  + ]:GNC    87577081 :         Assert(pg_atomic_read_u64(&buf_hdr->state) & BM_TAG_VALID);
                               1662                 :                : 
  401 andres@anarazel.de       1663         [ +  + ]:CBC    87577081 :         if (i < operation->nblocks_done)
  110 andres@anarazel.de       1664         [ -  + ]:GNC    84263747 :             Assert(pg_atomic_read_u64(&buf_hdr->state) & BM_VALID);
                               1665                 :                :     }
                               1666                 :                : #endif
  401 andres@anarazel.de       1667                 :CBC    87030838 : }
                               1668                 :                : 
                               1669                 :                : /*
                               1670                 :                :  * We track various stats related to buffer hits. Because this is done in a
                               1671                 :                :  * few separate places, this helper exists for convenience.
                               1672                 :                :  */
                               1673                 :                : static pg_attribute_always_inline void
   40 andres@anarazel.de       1674                 :GNC    82648374 : TrackBufferHit(IOObject io_object, IOContext io_context,
                               1675                 :                :                Relation rel, char persistence, SMgrRelation smgr,
                               1676                 :                :                ForkNumber forknum, BlockNumber blocknum)
                               1677                 :                : {
                               1678                 :                :     TRACE_POSTGRESQL_BUFFER_READ_DONE(forknum,
                               1679                 :                :                                       blocknum,
                               1680                 :                :                                       smgr->smgr_rlocator.locator.spcOid,
                               1681                 :                :                                       smgr->smgr_rlocator.locator.dbOid,
                               1682                 :                :                                       smgr->smgr_rlocator.locator.relNumber,
                               1683                 :                :                                       smgr->smgr_rlocator.backend,
                               1684                 :                :                                       true);
                               1685                 :                : 
                               1686         [ +  + ]:       82648374 :     if (persistence == RELPERSISTENCE_TEMP)
                               1687                 :        1900892 :         pgBufferUsage.local_blks_hit += 1;
                               1688                 :                :     else
                               1689                 :       80747482 :         pgBufferUsage.shared_blks_hit += 1;
                               1690                 :                : 
                               1691                 :       82648374 :     pgstat_count_io_op(io_object, io_context, IOOP_HIT, 1, 0);
                               1692                 :                : 
                               1693         [ +  + ]:       82648374 :     if (VacuumCostActive)
                               1694                 :         187737 :         VacuumCostBalance += VacuumCostPageHit;
                               1695                 :                : 
                               1696         [ +  + ]:       82648374 :     if (rel)
                               1697   [ +  +  +  +  :       76938798 :         pgstat_count_buffer_hit(rel);
                                              +  + ]
   40 andres@anarazel.de       1698                 :GIC    82648374 : }
                               1699                 :                : 
                               1700                 :                : /*
                               1701                 :                :  * Helper for WaitReadBuffers() that processes the results of a readv
                               1702                 :                :  * operation, raising an error if necessary.
                               1703                 :                :  */
                               1704                 :                : static void
  401 andres@anarazel.de       1705                 :CBC     1471689 : ProcessReadBuffersResult(ReadBuffersOperation *operation)
                               1706                 :                : {
                               1707                 :        1471689 :     PgAioReturn *aio_ret = &operation->io_return;
                               1708                 :        1471689 :     PgAioResultStatus rs = aio_ret->result.status;
                               1709                 :        1471689 :     int         newly_read_blocks = 0;
                               1710                 :                : 
                               1711         [ -  + ]:        1471689 :     Assert(pgaio_wref_valid(&operation->io_wref));
                               1712         [ -  + ]:        1471689 :     Assert(aio_ret->result.status != PGAIO_RS_UNKNOWN);
                               1713                 :                : 
                               1714                 :                :     /*
                               1715                 :                :      * SMGR reports the number of blocks successfully read as the result of
                               1716                 :                :      * the IO operation. Thus we can simply add that to ->nblocks_done.
                               1717                 :                :      */
                               1718                 :                : 
                               1719         [ +  + ]:        1471689 :     if (likely(rs != PGAIO_RS_ERROR))
                               1720                 :        1471646 :         newly_read_blocks = aio_ret->result.result;
                               1721                 :                : 
                               1722   [ +  +  +  + ]:        1471689 :     if (rs == PGAIO_RS_ERROR || rs == PGAIO_RS_WARNING)
                               1723         [ +  + ]:             67 :         pgaio_result_report(aio_ret->result, &aio_ret->target_data,
                               1724                 :                :                             rs == PGAIO_RS_ERROR ? ERROR : WARNING);
                               1725         [ +  + ]:        1471622 :     else if (aio_ret->result.status == PGAIO_RS_PARTIAL)
                               1726                 :                :     {
                               1727                 :                :         /*
                               1728                 :                :          * We'll retry, so we just emit a debug message to the server log (or
                               1729                 :                :          * not even that in prod scenarios).
                               1730                 :                :          */
                               1731                 :            163 :         pgaio_result_report(aio_ret->result, &aio_ret->target_data, DEBUG1);
                               1732         [ +  - ]:            163 :         elog(DEBUG3, "partial read, will retry");
                               1733                 :                :     }
                               1734                 :                : 
                               1735         [ -  + ]:        1471646 :     Assert(newly_read_blocks > 0);
                               1736         [ -  + ]:        1471646 :     Assert(newly_read_blocks <= MAX_IO_COMBINE_LIMIT);
                               1737                 :                : 
                               1738                 :        1471646 :     operation->nblocks_done += newly_read_blocks;
                               1739                 :                : 
                               1740         [ -  + ]:        1471646 :     Assert(operation->nblocks_done <= operation->nblocks);
                               1741                 :        1471646 : }
                               1742                 :                : 
                               1743                 :                : /*
                               1744                 :                :  * Wait for the IO operation initiated by StartReadBuffers() et al to
                               1745                 :                :  * complete.
                               1746                 :                :  *
                               1747                 :                :  * Returns true if we needed to wait for the IO operation, false otherwise.
                               1748                 :                :  */
                               1749                 :                : bool
  762 tmunro@postgresql.or     1750                 :        1473667 : WaitReadBuffers(ReadBuffersOperation *operation)
                               1751                 :                : {
  401 andres@anarazel.de       1752                 :        1473667 :     PgAioReturn *aio_ret = &operation->io_return;
                               1753                 :                :     IOContext   io_context;
                               1754                 :                :     IOObject    io_object;
   34 andres@anarazel.de       1755                 :GNC     1473667 :     bool        needed_wait = false;
                               1756                 :                : 
  401 andres@anarazel.de       1757         [ +  + ]:CBC     1473667 :     if (operation->persistence == RELPERSISTENCE_TEMP)
                               1758                 :                :     {
  762 tmunro@postgresql.or     1759                 :           2455 :         io_context = IOCONTEXT_NORMAL;
                               1760                 :           2455 :         io_object = IOOBJECT_TEMP_RELATION;
                               1761                 :                :     }
                               1762                 :                :     else
                               1763                 :                :     {
                               1764                 :        1471212 :         io_context = IOContextForStrategy(operation->strategy);
                               1765                 :        1471212 :         io_object = IOOBJECT_RELATION;
                               1766                 :                :     }
                               1767                 :                : 
                               1768                 :                :     /*
                               1769                 :                :      * If we get here without an IO operation having been issued, the
                               1770                 :                :      * io_method == IOMETHOD_SYNC path must have been used. Otherwise the
                               1771                 :                :      * caller should not have called WaitReadBuffers().
                               1772                 :                :      *
                               1773                 :                :      * In the case of IOMETHOD_SYNC, we start - as we used to before the
                               1774                 :                :      * introducing of AIO - the IO in WaitReadBuffers(). This is done as part
                               1775                 :                :      * of the retry logic below, no extra code is required.
                               1776                 :                :      *
                               1777                 :                :      * This path is expected to eventually go away.
                               1778                 :                :      */
  401 andres@anarazel.de       1779   [ +  +  +  - ]:        1473667 :     if (!pgaio_wref_valid(&operation->io_wref) && io_method != IOMETHOD_SYNC)
  401 andres@anarazel.de       1780         [ #  # ]:UBC           0 :         elog(ERROR, "waiting for read operation that didn't read");
                               1781                 :                : 
                               1782                 :                :     /*
                               1783                 :                :      * To handle partial reads, and IOMETHOD_SYNC, we re-issue IO until we're
                               1784                 :                :      * done. We may need multiple retries, not just because we could get
                               1785                 :                :      * multiple partial reads, but also because some of the remaining
                               1786                 :                :      * to-be-read buffers may have been read in by other backends, limiting
                               1787                 :                :      * the IO size.
                               1788                 :                :      */
                               1789                 :                :     while (true)
  762 tmunro@postgresql.or     1790                 :CBC        1546 :     {
                               1791                 :                :         int         ignored_nblocks_progress;
                               1792                 :                : 
  401 andres@anarazel.de       1793                 :        1475213 :         CheckReadBuffersOperation(operation, false);
                               1794                 :                : 
                               1795                 :                :         /*
                               1796                 :                :          * If there is an IO associated with the operation, we may need to
                               1797                 :                :          * wait for it.
                               1798                 :                :          */
                               1799         [ +  + ]:        1475213 :         if (pgaio_wref_valid(&operation->io_wref))
                               1800                 :                :         {
                               1801                 :                :             /*
                               1802                 :                :              * Track the time spent waiting for the IO to complete. As
                               1803                 :                :              * tracking a wait even if we don't actually need to wait
                               1804                 :                :              *
                               1805                 :                :              * a) is not cheap, due to the timestamping overhead
                               1806                 :                :              *
                               1807                 :                :              * b) reports some time as waiting, even if we never waited
                               1808                 :                :              *
                               1809                 :                :              * we first check if we already know the IO is complete.
                               1810                 :                :              *
                               1811                 :                :              * Note that operation->io_return is uninitialized for foreign IO,
                               1812                 :                :              * so we cannot use the cheaper PGAIO_RS_UNKNOWN pre-check.
                               1813                 :                :              */
   39 andres@anarazel.de       1814   [ +  +  +  + ]:GNC     1473828 :             if ((operation->foreign_io || aio_ret->result.status == PGAIO_RS_UNKNOWN) &&
  401 andres@anarazel.de       1815         [ +  + ]:CBC      670198 :                 !pgaio_wref_check_done(&operation->io_wref))
                               1816                 :         301498 :             {
                               1817                 :         301498 :                 instr_time  io_start = pgstat_prepare_io_time(track_io_timing);
                               1818                 :                : 
                               1819                 :         301498 :                 pgaio_wref_wait(&operation->io_wref);
   34 andres@anarazel.de       1820                 :GNC      301498 :                 needed_wait = true;
                               1821                 :                : 
                               1822                 :                :                 /*
                               1823                 :                :                  * The IO operation itself was already counted earlier, in
                               1824                 :                :                  * AsyncReadBuffers(), this just accounts for the wait time.
                               1825                 :                :                  */
  401 andres@anarazel.de       1826                 :CBC      301498 :                 pgstat_count_io_op_time(io_object, io_context, IOOP_READ,
                               1827                 :                :                                         io_start, 0, 0);
                               1828                 :                :             }
                               1829                 :                :             else
                               1830                 :                :             {
                               1831         [ -  + ]:        1172330 :                 Assert(pgaio_wref_check_done(&operation->io_wref));
                               1832                 :                :             }
                               1833                 :                : 
   39 andres@anarazel.de       1834         [ +  + ]:GNC     1473828 :             if (unlikely(operation->foreign_io))
                               1835                 :                :             {
                               1836                 :           2139 :                 Buffer      buffer = operation->buffers[operation->nblocks_done];
                               1837                 :           2139 :                 BufferDesc *desc = BufferIsLocal(buffer) ?
                               1838         [ +  + ]:           2139 :                     GetLocalBufferDescriptor(-buffer - 1) :
                               1839                 :           2136 :                     GetBufferDescriptor(buffer - 1);
                               1840                 :           2139 :                 uint64      buf_state = pg_atomic_read_u64(&desc->state);
                               1841                 :                : 
                               1842         [ +  + ]:           2139 :                 if (buf_state & BM_VALID)
                               1843                 :                :                 {
                               1844                 :           2135 :                     BlockNumber blocknum = operation->blocknum + operation->nblocks_done;
                               1845                 :                : 
                               1846                 :           2135 :                     operation->nblocks_done += 1;
                               1847         [ -  + ]:           2135 :                     Assert(operation->nblocks_done <= operation->nblocks);
                               1848                 :                : 
                               1849                 :                :                     /*
                               1850                 :                :                      * Track this as a 'hit' for this backend. The backend
                               1851                 :                :                      * performing the IO will track it as a 'read'.
                               1852                 :                :                      */
                               1853                 :           2135 :                     TrackBufferHit(io_object, io_context,
                               1854                 :           2135 :                                    operation->rel, operation->persistence,
                               1855                 :                :                                    operation->smgr, operation->forknum,
                               1856                 :                :                                    blocknum);
                               1857                 :                :                 }
                               1858                 :                : 
                               1859                 :                :                 /*
                               1860                 :                :                  * If the foreign IO failed and left the buffer invalid,
                               1861                 :                :                  * nblocks_done is not incremented. The retry loop below will
                               1862                 :                :                  * call AsyncReadBuffers() which will attempt the IO itself.
                               1863                 :                :                  */
                               1864                 :                :             }
                               1865                 :                :             else
                               1866                 :                :             {
                               1867                 :                :                 /*
                               1868                 :                :                  * We now are sure the IO completed. Check the results. This
                               1869                 :                :                  * includes reporting on errors if there were any.
                               1870                 :                :                  */
                               1871                 :        1471689 :                 ProcessReadBuffersResult(operation);
                               1872                 :                :             }
                               1873                 :                :         }
                               1874                 :                : 
                               1875                 :                :         /*
                               1876                 :                :          * Most of the time, the one IO we already started, will read in
                               1877                 :                :          * everything.  But we need to deal with partial reads and buffers not
                               1878                 :                :          * needing IO anymore.
                               1879                 :                :          */
  401 andres@anarazel.de       1880         [ +  + ]:CBC     1475170 :         if (operation->nblocks_done == operation->nblocks)
                               1881                 :        1473624 :             break;
                               1882                 :                : 
                               1883         [ -  + ]:           1546 :         CHECK_FOR_INTERRUPTS();
                               1884                 :                : 
                               1885                 :                :         /*
                               1886                 :                :          * If the IO completed only partially, we need to perform additional
                               1887                 :                :          * work, consider that a form of having had to wait.
                               1888                 :                :          */
   34 andres@anarazel.de       1889                 :GNC        1546 :         needed_wait = true;
                               1890                 :                : 
                               1891                 :                :         /*
                               1892                 :                :          * This may only complete the IO partially, either because some
                               1893                 :                :          * buffers were already valid, or because of a partial read.
                               1894                 :                :          *
                               1895                 :                :          * NB: In contrast to after the AsyncReadBuffers() call in
                               1896                 :                :          * StartReadBuffers(), we do *not* reduce
                               1897                 :                :          * ReadBuffersOperation->nblocks here, callers expect the full
                               1898                 :                :          * operation to be completed at this point (as more operations may
                               1899                 :                :          * have been queued).
                               1900                 :                :          */
  401 andres@anarazel.de       1901                 :CBC        1546 :         AsyncReadBuffers(operation, &ignored_nblocks_progress);
                               1902                 :                :     }
                               1903                 :                : 
                               1904                 :        1473624 :     CheckReadBuffersOperation(operation, true);
                               1905                 :                : 
                               1906                 :                :     /* NB: READ_DONE tracepoint was already executed in completion callback */
   34 andres@anarazel.de       1907                 :GNC     1473624 :     return needed_wait;
  401 andres@anarazel.de       1908                 :ECB   (1251753) : }
                               1909                 :                : 
                               1910                 :                : /*
                               1911                 :                :  * Initiate IO for the ReadBuffersOperation
                               1912                 :                :  *
                               1913                 :                :  * This function only starts a single IO at a time. The size of the IO may be
                               1914                 :                :  * limited to below the to-be-read blocks, if one of the buffers has
                               1915                 :                :  * concurrently been read in. If the first to-be-read buffer is already valid,
                               1916                 :                :  * no IO will be issued.
                               1917                 :                :  *
                               1918                 :                :  * To support retries after partial reads, the first operation->nblocks_done
                               1919                 :                :  * buffers are skipped.
                               1920                 :                :  *
                               1921                 :                :  * On return *nblocks_progress is updated to reflect the number of buffers
                               1922                 :                :  * affected by the call. If the first buffer is valid, *nblocks_progress is
                               1923                 :                :  * set to 1 and operation->nblocks_done is incremented.
                               1924                 :                :  *
                               1925                 :                :  * Returns true if IO was initiated or is already in progress (foreign IO),
                               1926                 :                :  * false if the buffer was already valid.
                               1927                 :                :  */
                               1928                 :                : static bool
  401 andres@anarazel.de       1929                 :CBC     1474776 : AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress)
                               1930                 :                : {
                               1931                 :        1474776 :     Buffer     *buffers = &operation->buffers[0];
                               1932                 :        1474776 :     int         flags = operation->flags;
                               1933                 :        1474776 :     ForkNumber  forknum = operation->forknum;
                               1934                 :        1474776 :     char        persistence = operation->persistence;
                               1935                 :        1474776 :     int16       nblocks_done = operation->nblocks_done;
   40                          1936                 :        1474776 :     BlockNumber blocknum = operation->blocknum + nblocks_done;
  401                          1937                 :        1474776 :     Buffer     *io_buffers = &operation->buffers[nblocks_done];
                               1938                 :        1474776 :     int         io_buffers_len = 0;
                               1939                 :                :     PgAioHandle *ioh;
                               1940                 :        1474776 :     uint32      ioh_flags = 0;
                               1941                 :                :     void       *io_pages[MAX_IO_COMBINE_LIMIT];
                               1942                 :                :     IOContext   io_context;
                               1943                 :                :     IOObject    io_object;
                               1944                 :                :     instr_time  io_start;
                               1945                 :                :     StartBufferIOResult status;
                               1946                 :                : 
                               1947         [ +  + ]:        1474776 :     if (persistence == RELPERSISTENCE_TEMP)
                               1948                 :                :     {
                               1949                 :           2847 :         io_context = IOCONTEXT_NORMAL;
                               1950                 :           2847 :         io_object = IOOBJECT_TEMP_RELATION;
                               1951                 :                :     }
                               1952                 :                :     else
                               1953                 :                :     {
                               1954                 :        1471929 :         io_context = IOContextForStrategy(operation->strategy);
                               1955                 :        1471929 :         io_object = IOOBJECT_RELATION;
                               1956                 :                :     }
                               1957                 :                : 
                               1958                 :                :     /*
                               1959                 :                :      * When this IO is executed synchronously, either because the caller will
                               1960                 :                :      * immediately block waiting for the IO or because IOMETHOD_SYNC is used,
                               1961                 :                :      * the AIO subsystem needs to know.
                               1962                 :                :      */
   40 andres@anarazel.de       1963         [ +  + ]:GNC     1474776 :     if (flags & READ_BUFFERS_SYNCHRONOUSLY)
                               1964                 :         794219 :         ioh_flags |= PGAIO_HF_SYNCHRONOUS;
                               1965                 :                : 
                               1966         [ +  + ]:        1474776 :     if (persistence == RELPERSISTENCE_TEMP)
                               1967                 :           2847 :         ioh_flags |= PGAIO_HF_REFERENCES_LOCAL;
                               1968                 :                : 
                               1969                 :                :     /*
                               1970                 :                :      * If zero_damaged_pages is enabled, add the READ_BUFFERS_ZERO_ON_ERROR
                               1971                 :                :      * flag. The reason for that is that, hopefully, zero_damaged_pages isn't
                               1972                 :                :      * set globally, but on a per-session basis. The completion callback,
                               1973                 :                :      * which may be run in other processes, e.g. in IO workers, may have a
                               1974                 :                :      * different value of the zero_damaged_pages GUC.
                               1975                 :                :      *
                               1976                 :                :      * XXX: We probably should eventually use a different flag for
                               1977                 :                :      * zero_damaged_pages, so we can report different log levels / error codes
                               1978                 :                :      * for zero_damaged_pages and ZERO_ON_ERROR.
                               1979                 :                :      */
  401 andres@anarazel.de       1980         [ +  + ]:CBC     1474776 :     if (zero_damaged_pages)
                               1981                 :             24 :         flags |= READ_BUFFERS_ZERO_ON_ERROR;
                               1982                 :                : 
                               1983                 :                :     /*
                               1984                 :                :      * For the same reason as with zero_damaged_pages we need to use this
                               1985                 :                :      * backend's ignore_checksum_failure value.
                               1986                 :                :      */
                               1987         [ +  + ]:        1474776 :     if (ignore_checksum_failure)
                               1988                 :             12 :         flags |= READ_BUFFERS_IGNORE_CHECKSUM_FAILURES;
                               1989                 :                : 
                               1990                 :                : 
                               1991                 :                :     /*
                               1992                 :                :      * To be allowed to report stats in the local completion callback we need
                               1993                 :                :      * to prepare to report stats now. This ensures we can safely report the
                               1994                 :                :      * checksum failure even in a critical section.
                               1995                 :                :      */
                               1996                 :        1474776 :     pgstat_prepare_report_checksum_failure(operation->smgr->smgr_rlocator.locator.dbOid);
                               1997                 :                : 
                               1998                 :                :     /*
                               1999                 :                :      * We must get an IO handle before StartBufferIO(), as pgaio_io_acquire()
                               2000                 :                :      * might block, which we don't want after setting IO_IN_PROGRESS. If we
                               2001                 :                :      * don't need to do the IO, we'll release the handle.
                               2002                 :                :      *
                               2003                 :                :      * If we need to wait for IO before we can get a handle, submit
                               2004                 :                :      * already-staged IO first, so that other backends don't need to wait.
                               2005                 :                :      * There wouldn't be a deadlock risk, as pgaio_io_acquire() just needs to
                               2006                 :                :      * wait for already submitted IO, which doesn't require additional locks,
                               2007                 :                :      * but it could still cause undesirable waits.
                               2008                 :                :      *
                               2009                 :                :      * A secondary benefit is that this would allow us to measure the time in
                               2010                 :                :      * pgaio_io_acquire() without causing undue timer overhead in the common,
                               2011                 :                :      * non-blocking, case.  However, currently the pgstats infrastructure
                               2012                 :                :      * doesn't really allow that, as it a) asserts that an operation can't
                               2013                 :                :      * have time without operations b) doesn't have an API to report
                               2014                 :                :      * "accumulated" time.
                               2015                 :                :      */
                               2016                 :        1474776 :     ioh = pgaio_io_acquire_nb(CurrentResourceOwner, &operation->io_return);
                               2017         [ +  + ]:        1474776 :     if (unlikely(!ioh))
                               2018                 :                :     {
                               2019                 :           3441 :         pgaio_submit_staged();
                               2020                 :           3441 :         ioh = pgaio_io_acquire(CurrentResourceOwner, &operation->io_return);
                               2021                 :                :     }
                               2022                 :                : 
   39 andres@anarazel.de       2023                 :GNC     1474776 :     operation->foreign_io = false;
                               2024                 :        1474776 :     pgaio_wref_clear(&operation->io_wref);
                               2025                 :                : 
                               2026                 :                :     /*
                               2027                 :                :      * Try to start IO on the first buffer in a new run of blocks. If AIO is
                               2028                 :                :      * in progress, be it in this backend or another backend, we just
                               2029                 :                :      * associate the wait reference with the operation and wait in
                               2030                 :                :      * WaitReadBuffers(). This turns out to be important for performance in
                               2031                 :                :      * two workloads:
                               2032                 :                :      *
                               2033                 :                :      * 1) A read stream that has to read the same block multiple times within
                               2034                 :                :      * the readahead distance. This can happen e.g. for the table accesses of
                               2035                 :                :      * an index scan.
                               2036                 :                :      *
                               2037                 :                :      * 2) Concurrent scans by multiple backends on the same relation.
                               2038                 :                :      *
                               2039                 :                :      * If we were to synchronously wait for the in-progress IO, we'd not be
                               2040                 :                :      * able to keep enough I/O in flight.
                               2041                 :                :      *
                               2042                 :                :      * If we do find there is ongoing I/O for the buffer, we set up a 1-block
                               2043                 :                :      * ReadBuffersOperation that WaitReadBuffers then can wait on.
                               2044                 :                :      *
                               2045                 :                :      * It's possible that another backend has started IO on the buffer but not
                               2046                 :                :      * yet set its wait reference. In this case, we have no choice but to wait
                               2047                 :                :      * for either the wait reference to be valid or the IO to be done.
                               2048                 :                :      */
                               2049                 :        1474776 :     status = StartBufferIO(buffers[nblocks_done], true, true,
                               2050                 :                :                            &operation->io_wref);
                               2051         [ +  + ]:        1474776 :     if (status != BUFFER_IO_READY_FOR_IO)
                               2052                 :                :     {
   39 andres@anarazel.de       2053                 :CBC        2675 :         pgaio_io_release(ioh);
  401 andres@anarazel.de       2054                 :GNC        2675 :         *nblocks_progress = 1;
   39                          2055         [ +  + ]:           2675 :         if (status == BUFFER_IO_ALREADY_DONE)
                               2056                 :                :         {
                               2057                 :                :             /*
                               2058                 :                :              * Someone has already completed this block, we're done.
                               2059                 :                :              *
                               2060                 :                :              * When IO is necessary, ->nblocks_done is updated in
                               2061                 :                :              * ProcessReadBuffersResult(), but that is not called if no IO is
                               2062                 :                :              * necessary. Thus update here.
                               2063                 :                :              */
                               2064                 :            536 :             operation->nblocks_done += 1;
                               2065         [ -  + ]:            536 :             Assert(operation->nblocks_done <= operation->nblocks);
                               2066                 :                : 
                               2067         [ -  + ]:            536 :             Assert(!pgaio_wref_valid(&operation->io_wref));
                               2068                 :                : 
                               2069                 :                :             /*
                               2070                 :                :              * Report and track this as a 'hit' for this backend, even though
                               2071                 :                :              * it must have started out as a miss in PinBufferForBlock(). The
                               2072                 :                :              * other backend will track this as a 'read'.
                               2073                 :                :              */
                               2074                 :            536 :             TrackBufferHit(io_object, io_context,
                               2075                 :            536 :                            operation->rel, operation->persistence,
                               2076                 :                :                            operation->smgr, operation->forknum,
                               2077                 :                :                            blocknum);
                               2078                 :            536 :             return false;
                               2079                 :                :         }
                               2080                 :                : 
                               2081                 :                :         /* The IO is already in-progress */
                               2082         [ -  + ]:           2139 :         Assert(status == BUFFER_IO_IN_PROGRESS);
                               2083         [ -  + ]:           2139 :         Assert(pgaio_wref_valid(&operation->io_wref));
                               2084                 :           2139 :         operation->foreign_io = true;
                               2085                 :                : 
                               2086                 :           2139 :         return true;
                               2087                 :                :     }
                               2088                 :                : 
   40                          2089         [ -  + ]:        1472101 :     Assert(io_buffers[0] == buffers[nblocks_done]);
                               2090                 :        1472101 :     io_pages[0] = BufferGetBlock(buffers[nblocks_done]);
                               2091                 :        1472101 :     io_buffers_len = 1;
                               2092                 :                : 
                               2093                 :                :     /*
                               2094                 :                :      * NB: As little code as possible should be added between the
                               2095                 :                :      * StartBufferIO() above, the further StartBufferIO()s below and the
                               2096                 :                :      * smgrstartreadv(), as some of the buffers are now marked as
                               2097                 :                :      * IO_IN_PROGRESS and will thus cause other backends to wait.
                               2098                 :                :      */
                               2099                 :                : 
                               2100                 :                :     /*
                               2101                 :                :      * How many neighboring-on-disk blocks can we scatter-read into other
                               2102                 :                :      * buffers at the same time?  In this case we don't wait if we see an I/O
                               2103                 :                :      * already in progress (see comment above).
                               2104                 :                :      */
                               2105         [ +  + ]:        1654122 :     for (int i = nblocks_done + 1; i < operation->nblocks; i++)
                               2106                 :                :     {
                               2107                 :                :         /* Must be consecutive block numbers. */
                               2108         [ -  + ]:         182032 :         Assert(BufferGetBlockNumber(buffers[i - 1]) ==
                               2109                 :                :                BufferGetBlockNumber(buffers[i]) - 1);
                               2110                 :                : 
   39                          2111                 :         182032 :         status = StartBufferIO(buffers[i], true, false, NULL);
                               2112         [ +  + ]:         182032 :         if (status != BUFFER_IO_READY_FOR_IO)
   40                          2113                 :             11 :             break;
                               2114                 :                : 
                               2115         [ -  + ]:         182021 :         Assert(io_buffers[io_buffers_len] == buffers[i]);
                               2116                 :                : 
                               2117                 :         182021 :         io_pages[io_buffers_len++] = BufferGetBlock(buffers[i]);
                               2118                 :                :     }
                               2119                 :                : 
                               2120                 :                :     /* get a reference to wait for in WaitReadBuffers() */
                               2121                 :        1472101 :     pgaio_io_get_wref(ioh, &operation->io_wref);
                               2122                 :                : 
                               2123                 :                :     /* provide the list of buffers to the completion callbacks */
                               2124                 :        1472101 :     pgaio_io_set_handle_data_32(ioh, (uint32 *) io_buffers, io_buffers_len);
                               2125                 :                : 
                               2126         [ +  + ]:        1472101 :     pgaio_io_register_callbacks(ioh,
                               2127                 :                :                                 persistence == RELPERSISTENCE_TEMP ?
                               2128                 :                :                                 PGAIO_HCB_LOCAL_BUFFER_READV :
                               2129                 :                :                                 PGAIO_HCB_SHARED_BUFFER_READV,
                               2130                 :                :                                 flags);
                               2131                 :                : 
                               2132                 :        1472101 :     pgaio_io_set_flag(ioh, ioh_flags);
                               2133                 :                : 
                               2134                 :                :     /* ---
                               2135                 :                :      * Even though we're trying to issue IO asynchronously, track the time
                               2136                 :                :      * in smgrstartreadv():
                               2137                 :                :      * - if io_method == IOMETHOD_SYNC, we will always perform the IO
                               2138                 :                :      *   immediately
                               2139                 :                :      * - the io method might not support the IO (e.g. worker IO for a temp
                               2140                 :                :      *   table)
                               2141                 :                :      * ---
                               2142                 :                :      */
                               2143                 :        1472101 :     io_start = pgstat_prepare_io_time(track_io_timing);
                               2144                 :        1472101 :     smgrstartreadv(ioh, operation->smgr, forknum,
                               2145                 :                :                    blocknum,
                               2146                 :                :                    io_pages, io_buffers_len);
                               2147                 :        1472086 :     pgstat_count_io_op_time(io_object, io_context, IOOP_READ,
                               2148                 :        1472086 :                             io_start, 1, io_buffers_len * BLCKSZ);
                               2149                 :                : 
                               2150         [ +  + ]:        1472086 :     if (persistence == RELPERSISTENCE_TEMP)
                               2151                 :           2842 :         pgBufferUsage.local_blks_read += io_buffers_len;
                               2152                 :                :     else
                               2153                 :        1469244 :         pgBufferUsage.shared_blks_read += io_buffers_len;
                               2154                 :                : 
                               2155                 :                :     /*
                               2156                 :                :      * Track vacuum cost when issuing IO, not after waiting for it. Otherwise
                               2157                 :                :      * we could end up issuing a lot of IO in a short timespan, despite a low
                               2158                 :                :      * cost limit.
                               2159                 :                :      */
                               2160         [ +  + ]:        1472086 :     if (VacuumCostActive)
                               2161                 :          19139 :         VacuumCostBalance += VacuumCostPageMiss * io_buffers_len;
                               2162                 :                : 
                               2163                 :        1472086 :     *nblocks_progress = io_buffers_len;
                               2164                 :                : 
                               2165                 :        1472086 :     return true;
                               2166                 :                : }
                               2167                 :                : 
                               2168                 :                : /*
                               2169                 :                :  * BufferAlloc -- subroutine for PinBufferForBlock.  Handles lookup of a shared
                               2170                 :                :  *      buffer.  If no buffer exists already, selects a replacement victim and
                               2171                 :                :  *      evicts the old page, but does NOT read in new page.
                               2172                 :                :  *
                               2173                 :                :  * "strategy" can be a buffer replacement strategy object, or NULL for
                               2174                 :                :  * the default strategy.  The selected buffer's usage_count is advanced when
                               2175                 :                :  * using the default strategy, but otherwise possibly not (see PinBuffer).
                               2176                 :                :  *
                               2177                 :                :  * The returned buffer is pinned and is already marked as holding the
                               2178                 :                :  * desired page.  If it already did have the desired page, *foundPtr is
                               2179                 :                :  * set true.  Otherwise, *foundPtr is set false.
                               2180                 :                :  *
                               2181                 :                :  * io_context is passed as an output parameter to avoid calling
                               2182                 :                :  * IOContextForStrategy() when there is a shared buffers hit and no IO
                               2183                 :                :  * statistics need be captured.
                               2184                 :                :  *
                               2185                 :                :  * No locks are held either at entry or exit.
                               2186                 :                :  */
                               2187                 :                : static pg_attribute_always_inline BufferDesc *
 5606 rhaas@postgresql.org     2188                 :CBC    82709194 : BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
                               2189                 :                :             BlockNumber blockNum,
                               2190                 :                :             BufferAccessStrategy strategy,
                               2191                 :                :             bool *foundPtr, IOContext io_context)
                               2192                 :                : {
                               2193                 :                :     BufferTag   newTag;         /* identity of requested block */
                               2194                 :                :     uint32      newHash;        /* hash value for newTag */
                               2195                 :                :     LWLock     *newPartitionLock;   /* buffer partition lock for it */
                               2196                 :                :     int         existing_buf_id;
                               2197                 :                :     Buffer      victim_buffer;
                               2198                 :                :     BufferDesc *victim_buf_hdr;
                               2199                 :                :     uint64      victim_buf_state;
  110 andres@anarazel.de       2200                 :GNC    82709194 :     uint64      set_bits = 0;
                               2201                 :                : 
                               2202                 :                :     /* Make sure we will have room to remember the buffer pin */
  909 heikki.linnakangas@i     2203                 :CBC    82709194 :     ResourceOwnerEnlarge(CurrentResourceOwner);
                               2204                 :       82709194 :     ReservePrivateRefCountEntry();
                               2205                 :                : 
                               2206                 :                :     /* create a tag so we can lookup the buffer */
 1378 rhaas@postgresql.org     2207                 :       82709194 :     InitBufferTag(&newTag, &smgr->smgr_rlocator.locator, forkNum, blockNum);
                               2208                 :                : 
                               2209                 :                :     /* determine its hash code and partition lock ID */
 7226 tgl@sss.pgh.pa.us        2210                 :       82709194 :     newHash = BufTableHashCode(&newTag);
                               2211                 :       82709194 :     newPartitionLock = BufMappingPartitionLock(newHash);
                               2212                 :                : 
                               2213                 :                :     /* see if the block is in the buffer pool already */
                               2214                 :       82709194 :     LWLockAcquire(newPartitionLock, LW_SHARED);
 1126 andres@anarazel.de       2215                 :       82709194 :     existing_buf_id = BufTableLookup(&newTag, newHash);
                               2216         [ +  + ]:       82709194 :     if (existing_buf_id >= 0)
                               2217                 :                :     {
                               2218                 :                :         BufferDesc *buf;
                               2219                 :                :         bool        valid;
                               2220                 :                : 
                               2221                 :                :         /*
                               2222                 :                :          * Found it.  Now, pin the buffer so no one can steal it from the
                               2223                 :                :          * buffer pool, and check to see if the correct data has been loaded
                               2224                 :                :          * into the buffer.
                               2225                 :                :          */
                               2226                 :       80746854 :         buf = GetBufferDescriptor(existing_buf_id);
                               2227                 :                : 
  209 andres@anarazel.de       2228                 :GNC    80746854 :         valid = PinBuffer(buf, strategy, false);
                               2229                 :                : 
                               2230                 :                :         /* Can release the mapping lock as soon as we've pinned it */
 7226 tgl@sss.pgh.pa.us        2231                 :CBC    80746854 :         LWLockRelease(newPartitionLock);
                               2232                 :                : 
 3184 peter_e@gmx.net          2233                 :       80746854 :         *foundPtr = true;
                               2234                 :                : 
 7732 tgl@sss.pgh.pa.us        2235         [ +  + ]:       80746854 :         if (!valid)
                               2236                 :                :         {
                               2237                 :                :             /*
                               2238                 :                :              * We can only get here if (a) someone else is still reading in
                               2239                 :                :              * the page, (b) a previous read attempt failed, or (c) someone
                               2240                 :                :              * called StartReadBuffers() but not yet WaitReadBuffers().
                               2241                 :                :              */
  762 tmunro@postgresql.or     2242                 :           2388 :             *foundPtr = false;
                               2243                 :                :         }
                               2244                 :                : 
10108 bruce@momjian.us         2245                 :       80746854 :         return buf;
                               2246                 :                :     }
                               2247                 :                : 
                               2248                 :                :     /*
                               2249                 :                :      * Didn't find it in the buffer pool.  We'll have to initialize a new
                               2250                 :                :      * buffer.  Remember to unlock the mapping lock while doing the work.
                               2251                 :                :      */
 7226 tgl@sss.pgh.pa.us        2252                 :        1962340 :     LWLockRelease(newPartitionLock);
                               2253                 :                : 
                               2254                 :                :     /*
                               2255                 :                :      * Acquire a victim buffer. Somebody else might try to do the same, we
                               2256                 :                :      * don't hold any conflicting locks. If so we'll have to undo our work
                               2257                 :                :      * later.
                               2258                 :                :      */
 1126 andres@anarazel.de       2259                 :        1962340 :     victim_buffer = GetVictimBuffer(strategy, io_context);
                               2260                 :        1962340 :     victim_buf_hdr = GetBufferDescriptor(victim_buffer - 1);
                               2261                 :                : 
                               2262                 :                :     /*
                               2263                 :                :      * Try to make a hashtable entry for the buffer under its new tag. If
                               2264                 :                :      * somebody else inserted another buffer for the tag, we'll release the
                               2265                 :                :      * victim buffer we acquired and use the already inserted one.
                               2266                 :                :      */
                               2267                 :        1962340 :     LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
                               2268                 :        1962340 :     existing_buf_id = BufTableInsert(&newTag, newHash, victim_buf_hdr->buf_id);
                               2269         [ +  + ]:        1962340 :     if (existing_buf_id >= 0)
                               2270                 :                :     {
                               2271                 :                :         BufferDesc *existing_buf_hdr;
                               2272                 :                :         bool        valid;
                               2273                 :                : 
                               2274                 :                :         /*
                               2275                 :                :          * Got a collision. Someone has already done what we were about to do.
                               2276                 :                :          * We'll just handle this as if it were found in the buffer pool in
                               2277                 :                :          * the first place.  First, give up the buffer we were planning to
                               2278                 :                :          * use.
                               2279                 :                :          *
                               2280                 :                :          * We could do this after releasing the partition lock, but then we'd
                               2281                 :                :          * have to call ResourceOwnerEnlarge() & ReservePrivateRefCountEntry()
                               2282                 :                :          * before acquiring the lock, for the rare case of such a collision.
                               2283                 :                :          */
                               2284                 :            701 :         UnpinBuffer(victim_buf_hdr);
                               2285                 :                : 
                               2286                 :                :         /* remaining code should match code at top of routine */
                               2287                 :                : 
                               2288                 :            701 :         existing_buf_hdr = GetBufferDescriptor(existing_buf_id);
                               2289                 :                : 
  209 andres@anarazel.de       2290                 :GNC         701 :         valid = PinBuffer(existing_buf_hdr, strategy, false);
                               2291                 :                : 
                               2292                 :                :         /* Can release the mapping lock as soon as we've pinned it */
 1126 andres@anarazel.de       2293                 :CBC         701 :         LWLockRelease(newPartitionLock);
                               2294                 :                : 
                               2295                 :            701 :         *foundPtr = true;
                               2296                 :                : 
                               2297         [ +  + ]:            701 :         if (!valid)
                               2298                 :                :         {
                               2299                 :                :             /*
                               2300                 :                :              * We can only get here if (a) someone else is still reading in
                               2301                 :                :              * the page, (b) a previous read attempt failed, or (c) someone
                               2302                 :                :              * called StartReadBuffers() but not yet WaitReadBuffers().
                               2303                 :                :              */
  762 tmunro@postgresql.or     2304                 :            351 :             *foundPtr = false;
                               2305                 :                :         }
                               2306                 :                : 
 1126 andres@anarazel.de       2307                 :            701 :         return existing_buf_hdr;
                               2308                 :                :     }
                               2309                 :                : 
                               2310                 :                :     /*
                               2311                 :                :      * Need to lock the buffer header too in order to change its tag.
                               2312                 :                :      */
                               2313                 :        1961639 :     victim_buf_state = LockBufHdr(victim_buf_hdr);
                               2314                 :                : 
                               2315                 :                :     /* some sanity checks while we hold the buffer header lock */
                               2316         [ -  + ]:        1961639 :     Assert(BUF_STATE_GET_REFCOUNT(victim_buf_state) == 1);
                               2317         [ -  + ]:        1961639 :     Assert(!(victim_buf_state & (BM_TAG_VALID | BM_VALID | BM_DIRTY | BM_IO_IN_PROGRESS)));
                               2318                 :                : 
                               2319                 :        1961639 :     victim_buf_hdr->tag = newTag;
                               2320                 :                : 
                               2321                 :                :     /*
                               2322                 :                :      * Make sure BM_PERMANENT is set for buffers that must be written at every
                               2323                 :                :      * checkpoint.  Unlogged buffers only need to be written at shutdown
                               2324                 :                :      * checkpoints, except for their "init" forks, which need to be treated
                               2325                 :                :      * just like permanent relations.
                               2326                 :                :      */
  180 andres@anarazel.de       2327                 :GNC     1961639 :     set_bits |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
 3339 rhaas@postgresql.org     2328   [ +  +  -  + ]:CBC     1961639 :     if (relpersistence == RELPERSISTENCE_PERMANENT || forkNum == INIT_FORKNUM)
  180 andres@anarazel.de       2329                 :GNC     1961279 :         set_bits |= BM_PERMANENT;
                               2330                 :                : 
                               2331                 :        1961639 :     UnlockBufHdrExt(victim_buf_hdr, victim_buf_state,
                               2332                 :                :                     set_bits, 0, 0);
                               2333                 :                : 
 7226 tgl@sss.pgh.pa.us        2334                 :CBC     1961639 :     LWLockRelease(newPartitionLock);
                               2335                 :                : 
                               2336                 :                :     /*
                               2337                 :                :      * Buffer contents are currently invalid.
                               2338                 :                :      */
  762 tmunro@postgresql.or     2339                 :        1961639 :     *foundPtr = false;
                               2340                 :                : 
 1126 andres@anarazel.de       2341                 :        1961639 :     return victim_buf_hdr;
                               2342                 :                : }
                               2343                 :                : 
                               2344                 :                : /*
                               2345                 :                :  * InvalidateBuffer -- mark a shared buffer invalid.
                               2346                 :                :  *
                               2347                 :                :  * The buffer header spinlock must be held at entry.  We drop it before
                               2348                 :                :  * returning.  (This is sane because the caller must have locked the
                               2349                 :                :  * buffer in order to be sure it should be dropped.)
                               2350                 :                :  *
                               2351                 :                :  * This is used only in contexts such as dropping a relation.  We assume
                               2352                 :                :  * that no other backend could possibly be interested in using the page,
                               2353                 :                :  * so the only reason the buffer might be pinned is if someone else is
                               2354                 :                :  * trying to write it out.  We have to let them finish before we can
                               2355                 :                :  * reclaim the buffer.
                               2356                 :                :  *
                               2357                 :                :  * The buffer could get reclaimed by someone else while we are waiting
                               2358                 :                :  * to acquire the necessary locks; if so, don't mess it up.
                               2359                 :                :  */
                               2360                 :                : static void
 3823 rhaas@postgresql.org     2361                 :         127888 : InvalidateBuffer(BufferDesc *buf)
                               2362                 :                : {
                               2363                 :                :     BufferTag   oldTag;
                               2364                 :                :     uint32      oldHash;        /* hash value for oldTag */
                               2365                 :                :     LWLock     *oldPartitionLock;   /* buffer partition lock for it */
                               2366                 :                :     uint32      oldFlags;
                               2367                 :                :     uint64      buf_state;
                               2368                 :                : 
                               2369                 :                :     /* Save the original buffer tag before dropping the spinlock */
 7732 tgl@sss.pgh.pa.us        2370                 :         127888 :     oldTag = buf->tag;
                               2371                 :                : 
  180 andres@anarazel.de       2372                 :GNC      127888 :     UnlockBufHdr(buf);
                               2373                 :                : 
                               2374                 :                :     /*
                               2375                 :                :      * Need to compute the old tag's hashcode and partition lock ID. XXX is it
                               2376                 :                :      * worth storing the hashcode in BufferDesc so we need not recompute it
                               2377                 :                :      * here?  Probably not.
                               2378                 :                :      */
 7226 tgl@sss.pgh.pa.us        2379                 :CBC      127888 :     oldHash = BufTableHashCode(&oldTag);
                               2380                 :         127888 :     oldPartitionLock = BufMappingPartitionLock(oldHash);
                               2381                 :                : 
 7732                          2382                 :         127890 : retry:
                               2383                 :                : 
                               2384                 :                :     /*
                               2385                 :                :      * Acquire exclusive mapping lock in preparation for changing the buffer's
                               2386                 :                :      * association.
                               2387                 :                :      */
 7226                          2388                 :         127890 :     LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
                               2389                 :                : 
                               2390                 :                :     /* Re-lock the buffer header */
 3677 andres@anarazel.de       2391                 :         127890 :     buf_state = LockBufHdr(buf);
                               2392                 :                : 
                               2393                 :                :     /* If it's changed while we were waiting for lock, do nothing */
 1378 rhaas@postgresql.org     2394         [ +  + ]:         127890 :     if (!BufferTagsEqual(&buf->tag, &oldTag))
                               2395                 :                :     {
  180 andres@anarazel.de       2396                 :GNC           1 :         UnlockBufHdr(buf);
 7226 tgl@sss.pgh.pa.us        2397                 :CBC           1 :         LWLockRelease(oldPartitionLock);
 7732                          2398                 :              1 :         return;
                               2399                 :                :     }
                               2400                 :                : 
                               2401                 :                :     /*
                               2402                 :                :      * We assume the reason for it to be pinned is that either we were
                               2403                 :                :      * asynchronously reading the page in before erroring out or someone else
                               2404                 :                :      * is flushing the page out.  Wait for the IO to finish.  (This could be
                               2405                 :                :      * an infinite loop if the refcount is messed up... it would be nice to
                               2406                 :                :      * time out after awhile, but there seems no way to be sure how many loops
                               2407                 :                :      * may be needed.  Note that if the other guy has pinned the buffer but
                               2408                 :                :      * not yet done StartBufferIO, WaitIO will fall through and we'll
                               2409                 :                :      * effectively be busy-looping here.)
                               2410                 :                :      */
 3677 andres@anarazel.de       2411         [ +  + ]:         127889 :     if (BUF_STATE_GET_REFCOUNT(buf_state) != 0)
                               2412                 :                :     {
  180 andres@anarazel.de       2413                 :GNC           2 :         UnlockBufHdr(buf);
 7226 tgl@sss.pgh.pa.us        2414                 :CBC           2 :         LWLockRelease(oldPartitionLock);
                               2415                 :                :         /* safety check: should definitely not be our *own* pin */
 3919 andres@anarazel.de       2416         [ -  + ]:              2 :         if (GetPrivateRefCount(BufferDescriptorGetBuffer(buf)) > 0)
 7718 tgl@sss.pgh.pa.us        2417         [ #  # ]:UBC           0 :             elog(ERROR, "buffer is pinned in InvalidateBuffer");
 7732 tgl@sss.pgh.pa.us        2418                 :CBC           2 :         WaitIO(buf);
                               2419                 :              2 :         goto retry;
                               2420                 :                :     }
                               2421                 :                : 
                               2422                 :                :     /*
                               2423                 :                :      * An invalidated buffer should not have any backends waiting to lock the
                               2424                 :                :      * buffer, therefore BM_LOCK_WAKE_IN_PROGRESS should not be set.
                               2425                 :                :      */
  110 andres@anarazel.de       2426         [ -  + ]:GNC      127887 :     Assert(!(buf_state & BM_LOCK_WAKE_IN_PROGRESS));
                               2427                 :                : 
                               2428                 :                :     /*
                               2429                 :                :      * Clear out the buffer's tag and flags.  We must do this to ensure that
                               2430                 :                :      * linear scans of the buffer array don't think the buffer is valid.
                               2431                 :                :      */
 3677 andres@anarazel.de       2432                 :CBC      127887 :     oldFlags = buf_state & BUF_FLAG_MASK;
 1378 rhaas@postgresql.org     2433                 :         127887 :     ClearBufferTag(&buf->tag);
                               2434                 :                : 
  180 andres@anarazel.de       2435                 :GNC      127887 :     UnlockBufHdrExt(buf, buf_state,
                               2436                 :                :                     0,
                               2437                 :                :                     BUF_FLAG_MASK | BUF_USAGECOUNT_MASK,
                               2438                 :                :                     0);
                               2439                 :                : 
                               2440                 :                :     /*
                               2441                 :                :      * Remove the buffer from the lookup hashtable, if it was in there.
                               2442                 :                :      */
 7732 tgl@sss.pgh.pa.us        2443         [ +  - ]:CBC      127887 :     if (oldFlags & BM_TAG_VALID)
 7226                          2444                 :         127887 :         BufTableDelete(&oldTag, oldHash);
                               2445                 :                : 
                               2446                 :                :     /*
                               2447                 :                :      * Done with mapping lock.
                               2448                 :                :      */
                               2449                 :         127887 :     LWLockRelease(oldPartitionLock);
                               2450                 :                : }
                               2451                 :                : 
                               2452                 :                : /*
                               2453                 :                :  * Helper routine for GetVictimBuffer()
                               2454                 :                :  *
                               2455                 :                :  * Needs to be called on a buffer with a valid tag, pinned, but without the
                               2456                 :                :  * buffer header spinlock held.
                               2457                 :                :  *
                               2458                 :                :  * Returns true if the buffer can be reused, in which case the buffer is only
                               2459                 :                :  * pinned by this backend and marked as invalid, false otherwise.
                               2460                 :                :  */
                               2461                 :                : static bool
 1126 andres@anarazel.de       2462                 :        1407780 : InvalidateVictimBuffer(BufferDesc *buf_hdr)
                               2463                 :                : {
                               2464                 :                :     uint64      buf_state;
                               2465                 :                :     uint32      hash;
                               2466                 :                :     LWLock     *partition_lock;
                               2467                 :                :     BufferTag   tag;
                               2468                 :                : 
                               2469         [ -  + ]:        1407780 :     Assert(GetPrivateRefCount(BufferDescriptorGetBuffer(buf_hdr)) == 1);
                               2470                 :                : 
                               2471                 :                :     /* have buffer pinned, so it's safe to read tag without lock */
                               2472                 :        1407780 :     tag = buf_hdr->tag;
                               2473                 :                : 
                               2474                 :        1407780 :     hash = BufTableHashCode(&tag);
                               2475                 :        1407780 :     partition_lock = BufMappingPartitionLock(hash);
                               2476                 :                : 
                               2477                 :        1407780 :     LWLockAcquire(partition_lock, LW_EXCLUSIVE);
                               2478                 :                : 
                               2479                 :                :     /* lock the buffer header */
                               2480                 :        1407780 :     buf_state = LockBufHdr(buf_hdr);
                               2481                 :                : 
                               2482                 :                :     /*
                               2483                 :                :      * We have the buffer pinned nobody else should have been able to unset
                               2484                 :                :      * this concurrently.
                               2485                 :                :      */
                               2486         [ -  + ]:        1407780 :     Assert(buf_state & BM_TAG_VALID);
                               2487         [ -  + ]:        1407780 :     Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
                               2488         [ -  + ]:        1407780 :     Assert(BufferTagsEqual(&buf_hdr->tag, &tag));
                               2489                 :                : 
                               2490                 :                :     /*
                               2491                 :                :      * If somebody else pinned the buffer since, or even worse, dirtied it,
                               2492                 :                :      * give up on this buffer: It's clearly in use.
                               2493                 :                :      */
                               2494   [ +  +  +  + ]:        1407780 :     if (BUF_STATE_GET_REFCOUNT(buf_state) != 1 || (buf_state & BM_DIRTY))
                               2495                 :                :     {
                               2496         [ -  + ]:            535 :         Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
                               2497                 :                : 
  180 andres@anarazel.de       2498                 :GNC         535 :         UnlockBufHdr(buf_hdr);
 1126 andres@anarazel.de       2499                 :CBC         535 :         LWLockRelease(partition_lock);
                               2500                 :                : 
                               2501                 :            535 :         return false;
                               2502                 :                :     }
                               2503                 :                : 
                               2504                 :                :     /*
                               2505                 :                :      * An invalidated buffer should not have any backends waiting to lock the
                               2506                 :                :      * buffer, therefore BM_LOCK_WAKE_IN_PROGRESS should not be set.
                               2507                 :                :      */
  110 andres@anarazel.de       2508         [ -  + ]:GNC     1407245 :     Assert(!(buf_state & BM_LOCK_WAKE_IN_PROGRESS));
                               2509                 :                : 
                               2510                 :                :     /*
                               2511                 :                :      * Clear out the buffer's tag and flags and usagecount.  This is not
                               2512                 :                :      * strictly required, as BM_TAG_VALID/BM_VALID needs to be checked before
                               2513                 :                :      * doing anything with the buffer. But currently it's beneficial, as the
                               2514                 :                :      * cheaper pre-check for several linear scans of shared buffers use the
                               2515                 :                :      * tag (see e.g. FlushDatabaseBuffers()).
                               2516                 :                :      */
 1126 andres@anarazel.de       2517                 :CBC     1407245 :     ClearBufferTag(&buf_hdr->tag);
  180 andres@anarazel.de       2518                 :GNC     1407245 :     UnlockBufHdrExt(buf_hdr, buf_state,
                               2519                 :                :                     0,
                               2520                 :                :                     BUF_FLAG_MASK | BUF_USAGECOUNT_MASK,
                               2521                 :                :                     0);
                               2522                 :                : 
 1126 andres@anarazel.de       2523         [ -  + ]:CBC     1407245 :     Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
                               2524                 :                : 
                               2525                 :                :     /* finally delete buffer from the buffer mapping table */
                               2526                 :        1407245 :     BufTableDelete(&tag, hash);
                               2527                 :                : 
                               2528                 :        1407245 :     LWLockRelease(partition_lock);
                               2529                 :                : 
  110 andres@anarazel.de       2530                 :GNC     1407245 :     buf_state = pg_atomic_read_u64(&buf_hdr->state);
 1126 andres@anarazel.de       2531         [ -  + ]:CBC     1407245 :     Assert(!(buf_state & (BM_DIRTY | BM_VALID | BM_TAG_VALID)));
                               2532         [ -  + ]:        1407245 :     Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
  110 andres@anarazel.de       2533         [ -  + ]:GNC     1407245 :     Assert(BUF_STATE_GET_REFCOUNT(pg_atomic_read_u64(&buf_hdr->state)) > 0);
                               2534                 :                : 
 1126 andres@anarazel.de       2535                 :CBC     1407245 :     return true;
                               2536                 :                : }
                               2537                 :                : 
                               2538                 :                : static Buffer
                               2539                 :        2241606 : GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context)
                               2540                 :                : {
                               2541                 :                :     BufferDesc *buf_hdr;
                               2542                 :                :     Buffer      buf;
                               2543                 :                :     uint64      buf_state;
                               2544                 :                :     bool        from_ring;
                               2545                 :                : 
                               2546                 :                :     /*
                               2547                 :                :      * Ensure, before we pin a victim buffer, that there's a free refcount
                               2548                 :                :      * entry and resource owner slot for the pin.
                               2549                 :                :      */
                               2550                 :        2241606 :     ReservePrivateRefCountEntry();
  909 heikki.linnakangas@i     2551                 :        2241606 :     ResourceOwnerEnlarge(CurrentResourceOwner);
                               2552                 :                : 
                               2553                 :                :     /* we return here if a prospective victim buffer gets used concurrently */
 1126 andres@anarazel.de       2554                 :          23035 : again:
                               2555                 :                : 
                               2556                 :                :     /*
                               2557                 :                :      * Select a victim buffer.  The buffer is returned pinned and owned by
                               2558                 :                :      * this backend.
                               2559                 :                :      */
                               2560                 :        2264641 :     buf_hdr = StrategyGetBuffer(strategy, &buf_state, &from_ring);
                               2561                 :        2264641 :     buf = BufferDescriptorGetBuffer(buf_hdr);
                               2562                 :                : 
                               2563                 :                :     /*
                               2564                 :                :      * We shouldn't have any other pins for this buffer.
                               2565                 :                :      */
                               2566                 :        2264641 :     CheckBufferIsPinnedOnce(buf);
                               2567                 :                : 
                               2568                 :                :     /*
                               2569                 :                :      * If the buffer was dirty, try to write it out.  There is a race
                               2570                 :                :      * condition here, another backend could dirty the buffer between
                               2571                 :                :      * StrategyGetBuffer() checking that it is not in use and invalidating the
                               2572                 :                :      * buffer below. That's addressed by InvalidateVictimBuffer() verifying
                               2573                 :                :      * that the buffer is not dirty.
                               2574                 :                :      */
                               2575         [ +  + ]:        2264641 :     if (buf_state & BM_DIRTY)
                               2576                 :                :     {
                               2577         [ -  + ]:         371125 :         Assert(buf_state & BM_TAG_VALID);
                               2578         [ -  + ]:         371125 :         Assert(buf_state & BM_VALID);
                               2579                 :                : 
                               2580                 :                :         /*
                               2581                 :                :          * We need a share-exclusive lock on the buffer contents to write it
                               2582                 :                :          * out (else we might write invalid data, eg because someone else is
                               2583                 :                :          * compacting the page contents while we write).  We must use a
                               2584                 :                :          * conditional lock acquisition here to avoid deadlock.  Even though
                               2585                 :                :          * the buffer was not pinned (and therefore surely not locked) when
                               2586                 :                :          * StrategyGetBuffer returned it, someone else could have pinned and
                               2587                 :                :          * (share-)exclusive-locked it by the time we get here. If we try to
                               2588                 :                :          * get the lock unconditionally, we'd block waiting for them; if they
                               2589                 :                :          * later block waiting for us, deadlock ensues. (This has been
                               2590                 :                :          * observed to happen when two backends are both trying to split btree
                               2591                 :                :          * index pages, and the second one just happens to be trying to split
                               2592                 :                :          * the page the first one got from StrategyGetBuffer.)
                               2593                 :                :          */
   56 andres@anarazel.de       2594         [ -  + ]:GNC      371125 :         if (!BufferLockConditional(buf, buf_hdr, BUFFER_LOCK_SHARE_EXCLUSIVE))
                               2595                 :                :         {
                               2596                 :                :             /*
                               2597                 :                :              * Someone else has locked the buffer, so give it up and loop back
                               2598                 :                :              * to get another one.
                               2599                 :                :              */
 1126 andres@anarazel.de       2600                 :UBC           0 :             UnpinBuffer(buf_hdr);
                               2601                 :              0 :             goto again;
                               2602                 :                :         }
                               2603                 :                : 
                               2604                 :                :         /*
                               2605                 :                :          * If using a nondefault strategy, and this victim came from the
                               2606                 :                :          * strategy ring, let the strategy decide whether to reject it when
                               2607                 :                :          * reusing it would require a WAL flush.  This only applies to
                               2608                 :                :          * permanent buffers; unlogged buffers can have fake LSNs, so
                               2609                 :                :          * XLogNeedsFlush() is not meaningful for them.
                               2610                 :                :          *
                               2611                 :                :          * We need to hold the content lock in at least share-exclusive mode
                               2612                 :                :          * to safely inspect the page LSN, so this couldn't have been done
                               2613                 :                :          * inside StrategyGetBuffer().
                               2614                 :                :          */
   55 melanieplageman@gmai     2615   [ +  +  +  + ]:GNC      371125 :         if (strategy && from_ring &&
                               2616   [ +  -  +  + ]:         191998 :             buf_state & BM_PERMANENT &&
                               2617         [ +  + ]:         122826 :             XLogNeedsFlush(BufferGetLSN(buf_hdr)) &&
                               2618                 :          26827 :             StrategyRejectBuffer(strategy, buf_hdr, from_ring))
                               2619                 :                :         {
   39 andres@anarazel.de       2620                 :          22500 :             UnlockReleaseBuffer(buf);
   55 melanieplageman@gmai     2621                 :          22500 :             goto again;
                               2622                 :                :         }
                               2623                 :                : 
                               2624                 :                :         /* OK, do the I/O */
 1126 andres@anarazel.de       2625                 :CBC      348625 :         FlushBuffer(buf_hdr, NULL, IOOBJECT_RELATION, io_context);
  110 andres@anarazel.de       2626                 :GNC      348625 :         LockBuffer(buf, BUFFER_LOCK_UNLOCK);
                               2627                 :                : 
 1084 andres@anarazel.de       2628                 :CBC      348625 :         ScheduleBufferTagForWriteback(&BackendWritebackContext, io_context,
                               2629                 :                :                                       &buf_hdr->tag);
                               2630                 :                :     }
                               2631                 :                : 
                               2632                 :                : 
 1126                          2633         [ +  + ]:        2242141 :     if (buf_state & BM_VALID)
                               2634                 :                :     {
                               2635                 :                :         /*
                               2636                 :                :          * When a BufferAccessStrategy is in use, blocks evicted from shared
                               2637                 :                :          * buffers are counted as IOOP_EVICT in the corresponding context
                               2638                 :                :          * (e.g. IOCONTEXT_BULKWRITE). Shared buffers are evicted by a
                               2639                 :                :          * strategy in two cases: 1) while initially claiming buffers for the
                               2640                 :                :          * strategy ring 2) to replace an existing strategy ring buffer
                               2641                 :                :          * because it is pinned or in use and cannot be reused.
                               2642                 :                :          *
                               2643                 :                :          * Blocks evicted from buffers already in the strategy ring are
                               2644                 :                :          * counted as IOOP_REUSE in the corresponding strategy context.
                               2645                 :                :          *
                               2646                 :                :          * At this point, we can accurately count evictions and reuses,
                               2647                 :                :          * because we have successfully claimed the valid buffer. Previously,
                               2648                 :                :          * we may have been forced to release the buffer due to concurrent
                               2649                 :                :          * pinners or erroring out.
                               2650                 :                :          */
                               2651                 :        1404984 :         pgstat_count_io_op(IOOBJECT_RELATION, io_context,
  476 michael@paquier.xyz      2652         [ +  + ]:        1404984 :                            from_ring ? IOOP_REUSE : IOOP_EVICT, 1, 0);
                               2653                 :                :     }
                               2654                 :                : 
                               2655                 :                :     /*
                               2656                 :                :      * If the buffer has an entry in the buffer mapping table, delete it. This
                               2657                 :                :      * can fail because another backend could have pinned or dirtied the
                               2658                 :                :      * buffer.
                               2659                 :                :      */
 1126 andres@anarazel.de       2660   [ +  +  +  + ]:        2242141 :     if ((buf_state & BM_TAG_VALID) && !InvalidateVictimBuffer(buf_hdr))
                               2661                 :                :     {
                               2662                 :            535 :         UnpinBuffer(buf_hdr);
                               2663                 :            535 :         goto again;
                               2664                 :                :     }
                               2665                 :                : 
                               2666                 :                :     /* a final set of sanity checks */
                               2667                 :                : #ifdef USE_ASSERT_CHECKING
  110 andres@anarazel.de       2668                 :GNC     2241606 :     buf_state = pg_atomic_read_u64(&buf_hdr->state);
                               2669                 :                : 
 1126 andres@anarazel.de       2670         [ -  + ]:CBC     2241606 :     Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 1);
                               2671         [ -  + ]:        2241606 :     Assert(!(buf_state & (BM_TAG_VALID | BM_VALID | BM_DIRTY)));
                               2672                 :                : 
                               2673                 :        2241606 :     CheckBufferIsPinnedOnce(buf);
                               2674                 :                : #endif
                               2675                 :                : 
                               2676                 :        2241606 :     return buf;
                               2677                 :                : }
                               2678                 :                : 
                               2679                 :                : /*
                               2680                 :                :  * Return the maximum number of buffers that a backend should try to pin once,
                               2681                 :                :  * to avoid exceeding its fair share.  This is the highest value that
                               2682                 :                :  * GetAdditionalPinLimit() could ever return.  Note that it may be zero on a
                               2683                 :                :  * system with a very small buffer pool relative to max_connections.
                               2684                 :                :  */
                               2685                 :                : uint32
  417 tmunro@postgresql.or     2686                 :         656230 : GetPinLimit(void)
                               2687                 :                : {
                               2688                 :         656230 :     return MaxProportionalPins;
                               2689                 :                : }
                               2690                 :                : 
                               2691                 :                : /*
                               2692                 :                :  * Return the maximum number of additional buffers that this backend should
                               2693                 :                :  * pin if it wants to stay under the per-backend limit, considering the number
                               2694                 :                :  * of buffers it has already pinned.  Unlike LimitAdditionalPins(), the limit
                               2695                 :                :  * return by this function can be zero.
                               2696                 :                :  */
                               2697                 :                : uint32
                               2698                 :        3555326 : GetAdditionalPinLimit(void)
                               2699                 :                : {
                               2700                 :                :     uint32      estimated_pins_held;
                               2701                 :                : 
                               2702                 :                :     /*
                               2703                 :                :      * We get the number of "overflowed" pins for free, but don't know the
                               2704                 :                :      * number of pins in PrivateRefCountArray.  The cost of calculating that
                               2705                 :                :      * exactly doesn't seem worth it, so just assume the max.
                               2706                 :                :      */
                               2707                 :        3555326 :     estimated_pins_held = PrivateRefCountOverflowed + REFCOUNT_ARRAY_ENTRIES;
                               2708                 :                : 
                               2709                 :                :     /* Is this backend already holding more than its fair share? */
                               2710         [ +  + ]:        3555326 :     if (estimated_pins_held > MaxProportionalPins)
                               2711                 :        1489615 :         return 0;
                               2712                 :                : 
                               2713                 :        2065711 :     return MaxProportionalPins - estimated_pins_held;
                               2714                 :                : }
                               2715                 :                : 
                               2716                 :                : /*
                               2717                 :                :  * Limit the number of pins a batch operation may additionally acquire, to
                               2718                 :                :  * avoid running out of pinnable buffers.
                               2719                 :                :  *
                               2720                 :                :  * One additional pin is always allowed, on the assumption that the operation
                               2721                 :                :  * requires at least one to make progress.
                               2722                 :                :  */
                               2723                 :                : void
 1126 andres@anarazel.de       2724                 :         252381 : LimitAdditionalPins(uint32 *additional_pins)
                               2725                 :                : {
                               2726                 :                :     uint32      limit;
                               2727                 :                : 
                               2728         [ +  + ]:         252381 :     if (*additional_pins <= 1)
                               2729                 :         236863 :         return;
                               2730                 :                : 
  417 tmunro@postgresql.or     2731                 :          15518 :     limit = GetAdditionalPinLimit();
                               2732                 :          15518 :     limit = Max(limit, 1);
                               2733         [ +  + ]:          15518 :     if (limit < *additional_pins)
                               2734                 :          10037 :         *additional_pins = limit;
                               2735                 :                : }
                               2736                 :                : 
                               2737                 :                : /*
                               2738                 :                :  * Logic shared between ExtendBufferedRelBy(), ExtendBufferedRelTo(). Just to
                               2739                 :                :  * avoid duplicating the tracing and relpersistence related logic.
                               2740                 :                :  */
                               2741                 :                : static BlockNumber
  986                          2742                 :         268393 : ExtendBufferedRelCommon(BufferManagerRelation bmr,
                               2743                 :                :                         ForkNumber fork,
                               2744                 :                :                         BufferAccessStrategy strategy,
                               2745                 :                :                         uint32 flags,
                               2746                 :                :                         uint32 extend_by,
                               2747                 :                :                         BlockNumber extend_upto,
                               2748                 :                :                         Buffer *buffers,
                               2749                 :                :                         uint32 *extended_by)
                               2750                 :                : {
                               2751                 :                :     BlockNumber first_block;
                               2752                 :                : 
                               2753                 :                :     TRACE_POSTGRESQL_BUFFER_EXTEND_START(fork,
                               2754                 :                :                                          BMR_GET_SMGR(bmr)->smgr_rlocator.locator.spcOid,
                               2755                 :                :                                          BMR_GET_SMGR(bmr)->smgr_rlocator.locator.dbOid,
                               2756                 :                :                                          BMR_GET_SMGR(bmr)->smgr_rlocator.locator.relNumber,
                               2757                 :                :                                          BMR_GET_SMGR(bmr)->smgr_rlocator.backend,
                               2758                 :                :                                          extend_by);
                               2759                 :                : 
                               2760         [ +  + ]:         268393 :     if (bmr.relpersistence == RELPERSISTENCE_TEMP)
                               2761                 :          16012 :         first_block = ExtendBufferedRelLocal(bmr, fork, flags,
                               2762                 :                :                                              extend_by, extend_upto,
                               2763                 :                :                                              buffers, &extend_by);
                               2764                 :                :     else
                               2765                 :         252381 :         first_block = ExtendBufferedRelShared(bmr, fork, strategy, flags,
                               2766                 :                :                                               extend_by, extend_upto,
                               2767                 :                :                                               buffers, &extend_by);
 1126 andres@anarazel.de       2768                 :         268393 :     *extended_by = extend_by;
                               2769                 :                : 
                               2770                 :                :     TRACE_POSTGRESQL_BUFFER_EXTEND_DONE(fork,
                               2771                 :                :                                         BMR_GET_SMGR(bmr)->smgr_rlocator.locator.spcOid,
                               2772                 :                :                                         BMR_GET_SMGR(bmr)->smgr_rlocator.locator.dbOid,
                               2773                 :                :                                         BMR_GET_SMGR(bmr)->smgr_rlocator.locator.relNumber,
                               2774                 :                :                                         BMR_GET_SMGR(bmr)->smgr_rlocator.backend,
                               2775                 :                :                                         *extended_by,
                               2776                 :                :                                         first_block);
                               2777                 :                : 
                               2778                 :         268393 :     return first_block;
                               2779                 :                : }
                               2780                 :                : 
                               2781                 :                : /*
                               2782                 :                :  * Implementation of ExtendBufferedRelBy() and ExtendBufferedRelTo() for
                               2783                 :                :  * shared buffers.
                               2784                 :                :  */
                               2785                 :                : static BlockNumber
  986 tmunro@postgresql.or     2786                 :         252381 : ExtendBufferedRelShared(BufferManagerRelation bmr,
                               2787                 :                :                         ForkNumber fork,
                               2788                 :                :                         BufferAccessStrategy strategy,
                               2789                 :                :                         uint32 flags,
                               2790                 :                :                         uint32 extend_by,
                               2791                 :                :                         BlockNumber extend_upto,
                               2792                 :                :                         Buffer *buffers,
                               2793                 :                :                         uint32 *extended_by)
                               2794                 :                : {
                               2795                 :                :     BlockNumber first_block;
 1126 andres@anarazel.de       2796                 :         252381 :     IOContext   io_context = IOContextForStrategy(strategy);
                               2797                 :                :     instr_time  io_start;
                               2798                 :                : 
                               2799                 :         252381 :     LimitAdditionalPins(&extend_by);
                               2800                 :                : 
                               2801                 :                :     /*
                               2802                 :                :      * Acquire victim buffers for extension without holding extension lock.
                               2803                 :                :      * Writing out victim buffers is the most expensive part of extending the
                               2804                 :                :      * relation, particularly when doing so requires WAL flushes. Zeroing out
                               2805                 :                :      * the buffers is also quite expensive, so do that before holding the
                               2806                 :                :      * extension lock as well.
                               2807                 :                :      *
                               2808                 :                :      * These pages are pinned by us and not valid. While we hold the pin they
                               2809                 :                :      * can't be acquired as victim buffers by another backend.
                               2810                 :                :      */
                               2811         [ +  + ]:         531647 :     for (uint32 i = 0; i < extend_by; i++)
                               2812                 :                :     {
                               2813                 :                :         Block       buf_block;
                               2814                 :                : 
                               2815                 :         279266 :         buffers[i] = GetVictimBuffer(strategy, io_context);
                               2816                 :         279266 :         buf_block = BufHdrGetBlock(GetBufferDescriptor(buffers[i] - 1));
                               2817                 :                : 
                               2818                 :                :         /* new buffers are zero-filled */
  447 peter@eisentraut.org     2819   [ +  -  +  -  :         279266 :         MemSet(buf_block, 0, BLCKSZ);
                                     +  -  -  +  -  
                                                 - ]
                               2820                 :                :     }
                               2821                 :                : 
                               2822                 :                :     /*
                               2823                 :                :      * Lock relation against concurrent extensions, unless requested not to.
                               2824                 :                :      *
                               2825                 :                :      * We use the same extension lock for all forks. That's unnecessarily
                               2826                 :                :      * restrictive, but currently extensions for forks don't happen often
                               2827                 :                :      * enough to make it worth locking more granularly.
                               2828                 :                :      *
                               2829                 :                :      * Note that another backend might have extended the relation by the time
                               2830                 :                :      * we get the lock.
                               2831                 :                :      */
 1126 andres@anarazel.de       2832         [ +  + ]:         252381 :     if (!(flags & EB_SKIP_EXTENSION_LOCK))
  986 tmunro@postgresql.or     2833                 :         196754 :         LockRelationForExtension(bmr.rel, ExclusiveLock);
                               2834                 :                : 
                               2835                 :                :     /*
                               2836                 :                :      * If requested, invalidate size cache, so that smgrnblocks asks the
                               2837                 :                :      * kernel.
                               2838                 :                :      */
 1126 andres@anarazel.de       2839         [ +  + ]:         252381 :     if (flags & EB_CLEAR_SIZE_CACHE)
  196 alvherre@kurilemu.de     2840         [ +  - ]:GNC        9955 :         BMR_GET_SMGR(bmr)->smgr_cached_nblocks[fork] = InvalidBlockNumber;
                               2841                 :                : 
                               2842         [ +  + ]:         252381 :     first_block = smgrnblocks(BMR_GET_SMGR(bmr), fork);
                               2843                 :                : 
                               2844                 :                :     /*
                               2845                 :                :      * Now that we have the accurate relation size, check if the caller wants
                               2846                 :                :      * us to extend to only up to a specific size. If there were concurrent
                               2847                 :                :      * extensions, we might have acquired too many buffers and need to release
                               2848                 :                :      * them.
                               2849                 :                :      */
 1126 andres@anarazel.de       2850         [ +  + ]:CBC      252381 :     if (extend_upto != InvalidBlockNumber)
                               2851                 :                :     {
                               2852                 :          57785 :         uint32      orig_extend_by = extend_by;
                               2853                 :                : 
                               2854         [ -  + ]:          57785 :         if (first_block > extend_upto)
 1126 andres@anarazel.de       2855                 :UBC           0 :             extend_by = 0;
 1126 andres@anarazel.de       2856         [ +  + ]:CBC       57785 :         else if ((uint64) first_block + extend_by > extend_upto)
                               2857                 :              9 :             extend_by = extend_upto - first_block;
                               2858                 :                : 
                               2859         [ +  + ]:          57804 :         for (uint32 i = extend_by; i < orig_extend_by; i++)
                               2860                 :                :         {
                               2861                 :             19 :             BufferDesc *buf_hdr = GetBufferDescriptor(buffers[i] - 1);
                               2862                 :                : 
                               2863                 :             19 :             UnpinBuffer(buf_hdr);
                               2864                 :                :         }
                               2865                 :                : 
                               2866         [ +  + ]:          57785 :         if (extend_by == 0)
                               2867                 :                :         {
                               2868         [ +  - ]:              9 :             if (!(flags & EB_SKIP_EXTENSION_LOCK))
  986 tmunro@postgresql.or     2869                 :              9 :                 UnlockRelationForExtension(bmr.rel, ExclusiveLock);
 1126 andres@anarazel.de       2870                 :              9 :             *extended_by = extend_by;
                               2871                 :              9 :             return first_block;
                               2872                 :                :         }
                               2873                 :                :     }
                               2874                 :                : 
                               2875                 :                :     /* Fail if relation is already at maximum possible length */
                               2876         [ -  + ]:         252372 :     if ((uint64) first_block + extend_by >= MaxBlockNumber)
 1126 andres@anarazel.de       2877   [ #  #  #  #  :UBC           0 :         ereport(ERROR,
                                     #  #  #  #  #  
                                                 # ]
                               2878                 :                :                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
                               2879                 :                :                  errmsg("cannot extend relation %s beyond %u blocks",
                               2880                 :                :                         relpath(BMR_GET_SMGR(bmr)->smgr_rlocator, fork).str,
                               2881                 :                :                         MaxBlockNumber)));
                               2882                 :                : 
                               2883                 :                :     /*
                               2884                 :                :      * Insert buffers into buffer table, mark as IO_IN_PROGRESS.
                               2885                 :                :      *
                               2886                 :                :      * This needs to happen before we extend the relation, because as soon as
                               2887                 :                :      * we do, other backends can start to read in those pages.
                               2888                 :                :      */
  959 peter@eisentraut.org     2889         [ +  + ]:CBC      531619 :     for (uint32 i = 0; i < extend_by; i++)
                               2890                 :                :     {
 1126 andres@anarazel.de       2891                 :         279247 :         Buffer      victim_buf = buffers[i];
                               2892                 :         279247 :         BufferDesc *victim_buf_hdr = GetBufferDescriptor(victim_buf - 1);
                               2893                 :                :         BufferTag   tag;
                               2894                 :                :         uint32      hash;
                               2895                 :                :         LWLock     *partition_lock;
                               2896                 :                :         int         existing_id;
                               2897                 :                : 
                               2898                 :                :         /* in case we need to pin an existing buffer below */
  909 heikki.linnakangas@i     2899                 :         279247 :         ResourceOwnerEnlarge(CurrentResourceOwner);
                               2900                 :         279247 :         ReservePrivateRefCountEntry();
                               2901                 :                : 
  196 alvherre@kurilemu.de     2902         [ +  + ]:GNC      279247 :         InitBufferTag(&tag, &BMR_GET_SMGR(bmr)->smgr_rlocator.locator, fork,
                               2903                 :                :                       first_block + i);
 1126 andres@anarazel.de       2904                 :CBC      279247 :         hash = BufTableHashCode(&tag);
                               2905                 :         279247 :         partition_lock = BufMappingPartitionLock(hash);
                               2906                 :                : 
                               2907                 :         279247 :         LWLockAcquire(partition_lock, LW_EXCLUSIVE);
                               2908                 :                : 
                               2909                 :         279247 :         existing_id = BufTableInsert(&tag, hash, victim_buf_hdr->buf_id);
                               2910                 :                : 
                               2911                 :                :         /*
                               2912                 :                :          * We get here only in the corner case where we are trying to extend
                               2913                 :                :          * the relation but we found a pre-existing buffer. This can happen
                               2914                 :                :          * because a prior attempt at extending the relation failed, and
                               2915                 :                :          * because mdread doesn't complain about reads beyond EOF (when
                               2916                 :                :          * zero_damaged_pages is ON) and so a previous attempt to read a block
                               2917                 :                :          * beyond EOF could have left a "valid" zero-filled buffer.
                               2918                 :                :          *
                               2919                 :                :          * This has also been observed when relation was overwritten by
                               2920                 :                :          * external process. Since the legitimate cases should always have
                               2921                 :                :          * left a zero-filled buffer, complain if not PageIsNew.
                               2922                 :                :          */
                               2923         [ -  + ]:         279247 :         if (existing_id >= 0)
                               2924                 :                :         {
 1126 andres@anarazel.de       2925                 :UBC           0 :             BufferDesc *existing_hdr = GetBufferDescriptor(existing_id);
                               2926                 :                :             Block       buf_block;
                               2927                 :                :             bool        valid;
                               2928                 :                : 
                               2929                 :                :             /*
                               2930                 :                :              * Pin the existing buffer before releasing the partition lock,
                               2931                 :                :              * preventing it from being evicted.
                               2932                 :                :              */
  209 andres@anarazel.de       2933                 :UNC           0 :             valid = PinBuffer(existing_hdr, strategy, false);
                               2934                 :                : 
 1126 andres@anarazel.de       2935                 :UBC           0 :             LWLockRelease(partition_lock);
                               2936                 :              0 :             UnpinBuffer(victim_buf_hdr);
                               2937                 :                : 
                               2938                 :              0 :             buffers[i] = BufferDescriptorGetBuffer(existing_hdr);
                               2939                 :              0 :             buf_block = BufHdrGetBlock(existing_hdr);
                               2940                 :                : 
                               2941   [ #  #  #  # ]:              0 :             if (valid && !PageIsNew((Page) buf_block))
                               2942   [ #  #  #  #  :              0 :                 ereport(ERROR,
                                     #  #  #  #  #  
                                                 # ]
                               2943                 :                :                         (errmsg("unexpected data beyond EOF in block %u of relation \"%s\"",
                               2944                 :                :                                 existing_hdr->tag.blockNum,
                               2945                 :                :                                 relpath(BMR_GET_SMGR(bmr)->smgr_rlocator, fork).str)));
                               2946                 :                : 
                               2947                 :                :             /*
                               2948                 :                :              * We *must* do smgr[zero]extend before succeeding, else the page
                               2949                 :                :              * will not be reserved by the kernel, and the next P_NEW call
                               2950                 :                :              * will decide to return the same page.  Clear the BM_VALID bit,
                               2951                 :                :              * do StartSharedBufferIO() and proceed.
                               2952                 :                :              *
                               2953                 :                :              * Loop to handle the very small possibility that someone re-sets
                               2954                 :                :              * BM_VALID between our clearing it and StartSharedBufferIO
                               2955                 :                :              * inspecting it.
                               2956                 :                :              */
                               2957                 :                :             while (true)
 1126 andres@anarazel.de       2958                 :UIC           0 :             {
                               2959                 :                :                 StartBufferIOResult sbres;
                               2960                 :                : 
  110 andres@anarazel.de       2961                 :UNC           0 :                 pg_atomic_fetch_and_u64(&existing_hdr->state, ~BM_VALID);
                               2962                 :                : 
   39                          2963                 :              0 :                 sbres = StartSharedBufferIO(existing_hdr, true, true, NULL);
                               2964                 :                : 
                               2965         [ #  # ]:              0 :                 if (sbres != BUFFER_IO_ALREADY_DONE)
                               2966                 :              0 :                     break;
                               2967                 :                :             }
                               2968                 :                :         }
                               2969                 :                :         else
                               2970                 :                :         {
                               2971                 :                :             uint64      buf_state;
  110 andres@anarazel.de       2972                 :GNC      279247 :             uint64      set_bits = 0;
                               2973                 :                : 
 1126 andres@anarazel.de       2974                 :CBC      279247 :             buf_state = LockBufHdr(victim_buf_hdr);
                               2975                 :                : 
                               2976                 :                :             /* some sanity checks while we hold the buffer header lock */
   55 andres@anarazel.de       2977         [ -  + ]:GNC      279247 :             Assert(!(buf_state & (BM_VALID | BM_TAG_VALID | BM_DIRTY)));
 1126 andres@anarazel.de       2978         [ -  + ]:CBC      279247 :             Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 1);
                               2979                 :                : 
                               2980                 :         279247 :             victim_buf_hdr->tag = tag;
                               2981                 :                : 
  180 andres@anarazel.de       2982                 :GNC      279247 :             set_bits |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
  986 tmunro@postgresql.or     2983   [ +  +  +  + ]:CBC      279247 :             if (bmr.relpersistence == RELPERSISTENCE_PERMANENT || fork == INIT_FORKNUM)
  180 andres@anarazel.de       2984                 :GNC      273267 :                 set_bits |= BM_PERMANENT;
                               2985                 :                : 
                               2986                 :         279247 :             UnlockBufHdrExt(victim_buf_hdr, buf_state,
                               2987                 :                :                             set_bits, 0,
                               2988                 :                :                             0);
                               2989                 :                : 
 1126 andres@anarazel.de       2990                 :CBC      279247 :             LWLockRelease(partition_lock);
                               2991                 :                : 
                               2992                 :                :             /* XXX: could combine the locked operations in it with the above */
   39 andres@anarazel.de       2993                 :GNC      279247 :             StartSharedBufferIO(victim_buf_hdr, true, true, NULL);
                               2994                 :                :         }
                               2995                 :                :     }
                               2996                 :                : 
  433 michael@paquier.xyz      2997                 :CBC      252372 :     io_start = pgstat_prepare_io_time(track_io_timing);
                               2998                 :                : 
                               2999                 :                :     /*
                               3000                 :                :      * Note: if smgrzeroextend fails, we will end up with buffers that are
                               3001                 :                :      * allocated but not marked BM_VALID.  The next relation extension will
                               3002                 :                :      * still select the same block number (because the relation didn't get any
                               3003                 :                :      * longer on disk) and so future attempts to extend the relation will find
                               3004                 :                :      * the same buffers (if they have not been recycled) but come right back
                               3005                 :                :      * here to try smgrzeroextend again.
                               3006                 :                :      *
                               3007                 :                :      * We don't need to set checksum for all-zero pages.
                               3008                 :                :      */
  196 alvherre@kurilemu.de     3009         [ +  + ]:GNC      252372 :     smgrzeroextend(BMR_GET_SMGR(bmr), fork, first_block, extend_by, false);
                               3010                 :                : 
                               3011                 :                :     /*
                               3012                 :                :      * Release the file-extension lock; it's now OK for someone else to extend
                               3013                 :                :      * the relation some more.
                               3014                 :                :      *
                               3015                 :                :      * We remove IO_IN_PROGRESS after this, as waking up waiting backends can
                               3016                 :                :      * take noticeable time.
                               3017                 :                :      */
 1126 andres@anarazel.de       3018         [ +  + ]:CBC      252372 :     if (!(flags & EB_SKIP_EXTENSION_LOCK))
  986 tmunro@postgresql.or     3019                 :         196745 :         UnlockRelationForExtension(bmr.rel, ExclusiveLock);
                               3020                 :                : 
 1124 andres@anarazel.de       3021                 :         252372 :     pgstat_count_io_op_time(IOOBJECT_RELATION, io_context, IOOP_EXTEND,
  476 michael@paquier.xyz      3022                 :         252372 :                             io_start, 1, extend_by * BLCKSZ);
                               3023                 :                : 
                               3024                 :                :     /* Set BM_VALID, terminate IO, and wake up any waiters */
  959 peter@eisentraut.org     3025         [ +  + ]:         531619 :     for (uint32 i = 0; i < extend_by; i++)
                               3026                 :                :     {
 1126 andres@anarazel.de       3027                 :         279247 :         Buffer      buf = buffers[i];
                               3028                 :         279247 :         BufferDesc *buf_hdr = GetBufferDescriptor(buf - 1);
                               3029                 :         279247 :         bool        lock = false;
                               3030                 :                : 
                               3031   [ +  +  +  + ]:         279247 :         if (flags & EB_LOCK_FIRST && i == 0)
                               3032                 :         194265 :             lock = true;
                               3033         [ +  + ]:          84982 :         else if (flags & EB_LOCK_TARGET)
                               3034                 :                :         {
                               3035         [ -  + ]:          46698 :             Assert(extend_upto != InvalidBlockNumber);
                               3036         [ +  + ]:          46698 :             if (first_block + i + 1 == extend_upto)
                               3037                 :          46129 :                 lock = true;
                               3038                 :                :         }
                               3039                 :                : 
                               3040         [ +  + ]:         279247 :         if (lock)
  209 andres@anarazel.de       3041                 :GNC      240394 :             LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
                               3042                 :                : 
  401 andres@anarazel.de       3043                 :CBC      279247 :         TerminateBufferIO(buf_hdr, false, BM_VALID, true, false);
                               3044                 :                :     }
                               3045                 :                : 
 1126                          3046                 :         252372 :     pgBufferUsage.shared_blks_written += extend_by;
                               3047                 :                : 
                               3048                 :         252372 :     *extended_by = extend_by;
                               3049                 :                : 
                               3050                 :         252372 :     return first_block;
                               3051                 :                : }
                               3052                 :                : 
                               3053                 :                : /*
                               3054                 :                :  * BufferIsLockedByMe
                               3055                 :                :  *
                               3056                 :                :  *      Checks if this backend has the buffer locked in any mode.
                               3057                 :                :  *
                               3058                 :                :  * Buffer must be pinned.
                               3059                 :                :  */
                               3060                 :                : bool
  209 andres@anarazel.de       3061                 :GNC         107 : BufferIsLockedByMe(Buffer buffer)
                               3062                 :                : {
                               3063                 :                :     BufferDesc *bufHdr;
                               3064                 :                : 
                               3065   [ -  +  -  +  :            107 :     Assert(BufferIsPinned(buffer));
                                              -  + ]
                               3066                 :                : 
                               3067         [ -  + ]:            107 :     if (BufferIsLocal(buffer))
                               3068                 :                :     {
                               3069                 :                :         /* Content locks are not maintained for local buffers. */
  209 andres@anarazel.de       3070                 :UNC           0 :         return true;
                               3071                 :                :     }
                               3072                 :                :     else
                               3073                 :                :     {
  209 andres@anarazel.de       3074                 :GNC         107 :         bufHdr = GetBufferDescriptor(buffer - 1);
  110                          3075                 :            107 :         return BufferLockHeldByMe(bufHdr);
                               3076                 :                :     }
                               3077                 :                : }
                               3078                 :                : 
                               3079                 :                : /*
                               3080                 :                :  * BufferIsLockedByMeInMode
                               3081                 :                :  *
                               3082                 :                :  *      Checks if this backend has the buffer locked in the specified mode.
                               3083                 :                :  *
                               3084                 :                :  * Buffer must be pinned.
                               3085                 :                :  */
                               3086                 :                : bool
  153                          3087                 :      116984158 : BufferIsLockedByMeInMode(Buffer buffer, BufferLockMode mode)
                               3088                 :                : {
                               3089                 :                :     BufferDesc *bufHdr;
                               3090                 :                : 
  461 tgl@sss.pgh.pa.us        3091   [ -  +  +  +  :CBC   116984158 :     Assert(BufferIsPinned(buffer));
                                              -  + ]
                               3092                 :                : 
  925 jdavis@postgresql.or     3093         [ +  + ]:      116984158 :     if (BufferIsLocal(buffer))
                               3094                 :                :     {
                               3095                 :                :         /* Content locks are not maintained for local buffers. */
  461 tgl@sss.pgh.pa.us        3096                 :GBC        1019 :         return true;
                               3097                 :                :     }
                               3098                 :                :     else
                               3099                 :                :     {
  925 jdavis@postgresql.or     3100                 :CBC   116983139 :         bufHdr = GetBufferDescriptor(buffer - 1);
  110 andres@anarazel.de       3101                 :GNC   116983139 :         return BufferLockHeldByMeInMode(bufHdr, mode);
                               3102                 :                :     }
                               3103                 :                : }
                               3104                 :                : 
                               3105                 :                : /*
                               3106                 :                :  * BufferIsDirty
                               3107                 :                :  *
                               3108                 :                :  *      Checks if buffer is already dirty.
                               3109                 :                :  *
                               3110                 :                :  * Buffer must be pinned and [share-]exclusive-locked.  (Without such a lock,
                               3111                 :                :  * the result may be stale before it's returned.)
                               3112                 :                :  */
                               3113                 :                : bool
  925 jdavis@postgresql.or     3114                 :CBC    27900320 : BufferIsDirty(Buffer buffer)
                               3115                 :                : {
                               3116                 :                :     BufferDesc *bufHdr;
                               3117                 :                : 
  461 tgl@sss.pgh.pa.us        3118   [ -  +  +  +  :       27900320 :     Assert(BufferIsPinned(buffer));
                                              -  + ]
                               3119                 :                : 
  925 jdavis@postgresql.or     3120         [ +  + ]:       27900320 :     if (BufferIsLocal(buffer))
                               3121                 :                :     {
  925 jdavis@postgresql.or     3122                 :GBC       10005 :         int         bufid = -buffer - 1;
                               3123                 :                : 
                               3124                 :          10005 :         bufHdr = GetLocalBufferDescriptor(bufid);
                               3125                 :                :         /* Content locks are not maintained for local buffers. */
                               3126                 :                :     }
                               3127                 :                :     else
                               3128                 :                :     {
  925 jdavis@postgresql.or     3129                 :CBC    27890315 :         bufHdr = GetBufferDescriptor(buffer - 1);
   56 andres@anarazel.de       3130   [ +  +  -  + ]:GNC    27890315 :         Assert(BufferIsLockedByMeInMode(buffer, BUFFER_LOCK_SHARE_EXCLUSIVE) ||
                               3131                 :                :                BufferIsLockedByMeInMode(buffer, BUFFER_LOCK_EXCLUSIVE));
                               3132                 :                :     }
                               3133                 :                : 
  110                          3134                 :       27900320 :     return pg_atomic_read_u64(&bufHdr->state) & BM_DIRTY;
                               3135                 :                : }
                               3136                 :                : 
                               3137                 :                : /*
                               3138                 :                :  * MarkBufferDirty
                               3139                 :                :  *
                               3140                 :                :  *      Marks buffer contents as dirty (actual write happens later).
                               3141                 :                :  *
                               3142                 :                :  * Buffer must be pinned and exclusive-locked.  (If caller does not hold
                               3143                 :                :  * exclusive lock, then somebody could be in process of writing the buffer,
                               3144                 :                :  * leading to risk of bad data written to disk.)
                               3145                 :                :  */
                               3146                 :                : void
 7340 tgl@sss.pgh.pa.us        3147                 :CBC    34972274 : MarkBufferDirty(Buffer buffer)
                               3148                 :                : {
                               3149                 :                :     BufferDesc *bufHdr;
                               3150                 :                :     uint64      buf_state;
                               3151                 :                :     uint64      old_buf_state;
                               3152                 :                : 
 7872                          3153         [ -  + ]:       34972274 :     if (!BufferIsValid(buffer))
 5434 peter_e@gmx.net          3154         [ #  # ]:UBC           0 :         elog(ERROR, "bad buffer ID: %d", buffer);
                               3155                 :                : 
 9531 tgl@sss.pgh.pa.us        3156         [ +  + ]:CBC    34972274 :     if (BufferIsLocal(buffer))
                               3157                 :                :     {
 7340                          3158                 :        1742551 :         MarkLocalBufferDirty(buffer);
 8725 bruce@momjian.us         3159                 :        1742551 :         return;
                               3160                 :                :     }
                               3161                 :                : 
 4114 andres@anarazel.de       3162                 :       33229723 :     bufHdr = GetBufferDescriptor(buffer - 1);
                               3163                 :                : 
 4266                          3164   [ -  +  -  +  :       33229723 :     Assert(BufferIsPinned(buffer));
                                              -  + ]
  209 andres@anarazel.de       3165         [ -  + ]:GNC    33229723 :     Assert(BufferIsLockedByMeInMode(buffer, BUFFER_LOCK_EXCLUSIVE));
                               3166                 :                : 
                               3167                 :                :     /*
                               3168                 :                :      * NB: We have to wait for the buffer header spinlock to be not held, as
                               3169                 :                :      * TerminateBufferIO() relies on the spinlock.
                               3170                 :                :      */
  110                          3171                 :       33229723 :     old_buf_state = pg_atomic_read_u64(&bufHdr->state);
                               3172                 :                :     for (;;)
                               3173                 :                :     {
 3677 andres@anarazel.de       3174         [ +  + ]:CBC    33229863 :         if (old_buf_state & BM_LOCKED)
                               3175                 :            295 :             old_buf_state = WaitBufHdrUnlocked(bufHdr);
                               3176                 :                : 
                               3177                 :       33229863 :         buf_state = old_buf_state;
                               3178                 :                : 
                               3179         [ -  + ]:       33229863 :         Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
   55 andres@anarazel.de       3180                 :GNC    33229863 :         buf_state |= BM_DIRTY;
                               3181                 :                : 
  110                          3182         [ +  + ]:       33229863 :         if (pg_atomic_compare_exchange_u64(&bufHdr->state, &old_buf_state,
                               3183                 :                :                                            buf_state))
 3677 andres@anarazel.de       3184                 :CBC    33229723 :             break;
                               3185                 :                :     }
                               3186                 :                : 
                               3187                 :                :     /*
                               3188                 :                :      * If the buffer was not dirty already, do vacuum accounting.
                               3189                 :                :      */
                               3190         [ +  + ]:       33229723 :     if (!(old_buf_state & BM_DIRTY))
                               3191                 :                :     {
 5186 rhaas@postgresql.org     3192                 :         810367 :         pgBufferUsage.shared_blks_dirtied++;
 5275 alvherre@alvh.no-ip.     3193         [ +  + ]:         810367 :         if (VacuumCostActive)
                               3194                 :          11530 :             VacuumCostBalance += VacuumCostPageDirty;
                               3195                 :                :     }
                               3196                 :                : }
                               3197                 :                : 
                               3198                 :                : /*
                               3199                 :                :  * ReleaseAndReadBuffer -- combine ReleaseBuffer() and ReadBuffer()
                               3200                 :                :  *
                               3201                 :                :  * Formerly, this saved one cycle of acquiring/releasing the BufMgrLock
                               3202                 :                :  * compared to calling the two routines separately.  Now it's mainly just
                               3203                 :                :  * a convenience function.  However, if the passed buffer is valid and
                               3204                 :                :  * already contains the desired block, we just return it as-is; and that
                               3205                 :                :  * does save considerable work compared to a full release and reacquire.
                               3206                 :                :  *
                               3207                 :                :  * Note: it is OK to pass buffer == InvalidBuffer, indicating that no old
                               3208                 :                :  * buffer actually needs to be released.  This case is the same as ReadBuffer,
                               3209                 :                :  * but can save some tests in the caller.
                               3210                 :                :  */
                               3211                 :                : Buffer
10892 scrappy@hub.org          3212                 :           3646 : ReleaseAndReadBuffer(Buffer buffer,
                               3213                 :                :                      Relation relation,
                               3214                 :                :                      BlockNumber blockNum)
                               3215                 :                : {
 6172 bruce@momjian.us         3216                 :           3646 :     ForkNumber  forkNum = MAIN_FORKNUM;
                               3217                 :                :     BufferDesc *bufHdr;
                               3218                 :                : 
 9124 tgl@sss.pgh.pa.us        3219         [ +  - ]:           3646 :     if (BufferIsValid(buffer))
                               3220                 :                :     {
 4266 andres@anarazel.de       3221   [ -  +  +  +  :           3646 :         Assert(BufferIsPinned(buffer));
                                              -  + ]
 9124 tgl@sss.pgh.pa.us        3222         [ +  + ]:           3646 :         if (BufferIsLocal(buffer))
                               3223                 :                :         {
 4114 andres@anarazel.de       3224                 :             50 :             bufHdr = GetLocalBufferDescriptor(-buffer - 1);
 9096 tgl@sss.pgh.pa.us        3225   [ -  +  -  - ]:             50 :             if (bufHdr->tag.blockNum == blockNum &&
 1350 rhaas@postgresql.org     3226         [ #  # ]:LBC      (7044) :                 BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) &&
                               3227                 :         (3522) :                 BufTagGetForkNum(&bufHdr->tag) == forkNum)
 9096 tgl@sss.pgh.pa.us        3228                 :         (3522) :                 return buffer;
 1126 andres@anarazel.de       3229                 :CBC          50 :             UnpinLocalBuffer(buffer);
                               3230                 :                :         }
                               3231                 :                :         else
                               3232                 :                :         {
 4114                          3233                 :           3596 :             bufHdr = GetBufferDescriptor(buffer - 1);
                               3234                 :                :             /* we have pin, so it's ok to examine tag without spinlock */
 9096 tgl@sss.pgh.pa.us        3235   [ -  +  -  - ]:           3596 :             if (bufHdr->tag.blockNum == blockNum &&
 1350 rhaas@postgresql.org     3236         [ #  # ]:LBC  (10961034) :                 BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) &&
                               3237                 :      (5480517) :                 BufTagGetForkNum(&bufHdr->tag) == forkNum)
 9096 tgl@sss.pgh.pa.us        3238                 :      (5480517) :                 return buffer;
 1313 michael@paquier.xyz      3239                 :CBC        3596 :             UnpinBuffer(bufHdr);
                               3240                 :                :         }
                               3241                 :                :     }
                               3242                 :                : 
 7732 tgl@sss.pgh.pa.us        3243                 :           3646 :     return ReadBuffer(relation, blockNum);
                               3244                 :                : }
                               3245                 :                : 
                               3246                 :                : /*
                               3247                 :                :  * PinBuffer -- make buffer unavailable for replacement.
                               3248                 :                :  *
                               3249                 :                :  * For the default access strategy, the buffer's usage_count is incremented
                               3250                 :                :  * when we first pin it; for other strategies we just make sure the usage_count
                               3251                 :                :  * isn't zero.  (The idea of the latter is that we don't want synchronized
                               3252                 :                :  * heap scans to inflate the count, but we need it to not be zero to discourage
                               3253                 :                :  * other backends from stealing buffers from our ring.  As long as we cycle
                               3254                 :                :  * through the ring faster than the global clock-sweep cycles, buffers in
                               3255                 :                :  * our ring won't be chosen as victims for replacement by other backends.)
                               3256                 :                :  *
                               3257                 :                :  * This should be applied only to shared buffers, never local ones.
                               3258                 :                :  *
                               3259                 :                :  * Since buffers are pinned/unpinned very frequently, pin buffers without
                               3260                 :                :  * taking the buffer header lock; instead update the state variable in loop of
                               3261                 :                :  * CAS operations. Hopefully it's just a single CAS.
                               3262                 :                :  *
                               3263                 :                :  * Note that ResourceOwnerEnlarge() and ReservePrivateRefCountEntry()
                               3264                 :                :  * must have been done already.
                               3265                 :                :  *
                               3266                 :                :  * Returns true if buffer is BM_VALID, else false.  This provision allows
                               3267                 :                :  * some callers to avoid an extra spinlock cycle.  If skip_if_not_valid is
                               3268                 :                :  * true, then a false return value also indicates that the buffer was
                               3269                 :                :  * (recently) invalid and has not been pinned.
                               3270                 :                :  */
                               3271                 :                : static bool
  209 andres@anarazel.de       3272                 :GNC    80752342 : PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy,
                               3273                 :                :           bool skip_if_not_valid)
                               3274                 :                : {
 3919 andres@anarazel.de       3275                 :CBC    80752342 :     Buffer      b = BufferDescriptorGetBuffer(buf);
                               3276                 :                :     bool        result;
                               3277                 :                :     PrivateRefCountEntry *ref;
                               3278                 :                : 
 1126                          3279         [ -  + ]:       80752342 :     Assert(!BufferIsLocal(b));
  142 andres@anarazel.de       3280         [ -  + ]:GNC    80752342 :     Assert(ReservedRefCountSlot != -1);
                               3281                 :                : 
 3919 andres@anarazel.de       3282                 :CBC    80752342 :     ref = GetPrivateRefCountEntry(b, true);
                               3283                 :                : 
 4124                          3284         [ +  + ]:       80752342 :     if (ref == NULL)
                               3285                 :                :     {
                               3286                 :                :         uint64      buf_state;
                               3287                 :                :         uint64      old_buf_state;
                               3288                 :                : 
  110 andres@anarazel.de       3289                 :GNC    73491569 :         old_buf_state = pg_atomic_read_u64(&buf->state);
                               3290                 :                :         for (;;)
                               3291                 :                :         {
  209                          3292   [ +  +  +  +  :       73496599 :             if (unlikely(skip_if_not_valid && !(old_buf_state & BM_VALID)))
                                              +  + ]
                               3293                 :              9 :                 return false;
                               3294                 :                : 
                               3295                 :                :             /*
                               3296                 :                :              * We're not allowed to increase the refcount while the buffer
                               3297                 :                :              * header spinlock is held. Wait for the lock to be released.
                               3298                 :                :              */
   35                          3299         [ +  + ]:       73496590 :             if (unlikely(old_buf_state & BM_LOCKED))
                               3300                 :                :             {
 3677 andres@anarazel.de       3301                 :CBC          89 :                 old_buf_state = WaitBufHdrUnlocked(buf);
                               3302                 :                : 
                               3303                 :                :                 /* perform checks at the top of the loop again */
   35 andres@anarazel.de       3304                 :GNC          89 :                 continue;
                               3305                 :                :             }
                               3306                 :                : 
 3677 andres@anarazel.de       3307                 :CBC    73496501 :             buf_state = old_buf_state;
                               3308                 :                : 
                               3309                 :                :             /* increase refcount */
                               3310                 :       73496501 :             buf_state += BUF_REFCOUNT_ONE;
                               3311                 :                : 
 3333 teodor@sigaev.ru         3312         [ +  + ]:       73496501 :             if (strategy == NULL)
                               3313                 :                :             {
                               3314                 :                :                 /* Default case: increase usagecount unless already max. */
                               3315         [ +  + ]:       72902725 :                 if (BUF_STATE_GET_USAGECOUNT(buf_state) < BM_MAX_USAGE_COUNT)
                               3316                 :        3807635 :                     buf_state += BUF_USAGECOUNT_ONE;
                               3317                 :                :             }
                               3318                 :                :             else
                               3319                 :                :             {
                               3320                 :                :                 /*
                               3321                 :                :                  * Ring buffers shouldn't evict others from pool.  Thus we
                               3322                 :                :                  * don't make usagecount more than 1.
                               3323                 :                :                  */
                               3324         [ +  + ]:         593776 :                 if (BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
                               3325                 :          39959 :                     buf_state += BUF_USAGECOUNT_ONE;
                               3326                 :                :             }
                               3327                 :                : 
  110 andres@anarazel.de       3328         [ +  + ]:GNC    73496501 :             if (pg_atomic_compare_exchange_u64(&buf->state, &old_buf_state,
                               3329                 :                :                                                buf_state))
                               3330                 :                :             {
 3677 andres@anarazel.de       3331                 :CBC    73491560 :                 result = (buf_state & BM_VALID) != 0;
                               3332                 :                : 
  209 andres@anarazel.de       3333                 :GNC    73491560 :                 TrackNewBufferPin(b);
 3677 andres@anarazel.de       3334                 :CBC    73491560 :                 break;
                               3335                 :                :             }
                               3336                 :                :         }
                               3337                 :                :     }
                               3338                 :                :     else
                               3339                 :                :     {
                               3340                 :                :         /*
                               3341                 :                :          * If we previously pinned the buffer, it is likely to be valid, but
                               3342                 :                :          * it may not be if StartReadBuffers() was called and
                               3343                 :                :          * WaitReadBuffers() hasn't been called yet.  We'll check by loading
                               3344                 :                :          * the flags without locking.  This is racy, but it's OK to return
                               3345                 :                :          * false spuriously: when WaitReadBuffers() calls StartBufferIO(),
                               3346                 :                :          * it'll see that it's now valid.
                               3347                 :                :          *
                               3348                 :                :          * Note: We deliberately avoid a Valgrind client request here.
                               3349                 :                :          * Individual access methods can optionally superimpose buffer page
                               3350                 :                :          * client requests on top of our client requests to enforce that
                               3351                 :                :          * buffers are only accessed while locked (and pinned).  It's possible
                               3352                 :                :          * that the buffer page is legitimately non-accessible here.  We
                               3353                 :                :          * cannot meddle with that.
                               3354                 :                :          */
  110 andres@anarazel.de       3355                 :GNC     7260773 :         result = (pg_atomic_read_u64(&buf->state) & BM_VALID) != 0;
                               3356                 :                : 
  142                          3357         [ -  + ]:        7260773 :         Assert(ref->data.refcount > 0);
                               3358                 :        7260773 :         ref->data.refcount++;
  209                          3359                 :        7260773 :         ResourceOwnerRememberBuffer(CurrentResourceOwner, b);
                               3360                 :                :     }
                               3361                 :                : 
 7732 tgl@sss.pgh.pa.us        3362                 :CBC    80752333 :     return result;
                               3363                 :                : }
                               3364                 :                : 
                               3365                 :                : /*
                               3366                 :                :  * PinBuffer_Locked -- as above, but caller already locked the buffer header.
                               3367                 :                :  * The spinlock is released before return.
                               3368                 :                :  *
                               3369                 :                :  * As this function is called with the spinlock held, the caller has to
                               3370                 :                :  * previously call ReservePrivateRefCountEntry() and
                               3371                 :                :  * ResourceOwnerEnlarge(CurrentResourceOwner);
                               3372                 :                :  *
                               3373                 :                :  * Currently, no callers of this function want to modify the buffer's
                               3374                 :                :  * usage_count at all, so there's no need for a strategy parameter.
                               3375                 :                :  * Also we don't bother with a BM_VALID test (the caller could check that for
                               3376                 :                :  * itself).
                               3377                 :                :  *
                               3378                 :                :  * Also all callers only ever use this function when it's known that the
                               3379                 :                :  * buffer can't have a preexisting pin by this backend. That allows us to skip
                               3380                 :                :  * searching the private refcount array & hash, which is a boon, because the
                               3381                 :                :  * spinlock is still held.
                               3382                 :                :  *
                               3383                 :                :  * Note: use of this routine is frequently mandatory, not just an optimization
                               3384                 :                :  * to save a spin lock/unlock cycle, because we need to pin a buffer before
                               3385                 :                :  * its state can change under us.
                               3386                 :                :  */
                               3387                 :                : static void
 3823 rhaas@postgresql.org     3388                 :         364711 : PinBuffer_Locked(BufferDesc *buf)
                               3389                 :                : {
                               3390                 :                :     uint64      old_buf_state;
                               3391                 :                : 
                               3392                 :                :     /*
                               3393                 :                :      * As explained, We don't expect any preexisting pins. That allows us to
                               3394                 :                :      * manipulate the PrivateRefCount after releasing the spinlock
                               3395                 :                :      */
 3919 andres@anarazel.de       3396         [ -  + ]:         364711 :     Assert(GetPrivateRefCountEntry(BufferDescriptorGetBuffer(buf), false) == NULL);
                               3397                 :                : 
                               3398                 :                :     /*
                               3399                 :                :      * Since we hold the buffer spinlock, we can update the buffer state and
                               3400                 :                :      * release the lock in one operation.
                               3401                 :                :      */
  110 andres@anarazel.de       3402                 :GNC      364711 :     old_buf_state = pg_atomic_read_u64(&buf->state);
                               3403                 :                : 
  180                          3404                 :         364711 :     UnlockBufHdrExt(buf, old_buf_state,
                               3405                 :                :                     0, 0, 1);
                               3406                 :                : 
  209                          3407                 :         364711 :     TrackNewBufferPin(BufferDescriptorGetBuffer(buf));
 8051 tgl@sss.pgh.pa.us        3408                 :CBC      364711 : }
                               3409                 :                : 
                               3410                 :                : /*
                               3411                 :                :  * Support for waking up another backend that is waiting for the cleanup lock
                               3412                 :                :  * to be released using BM_PIN_COUNT_WAITER.
                               3413                 :                :  *
                               3414                 :                :  * See LockBufferForCleanup().
                               3415                 :                :  *
                               3416                 :                :  * Expected to be called just after releasing a buffer pin (in a BufferDesc,
                               3417                 :                :  * not just reducing the backend-local pincount for the buffer).
                               3418                 :                :  */
                               3419                 :                : static void
  401 andres@anarazel.de       3420                 :             63 : WakePinCountWaiter(BufferDesc *buf)
                               3421                 :                : {
                               3422                 :                :     /*
                               3423                 :                :      * Acquire the buffer header lock, re-check that there's a waiter. Another
                               3424                 :                :      * backend could have unpinned this buffer, and already woken up the
                               3425                 :                :      * waiter.
                               3426                 :                :      *
                               3427                 :                :      * There's no danger of the buffer being replaced after we unpinned it
                               3428                 :                :      * above, as it's pinned by the waiter. The waiter removes
                               3429                 :                :      * BM_PIN_COUNT_WAITER if it stops waiting for a reason other than this
                               3430                 :                :      * backend waking it up.
                               3431                 :                :      */
  110 andres@anarazel.de       3432                 :GNC          63 :     uint64      buf_state = LockBufHdr(buf);
                               3433                 :                : 
  401 andres@anarazel.de       3434         [ +  - ]:CBC          63 :     if ((buf_state & BM_PIN_COUNT_WAITER) &&
                               3435         [ +  + ]:             63 :         BUF_STATE_GET_REFCOUNT(buf_state) == 1)
                               3436                 :             59 :     {
                               3437                 :                :         /* we just released the last pin other than the waiter's */
                               3438                 :             59 :         int         wait_backend_pgprocno = buf->wait_backend_pgprocno;
                               3439                 :                : 
  180 andres@anarazel.de       3440                 :GNC          59 :         UnlockBufHdrExt(buf, buf_state,
                               3441                 :                :                         0, BM_PIN_COUNT_WAITER,
                               3442                 :                :                         0);
  401 andres@anarazel.de       3443                 :CBC          59 :         ProcSendSignal(wait_backend_pgprocno);
                               3444                 :                :     }
                               3445                 :                :     else
  180 andres@anarazel.de       3446                 :GNC           4 :         UnlockBufHdr(buf);
  401 andres@anarazel.de       3447                 :CBC          63 : }
                               3448                 :                : 
                               3449                 :                : /*
                               3450                 :                :  * UnpinBuffer -- make buffer available for replacement.
                               3451                 :                :  *
                               3452                 :                :  * This should be applied only to shared buffers, never local ones.  This
                               3453                 :                :  * always adjusts CurrentResourceOwner.
                               3454                 :                :  */
                               3455                 :                : static void
 1313 michael@paquier.xyz      3456                 :       48210631 : UnpinBuffer(BufferDesc *buf)
                               3457                 :                : {
  909 heikki.linnakangas@i     3458                 :       48210631 :     Buffer      b = BufferDescriptorGetBuffer(buf);
                               3459                 :                : 
                               3460                 :       48210631 :     ResourceOwnerForgetBuffer(CurrentResourceOwner, b);
                               3461                 :       48210631 :     UnpinBufferNoOwner(buf);
                               3462                 :       48210631 : }
                               3463                 :                : 
                               3464                 :                : static void
                               3465                 :       48217242 : UnpinBufferNoOwner(BufferDesc *buf)
                               3466                 :                : {
                               3467                 :                :     PrivateRefCountEntry *ref;
 3919 andres@anarazel.de       3468                 :       48217242 :     Buffer      b = BufferDescriptorGetBuffer(buf);
                               3469                 :                : 
 1126                          3470         [ -  + ]:       48217242 :     Assert(!BufferIsLocal(b));
                               3471                 :                : 
                               3472                 :                :     /* not moving as we're likely deleting it soon anyway */
 3919                          3473                 :       48217242 :     ref = GetPrivateRefCountEntry(b, false);
 4266                          3474         [ -  + ]:       48217242 :     Assert(ref != NULL);
  142 andres@anarazel.de       3475         [ -  + ]:GNC    48217242 :     Assert(ref->data.refcount > 0);
                               3476                 :       48217242 :     ref->data.refcount--;
                               3477         [ +  + ]:       48217242 :     if (ref->data.refcount == 0)
                               3478                 :                :     {
                               3479                 :                :         uint64      old_buf_state;
                               3480                 :                : 
                               3481                 :                :         /*
                               3482                 :                :          * Mark buffer non-accessible to Valgrind.
                               3483                 :                :          *
                               3484                 :                :          * Note that the buffer may have already been marked non-accessible
                               3485                 :                :          * within access method code that enforces that buffers are only
                               3486                 :                :          * accessed while a buffer lock is held.
                               3487                 :                :          */
                               3488                 :                :         VALGRIND_MAKE_MEM_NOACCESS(BufHdrGetBlock(buf), BLCKSZ);
                               3489                 :                : 
                               3490                 :                :         /*
                               3491                 :                :          * I'd better not still hold the buffer content lock. Can't use
                               3492                 :                :          * BufferIsLockedByMe(), as that asserts the buffer is pinned.
                               3493                 :                :          */
  110                          3494         [ -  + ]:       29258394 :         Assert(!BufferLockHeldByMe(buf));
                               3495                 :                : 
                               3496                 :                :         /* decrement the shared reference count */
                               3497                 :       29258394 :         old_buf_state = pg_atomic_fetch_sub_u64(&buf->state, BUF_REFCOUNT_ONE);
                               3498                 :                : 
                               3499                 :                :         /* Support LockBufferForCleanup() */
  180                          3500         [ +  + ]:       29258394 :         if (old_buf_state & BM_PIN_COUNT_WAITER)
  401 andres@anarazel.de       3501                 :CBC          58 :             WakePinCountWaiter(buf);
                               3502                 :                : 
 4266                          3503                 :       29258394 :         ForgetPrivateRefCountEntry(ref);
                               3504                 :                :     }
 8051 tgl@sss.pgh.pa.us        3505                 :       48217242 : }
                               3506                 :                : 
                               3507                 :                : /*
                               3508                 :                :  * Set up backend-local tracking of a buffer pinned the first time by this
                               3509                 :                :  * backend.
                               3510                 :                :  */
                               3511                 :                : inline void
  209 andres@anarazel.de       3512                 :GNC    76120912 : TrackNewBufferPin(Buffer buf)
                               3513                 :                : {
                               3514                 :                :     PrivateRefCountEntry *ref;
                               3515                 :                : 
                               3516                 :       76120912 :     ref = NewPrivateRefCountEntry(buf);
  142                          3517                 :       76120912 :     ref->data.refcount++;
                               3518                 :                : 
  209                          3519                 :       76120912 :     ResourceOwnerRememberBuffer(CurrentResourceOwner, buf);
                               3520                 :                : 
                               3521                 :                :     /*
                               3522                 :                :      * This is the first pin for this page by this backend, mark its page as
                               3523                 :                :      * defined to valgrind. While the page contents might not actually be
                               3524                 :                :      * valid yet, we don't currently guarantee that such pages are marked
                               3525                 :                :      * undefined or non-accessible.
                               3526                 :                :      *
                               3527                 :                :      * It's not necessarily the prettiest to do this here, but otherwise we'd
                               3528                 :                :      * need this block of code in multiple places.
                               3529                 :                :      */
                               3530                 :                :     VALGRIND_MAKE_MEM_DEFINED(BufHdrGetBlock(GetBufferDescriptor(buf - 1)),
                               3531                 :                :                               BLCKSZ);
                               3532                 :       76120912 : }
                               3533                 :                : 
                               3534                 :                : #define ST_SORT sort_checkpoint_bufferids
                               3535                 :                : #define ST_ELEMENT_TYPE CkptSortItem
                               3536                 :                : #define ST_COMPARE(a, b) ckpt_buforder_comparator(a, b)
                               3537                 :                : #define ST_SCOPE static
                               3538                 :                : #define ST_DEFINE
                               3539                 :                : #include "lib/sort_template.h"
                               3540                 :                : 
                               3541                 :                : /*
                               3542                 :                :  * BufferSync -- Write out all dirty buffers in the pool.
                               3543                 :                :  *
                               3544                 :                :  * This is called at checkpoint time to write out all dirty shared buffers.
                               3545                 :                :  * The checkpoint request flags should be passed in.  If CHECKPOINT_FAST is
                               3546                 :                :  * set, we disable delays between writes; if CHECKPOINT_IS_SHUTDOWN,
                               3547                 :                :  * CHECKPOINT_END_OF_RECOVERY or CHECKPOINT_FLUSH_UNLOGGED is set, we write
                               3548                 :                :  * even unlogged buffers, which are otherwise skipped.  The remaining flags
                               3549                 :                :  * currently have no effect here.
                               3550                 :                :  */
                               3551                 :                : static void
 6886 tgl@sss.pgh.pa.us        3552                 :CBC        1944 : BufferSync(int flags)
                               3553                 :                : {
                               3554                 :                :     uint64      buf_state;
                               3555                 :                :     int         buf_id;
                               3556                 :                :     int         num_to_scan;
                               3557                 :                :     int         num_spaces;
                               3558                 :                :     int         num_processed;
                               3559                 :                :     int         num_written;
 3728 andres@anarazel.de       3560                 :           1944 :     CkptTsStatus *per_ts_stat = NULL;
                               3561                 :                :     Oid         last_tsid;
                               3562                 :                :     binaryheap *ts_heap;
                               3563                 :                :     int         i;
  110 andres@anarazel.de       3564                 :GNC        1944 :     uint64      mask = BM_DIRTY;
                               3565                 :                :     WritebackContext wb_context;
                               3566                 :                : 
                               3567                 :                :     /*
                               3568                 :                :      * Unless this is a shutdown checkpoint or we have been explicitly told,
                               3569                 :                :      * we write only permanent, dirty buffers.  But at shutdown or end of
                               3570                 :                :      * recovery, we write all dirty buffers.
                               3571                 :                :      */
 4215 andres@anarazel.de       3572         [ +  + ]:CBC        1944 :     if (!((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
                               3573                 :                :                     CHECKPOINT_FLUSH_UNLOGGED))))
 5233 rhaas@postgresql.org     3574                 :           1063 :         mask |= BM_PERMANENT;
                               3575                 :                : 
                               3576                 :                :     /*
                               3577                 :                :      * Loop over all buffers, and mark the ones that need to be written with
                               3578                 :                :      * BM_CHECKPOINT_NEEDED.  Count them as we go (num_to_scan), so that we
                               3579                 :                :      * can estimate how much work needs to be done.
                               3580                 :                :      *
                               3581                 :                :      * This allows us to write only those pages that were dirty when the
                               3582                 :                :      * checkpoint began, and not those that get dirtied while it proceeds.
                               3583                 :                :      * Whenever a page with BM_CHECKPOINT_NEEDED is written out, either by us
                               3584                 :                :      * later in this function, or by normal backends or the bgwriter cleaning
                               3585                 :                :      * scan, the flag is cleared.  Any buffer dirtied after this point won't
                               3586                 :                :      * have the flag set.
                               3587                 :                :      *
                               3588                 :                :      * Note that if we fail to write some buffer, we may leave buffers with
                               3589                 :                :      * BM_CHECKPOINT_NEEDED still set.  This is OK since any such buffer would
                               3590                 :                :      * certainly need to be written for the next checkpoint attempt, too.
                               3591                 :                :      */
 3728 andres@anarazel.de       3592                 :           1944 :     num_to_scan = 0;
 6886 tgl@sss.pgh.pa.us        3593         [ +  + ]:       14003048 :     for (buf_id = 0; buf_id < NBuffers; buf_id++)
                               3594                 :                :     {
 3823 rhaas@postgresql.org     3595                 :       14001104 :         BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
  110 andres@anarazel.de       3596                 :GNC    14001104 :         uint64      set_bits = 0;
                               3597                 :                : 
                               3598                 :                :         /*
                               3599                 :                :          * Header spinlock is enough to examine BM_DIRTY, see comment in
                               3600                 :                :          * SyncOneBuffer.
                               3601                 :                :          */
 3677 andres@anarazel.de       3602                 :CBC    14001104 :         buf_state = LockBufHdr(bufHdr);
                               3603                 :                : 
                               3604         [ +  + ]:       14001104 :         if ((buf_state & mask) == mask)
                               3605                 :                :         {
                               3606                 :                :             CkptSortItem *item;
                               3607                 :                : 
  180 andres@anarazel.de       3608                 :GNC      344186 :             set_bits = BM_CHECKPOINT_NEEDED;
                               3609                 :                : 
 3728 andres@anarazel.de       3610                 :CBC      344186 :             item = &CkptBufferIds[num_to_scan++];
                               3611                 :         344186 :             item->buf_id = buf_id;
 1350 rhaas@postgresql.org     3612                 :         344186 :             item->tsId = bufHdr->tag.spcOid;
                               3613                 :         344186 :             item->relNumber = BufTagGetRelNumber(&bufHdr->tag);
                               3614                 :         344186 :             item->forkNum = BufTagGetForkNum(&bufHdr->tag);
 3728 andres@anarazel.de       3615                 :         344186 :             item->blockNum = bufHdr->tag.blockNum;
                               3616                 :                :         }
                               3617                 :                : 
  180 andres@anarazel.de       3618                 :GNC    14001104 :         UnlockBufHdrExt(bufHdr, buf_state,
                               3619                 :                :                         set_bits, 0,
                               3620                 :                :                         0);
                               3621                 :                : 
                               3622                 :                :         /* Check for barrier events in case NBuffers is large. */
 2329 rhaas@postgresql.org     3623         [ -  + ]:CBC    14001104 :         if (ProcSignalBarrierPending)
 2329 rhaas@postgresql.org     3624                 :UBC           0 :             ProcessProcSignalBarrier();
                               3625                 :                :     }
                               3626                 :                : 
 3728 andres@anarazel.de       3627         [ +  + ]:CBC        1944 :     if (num_to_scan == 0)
 6886 tgl@sss.pgh.pa.us        3628                 :            738 :         return;                 /* nothing to do */
                               3629                 :                : 
 3728 andres@anarazel.de       3630                 :           1206 :     WritebackContextInit(&wb_context, &checkpoint_flush_after);
                               3631                 :                : 
                               3632                 :                :     TRACE_POSTGRESQL_BUFFER_SYNC_START(NBuffers, num_to_scan);
                               3633                 :                : 
                               3634                 :                :     /*
                               3635                 :                :      * Sort buffers that need to be written to reduce the likelihood of random
                               3636                 :                :      * IO. The sorting is also important for the implementation of balancing
                               3637                 :                :      * writes between tablespaces. Without balancing writes we'd potentially
                               3638                 :                :      * end up writing to the tablespaces one-by-one; possibly overloading the
                               3639                 :                :      * underlying system.
                               3640                 :                :      */
 1880 tmunro@postgresql.or     3641                 :           1206 :     sort_checkpoint_bufferids(CkptBufferIds, num_to_scan);
                               3642                 :                : 
 3728 andres@anarazel.de       3643                 :           1206 :     num_spaces = 0;
                               3644                 :                : 
                               3645                 :                :     /*
                               3646                 :                :      * Allocate progress status for each tablespace with buffers that need to
                               3647                 :                :      * be flushed. This requires the to-be-flushed array to be sorted.
                               3648                 :                :      */
                               3649                 :           1206 :     last_tsid = InvalidOid;
                               3650         [ +  + ]:         345392 :     for (i = 0; i < num_to_scan; i++)
                               3651                 :                :     {
                               3652                 :                :         CkptTsStatus *s;
                               3653                 :                :         Oid         cur_tsid;
                               3654                 :                : 
                               3655                 :         344186 :         cur_tsid = CkptBufferIds[i].tsId;
                               3656                 :                : 
                               3657                 :                :         /*
                               3658                 :                :          * Grow array of per-tablespace status structs, every time a new
                               3659                 :                :          * tablespace is found.
                               3660                 :                :          */
                               3661   [ +  +  +  + ]:         344186 :         if (last_tsid == InvalidOid || last_tsid != cur_tsid)
                               3662                 :           1807 :         {
                               3663                 :                :             Size        sz;
                               3664                 :                : 
                               3665                 :           1807 :             num_spaces++;
                               3666                 :                : 
                               3667                 :                :             /*
                               3668                 :                :              * Not worth adding grow-by-power-of-2 logic here - even with a
                               3669                 :                :              * few hundred tablespaces this should be fine.
                               3670                 :                :              */
                               3671                 :           1807 :             sz = sizeof(CkptTsStatus) * num_spaces;
                               3672                 :                : 
                               3673         [ +  + ]:           1807 :             if (per_ts_stat == NULL)
                               3674                 :           1206 :                 per_ts_stat = (CkptTsStatus *) palloc(sz);
                               3675                 :                :             else
                               3676                 :            601 :                 per_ts_stat = (CkptTsStatus *) repalloc(per_ts_stat, sz);
                               3677                 :                : 
                               3678                 :           1807 :             s = &per_ts_stat[num_spaces - 1];
                               3679                 :           1807 :             memset(s, 0, sizeof(*s));
                               3680                 :           1807 :             s->tsId = cur_tsid;
                               3681                 :                : 
                               3682                 :                :             /*
                               3683                 :                :              * The first buffer in this tablespace. As CkptBufferIds is sorted
                               3684                 :                :              * by tablespace all (s->num_to_scan) buffers in this tablespace
                               3685                 :                :              * will follow afterwards.
                               3686                 :                :              */
                               3687                 :           1807 :             s->index = i;
                               3688                 :                : 
                               3689                 :                :             /*
                               3690                 :                :              * progress_slice will be determined once we know how many buffers
                               3691                 :                :              * are in each tablespace, i.e. after this loop.
                               3692                 :                :              */
                               3693                 :                : 
                               3694                 :           1807 :             last_tsid = cur_tsid;
                               3695                 :                :         }
                               3696                 :                :         else
                               3697                 :                :         {
                               3698                 :         342379 :             s = &per_ts_stat[num_spaces - 1];
                               3699                 :                :         }
                               3700                 :                : 
                               3701                 :         344186 :         s->num_to_scan++;
                               3702                 :                : 
                               3703                 :                :         /* Check for barrier events. */
 2329 rhaas@postgresql.org     3704         [ -  + ]:         344186 :         if (ProcSignalBarrierPending)
 2329 rhaas@postgresql.org     3705                 :UBC           0 :             ProcessProcSignalBarrier();
                               3706                 :                :     }
                               3707                 :                : 
 3728 andres@anarazel.de       3708         [ -  + ]:CBC        1206 :     Assert(num_spaces > 0);
                               3709                 :                : 
                               3710                 :                :     /*
                               3711                 :                :      * Build a min-heap over the write-progress in the individual tablespaces,
                               3712                 :                :      * and compute how large a portion of the total progress a single
                               3713                 :                :      * processed buffer is.
                               3714                 :                :      */
                               3715                 :           1206 :     ts_heap = binaryheap_allocate(num_spaces,
                               3716                 :                :                                   ts_ckpt_progress_comparator,
                               3717                 :                :                                   NULL);
                               3718                 :                : 
                               3719         [ +  + ]:           3013 :     for (i = 0; i < num_spaces; i++)
                               3720                 :                :     {
                               3721                 :           1807 :         CkptTsStatus *ts_stat = &per_ts_stat[i];
                               3722                 :                : 
                               3723                 :           1807 :         ts_stat->progress_slice = (float8) num_to_scan / ts_stat->num_to_scan;
                               3724                 :                : 
                               3725                 :           1807 :         binaryheap_add_unordered(ts_heap, PointerGetDatum(ts_stat));
                               3726                 :                :     }
                               3727                 :                : 
                               3728                 :           1206 :     binaryheap_build(ts_heap);
                               3729                 :                : 
                               3730                 :                :     /*
                               3731                 :                :      * Iterate through to-be-checkpointed buffers and write the ones (still)
                               3732                 :                :      * marked with BM_CHECKPOINT_NEEDED. The writes are balanced between
                               3733                 :                :      * tablespaces; otherwise the sorting would lead to only one tablespace
                               3734                 :                :      * receiving writes at a time, making inefficient use of the hardware.
                               3735                 :                :      */
                               3736                 :           1206 :     num_processed = 0;
 6886 tgl@sss.pgh.pa.us        3737                 :           1206 :     num_written = 0;
 3728 andres@anarazel.de       3738         [ +  + ]:         345392 :     while (!binaryheap_empty(ts_heap))
                               3739                 :                :     {
                               3740                 :         344186 :         BufferDesc *bufHdr = NULL;
                               3741                 :                :         CkptTsStatus *ts_stat = (CkptTsStatus *)
 1082 tgl@sss.pgh.pa.us        3742                 :         344186 :             DatumGetPointer(binaryheap_first(ts_heap));
                               3743                 :                : 
 3728 andres@anarazel.de       3744                 :         344186 :         buf_id = CkptBufferIds[ts_stat->index].buf_id;
                               3745         [ -  + ]:         344186 :         Assert(buf_id != -1);
                               3746                 :                : 
                               3747                 :         344186 :         bufHdr = GetBufferDescriptor(buf_id);
                               3748                 :                : 
                               3749                 :         344186 :         num_processed++;
                               3750                 :                : 
                               3751                 :                :         /*
                               3752                 :                :          * We don't need to acquire the lock here, because we're only looking
                               3753                 :                :          * at a single bit. It's possible that someone else writes the buffer
                               3754                 :                :          * and clears the flag right after we check, but that doesn't matter
                               3755                 :                :          * since SyncOneBuffer will then do nothing.  However, there is a
                               3756                 :                :          * further race condition: it's conceivable that between the time we
                               3757                 :                :          * examine the bit here and the time SyncOneBuffer acquires the lock,
                               3758                 :                :          * someone else not only wrote the buffer but replaced it with another
                               3759                 :                :          * page and dirtied it.  In that improbable case, SyncOneBuffer will
                               3760                 :                :          * write the buffer though we didn't need to.  It doesn't seem worth
                               3761                 :                :          * guarding against this, though.
                               3762                 :                :          */
  110 andres@anarazel.de       3763         [ +  + ]:GNC      344186 :         if (pg_atomic_read_u64(&bufHdr->state) & BM_CHECKPOINT_NEEDED)
                               3764                 :                :         {
 3728 andres@anarazel.de       3765         [ +  + ]:CBC      321605 :             if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN)
                               3766                 :                :             {
                               3767                 :                :                 TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id);
  918 michael@paquier.xyz      3768                 :         321604 :                 PendingCheckpointerStats.buffers_written++;
 6886 tgl@sss.pgh.pa.us        3769                 :         321604 :                 num_written++;
                               3770                 :                :             }
                               3771                 :                :         }
                               3772                 :                : 
                               3773                 :                :         /*
                               3774                 :                :          * Measure progress independent of actually having to flush the buffer
                               3775                 :                :          * - otherwise writing become unbalanced.
                               3776                 :                :          */
 3728 andres@anarazel.de       3777                 :         344186 :         ts_stat->progress += ts_stat->progress_slice;
                               3778                 :         344186 :         ts_stat->num_scanned++;
                               3779                 :         344186 :         ts_stat->index++;
                               3780                 :                : 
                               3781                 :                :         /* Have all the buffers from the tablespace been processed? */
                               3782         [ +  + ]:         344186 :         if (ts_stat->num_scanned == ts_stat->num_to_scan)
                               3783                 :                :         {
                               3784                 :           1807 :             binaryheap_remove_first(ts_heap);
                               3785                 :                :         }
                               3786                 :                :         else
                               3787                 :                :         {
                               3788                 :                :             /* update heap with the new progress */
                               3789                 :         342379 :             binaryheap_replace_first(ts_heap, PointerGetDatum(ts_stat));
                               3790                 :                :         }
                               3791                 :                : 
                               3792                 :                :         /*
                               3793                 :                :          * Sleep to throttle our I/O rate.
                               3794                 :                :          *
                               3795                 :                :          * (This will check for barrier events even if it doesn't sleep.)
                               3796                 :                :          */
                               3797                 :         344186 :         CheckpointWriteDelay(flags, (double) num_processed / num_to_scan);
                               3798                 :                :     }
                               3799                 :                : 
                               3800                 :                :     /*
                               3801                 :                :      * Issue all pending flushes. Only checkpointer calls BufferSync(), so
                               3802                 :                :      * IOContext will always be IOCONTEXT_NORMAL.
                               3803                 :                :      */
 1084                          3804                 :           1206 :     IssuePendingWritebacks(&wb_context, IOCONTEXT_NORMAL);
                               3805                 :                : 
 3728                          3806                 :           1206 :     pfree(per_ts_stat);
                               3807                 :           1206 :     per_ts_stat = NULL;
                               3808                 :           1206 :     binaryheap_free(ts_heap);
                               3809                 :                : 
                               3810                 :                :     /*
                               3811                 :                :      * Update checkpoint statistics. As noted above, this doesn't include
                               3812                 :                :      * buffers written by other backends or bgwriter scan.
                               3813                 :                :      */
 6884 tgl@sss.pgh.pa.us        3814                 :           1206 :     CheckpointStats.ckpt_bufs_written += num_written;
                               3815                 :                : 
                               3816                 :                :     TRACE_POSTGRESQL_BUFFER_SYNC_DONE(NBuffers, num_written, num_to_scan);
                               3817                 :                : }
                               3818                 :                : 
                               3819                 :                : /*
                               3820                 :                :  * BgBufferSync -- Write out some dirty buffers in the pool.
                               3821                 :                :  *
                               3822                 :                :  * This is called periodically by the background writer process.
                               3823                 :                :  *
                               3824                 :                :  * Returns true if it's appropriate for the bgwriter process to go into
                               3825                 :                :  * low-power hibernation mode.  (This happens if the strategy clock-sweep
                               3826                 :                :  * has been "lapped" and no buffer allocations have occurred recently,
                               3827                 :                :  * or if the bgwriter has been effectively disabled by setting
                               3828                 :                :  * bgwriter_lru_maxpages to 0.)
                               3829                 :                :  */
                               3830                 :                : bool
 3728 andres@anarazel.de       3831                 :          13948 : BgBufferSync(WritebackContext *wb_context)
                               3832                 :                : {
                               3833                 :                :     /* info obtained from freelist.c */
                               3834                 :                :     int         strategy_buf_id;
                               3835                 :                :     uint32      strategy_passes;
                               3836                 :                :     uint32      recent_alloc;
                               3837                 :                : 
                               3838                 :                :     /*
                               3839                 :                :      * Information saved between calls so we can determine the strategy
                               3840                 :                :      * point's advance rate and avoid scanning already-cleaned buffers.
                               3841                 :                :      */
                               3842                 :                :     static bool saved_info_valid = false;
                               3843                 :                :     static int  prev_strategy_buf_id;
                               3844                 :                :     static uint32 prev_strategy_passes;
                               3845                 :                :     static int  next_to_clean;
                               3846                 :                :     static uint32 next_passes;
                               3847                 :                : 
                               3848                 :                :     /* Moving averages of allocation rate and clean-buffer density */
                               3849                 :                :     static float smoothed_alloc = 0;
                               3850                 :                :     static float smoothed_density = 10.0;
                               3851                 :                : 
                               3852                 :                :     /* Potentially these could be tunables, but for now, not */
 6797 tgl@sss.pgh.pa.us        3853                 :          13948 :     float       smoothing_samples = 16;
                               3854                 :          13948 :     float       scan_whole_pool_milliseconds = 120000.0;
                               3855                 :                : 
                               3856                 :                :     /* Used to compute how far we scan ahead */
                               3857                 :                :     long        strategy_delta;
                               3858                 :                :     int         bufs_to_lap;
                               3859                 :                :     int         bufs_ahead;
                               3860                 :                :     float       scans_per_alloc;
                               3861                 :                :     int         reusable_buffers_est;
                               3862                 :                :     int         upcoming_alloc_est;
                               3863                 :                :     int         min_scan_buffers;
                               3864                 :                : 
                               3865                 :                :     /* Variables for the scanning loop proper */
                               3866                 :                :     int         num_to_scan;
                               3867                 :                :     int         num_written;
                               3868                 :                :     int         reusable_buffers;
                               3869                 :                : 
                               3870                 :                :     /* Variables for final smoothed_density update */
                               3871                 :                :     long        new_strategy_delta;
                               3872                 :                :     uint32      new_recent_alloc;
                               3873                 :                : 
                               3874                 :                :     /*
                               3875                 :                :      * Find out where the clock-sweep currently is, and how many buffer
                               3876                 :                :      * allocations have happened since our last call.
                               3877                 :                :      */
                               3878                 :          13948 :     strategy_buf_id = StrategySyncStart(&strategy_passes, &recent_alloc);
                               3879                 :                : 
                               3880                 :                :     /* Report buffer alloc counts to pgstat */
 1490 andres@anarazel.de       3881                 :          13948 :     PendingBgWriterStats.buf_alloc += recent_alloc;
                               3882                 :                : 
                               3883                 :                :     /*
                               3884                 :                :      * If we're not running the LRU scan, just stop after doing the stats
                               3885                 :                :      * stuff.  We mark the saved state invalid so that we can recover sanely
                               3886                 :                :      * if LRU scan is turned back on later.
                               3887                 :                :      */
 6797 tgl@sss.pgh.pa.us        3888         [ +  + ]:          13948 :     if (bgwriter_lru_maxpages <= 0)
                               3889                 :                :     {
                               3890                 :             50 :         saved_info_valid = false;
 5213 heikki.linnakangas@i     3891                 :             50 :         return true;
                               3892                 :                :     }
                               3893                 :                : 
                               3894                 :                :     /*
                               3895                 :                :      * Compute strategy_delta = how many buffers have been scanned by the
                               3896                 :                :      * clock-sweep since last time.  If first time through, assume none. Then
                               3897                 :                :      * see if we are still ahead of the clock-sweep, and if so, how many
                               3898                 :                :      * buffers we could scan before we'd catch up with it and "lap" it. Note:
                               3899                 :                :      * weird-looking coding of xxx_passes comparisons are to avoid bogus
                               3900                 :                :      * behavior when the passes counts wrap around.
                               3901                 :                :      */
 6797 tgl@sss.pgh.pa.us        3902         [ +  + ]:          13898 :     if (saved_info_valid)
                               3903                 :                :     {
 6746 bruce@momjian.us         3904                 :          13266 :         int32       passes_delta = strategy_passes - prev_strategy_passes;
                               3905                 :                : 
 6797 tgl@sss.pgh.pa.us        3906                 :          13266 :         strategy_delta = strategy_buf_id - prev_strategy_buf_id;
 3240                          3907                 :          13266 :         strategy_delta += (long) passes_delta * NBuffers;
                               3908                 :                : 
 6797                          3909         [ -  + ]:          13266 :         Assert(strategy_delta >= 0);
                               3910                 :                : 
                               3911         [ +  + ]:          13266 :         if ((int32) (next_passes - strategy_passes) > 0)
                               3912                 :                :         {
                               3913                 :                :             /* we're one pass ahead of the strategy point */
                               3914                 :           3117 :             bufs_to_lap = strategy_buf_id - next_to_clean;
                               3915                 :                : #ifdef BGW_DEBUG
                               3916                 :                :             elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
                               3917                 :                :                  next_passes, next_to_clean,
                               3918                 :                :                  strategy_passes, strategy_buf_id,
                               3919                 :                :                  strategy_delta, bufs_to_lap);
                               3920                 :                : #endif
                               3921                 :                :         }
                               3922         [ +  + ]:          10149 :         else if (next_passes == strategy_passes &&
                               3923         [ +  + ]:           7068 :                  next_to_clean >= strategy_buf_id)
                               3924                 :                :         {
                               3925                 :                :             /* on same pass, but ahead or at least not behind */
                               3926                 :           6080 :             bufs_to_lap = NBuffers - (next_to_clean - strategy_buf_id);
                               3927                 :                : #ifdef BGW_DEBUG
                               3928                 :                :             elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
                               3929                 :                :                  next_passes, next_to_clean,
                               3930                 :                :                  strategy_passes, strategy_buf_id,
                               3931                 :                :                  strategy_delta, bufs_to_lap);
                               3932                 :                : #endif
                               3933                 :                :         }
                               3934                 :                :         else
                               3935                 :                :         {
                               3936                 :                :             /*
                               3937                 :                :              * We're behind, so skip forward to the strategy point and start
                               3938                 :                :              * cleaning from there.
                               3939                 :                :              */
                               3940                 :                : #ifdef BGW_DEBUG
                               3941                 :                :             elog(DEBUG2, "bgwriter behind: bgw %u-%u strategy %u-%u delta=%ld",
                               3942                 :                :                  next_passes, next_to_clean,
                               3943                 :                :                  strategy_passes, strategy_buf_id,
                               3944                 :                :                  strategy_delta);
                               3945                 :                : #endif
                               3946                 :           4069 :             next_to_clean = strategy_buf_id;
                               3947                 :           4069 :             next_passes = strategy_passes;
                               3948                 :           4069 :             bufs_to_lap = NBuffers;
                               3949                 :                :         }
                               3950                 :                :     }
                               3951                 :                :     else
                               3952                 :                :     {
                               3953                 :                :         /*
                               3954                 :                :          * Initializing at startup or after LRU scanning had been off. Always
                               3955                 :                :          * start at the strategy point.
                               3956                 :                :          */
                               3957                 :                : #ifdef BGW_DEBUG
                               3958                 :                :         elog(DEBUG2, "bgwriter initializing: strategy %u-%u",
                               3959                 :                :              strategy_passes, strategy_buf_id);
                               3960                 :                : #endif
                               3961                 :            632 :         strategy_delta = 0;
                               3962                 :            632 :         next_to_clean = strategy_buf_id;
                               3963                 :            632 :         next_passes = strategy_passes;
                               3964                 :            632 :         bufs_to_lap = NBuffers;
                               3965                 :                :     }
                               3966                 :                : 
                               3967                 :                :     /* Update saved info for next time */
                               3968                 :          13898 :     prev_strategy_buf_id = strategy_buf_id;
                               3969                 :          13898 :     prev_strategy_passes = strategy_passes;
                               3970                 :          13898 :     saved_info_valid = true;
                               3971                 :                : 
                               3972                 :                :     /*
                               3973                 :                :      * Compute how many buffers had to be scanned for each new allocation, ie,
                               3974                 :                :      * 1/density of reusable buffers, and track a moving average of that.
                               3975                 :                :      *
                               3976                 :                :      * If the strategy point didn't move, we don't update the density estimate
                               3977                 :                :      */
                               3978   [ +  +  +  - ]:          13898 :     if (strategy_delta > 0 && recent_alloc > 0)
                               3979                 :                :     {
                               3980                 :           9469 :         scans_per_alloc = (float) strategy_delta / (float) recent_alloc;
                               3981                 :           9469 :         smoothed_density += (scans_per_alloc - smoothed_density) /
                               3982                 :                :             smoothing_samples;
                               3983                 :                :     }
                               3984                 :                : 
                               3985                 :                :     /*
                               3986                 :                :      * Estimate how many reusable buffers there are between the current
                               3987                 :                :      * strategy point and where we've scanned ahead to, based on the smoothed
                               3988                 :                :      * density estimate.
                               3989                 :                :      */
                               3990                 :          13898 :     bufs_ahead = NBuffers - bufs_to_lap;
                               3991                 :          13898 :     reusable_buffers_est = (float) bufs_ahead / smoothed_density;
                               3992                 :                : 
                               3993                 :                :     /*
                               3994                 :                :      * Track a moving average of recent buffer allocations.  Here, rather than
                               3995                 :                :      * a true average we want a fast-attack, slow-decline behavior: we
                               3996                 :                :      * immediately follow any increase.
                               3997                 :                :      */
                               3998         [ +  + ]:          13898 :     if (smoothed_alloc <= (float) recent_alloc)
                               3999                 :           2857 :         smoothed_alloc = recent_alloc;
                               4000                 :                :     else
                               4001                 :          11041 :         smoothed_alloc += ((float) recent_alloc - smoothed_alloc) /
                               4002                 :                :             smoothing_samples;
                               4003                 :                : 
                               4004                 :                :     /* Scale the estimate by a GUC to allow more aggressive tuning. */
 5281                          4005                 :          13898 :     upcoming_alloc_est = (int) (smoothed_alloc * bgwriter_lru_multiplier);
                               4006                 :                : 
                               4007                 :                :     /*
                               4008                 :                :      * If recent_alloc remains at zero for many cycles, smoothed_alloc will
                               4009                 :                :      * eventually underflow to zero, and the underflows produce annoying
                               4010                 :                :      * kernel warnings on some platforms.  Once upcoming_alloc_est has gone to
                               4011                 :                :      * zero, there's no point in tracking smaller and smaller values of
                               4012                 :                :      * smoothed_alloc, so just reset it to exactly zero to avoid this
                               4013                 :                :      * syndrome.  It will pop back up as soon as recent_alloc increases.
                               4014                 :                :      */
                               4015         [ +  + ]:          13898 :     if (upcoming_alloc_est == 0)
                               4016                 :           1081 :         smoothed_alloc = 0;
                               4017                 :                : 
                               4018                 :                :     /*
                               4019                 :                :      * Even in cases where there's been little or no buffer allocation
                               4020                 :                :      * activity, we want to make a small amount of progress through the buffer
                               4021                 :                :      * cache so that as many reusable buffers as possible are clean after an
                               4022                 :                :      * idle period.
                               4023                 :                :      *
                               4024                 :                :      * (scan_whole_pool_milliseconds / BgWriterDelay) computes how many times
                               4025                 :                :      * the BGW will be called during the scan_whole_pool time; slice the
                               4026                 :                :      * buffer pool into that many sections.
                               4027                 :                :      */
 6797                          4028                 :          13898 :     min_scan_buffers = (int) (NBuffers / (scan_whole_pool_milliseconds / BgWriterDelay));
                               4029                 :                : 
                               4030         [ +  + ]:          13898 :     if (upcoming_alloc_est < (min_scan_buffers + reusable_buffers_est))
                               4031                 :                :     {
                               4032                 :                : #ifdef BGW_DEBUG
                               4033                 :                :         elog(DEBUG2, "bgwriter: alloc_est=%d too small, using min=%d + reusable_est=%d",
                               4034                 :                :              upcoming_alloc_est, min_scan_buffers, reusable_buffers_est);
                               4035                 :                : #endif
                               4036                 :           6362 :         upcoming_alloc_est = min_scan_buffers + reusable_buffers_est;
                               4037                 :                :     }
                               4038                 :                : 
                               4039                 :                :     /*
                               4040                 :                :      * Now write out dirty reusable buffers, working forward from the
                               4041                 :                :      * next_to_clean point, until we have lapped the strategy scan, or cleaned
                               4042                 :                :      * enough buffers to match our estimate of the next cycle's allocation
                               4043                 :                :      * requirements, or hit the bgwriter_lru_maxpages limit.
                               4044                 :                :      */
                               4045                 :                : 
                               4046                 :          13898 :     num_to_scan = bufs_to_lap;
                               4047                 :          13898 :     num_written = 0;
                               4048                 :          13898 :     reusable_buffers = reusable_buffers_est;
                               4049                 :                : 
                               4050                 :                :     /* Execute the LRU scan */
                               4051   [ +  +  +  + ]:        2069212 :     while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est)
                               4052                 :                :     {
 3677 andres@anarazel.de       4053                 :        2055316 :         int         sync_state = SyncOneBuffer(next_to_clean, true,
                               4054                 :                :                                                wb_context);
                               4055                 :                : 
 6797 tgl@sss.pgh.pa.us        4056         [ +  + ]:        2055316 :         if (++next_to_clean >= NBuffers)
                               4057                 :                :         {
                               4058                 :           3867 :             next_to_clean = 0;
                               4059                 :           3867 :             next_passes++;
                               4060                 :                :         }
                               4061                 :        2055316 :         num_to_scan--;
                               4062                 :                : 
 3677 andres@anarazel.de       4063         [ +  + ]:        2055316 :         if (sync_state & BUF_WRITTEN)
                               4064                 :                :         {
 6797 tgl@sss.pgh.pa.us        4065                 :          37521 :             reusable_buffers++;
                               4066         [ +  + ]:          37521 :             if (++num_written >= bgwriter_lru_maxpages)
                               4067                 :                :             {
 1490 andres@anarazel.de       4068                 :              2 :                 PendingBgWriterStats.maxwritten_clean++;
 6797 tgl@sss.pgh.pa.us        4069                 :              2 :                 break;
                               4070                 :                :             }
                               4071                 :                :         }
 3677 andres@anarazel.de       4072         [ +  + ]:        2017795 :         else if (sync_state & BUF_REUSABLE)
 6797 tgl@sss.pgh.pa.us        4073                 :        1507796 :             reusable_buffers++;
                               4074                 :                :     }
                               4075                 :                : 
 1490 andres@anarazel.de       4076                 :          13898 :     PendingBgWriterStats.buf_written_clean += num_written;
                               4077                 :                : 
                               4078                 :                : #ifdef BGW_DEBUG
                               4079                 :                :     elog(DEBUG1, "bgwriter: recent_alloc=%u smoothed=%.2f delta=%ld ahead=%d density=%.2f reusable_est=%d upcoming_est=%d scanned=%d wrote=%d reusable=%d",
                               4080                 :                :          recent_alloc, smoothed_alloc, strategy_delta, bufs_ahead,
                               4081                 :                :          smoothed_density, reusable_buffers_est, upcoming_alloc_est,
                               4082                 :                :          bufs_to_lap - num_to_scan,
                               4083                 :                :          num_written,
                               4084                 :                :          reusable_buffers - reusable_buffers_est);
                               4085                 :                : #endif
                               4086                 :                : 
                               4087                 :                :     /*
                               4088                 :                :      * Consider the above scan as being like a new allocation scan.
                               4089                 :                :      * Characterize its density and update the smoothed one based on it. This
                               4090                 :                :      * effectively halves the moving average period in cases where both the
                               4091                 :                :      * strategy and the background writer are doing some useful scanning,
                               4092                 :                :      * which is helpful because a long memory isn't as desirable on the
                               4093                 :                :      * density estimates.
                               4094                 :                :      */
 5109 tgl@sss.pgh.pa.us        4095                 :          13898 :     new_strategy_delta = bufs_to_lap - num_to_scan;
                               4096                 :          13898 :     new_recent_alloc = reusable_buffers - reusable_buffers_est;
                               4097   [ +  +  +  + ]:          13898 :     if (new_strategy_delta > 0 && new_recent_alloc > 0)
                               4098                 :                :     {
                               4099                 :          11972 :         scans_per_alloc = (float) new_strategy_delta / (float) new_recent_alloc;
 6797                          4100                 :          11972 :         smoothed_density += (scans_per_alloc - smoothed_density) /
                               4101                 :                :             smoothing_samples;
                               4102                 :                : 
                               4103                 :                : #ifdef BGW_DEBUG
                               4104                 :                :         elog(DEBUG2, "bgwriter: cleaner density alloc=%u scan=%ld density=%.2f new smoothed=%.2f",
                               4105                 :                :              new_recent_alloc, new_strategy_delta,
                               4106                 :                :              scans_per_alloc, smoothed_density);
                               4107                 :                : #endif
                               4108                 :                :     }
                               4109                 :                : 
                               4110                 :                :     /* Return true if OK to hibernate */
 5109                          4111   [ +  +  +  - ]:          13898 :     return (bufs_to_lap == 0 && recent_alloc == 0);
                               4112                 :                : }
                               4113                 :                : 
                               4114                 :                : /*
                               4115                 :                :  * SyncOneBuffer -- process a single buffer during syncing.
                               4116                 :                :  *
                               4117                 :                :  * If skip_recently_used is true, we don't write currently-pinned buffers, nor
                               4118                 :                :  * buffers marked recently used, as these are not replacement candidates.
                               4119                 :                :  *
                               4120                 :                :  * Returns a bitmask containing the following flag bits:
                               4121                 :                :  *  BUF_WRITTEN: we wrote the buffer.
                               4122                 :                :  *  BUF_REUSABLE: buffer is available for replacement, ie, it has
                               4123                 :                :  *      pin count 0 and usage count 0.
                               4124                 :                :  *
                               4125                 :                :  * (BUF_WRITTEN could be set in error if FlushBuffer finds the buffer clean
                               4126                 :                :  * after locking it, but we don't care all that much.)
                               4127                 :                :  */
                               4128                 :                : static int
 3728 andres@anarazel.de       4129                 :        2376921 : SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
                               4130                 :                : {
 3823 rhaas@postgresql.org     4131                 :        2376921 :     BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
 6746 bruce@momjian.us         4132                 :        2376921 :     int         result = 0;
                               4133                 :                :     uint64      buf_state;
                               4134                 :                :     BufferTag   tag;
                               4135                 :                : 
                               4136                 :                :     /* Make sure we can handle the pin */
 4124 andres@anarazel.de       4137                 :        2376921 :     ReservePrivateRefCountEntry();
  909 heikki.linnakangas@i     4138                 :        2376921 :     ResourceOwnerEnlarge(CurrentResourceOwner);
                               4139                 :                : 
                               4140                 :                :     /*
                               4141                 :                :      * Check whether buffer needs writing.
                               4142                 :                :      *
                               4143                 :                :      * We can make this check without taking the buffer content lock so long
                               4144                 :                :      * as we mark pages dirty in access methods *before* logging changes with
                               4145                 :                :      * XLogInsert(): if someone marks the buffer dirty just after our check we
                               4146                 :                :      * don't worry because our checkpoint.redo points before log record for
                               4147                 :                :      * upcoming changes and so we are not required to write such dirty buffer.
                               4148                 :                :      */
 3677 andres@anarazel.de       4149                 :        2376921 :     buf_state = LockBufHdr(bufHdr);
                               4150                 :                : 
                               4151         [ +  + ]:        2376921 :     if (BUF_STATE_GET_REFCOUNT(buf_state) == 0 &&
                               4152         [ +  + ]:        2370701 :         BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
                               4153                 :                :     {
 6797 tgl@sss.pgh.pa.us        4154                 :        1547342 :         result |= BUF_REUSABLE;
                               4155                 :                :     }
                               4156         [ +  + ]:         829579 :     else if (skip_recently_used)
                               4157                 :                :     {
                               4158                 :                :         /* Caller told us not to write recently-used buffers */
  180 andres@anarazel.de       4159                 :GNC      509999 :         UnlockBufHdr(bufHdr);
 6797 tgl@sss.pgh.pa.us        4160                 :CBC      509999 :         return result;
                               4161                 :                :     }
                               4162                 :                : 
 3677 andres@anarazel.de       4163   [ +  +  +  + ]:        1866922 :     if (!(buf_state & BM_VALID) || !(buf_state & BM_DIRTY))
                               4164                 :                :     {
                               4165                 :                :         /* It's clean, so nothing to do */
  180 andres@anarazel.de       4166                 :GNC     1507797 :         UnlockBufHdr(bufHdr);
 6797 tgl@sss.pgh.pa.us        4167                 :CBC     1507797 :         return result;
                               4168                 :                :     }
                               4169                 :                : 
                               4170                 :                :     /*
                               4171                 :                :      * Pin it, share-exclusive-lock it, write it.  (FlushBuffer will do
                               4172                 :                :      * nothing if the buffer is clean by the time we've locked it.)
                               4173                 :                :      */
 7732                          4174                 :         359125 :     PinBuffer_Locked(bufHdr);
                               4175                 :                : 
  209 andres@anarazel.de       4176                 :GNC      359125 :     FlushUnlockedBuffer(bufHdr, NULL, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
                               4177                 :                : 
 3728 andres@anarazel.de       4178                 :CBC      359125 :     tag = bufHdr->tag;
                               4179                 :                : 
 1313 michael@paquier.xyz      4180                 :         359125 :     UnpinBuffer(bufHdr);
                               4181                 :                : 
                               4182                 :                :     /*
                               4183                 :                :      * SyncOneBuffer() is only called by checkpointer and bgwriter, so
                               4184                 :                :      * IOContext will always be IOCONTEXT_NORMAL.
                               4185                 :                :      */
 1084 andres@anarazel.de       4186                 :         359125 :     ScheduleBufferTagForWriteback(wb_context, IOCONTEXT_NORMAL, &tag);
                               4187                 :                : 
 6797 tgl@sss.pgh.pa.us        4188                 :         359125 :     return result | BUF_WRITTEN;
                               4189                 :                : }
                               4190                 :                : 
                               4191                 :                : /*
                               4192                 :                :  *      AtEOXact_Buffers - clean up at end of transaction.
                               4193                 :                :  *
                               4194                 :                :  *      As of PostgreSQL 8.0, buffer pins should get released by the
                               4195                 :                :  *      ResourceOwner mechanism.  This routine is just a debugging
                               4196                 :                :  *      cross-check that no pins remain.
                               4197                 :                :  */
                               4198                 :                : void
 8673                          4199                 :         423297 : AtEOXact_Buffers(bool isCommit)
                               4200                 :                : {
 4337 andres@anarazel.de       4201                 :         423297 :     CheckForBufferLeaks();
                               4202                 :                : 
 7871 tgl@sss.pgh.pa.us        4203                 :         423297 :     AtEOXact_LocalBuffers(isCommit);
                               4204                 :                : 
 4266 andres@anarazel.de       4205         [ -  + ]:         423297 :     Assert(PrivateRefCountOverflowed == 0);
                               4206                 :         423297 : }
                               4207                 :                : 
                               4208                 :                : /*
                               4209                 :                :  * Initialize access to shared buffer pool
                               4210                 :                :  *
                               4211                 :                :  * This is called during backend startup (whether standalone or under the
                               4212                 :                :  * postmaster).  It sets up for this backend's access to the already-existing
                               4213                 :                :  * buffer pool.
                               4214                 :                :  */
                               4215                 :                : void
  614 heikki.linnakangas@i     4216                 :          22963 : InitBufferManagerAccess(void)
                               4217                 :                : {
                               4218                 :                :     /*
                               4219                 :                :      * An advisory limit on the number of pins each backend should hold, based
                               4220                 :                :      * on shared_buffers and the maximum number of connections possible.
                               4221                 :                :      * That's very pessimistic, but outside toy-sized shared_buffers it should
                               4222                 :                :      * allow plenty of pins.  LimitAdditionalPins() and
                               4223                 :                :      * GetAdditionalPinLimit() can be used to check the remaining balance.
                               4224                 :                :      */
  417 tmunro@postgresql.or     4225                 :          22963 :     MaxProportionalPins = NBuffers / (MaxBackends + NUM_AUXILIARY_PROCS);
                               4226                 :                : 
 4266 andres@anarazel.de       4227                 :          22963 :     memset(&PrivateRefCountArray, 0, sizeof(PrivateRefCountArray));
  142 andres@anarazel.de       4228                 :GNC       22963 :     memset(&PrivateRefCountArrayKeys, 0, sizeof(PrivateRefCountArrayKeys));
                               4229                 :                : 
   54 pg@bowt.ie               4230                 :          22963 :     PrivateRefCountHash = refcount_create(CurrentMemoryContext, 100, NULL);
                               4231                 :                : 
                               4232                 :                :     /*
                               4233                 :                :      * AtProcExit_Buffers needs LWLock access, and thereby has to be called at
                               4234                 :                :      * the corresponding phase of backend shutdown.
                               4235                 :                :      */
 1734 andres@anarazel.de       4236         [ -  + ]:CBC       22963 :     Assert(MyProc != NULL);
 7575 tgl@sss.pgh.pa.us        4237                 :          22963 :     on_shmem_exit(AtProcExit_Buffers, 0);
                               4238                 :          22963 : }
                               4239                 :                : 
                               4240                 :                : /*
                               4241                 :                :  * During backend exit, ensure that we released all shared-buffer locks and
                               4242                 :                :  * assert that we have no remaining pins.
                               4243                 :                :  */
                               4244                 :                : static void
                               4245                 :          22963 : AtProcExit_Buffers(int code, Datum arg)
                               4246                 :                : {
 7871                          4247                 :          22963 :     UnlockBuffers();
                               4248                 :                : 
 4337 andres@anarazel.de       4249                 :          22963 :     CheckForBufferLeaks();
                               4250                 :                : 
                               4251                 :                :     /* localbuf.c needs a chance too */
                               4252                 :          22963 :     AtProcExit_LocalBuffers();
                               4253                 :          22963 : }
                               4254                 :                : 
                               4255                 :                : /*
                               4256                 :                :  *      CheckForBufferLeaks - ensure this backend holds no buffer pins
                               4257                 :                :  *
                               4258                 :                :  *      As of PostgreSQL 8.0, buffer pins should get released by the
                               4259                 :                :  *      ResourceOwner mechanism.  This routine is just a debugging
                               4260                 :                :  *      cross-check that no pins remain.
                               4261                 :                :  */
                               4262                 :                : static void
                               4263                 :         446260 : CheckForBufferLeaks(void)
                               4264                 :                : {
                               4265                 :                : #ifdef USE_ASSERT_CHECKING
                               4266                 :         446260 :     int         RefCountErrors = 0;
                               4267                 :                :     PrivateRefCountEntry *res;
                               4268                 :                :     int         i;
                               4269                 :                :     char       *s;
                               4270                 :                : 
                               4271                 :                :     /* check the array */
 4266                          4272         [ +  + ]:        4016340 :     for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
                               4273                 :                :     {
  142 andres@anarazel.de       4274         [ -  + ]:GNC     3570080 :         if (PrivateRefCountArrayKeys[i] != InvalidBuffer)
                               4275                 :                :         {
  142 andres@anarazel.de       4276                 :UNC           0 :             res = &PrivateRefCountArray[i];
                               4277                 :                : 
  909 heikki.linnakangas@i     4278                 :UBC           0 :             s = DebugPrintBufferRefcount(res->buffer);
                               4279         [ #  # ]:              0 :             elog(WARNING, "buffer refcount leak: %s", s);
                               4280                 :              0 :             pfree(s);
                               4281                 :                : 
 4266 andres@anarazel.de       4282                 :              0 :             RefCountErrors++;
                               4283                 :                :         }
                               4284                 :                :     }
                               4285                 :                : 
                               4286                 :                :     /* if necessary search the hash */
 4266 andres@anarazel.de       4287         [ -  + ]:CBC      446260 :     if (PrivateRefCountOverflowed)
                               4288                 :                :     {
                               4289                 :                :         refcount_iterator iter;
                               4290                 :                : 
   54 pg@bowt.ie               4291                 :UNC           0 :         refcount_start_iterate(PrivateRefCountHash, &iter);
                               4292         [ #  # ]:              0 :         while ((res = refcount_iterate(PrivateRefCountHash, &iter)) != NULL)
                               4293                 :                :         {
  909 heikki.linnakangas@i     4294                 :UBC           0 :             s = DebugPrintBufferRefcount(res->buffer);
                               4295         [ #  # ]:              0 :             elog(WARNING, "buffer refcount leak: %s", s);
                               4296                 :              0 :             pfree(s);
 4337 andres@anarazel.de       4297                 :              0 :             RefCountErrors++;
                               4298                 :                :         }
                               4299                 :                :     }
                               4300                 :                : 
 4337 andres@anarazel.de       4301         [ -  + ]:CBC      446260 :     Assert(RefCountErrors == 0);
                               4302                 :                : #endif
 7871 tgl@sss.pgh.pa.us        4303                 :         446260 : }
                               4304                 :                : 
                               4305                 :                : #ifdef USE_ASSERT_CHECKING
                               4306                 :                : /*
                               4307                 :                :  * Check for exclusive-locked catalog buffers.  This is the core of
                               4308                 :                :  * AssertCouldGetRelation().
                               4309                 :                :  *
                               4310                 :                :  * A backend would self-deadlock on the content lock if the catalog scan read
                               4311                 :                :  * the exclusive-locked buffer.  The main threat is exclusive-locked buffers
                               4312                 :                :  * of catalogs used in relcache, because a catcache search on any catalog may
                               4313                 :                :  * build that catalog's relcache entry.  We don't have an inventory of
                               4314                 :                :  * catalogs relcache uses, so just check buffers of most catalogs.
                               4315                 :                :  *
                               4316                 :                :  * It's better to minimize waits while holding an exclusive buffer lock, so it
                               4317                 :                :  * would be nice to broaden this check not to be catalog-specific.  However,
                               4318                 :                :  * bttextcmp() accesses pg_collation, and non-core opclasses might similarly
                               4319                 :                :  * read tables.  That is deadlock-free as long as there's no loop in the
                               4320                 :                :  * dependency graph: modifying table A may cause an opclass to read table B,
                               4321                 :                :  * but it must not cause a read of table A.
                               4322                 :                :  */
                               4323                 :                : void
  383 noah@leadboat.com        4324                 :      143281422 : AssertBufferLocksPermitCatalogRead(void)
                               4325                 :                : {
                               4326                 :                :     PrivateRefCountEntry *res;
                               4327                 :                : 
                               4328                 :                :     /* check the array */
  110 andres@anarazel.de       4329         [ +  + ]:GNC  1289532798 :     for (int i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
                               4330                 :                :     {
                               4331         [ +  + ]:     1146251376 :         if (PrivateRefCountArrayKeys[i] != InvalidBuffer)
                               4332                 :                :         {
                               4333                 :       44807195 :             res = &PrivateRefCountArray[i];
                               4334                 :                : 
                               4335         [ -  + ]:       44807195 :             if (res->buffer == InvalidBuffer)
  110 andres@anarazel.de       4336                 :UNC           0 :                 continue;
                               4337                 :                : 
  110 andres@anarazel.de       4338                 :GNC    44807195 :             AssertNotCatalogBufferLock(res->buffer, res->data.lockmode);
                               4339                 :                :         }
                               4340                 :                :     }
                               4341                 :                : 
                               4342                 :                :     /* if necessary search the hash */
                               4343         [ +  + ]:      143281422 :     if (PrivateRefCountOverflowed)
                               4344                 :                :     {
                               4345                 :                :         refcount_iterator iter;
                               4346                 :                : 
   54 pg@bowt.ie               4347                 :         189537 :         refcount_start_iterate(PrivateRefCountHash, &iter);
                               4348         [ +  + ]:         446849 :         while ((res = refcount_iterate(PrivateRefCountHash, &iter)) != NULL)
                               4349                 :                :         {
  110 andres@anarazel.de       4350                 :         257312 :             AssertNotCatalogBufferLock(res->buffer, res->data.lockmode);
                               4351                 :                :         }
                               4352                 :                :     }
  383 noah@leadboat.com        4353                 :CBC   143281422 : }
                               4354                 :                : 
                               4355                 :                : static void
  110 andres@anarazel.de       4356                 :GNC    45064507 : AssertNotCatalogBufferLock(Buffer buffer, BufferLockMode mode)
                               4357                 :                : {
                               4358                 :       45064507 :     BufferDesc *bufHdr = GetBufferDescriptor(buffer - 1);
                               4359                 :                :     BufferTag   tag;
                               4360                 :                :     Oid         relid;
                               4361                 :                : 
                               4362         [ +  + ]:       45064507 :     if (mode != BUFFER_LOCK_EXCLUSIVE)
  383 noah@leadboat.com        4363                 :CBC    45053182 :         return;
                               4364                 :                : 
                               4365                 :          11325 :     tag = bufHdr->tag;
                               4366                 :                : 
                               4367                 :                :     /*
                               4368                 :                :      * This relNumber==relid assumption holds until a catalog experiences
                               4369                 :                :      * VACUUM FULL or similar.  After a command like that, relNumber will be
                               4370                 :                :      * in the normal (non-catalog) range, and we lose the ability to detect
                               4371                 :                :      * hazardous access to that catalog.  Calling RelidByRelfilenumber() would
                               4372                 :                :      * close that gap, but RelidByRelfilenumber() might then deadlock with a
                               4373                 :                :      * held lock.
                               4374                 :                :      */
                               4375                 :          11325 :     relid = tag.relNumber;
                               4376                 :                : 
                               4377         [ -  + ]:          11325 :     if (IsCatalogTextUniqueIndexOid(relid)) /* see comments at the callee */
  383 noah@leadboat.com        4378                 :UBC           0 :         return;
                               4379                 :                : 
  383 noah@leadboat.com        4380         [ -  + ]:CBC       11325 :     Assert(!IsCatalogRelationOid(relid));
                               4381                 :                : }
                               4382                 :                : #endif
                               4383                 :                : 
                               4384                 :                : 
                               4385                 :                : /*
                               4386                 :                :  * Helper routine to issue warnings when a buffer is unexpectedly pinned
                               4387                 :                :  */
                               4388                 :                : char *
  909 heikki.linnakangas@i     4389                 :             72 : DebugPrintBufferRefcount(Buffer buffer)
                               4390                 :                : {
                               4391                 :                :     BufferDesc *buf;
                               4392                 :                :     int32       loccount;
                               4393                 :                :     char       *result;
                               4394                 :                :     ProcNumber  backend;
                               4395                 :                :     uint64      buf_state;
                               4396                 :                : 
 7871 tgl@sss.pgh.pa.us        4397         [ -  + ]:             72 :     Assert(BufferIsValid(buffer));
                               4398         [ +  + ]:             72 :     if (BufferIsLocal(buffer))
                               4399                 :                :     {
 4114 andres@anarazel.de       4400                 :             24 :         buf = GetLocalBufferDescriptor(-buffer - 1);
 7871 tgl@sss.pgh.pa.us        4401                 :             24 :         loccount = LocalRefCount[-buffer - 1];
  793 heikki.linnakangas@i     4402                 :             24 :         backend = MyProcNumber;
                               4403                 :                :     }
                               4404                 :                :     else
                               4405                 :                :     {
 4114 andres@anarazel.de       4406                 :             48 :         buf = GetBufferDescriptor(buffer - 1);
 4266                          4407                 :             48 :         loccount = GetPrivateRefCount(buffer);
  793 heikki.linnakangas@i     4408                 :             48 :         backend = INVALID_PROC_NUMBER;
                               4409                 :                :     }
                               4410                 :                : 
                               4411                 :                :     /* theoretically we should lock the bufHdr here */
  110 andres@anarazel.de       4412                 :GNC          72 :     buf_state = pg_atomic_read_u64(&buf->state);
                               4413                 :                : 
                               4414                 :             72 :     result = psprintf("[%03d] (rel=%s, blockNum=%u, flags=0x%" PRIx64 ", refcount=%u %d)",
                               4415                 :                :                       buffer,
  434 andres@anarazel.de       4416                 :CBC          72 :                       relpathbackend(BufTagGetRelFileLocator(&buf->tag), backend,
                               4417                 :                :                                      BufTagGetForkNum(&buf->tag)).str,
                               4418                 :                :                       buf->tag.blockNum, buf_state & BUF_FLAG_MASK,
                               4419                 :                :                       BUF_STATE_GET_REFCOUNT(buf_state), loccount);
  909 heikki.linnakangas@i     4420                 :             72 :     return result;
                               4421                 :                : }
                               4422                 :                : 
                               4423                 :                : /*
                               4424                 :                :  * CheckPointBuffers
                               4425                 :                :  *
                               4426                 :                :  * Flush all dirty blocks in buffer pool to disk at checkpoint time.
                               4427                 :                :  *
                               4428                 :                :  * Note: temporary relations do not participate in checkpoints, so they don't
                               4429                 :                :  * need to be flushed.
                               4430                 :                :  */
                               4431                 :                : void
 6886 tgl@sss.pgh.pa.us        4432                 :           1944 : CheckPointBuffers(int flags)
                               4433                 :                : {
                               4434                 :           1944 :     BufferSync(flags);
 9287 vadim4o@yahoo.com        4435                 :           1944 : }
                               4436                 :                : 
                               4437                 :                : /*
                               4438                 :                :  * BufferGetBlockNumber
                               4439                 :                :  *      Returns the block number associated with a buffer.
                               4440                 :                :  *
                               4441                 :                :  * Note:
                               4442                 :                :  *      Assumes that the buffer is valid and pinned, else the
                               4443                 :                :  *      value may be obsolete immediately...
                               4444                 :                :  */
                               4445                 :                : BlockNumber
10892 scrappy@hub.org          4446                 :      236345017 : BufferGetBlockNumber(Buffer buffer)
                               4447                 :                : {
                               4448                 :                :     BufferDesc *bufHdr;
                               4449                 :                : 
 8786 bruce@momjian.us         4450   [ -  +  +  +  :      236345017 :     Assert(BufferIsPinned(buffer));
                                              -  + ]
                               4451                 :                : 
10467                          4452         [ +  + ]:      236345017 :     if (BufferIsLocal(buffer))
 4114 andres@anarazel.de       4453                 :        5507359 :         bufHdr = GetLocalBufferDescriptor(-buffer - 1);
                               4454                 :                :     else
                               4455                 :      230837658 :         bufHdr = GetBufferDescriptor(buffer - 1);
                               4456                 :                : 
                               4457                 :                :     /* pinned, so OK to read tag without spinlock */
 7732 tgl@sss.pgh.pa.us        4458                 :      236345017 :     return bufHdr->tag.blockNum;
                               4459                 :                : }
                               4460                 :                : 
                               4461                 :                : /*
                               4462                 :                :  * BufferGetTag
                               4463                 :                :  *      Returns the relfilelocator, fork number and block number associated with
                               4464                 :                :  *      a buffer.
                               4465                 :                :  */
                               4466                 :                : void
 1399 rhaas@postgresql.org     4467                 :       27813979 : BufferGetTag(Buffer buffer, RelFileLocator *rlocator, ForkNumber *forknum,
                               4468                 :                :              BlockNumber *blknum)
                               4469                 :                : {
                               4470                 :                :     BufferDesc *bufHdr;
                               4471                 :                : 
                               4472                 :                :     /* Do the same checks as BufferGetBlockNumber. */
 6476 heikki.linnakangas@i     4473   [ -  +  -  +  :       27813979 :     Assert(BufferIsPinned(buffer));
                                              -  + ]
                               4474                 :                : 
 8051 tgl@sss.pgh.pa.us        4475         [ -  + ]:       27813979 :     if (BufferIsLocal(buffer))
 4114 andres@anarazel.de       4476                 :UBC           0 :         bufHdr = GetLocalBufferDescriptor(-buffer - 1);
                               4477                 :                :     else
 4114 andres@anarazel.de       4478                 :CBC    27813979 :         bufHdr = GetBufferDescriptor(buffer - 1);
                               4479                 :                : 
                               4480                 :                :     /* pinned, so OK to read tag without spinlock */
 1350 rhaas@postgresql.org     4481                 :       27813979 :     *rlocator = BufTagGetRelFileLocator(&bufHdr->tag);
                               4482                 :       27813979 :     *forknum = BufTagGetForkNum(&bufHdr->tag);
 6476 heikki.linnakangas@i     4483                 :       27813979 :     *blknum = bufHdr->tag.blockNum;
 8051 tgl@sss.pgh.pa.us        4484                 :       27813979 : }
                               4485                 :                : 
                               4486                 :                : /*
                               4487                 :                :  * FlushBuffer
                               4488                 :                :  *      Physically write out a shared buffer.
                               4489                 :                :  *
                               4490                 :                :  * NOTE: this actually just passes the buffer contents to the kernel; the
                               4491                 :                :  * real write to disk won't happen until the kernel feels like it.  This
                               4492                 :                :  * is okay from our point of view since we can redo the changes from WAL.
                               4493                 :                :  * However, we will need to force the changes to disk via fsync before
                               4494                 :                :  * we can checkpoint WAL.
                               4495                 :                :  *
                               4496                 :                :  * The caller must hold a pin on the buffer and have
                               4497                 :                :  * (share-)exclusively-locked the buffer contents.
                               4498                 :                :  *
                               4499                 :                :  * If the caller has an smgr reference for the buffer's relation, pass it
                               4500                 :                :  * as the second parameter.  If not, pass NULL.
                               4501                 :                :  */
                               4502                 :                : static void
 1181 andres@anarazel.de       4503                 :         711755 : FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
                               4504                 :                :             IOContext io_context)
                               4505                 :                : {
                               4506                 :                :     XLogRecPtr  recptr;
                               4507                 :                :     ErrorContextCallback errcallback;
                               4508                 :                :     instr_time  io_start;
                               4509                 :                :     Block       bufBlock;
                               4510                 :                : 
   56 andres@anarazel.de       4511   [ +  +  -  + ]:GNC      711755 :     Assert(BufferLockHeldByMeInMode(buf, BUFFER_LOCK_EXCLUSIVE) ||
                               4512                 :                :            BufferLockHeldByMeInMode(buf, BUFFER_LOCK_SHARE_EXCLUSIVE));
                               4513                 :                : 
                               4514                 :                :     /*
                               4515                 :                :      * Try to start an I/O operation.  If StartBufferIO returns false, then
                               4516                 :                :      * someone else flushed the buffer before we could, so we need not do
                               4517                 :                :      * anything.
                               4518                 :                :      */
   39                          4519         [ +  + ]:         711755 :     if (StartSharedBufferIO(buf, false, true, NULL) == BUFFER_IO_ALREADY_DONE)
 7732 tgl@sss.pgh.pa.us        4520                 :CBC           7 :         return;
                               4521                 :                : 
                               4522                 :                :     /* Setup error traceback support for ereport() */
 4922 heikki.linnakangas@i     4523                 :         711748 :     errcallback.callback = shared_buffer_write_error_callback;
  523 peter@eisentraut.org     4524                 :         711748 :     errcallback.arg = buf;
 4922 heikki.linnakangas@i     4525                 :         711748 :     errcallback.previous = error_context_stack;
                               4526                 :         711748 :     error_context_stack = &errcallback;
                               4527                 :                : 
                               4528                 :                :     /* Find smgr relation for buffer */
 8049 tgl@sss.pgh.pa.us        4529         [ +  + ]:         711748 :     if (reln == NULL)
  793 heikki.linnakangas@i     4530                 :         709018 :         reln = smgropen(BufTagGetRelFileLocator(&buf->tag), INVALID_PROC_NUMBER);
                               4531                 :                : 
                               4532                 :                :     TRACE_POSTGRESQL_BUFFER_FLUSH_START(BufTagGetForkNum(&buf->tag),
                               4533                 :                :                                         buf->tag.blockNum,
                               4534                 :                :                                         reln->smgr_rlocator.locator.spcOid,
                               4535                 :                :                                         reln->smgr_rlocator.locator.dbOid,
                               4536                 :                :                                         reln->smgr_rlocator.locator.relNumber);
                               4537                 :                : 
                               4538                 :                :     /*
                               4539                 :                :      * As we hold at least a share-exclusive lock on the buffer, the LSN
                               4540                 :                :      * cannot change during the flush (and thus can't be torn).
                               4541                 :                :      */
 4792 simon@2ndQuadrant.co     4542                 :         711748 :     recptr = BufferGetLSN(buf);
                               4543                 :                : 
                               4544                 :                :     /*
                               4545                 :                :      * Force XLOG flush up to buffer's LSN.  This implements the basic WAL
                               4546                 :                :      * rule that log updates must hit disk before any of the data-file changes
                               4547                 :                :      * they describe do.
                               4548                 :                :      *
                               4549                 :                :      * However, this rule does not apply to unlogged relations, which will be
                               4550                 :                :      * lost after a crash anyway.  Most unlogged relation pages do not bear
                               4551                 :                :      * LSNs since we never emit WAL records for them, and therefore flushing
                               4552                 :                :      * up through the buffer LSN would be useless, but harmless.  However,
                               4553                 :                :      * some index AMs use LSNs internally to detect concurrent page
                               4554                 :                :      * modifications, and therefore unlogged index pages bear "fake" LSNs
                               4555                 :                :      * generated by XLogGetFakeLSN.  It is unlikely but possible that the fake
                               4556                 :                :      * LSN counter could advance past the WAL insertion point; and if it did
                               4557                 :                :      * happen, attempting to flush WAL through that location would fail, with
                               4558                 :                :      * disastrous system-wide consequences.  To make sure that can't happen,
                               4559                 :                :      * skip the flush if the buffer isn't permanent.
                               4560                 :                :      */
   55 andres@anarazel.de       4561         [ +  + ]:GNC      711748 :     if (pg_atomic_read_u64(&buf->state) & BM_PERMANENT)
 4831 heikki.linnakangas@i     4562                 :CBC      709928 :         XLogFlush(recptr);
                               4563                 :                : 
                               4564                 :                :     /*
                               4565                 :                :      * Now it's safe to write the buffer to disk. Note that no one else should
                               4566                 :                :      * have been able to write it, while we were busy with log flushing,
                               4567                 :                :      * because we got the exclusive right to perform I/O by setting the
                               4568                 :                :      * BM_IO_IN_PROGRESS bit.
                               4569                 :                :      */
 4792 simon@2ndQuadrant.co     4570                 :         711748 :     bufBlock = BufHdrGetBlock(buf);
                               4571                 :                : 
                               4572                 :                :     /* Update page checksum if desired. */
   39 andres@anarazel.de       4573                 :GNC      711748 :     PageSetChecksum((Page) bufBlock, buf->tag.blockNum);
                               4574                 :                : 
  433 michael@paquier.xyz      4575                 :CBC      711748 :     io_start = pgstat_prepare_io_time(track_io_timing);
                               4576                 :                : 
 8120 tgl@sss.pgh.pa.us        4577                 :         711748 :     smgrwrite(reln,
 1350 rhaas@postgresql.org     4578                 :         711748 :               BufTagGetForkNum(&buf->tag),
                               4579                 :                :               buf->tag.blockNum,
                               4580                 :                :               bufBlock,
                               4581                 :                :               false);
                               4582                 :                : 
                               4583                 :                :     /*
                               4584                 :                :      * When a strategy is in use, only flushes of dirty buffers already in the
                               4585                 :                :      * strategy ring are counted as strategy writes (IOCONTEXT
                               4586                 :                :      * [BULKREAD|BULKWRITE|VACUUM] IOOP_WRITE) for the purpose of IO
                               4587                 :                :      * statistics tracking.
                               4588                 :                :      *
                               4589                 :                :      * If a shared buffer initially added to the ring must be flushed before
                               4590                 :                :      * being used, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE.
                               4591                 :                :      *
                               4592                 :                :      * If a shared buffer which was added to the ring later because the
                               4593                 :                :      * current strategy buffer is pinned or in use or because all strategy
                               4594                 :                :      * buffers were dirty and rejected (for BAS_BULKREAD operations only)
                               4595                 :                :      * requires flushing, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE
                               4596                 :                :      * (from_ring will be false).
                               4597                 :                :      *
                               4598                 :                :      * When a strategy is not in use, the write can only be a "regular" write
                               4599                 :                :      * of a dirty shared buffer (IOCONTEXT_NORMAL IOOP_WRITE).
                               4600                 :                :      */
   14 melanieplageman@gmai     4601                 :GNC      711748 :     pgstat_count_io_op_time(io_object, io_context,
                               4602                 :                :                             IOOP_WRITE, io_start, 1, BLCKSZ);
                               4603                 :                : 
 5985 rhaas@postgresql.org     4604                 :CBC      711748 :     pgBufferUsage.shared_blks_written++;
                               4605                 :                : 
                               4606                 :                :     /*
                               4607                 :                :      * Mark the buffer as clean and end the BM_IO_IN_PROGRESS state.
                               4608                 :                :      */
  401 andres@anarazel.de       4609                 :         711748 :     TerminateBufferIO(buf, true, 0, true, false);
                               4610                 :                : 
                               4611                 :                :     TRACE_POSTGRESQL_BUFFER_FLUSH_DONE(BufTagGetForkNum(&buf->tag),
                               4612                 :                :                                        buf->tag.blockNum,
                               4613                 :                :                                        reln->smgr_rlocator.locator.spcOid,
                               4614                 :                :                                        reln->smgr_rlocator.locator.dbOid,
                               4615                 :                :                                        reln->smgr_rlocator.locator.relNumber);
                               4616                 :                : 
                               4617                 :                :     /* Pop the error context stack */
 4922 heikki.linnakangas@i     4618                 :         711748 :     error_context_stack = errcallback.previous;
                               4619                 :                : }
                               4620                 :                : 
                               4621                 :                : /*
                               4622                 :                :  * Convenience wrapper around FlushBuffer() that locks/unlocks the buffer
                               4623                 :                :  * before/after calling FlushBuffer().
                               4624                 :                :  */
                               4625                 :                : static void
  209 andres@anarazel.de       4626                 :GNC      363023 : FlushUnlockedBuffer(BufferDesc *buf, SMgrRelation reln,
                               4627                 :                :                     IOObject io_object, IOContext io_context)
                               4628                 :                : {
  110                          4629                 :         363023 :     Buffer      buffer = BufferDescriptorGetBuffer(buf);
                               4630                 :                : 
   56                          4631                 :         363023 :     BufferLockAcquire(buffer, buf, BUFFER_LOCK_SHARE_EXCLUSIVE);
   14 melanieplageman@gmai     4632                 :         363023 :     FlushBuffer(buf, reln, io_object, io_context);
  110 andres@anarazel.de       4633                 :         363023 :     BufferLockUnlock(buffer, buf);
  209                          4634                 :         363023 : }
                               4635                 :                : 
                               4636                 :                : /*
                               4637                 :                :  * RelationGetNumberOfBlocksInFork
                               4638                 :                :  *      Determines the current number of pages in the specified relation fork.
                               4639                 :                :  *
                               4640                 :                :  * Note that the accuracy of the result will depend on the details of the
                               4641                 :                :  * relation's storage. For builtin AMs it'll be accurate, but for external AMs
                               4642                 :                :  * it might not be.
                               4643                 :                :  */
                               4644                 :                : BlockNumber
 5606 rhaas@postgresql.org     4645                 :CBC     2507276 : RelationGetNumberOfBlocksInFork(Relation relation, ForkNumber forkNum)
                               4646                 :                : {
 1614 peter@eisentraut.org     4647   [ +  +  +  +  :        2507276 :     if (RELKIND_HAS_TABLE_AM(relation->rd_rel->relkind))
                                              +  + ]
                               4648                 :                :     {
                               4649                 :                :         /*
                               4650                 :                :          * Not every table AM uses BLCKSZ wide fixed size blocks. Therefore
                               4651                 :                :          * tableam returns the size in bytes - but for the purpose of this
                               4652                 :                :          * routine, we want the number of blocks. Therefore divide, rounding
                               4653                 :                :          * up.
                               4654                 :                :          */
                               4655                 :                :         uint64      szbytes;
                               4656                 :                : 
                               4657                 :        1819422 :         szbytes = table_relation_size(relation, forkNum);
                               4658                 :                : 
                               4659                 :        1819403 :         return (szbytes + (BLCKSZ - 1)) / BLCKSZ;
                               4660                 :                :     }
                               4661   [ +  -  +  +  :         687854 :     else if (RELKIND_HAS_STORAGE(relation->rd_rel->relkind))
                                     -  +  -  -  -  
                                                 - ]
                               4662                 :                :     {
 1454 tgl@sss.pgh.pa.us        4663                 :         687854 :         return smgrnblocks(RelationGetSmgr(relation), forkNum);
                               4664                 :                :     }
                               4665                 :                :     else
 1614 peter@eisentraut.org     4666                 :UBC           0 :         Assert(false);
                               4667                 :                : 
                               4668                 :                :     return 0;                   /* keep compiler quiet */
                               4669                 :                : }
                               4670                 :                : 
                               4671                 :                : /*
                               4672                 :                :  * BufferIsPermanent
                               4673                 :                :  *      Determines whether a buffer will potentially still be around after
                               4674                 :                :  *      a crash.  Caller must hold a buffer pin.
                               4675                 :                :  */
                               4676                 :                : bool
 5303 rhaas@postgresql.org     4677                 :CBC    17654831 : BufferIsPermanent(Buffer buffer)
                               4678                 :                : {
                               4679                 :                :     BufferDesc *bufHdr;
                               4680                 :                : 
                               4681                 :                :     /* Local buffers are used only for temp relations. */
                               4682         [ +  + ]:       17654831 :     if (BufferIsLocal(buffer))
                               4683                 :         937354 :         return false;
                               4684                 :                : 
                               4685                 :                :     /* Make sure we've got a real buffer, and that we hold a pin on it. */
                               4686         [ -  + ]:       16717477 :     Assert(BufferIsValid(buffer));
                               4687   [ -  +  -  +  :       16717477 :     Assert(BufferIsPinned(buffer));
                                              -  + ]
                               4688                 :                : 
                               4689                 :                :     /*
                               4690                 :                :      * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
                               4691                 :                :      * need not bother with the buffer header spinlock.  Even if someone else
                               4692                 :                :      * changes the buffer header state while we're doing this, the state is
                               4693                 :                :      * changed atomically, so we'll read the old value or the new value, but
                               4694                 :                :      * not random garbage.
                               4695                 :                :      */
 4114 andres@anarazel.de       4696                 :       16717477 :     bufHdr = GetBufferDescriptor(buffer - 1);
  110 andres@anarazel.de       4697                 :GNC    16717477 :     return (pg_atomic_read_u64(&bufHdr->state) & BM_PERMANENT) != 0;
                               4698                 :                : }
                               4699                 :                : 
                               4700                 :                : /*
                               4701                 :                :  * BufferGetLSNAtomic
                               4702                 :                :  *      Retrieves the LSN of the buffer atomically.
                               4703                 :                :  *
                               4704                 :                :  * This is necessary for some callers who may only hold a share lock on
                               4705                 :                :  * the buffer. A share lock allows a concurrent backend to set hint bits
                               4706                 :                :  * on the page, which in turn may require a WAL record to be emitted.
                               4707                 :                :  *
                               4708                 :                :  * On platforms with 8 byte atomic reads/writes, we don't need to do any
                               4709                 :                :  * additional locking. On platforms not supporting such 8 byte atomic
                               4710                 :                :  * reads/writes, we need to actually take the header lock.
                               4711                 :                :  */
                               4712                 :                : XLogRecPtr
 4792 simon@2ndQuadrant.co     4713                 :CBC     8106325 : BufferGetLSNAtomic(Buffer buffer)
                               4714                 :                : {
                               4715                 :                :     /* Make sure we've got a real buffer, and that we hold a pin on it. */
                               4716         [ -  + ]:        8106325 :     Assert(BufferIsValid(buffer));
                               4717   [ -  +  +  +  :        8106325 :     Assert(BufferIsPinned(buffer));
                                              -  + ]
                               4718                 :                : 
                               4719                 :                : #ifdef PG_HAVE_8BYTE_SINGLE_COPY_ATOMICITY
   55 tomas.vondra@postgre     4720                 :GNC     8106325 :     return PageGetLSN(BufferGetPage(buffer));
                               4721                 :                : #else
                               4722                 :                :     {
                               4723                 :                :         char       *page = BufferGetPage(buffer);
                               4724                 :                :         BufferDesc *bufHdr;
                               4725                 :                :         XLogRecPtr  lsn;
                               4726                 :                : 
                               4727                 :                :         /*
                               4728                 :                :          * If we don't need locking for correctness, fastpath out.
                               4729                 :                :          */
                               4730                 :                :         if (!XLogHintBitIsNeeded() || BufferIsLocal(buffer))
                               4731                 :                :             return PageGetLSN(page);
                               4732                 :                : 
                               4733                 :                :         bufHdr = GetBufferDescriptor(buffer - 1);
                               4734                 :                :         LockBufHdr(bufHdr);
                               4735                 :                :         lsn = PageGetLSN(page);
                               4736                 :                :         UnlockBufHdr(bufHdr);
                               4737                 :                : 
                               4738                 :                :         return lsn;
                               4739                 :                :     }
                               4740                 :                : #endif
                               4741                 :                : }
                               4742                 :                : 
                               4743                 :                : /* ---------------------------------------------------------------------
                               4744                 :                :  *      DropRelationBuffers
                               4745                 :                :  *
                               4746                 :                :  *      This function removes from the buffer pool all the pages of the
                               4747                 :                :  *      specified relation forks that have block numbers >= firstDelBlock.
                               4748                 :                :  *      (In particular, with firstDelBlock = 0, all pages are removed.)
                               4749                 :                :  *      Dirty pages are simply dropped, without bothering to write them
                               4750                 :                :  *      out first.  Therefore, this is NOT rollback-able, and so should be
                               4751                 :                :  *      used only with extreme caution!
                               4752                 :                :  *
                               4753                 :                :  *      Currently, this is called only from smgr.c when the underlying file
                               4754                 :                :  *      is about to be deleted or truncated (firstDelBlock is needed for
                               4755                 :                :  *      the truncation case).  The data in the affected pages would therefore
                               4756                 :                :  *      be deleted momentarily anyway, and there is no point in writing it.
                               4757                 :                :  *      It is the responsibility of higher-level code to ensure that the
                               4758                 :                :  *      deletion or truncation does not lose any data that could be needed
                               4759                 :                :  *      later.  It is also the responsibility of higher-level code to ensure
                               4760                 :                :  *      that no other process could be trying to load more pages of the
                               4761                 :                :  *      relation into buffers.
                               4762                 :                :  * --------------------------------------------------------------------
                               4763                 :                :  */
                               4764                 :                : void
 1393 rhaas@postgresql.org     4765                 :CBC         804 : DropRelationBuffers(SMgrRelation smgr_reln, ForkNumber *forkNum,
                               4766                 :                :                     int nforks, BlockNumber *firstDelBlock)
                               4767                 :                : {
                               4768                 :                :     int         i;
                               4769                 :                :     int         j;
                               4770                 :                :     RelFileLocatorBackend rlocator;
                               4771                 :                :     BlockNumber nForkBlock[MAX_FORKNUM];
 1819 tgl@sss.pgh.pa.us        4772                 :            804 :     uint64      nBlocksToInvalidate = 0;
                               4773                 :                : 
 1399 rhaas@postgresql.org     4774                 :            804 :     rlocator = smgr_reln->smgr_rlocator;
                               4775                 :                : 
                               4776                 :                :     /* If it's a local relation, it's localbuf.c's problem. */
                               4777         [ +  + ]:            804 :     if (RelFileLocatorBackendIsTemp(rlocator))
                               4778                 :                :     {
  793 heikki.linnakangas@i     4779         [ +  - ]:            498 :         if (rlocator.backend == MyProcNumber)
  305 fujii@postgresql.org     4780                 :GNC         498 :             DropRelationLocalBuffers(rlocator.locator, forkNum, nforks,
                               4781                 :                :                                      firstDelBlock);
                               4782                 :                : 
 8673 tgl@sss.pgh.pa.us        4783                 :CBC         541 :         return;
                               4784                 :                :     }
                               4785                 :                : 
                               4786                 :                :     /*
                               4787                 :                :      * To remove all the pages of the specified relation forks from the buffer
                               4788                 :                :      * pool, we need to scan the entire buffer pool but we can optimize it by
                               4789                 :                :      * finding the buffers from BufMapping table provided we know the exact
                               4790                 :                :      * size of each fork of the relation. The exact size is required to ensure
                               4791                 :                :      * that we don't leave any buffer for the relation being dropped as
                               4792                 :                :      * otherwise the background writer or checkpointer can lead to a PANIC
                               4793                 :                :      * error while flushing buffers corresponding to files that don't exist.
                               4794                 :                :      *
                               4795                 :                :      * To know the exact size, we rely on the size cached for each fork by us
                               4796                 :                :      * during recovery which limits the optimization to recovery and on
                               4797                 :                :      * standbys but we can easily extend it once we have shared cache for
                               4798                 :                :      * relation size.
                               4799                 :                :      *
                               4800                 :                :      * In recovery, we cache the value returned by the first lseek(SEEK_END)
                               4801                 :                :      * and the future writes keeps the cached value up-to-date. See
                               4802                 :                :      * smgrextend. It is possible that the value of the first lseek is smaller
                               4803                 :                :      * than the actual number of existing blocks in the file due to buggy
                               4804                 :                :      * Linux kernels that might not have accounted for the recent write. But
                               4805                 :                :      * that should be fine because there must not be any buffers after that
                               4806                 :                :      * file size.
                               4807                 :                :      */
 1939 akapila@postgresql.o     4808         [ +  + ]:            416 :     for (i = 0; i < nforks; i++)
                               4809                 :                :     {
                               4810                 :                :         /* Get the number of blocks for a relation's fork */
                               4811                 :            360 :         nForkBlock[i] = smgrnblocks_cached(smgr_reln, forkNum[i]);
                               4812                 :                : 
                               4813         [ +  + ]:            360 :         if (nForkBlock[i] == InvalidBlockNumber)
                               4814                 :                :         {
                               4815                 :            250 :             nBlocksToInvalidate = InvalidBlockNumber;
                               4816                 :            250 :             break;
                               4817                 :                :         }
                               4818                 :                : 
                               4819                 :                :         /* calculate the number of blocks to be invalidated */
                               4820                 :            110 :         nBlocksToInvalidate += (nForkBlock[i] - firstDelBlock[i]);
                               4821                 :                :     }
                               4822                 :                : 
                               4823                 :                :     /*
                               4824                 :                :      * We apply the optimization iff the total number of blocks to invalidate
                               4825                 :                :      * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
                               4826                 :                :      */
                               4827         [ +  + ]:            306 :     if (BlockNumberIsValid(nBlocksToInvalidate) &&
                               4828         [ +  + ]:             56 :         nBlocksToInvalidate < BUF_DROP_FULL_SCAN_THRESHOLD)
                               4829                 :                :     {
                               4830         [ +  + ]:            121 :         for (j = 0; j < nforks; j++)
 1393 rhaas@postgresql.org     4831                 :             78 :             FindAndDropRelationBuffers(rlocator.locator, forkNum[j],
                               4832                 :             78 :                                        nForkBlock[j], firstDelBlock[j]);
 1939 akapila@postgresql.o     4833                 :             43 :         return;
                               4834                 :                :     }
                               4835                 :                : 
 7732 tgl@sss.pgh.pa.us        4836         [ +  + ]:        3480199 :     for (i = 0; i < NBuffers; i++)
                               4837                 :                :     {
 3823 rhaas@postgresql.org     4838                 :        3479936 :         BufferDesc *bufHdr = GetBufferDescriptor(i);
                               4839                 :                : 
                               4840                 :                :         /*
                               4841                 :                :          * We can make this a tad faster by prechecking the buffer tag before
                               4842                 :                :          * we attempt to lock the buffer; this saves a lot of lock
                               4843                 :                :          * acquisitions in typical cases.  It should be safe because the
                               4844                 :                :          * caller must have AccessExclusiveLock on the relation, or some other
                               4845                 :                :          * reason to be certain that no one is loading new pages of the rel
                               4846                 :                :          * into the buffer pool.  (Otherwise we might well miss such pages
                               4847                 :                :          * entirely.)  Therefore, while the tag might be changing while we
                               4848                 :                :          * look at it, it can't be changing *to* a value we care about, only
                               4849                 :                :          * *away* from such a value.  So false negatives are impossible, and
                               4850                 :                :          * false positives are safe because we'll recheck after getting the
                               4851                 :                :          * buffer lock.
                               4852                 :                :          *
                               4853                 :                :          * We could check forkNum and blockNum as well as the rlocator, but
                               4854                 :                :          * the incremental win from doing so seems small.
                               4855                 :                :          */
 1350                          4856         [ +  + ]:        3479936 :         if (!BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator))
 5080 tgl@sss.pgh.pa.us        4857                 :        3467796 :             continue;
                               4858                 :                : 
  180 andres@anarazel.de       4859                 :GNC       12140 :         LockBufHdr(bufHdr);
                               4860                 :                : 
 2415 fujii@postgresql.org     4861         [ +  + ]:CBC       30027 :         for (j = 0; j < nforks; j++)
                               4862                 :                :         {
 1350 rhaas@postgresql.org     4863         [ +  - ]:          21277 :             if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator) &&
                               4864         [ +  + ]:          21277 :                 BufTagGetForkNum(&bufHdr->tag) == forkNum[j] &&
 2415 fujii@postgresql.org     4865         [ +  + ]:          11996 :                 bufHdr->tag.blockNum >= firstDelBlock[j])
                               4866                 :                :             {
 2182 tgl@sss.pgh.pa.us        4867                 :           3390 :                 InvalidateBuffer(bufHdr);   /* releases spinlock */
 2415 fujii@postgresql.org     4868                 :           3390 :                 break;
                               4869                 :                :             }
                               4870                 :                :         }
                               4871         [ +  + ]:          12140 :         if (j >= nforks)
  180 andres@anarazel.de       4872                 :GNC        8750 :             UnlockBufHdr(bufHdr);
                               4873                 :                :     }
                               4874                 :                : }
                               4875                 :                : 
                               4876                 :                : /* ---------------------------------------------------------------------
                               4877                 :                :  *      DropRelationsAllBuffers
                               4878                 :                :  *
                               4879                 :                :  *      This function removes from the buffer pool all the pages of all
                               4880                 :                :  *      forks of the specified relations.  It's equivalent to calling
                               4881                 :                :  *      DropRelationBuffers once per fork per relation with firstDelBlock = 0.
                               4882                 :                :  *      --------------------------------------------------------------------
                               4883                 :                :  */
                               4884                 :                : void
 1393 rhaas@postgresql.org     4885                 :CBC       17832 : DropRelationsAllBuffers(SMgrRelation *smgr_reln, int nlocators)
                               4886                 :                : {
                               4887                 :                :     int         i;
 1938 akapila@postgresql.o     4888                 :          17832 :     int         n = 0;
                               4889                 :                :     SMgrRelation *rels;
                               4890                 :                :     BlockNumber (*block)[MAX_FORKNUM + 1];
 1819 tgl@sss.pgh.pa.us        4891                 :          17832 :     uint64      nBlocksToInvalidate = 0;
                               4892                 :                :     RelFileLocator *locators;
 1938 akapila@postgresql.o     4893                 :          17832 :     bool        cached = true;
                               4894                 :                :     bool        use_bsearch;
                               4895                 :                : 
 1399 rhaas@postgresql.org     4896         [ -  + ]:          17832 :     if (nlocators == 0)
 4856 alvherre@alvh.no-ip.     4897                 :UBC           0 :         return;
                               4898                 :                : 
  145 michael@paquier.xyz      4899                 :GNC       17832 :     rels = palloc_array(SMgrRelation, nlocators);   /* non-local relations */
                               4900                 :                : 
                               4901                 :                :     /* If it's a local relation, it's localbuf.c's problem. */
 1399 rhaas@postgresql.org     4902         [ +  + ]:CBC       78346 :     for (i = 0; i < nlocators; i++)
                               4903                 :                :     {
                               4904         [ +  + ]:          60514 :         if (RelFileLocatorBackendIsTemp(smgr_reln[i]->smgr_rlocator))
                               4905                 :                :         {
  793 heikki.linnakangas@i     4906         [ +  - ]:           4407 :             if (smgr_reln[i]->smgr_rlocator.backend == MyProcNumber)
 1393 rhaas@postgresql.org     4907                 :           4407 :                 DropRelationAllLocalBuffers(smgr_reln[i]->smgr_rlocator.locator);
                               4908                 :                :         }
                               4909                 :                :         else
 1938 akapila@postgresql.o     4910                 :          56107 :             rels[n++] = smgr_reln[i];
                               4911                 :                :     }
                               4912                 :                : 
                               4913                 :                :     /*
                               4914                 :                :      * If there are no non-local relations, then we're done. Release the
                               4915                 :                :      * memory and return.
                               4916                 :                :      */
 4856 alvherre@alvh.no-ip.     4917         [ +  + ]:          17832 :     if (n == 0)
                               4918                 :                :     {
 1938 akapila@postgresql.o     4919                 :           1183 :         pfree(rels);
 5080 tgl@sss.pgh.pa.us        4920                 :           1183 :         return;
                               4921                 :                :     }
                               4922                 :                : 
                               4923                 :                :     /*
                               4924                 :                :      * This is used to remember the number of blocks for all the relations
                               4925                 :                :      * forks.
                               4926                 :                :      */
                               4927                 :                :     block = (BlockNumber (*)[MAX_FORKNUM + 1])
 1938 akapila@postgresql.o     4928                 :          16649 :         palloc(sizeof(BlockNumber) * n * (MAX_FORKNUM + 1));
                               4929                 :                : 
                               4930                 :                :     /*
                               4931                 :                :      * We can avoid scanning the entire buffer pool if we know the exact size
                               4932                 :                :      * of each of the given relation forks. See DropRelationBuffers.
                               4933                 :                :      */
                               4934   [ +  +  +  + ]:          34681 :     for (i = 0; i < n && cached; i++)
                               4935                 :                :     {
 1350 drowley@postgresql.o     4936         [ +  + ]:          27072 :         for (int j = 0; j <= MAX_FORKNUM; j++)
                               4937                 :                :         {
                               4938                 :                :             /* Get the number of blocks for a relation's fork. */
 1938 akapila@postgresql.o     4939                 :          24831 :             block[i][j] = smgrnblocks_cached(rels[i], j);
                               4940                 :                : 
                               4941                 :                :             /* We need to only consider the relation forks that exists. */
                               4942         [ +  + ]:          24831 :             if (block[i][j] == InvalidBlockNumber)
                               4943                 :                :             {
                               4944         [ +  + ]:          22411 :                 if (!smgrexists(rels[i], j))
                               4945                 :           6620 :                     continue;
                               4946                 :          15791 :                 cached = false;
                               4947                 :          15791 :                 break;
                               4948                 :                :             }
                               4949                 :                : 
                               4950                 :                :             /* calculate the total number of blocks to be invalidated */
                               4951                 :           2420 :             nBlocksToInvalidate += block[i][j];
                               4952                 :                :         }
                               4953                 :                :     }
                               4954                 :                : 
                               4955                 :                :     /*
                               4956                 :                :      * We apply the optimization iff the total number of blocks to invalidate
                               4957                 :                :      * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
                               4958                 :                :      */
                               4959   [ +  +  +  + ]:          16649 :     if (cached && nBlocksToInvalidate < BUF_DROP_FULL_SCAN_THRESHOLD)
                               4960                 :                :     {
                               4961         [ +  + ]:           1418 :         for (i = 0; i < n; i++)
                               4962                 :                :         {
 1350 drowley@postgresql.o     4963         [ +  + ]:           3915 :             for (int j = 0; j <= MAX_FORKNUM; j++)
                               4964                 :                :             {
                               4965                 :                :                 /* ignore relation forks that doesn't exist */
 1938 akapila@postgresql.o     4966         [ +  + ]:           3132 :                 if (!BlockNumberIsValid(block[i][j]))
                               4967                 :           2338 :                     continue;
                               4968                 :                : 
                               4969                 :                :                 /* drop all the buffers for a particular relation fork */
 1393 rhaas@postgresql.org     4970                 :            794 :                 FindAndDropRelationBuffers(rels[i]->smgr_rlocator.locator,
                               4971                 :            794 :                                            j, block[i][j], 0);
                               4972                 :                :             }
                               4973                 :                :         }
                               4974                 :                : 
 1938 akapila@postgresql.o     4975                 :            635 :         pfree(block);
                               4976                 :            635 :         pfree(rels);
                               4977                 :            635 :         return;
                               4978                 :                :     }
                               4979                 :                : 
                               4980                 :          16014 :     pfree(block);
  145 michael@paquier.xyz      4981                 :GNC       16014 :     locators = palloc_array(RelFileLocator, n); /* non-local relations */
 1938 akapila@postgresql.o     4982         [ +  + ]:CBC       71338 :     for (i = 0; i < n; i++)
 1399 rhaas@postgresql.org     4983                 :          55324 :         locators[i] = rels[i]->smgr_rlocator.locator;
                               4984                 :                : 
                               4985                 :                :     /*
                               4986                 :                :      * For low number of relations to drop just use a simple walk through, to
                               4987                 :                :      * save the bsearch overhead. The threshold to use is rather a guess than
                               4988                 :                :      * an exactly determined value, as it depends on many factors (CPU and RAM
                               4989                 :                :      * speeds, amount of shared buffers etc.).
                               4990                 :                :      */
 2222 noah@leadboat.com        4991                 :          16014 :     use_bsearch = n > RELS_BSEARCH_THRESHOLD;
                               4992                 :                : 
                               4993                 :                :     /* sort the list of rlocators if necessary */
 4856 alvherre@alvh.no-ip.     4994         [ +  + ]:          16014 :     if (use_bsearch)
  809 nathan@postgresql.or     4995                 :            218 :         qsort(locators, n, sizeof(RelFileLocator), rlocator_comparator);
                               4996                 :                : 
 5080 tgl@sss.pgh.pa.us        4997         [ +  + ]:      186815246 :     for (i = 0; i < NBuffers; i++)
                               4998                 :                :     {
 1399 rhaas@postgresql.org     4999                 :      186799232 :         RelFileLocator *rlocator = NULL;
 3823                          5000                 :      186799232 :         BufferDesc *bufHdr = GetBufferDescriptor(i);
                               5001                 :                : 
                               5002                 :                :         /*
                               5003                 :                :          * As in DropRelationBuffers, an unlocked precheck should be safe and
                               5004                 :                :          * saves some cycles.
                               5005                 :                :          */
                               5006                 :                : 
 4856 alvherre@alvh.no-ip.     5007         [ +  + ]:      186799232 :         if (!use_bsearch)
                               5008                 :                :         {
                               5009                 :                :             int         j;
                               5010                 :                : 
                               5011         [ +  + ]:      742489842 :             for (j = 0; j < n; j++)
                               5012                 :                :             {
 1350 rhaas@postgresql.org     5013         [ +  + ]:      558296172 :                 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &locators[j]))
                               5014                 :                :                 {
 1399                          5015                 :         106746 :                     rlocator = &locators[j];
 4856 alvherre@alvh.no-ip.     5016                 :         106746 :                     break;
                               5017                 :                :                 }
                               5018                 :                :             }
                               5019                 :                :         }
                               5020                 :                :         else
                               5021                 :                :         {
                               5022                 :                :             RelFileLocator locator;
                               5023                 :                : 
 1350 rhaas@postgresql.org     5024                 :        2498816 :             locator = BufTagGetRelFileLocator(&bufHdr->tag);
  515 peter@eisentraut.org     5025                 :        2498816 :             rlocator = bsearch(&locator,
                               5026                 :                :                                locators, n, sizeof(RelFileLocator),
                               5027                 :                :                                rlocator_comparator);
                               5028                 :                :         }
                               5029                 :                : 
                               5030                 :                :         /* buffer doesn't belong to any of the given relfilelocators; skip it */
 1399 rhaas@postgresql.org     5031         [ +  + ]:      186799232 :         if (rlocator == NULL)
 5080 tgl@sss.pgh.pa.us        5032                 :      186690717 :             continue;
                               5033                 :                : 
  180 andres@anarazel.de       5034                 :GNC      108515 :         LockBufHdr(bufHdr);
 1350 rhaas@postgresql.org     5035         [ +  - ]:CBC      108515 :         if (BufTagMatchesRelFileLocator(&bufHdr->tag, rlocator))
 5080 tgl@sss.pgh.pa.us        5036                 :         108515 :             InvalidateBuffer(bufHdr);   /* releases spinlock */
                               5037                 :                :         else
  180 andres@anarazel.de       5038                 :UNC           0 :             UnlockBufHdr(bufHdr);
                               5039                 :                :     }
                               5040                 :                : 
 1399 rhaas@postgresql.org     5041                 :CBC       16014 :     pfree(locators);
 1938 akapila@postgresql.o     5042                 :          16014 :     pfree(rels);
                               5043                 :                : }
                               5044                 :                : 
                               5045                 :                : /* ---------------------------------------------------------------------
                               5046                 :                :  *      FindAndDropRelationBuffers
                               5047                 :                :  *
                               5048                 :                :  *      This function performs look up in BufMapping table and removes from the
                               5049                 :                :  *      buffer pool all the pages of the specified relation fork that has block
                               5050                 :                :  *      number >= firstDelBlock. (In particular, with firstDelBlock = 0, all
                               5051                 :                :  *      pages are removed.)
                               5052                 :                :  * --------------------------------------------------------------------
                               5053                 :                :  */
                               5054                 :                : static void
 1393 rhaas@postgresql.org     5055                 :            872 : FindAndDropRelationBuffers(RelFileLocator rlocator, ForkNumber forkNum,
                               5056                 :                :                            BlockNumber nForkBlock,
                               5057                 :                :                            BlockNumber firstDelBlock)
                               5058                 :                : {
                               5059                 :                :     BlockNumber curBlock;
                               5060                 :                : 
 1939 akapila@postgresql.o     5061         [ +  + ]:           2087 :     for (curBlock = firstDelBlock; curBlock < nForkBlock; curBlock++)
                               5062                 :                :     {
                               5063                 :                :         uint32      bufHash;    /* hash value for tag */
                               5064                 :                :         BufferTag   bufTag;     /* identity of requested block */
                               5065                 :                :         LWLock     *bufPartitionLock;   /* buffer partition lock for it */
                               5066                 :                :         int         buf_id;
                               5067                 :                :         BufferDesc *bufHdr;
                               5068                 :                : 
                               5069                 :                :         /* create a tag so we can lookup the buffer */
 1378 rhaas@postgresql.org     5070                 :           1215 :         InitBufferTag(&bufTag, &rlocator, forkNum, curBlock);
                               5071                 :                : 
                               5072                 :                :         /* determine its hash code and partition lock ID */
 1939 akapila@postgresql.o     5073                 :           1215 :         bufHash = BufTableHashCode(&bufTag);
                               5074                 :           1215 :         bufPartitionLock = BufMappingPartitionLock(bufHash);
                               5075                 :                : 
                               5076                 :                :         /* Check that it is in the buffer pool. If not, do nothing. */
                               5077                 :           1215 :         LWLockAcquire(bufPartitionLock, LW_SHARED);
                               5078                 :           1215 :         buf_id = BufTableLookup(&bufTag, bufHash);
                               5079                 :           1215 :         LWLockRelease(bufPartitionLock);
                               5080                 :                : 
                               5081         [ +  + ]:           1215 :         if (buf_id < 0)
                               5082                 :            108 :             continue;
                               5083                 :                : 
                               5084                 :           1107 :         bufHdr = GetBufferDescriptor(buf_id);
                               5085                 :                : 
                               5086                 :                :         /*
                               5087                 :                :          * We need to lock the buffer header and recheck if the buffer is
                               5088                 :                :          * still associated with the same block because the buffer could be
                               5089                 :                :          * evicted by some other backend loading blocks for a different
                               5090                 :                :          * relation after we release lock on the BufMapping table.
                               5091                 :                :          */
  180 andres@anarazel.de       5092                 :GNC        1107 :         LockBufHdr(bufHdr);
                               5093                 :                : 
 1350 rhaas@postgresql.org     5094   [ +  -  +  - ]:CBC        2214 :         if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator) &&
                               5095                 :           1107 :             BufTagGetForkNum(&bufHdr->tag) == forkNum &&
 1939 akapila@postgresql.o     5096         [ +  - ]:           1107 :             bufHdr->tag.blockNum >= firstDelBlock)
                               5097                 :           1107 :             InvalidateBuffer(bufHdr);   /* releases spinlock */
                               5098                 :                :         else
  180 andres@anarazel.de       5099                 :UNC           0 :             UnlockBufHdr(bufHdr);
                               5100                 :                :     }
 1939 akapila@postgresql.o     5101                 :CBC         872 : }
                               5102                 :                : 
                               5103                 :                : /* ---------------------------------------------------------------------
                               5104                 :                :  *      DropDatabaseBuffers
                               5105                 :                :  *
                               5106                 :                :  *      This function removes all the buffers in the buffer cache for a
                               5107                 :                :  *      particular database.  Dirty pages are simply dropped, without
                               5108                 :                :  *      bothering to write them out first.  This is used when we destroy a
                               5109                 :                :  *      database, to avoid trying to flush data to disk when the directory
                               5110                 :                :  *      tree no longer exists.  Implementation is pretty similar to
                               5111                 :                :  *      DropRelationBuffers() which is for destroying just one relation.
                               5112                 :                :  * --------------------------------------------------------------------
                               5113                 :                :  */
                               5114                 :                : void
 7342 tgl@sss.pgh.pa.us        5115                 :             81 : DropDatabaseBuffers(Oid dbid)
                               5116                 :                : {
                               5117                 :                :     int         i;
                               5118                 :                : 
                               5119                 :                :     /*
                               5120                 :                :      * We needn't consider local buffers, since by assumption the target
                               5121                 :                :      * database isn't our own.
                               5122                 :                :      */
                               5123                 :                : 
 7732                          5124         [ +  + ]:         628177 :     for (i = 0; i < NBuffers; i++)
                               5125                 :                :     {
 3823 rhaas@postgresql.org     5126                 :         628096 :         BufferDesc *bufHdr = GetBufferDescriptor(i);
                               5127                 :                : 
                               5128                 :                :         /*
                               5129                 :                :          * As in DropRelationBuffers, an unlocked precheck should be safe and
                               5130                 :                :          * saves some cycles.
                               5131                 :                :          */
 1350                          5132         [ +  + ]:         628096 :         if (bufHdr->tag.dbOid != dbid)
 5080 tgl@sss.pgh.pa.us        5133                 :         613220 :             continue;
                               5134                 :                : 
  180 andres@anarazel.de       5135                 :GNC       14876 :         LockBufHdr(bufHdr);
 1350 rhaas@postgresql.org     5136         [ +  - ]:CBC       14876 :         if (bufHdr->tag.dbOid == dbid)
 7507 bruce@momjian.us         5137                 :          14876 :             InvalidateBuffer(bufHdr);   /* releases spinlock */
                               5138                 :                :         else
  180 andres@anarazel.de       5139                 :UNC           0 :             UnlockBufHdr(bufHdr);
                               5140                 :                :     }
10892 scrappy@hub.org          5141                 :CBC          81 : }
                               5142                 :                : 
                               5143                 :                : /* ---------------------------------------------------------------------
                               5144                 :                :  *      FlushRelationBuffers
                               5145                 :                :  *
                               5146                 :                :  *      This function writes all dirty pages of a relation out to disk
                               5147                 :                :  *      (or more accurately, out to kernel disk buffers), ensuring that the
                               5148                 :                :  *      kernel has an up-to-date view of the relation.
                               5149                 :                :  *
                               5150                 :                :  *      Generally, the caller should be holding AccessExclusiveLock on the
                               5151                 :                :  *      target relation to ensure that no other backend is busy dirtying
                               5152                 :                :  *      more blocks of the relation; the effects can't be expected to last
                               5153                 :                :  *      after the lock is released.
                               5154                 :                :  *
                               5155                 :                :  *      XXX currently it sequentially searches the buffer pool, should be
                               5156                 :                :  *      changed to more clever ways of searching.  This routine is not
                               5157                 :                :  *      used in any performance-critical code paths, so it's not worth
                               5158                 :                :  *      adding additional overhead to normal paths to make it go faster.
                               5159                 :                :  * --------------------------------------------------------------------
                               5160                 :                :  */
                               5161                 :                : void
 7716 tgl@sss.pgh.pa.us        5162                 :            167 : FlushRelationBuffers(Relation rel)
                               5163                 :                : {
                               5164                 :                :     int         i;
                               5165                 :                :     BufferDesc *bufHdr;
  825 heikki.linnakangas@i     5166                 :            167 :     SMgrRelation srel = RelationGetSmgr(rel);
                               5167                 :                : 
 5622 rhaas@postgresql.org     5168         [ +  + ]:            167 :     if (RelationUsesLocalBuffers(rel))
                               5169                 :                :     {
10452 vadim4o@yahoo.com        5170         [ +  + ]:           1212 :         for (i = 0; i < NLocBuffer; i++)
                               5171                 :                :         {
                               5172                 :                :             uint64      buf_state;
                               5173                 :                : 
 4114 andres@anarazel.de       5174                 :           1200 :             bufHdr = GetLocalBufferDescriptor(i);
 1350 rhaas@postgresql.org     5175         [ +  + ]:           1200 :             if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
  110 andres@anarazel.de       5176         [ +  + ]:GNC         400 :                 ((buf_state = pg_atomic_read_u64(&bufHdr->state)) &
                               5177                 :                :                  (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
                               5178                 :                :             {
                               5179                 :                :                 ErrorContextCallback errcallback;
                               5180                 :                : 
                               5181                 :                :                 /* Setup error traceback support for ereport() */
 4922 heikki.linnakangas@i     5182                 :CBC         392 :                 errcallback.callback = local_buffer_write_error_callback;
  523 peter@eisentraut.org     5183                 :            392 :                 errcallback.arg = bufHdr;
 4922 heikki.linnakangas@i     5184                 :            392 :                 errcallback.previous = error_context_stack;
                               5185                 :            392 :                 error_context_stack = &errcallback;
                               5186                 :                : 
                               5187                 :                :                 /* Make sure we can handle the pin */
  393 andres@anarazel.de       5188                 :            392 :                 ReservePrivateRefCountEntry();
                               5189                 :            392 :                 ResourceOwnerEnlarge(CurrentResourceOwner);
                               5190                 :                : 
                               5191                 :                :                 /*
                               5192                 :                :                  * Pin/unpin mostly to make valgrind work, but it also seems
                               5193                 :                :                  * like the right thing to do.
                               5194                 :                :                  */
                               5195                 :            392 :                 PinLocalBuffer(bufHdr, false);
                               5196                 :                : 
                               5197                 :                : 
  416                          5198                 :            392 :                 FlushLocalBuffer(bufHdr, srel);
                               5199                 :                : 
  393                          5200                 :            392 :                 UnpinLocalBuffer(BufferDescriptorGetBuffer(bufHdr));
                               5201                 :                : 
                               5202                 :                :                 /* Pop the error context stack */
 4922 heikki.linnakangas@i     5203                 :            392 :                 error_context_stack = errcallback.previous;
                               5204                 :                :             }
                               5205                 :                :         }
                               5206                 :                : 
 8009 tgl@sss.pgh.pa.us        5207                 :             12 :         return;
                               5208                 :                :     }
                               5209                 :                : 
10452 vadim4o@yahoo.com        5210         [ +  + ]:        1938203 :     for (i = 0; i < NBuffers; i++)
                               5211                 :                :     {
                               5212                 :                :         uint64      buf_state;
                               5213                 :                : 
 4114 andres@anarazel.de       5214                 :        1938048 :         bufHdr = GetBufferDescriptor(i);
                               5215                 :                : 
                               5216                 :                :         /*
                               5217                 :                :          * As in DropRelationBuffers, an unlocked precheck should be safe and
                               5218                 :                :          * saves some cycles.
                               5219                 :                :          */
 1350 rhaas@postgresql.org     5220         [ +  + ]:        1938048 :         if (!BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator))
 5080 tgl@sss.pgh.pa.us        5221                 :        1937802 :             continue;
                               5222                 :                : 
                               5223                 :                :         /* Make sure we can handle the pin */
 4124 andres@anarazel.de       5224                 :            246 :         ReservePrivateRefCountEntry();
  909 heikki.linnakangas@i     5225                 :            246 :         ResourceOwnerEnlarge(CurrentResourceOwner);
                               5226                 :                : 
 3677 andres@anarazel.de       5227                 :            246 :         buf_state = LockBufHdr(bufHdr);
 1350 rhaas@postgresql.org     5228         [ +  - ]:            246 :         if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
 3677 andres@anarazel.de       5229         [ +  + ]:            246 :             (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
                               5230                 :                :         {
 7716 tgl@sss.pgh.pa.us        5231                 :            204 :             PinBuffer_Locked(bufHdr);
  209 andres@anarazel.de       5232                 :GNC         204 :             FlushUnlockedBuffer(bufHdr, srel, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
 1313 michael@paquier.xyz      5233                 :CBC         204 :             UnpinBuffer(bufHdr);
                               5234                 :                :         }
                               5235                 :                :         else
  180 andres@anarazel.de       5236                 :GNC          42 :             UnlockBufHdr(bufHdr);
                               5237                 :                :     }
                               5238                 :                : }
                               5239                 :                : 
                               5240                 :                : /* ---------------------------------------------------------------------
                               5241                 :                :  *      FlushRelationsAllBuffers
                               5242                 :                :  *
                               5243                 :                :  *      This function flushes out of the buffer pool all the pages of all
                               5244                 :                :  *      forks of the specified smgr relations.  It's equivalent to calling
                               5245                 :                :  *      FlushRelationBuffers once per relation.  The relations are assumed not
                               5246                 :                :  *      to use local buffers.
                               5247                 :                :  * --------------------------------------------------------------------
                               5248                 :                :  */
                               5249                 :                : void
 2222 noah@leadboat.com        5250                 :CBC           5 : FlushRelationsAllBuffers(SMgrRelation *smgrs, int nrels)
                               5251                 :                : {
                               5252                 :                :     int         i;
                               5253                 :                :     SMgrSortArray *srels;
                               5254                 :                :     bool        use_bsearch;
                               5255                 :                : 
                               5256         [ -  + ]:              5 :     if (nrels == 0)
 2222 noah@leadboat.com        5257                 :UBC           0 :         return;
                               5258                 :                : 
                               5259                 :                :     /* fill-in array for qsort */
  145 michael@paquier.xyz      5260                 :GNC           5 :     srels = palloc_array(SMgrSortArray, nrels);
                               5261                 :                : 
 2222 noah@leadboat.com        5262         [ +  + ]:CBC          10 :     for (i = 0; i < nrels; i++)
                               5263                 :                :     {
 1399 rhaas@postgresql.org     5264         [ -  + ]:              5 :         Assert(!RelFileLocatorBackendIsTemp(smgrs[i]->smgr_rlocator));
                               5265                 :                : 
                               5266                 :              5 :         srels[i].rlocator = smgrs[i]->smgr_rlocator.locator;
 2222 noah@leadboat.com        5267                 :              5 :         srels[i].srel = smgrs[i];
                               5268                 :                :     }
                               5269                 :                : 
                               5270                 :                :     /*
                               5271                 :                :      * Save the bsearch overhead for low number of relations to sync. See
                               5272                 :                :      * DropRelationsAllBuffers for details.
                               5273                 :                :      */
                               5274                 :              5 :     use_bsearch = nrels > RELS_BSEARCH_THRESHOLD;
                               5275                 :                : 
                               5276                 :                :     /* sort the list of SMgrRelations if necessary */
                               5277         [ -  + ]:              5 :     if (use_bsearch)
  809 nathan@postgresql.or     5278                 :UBC           0 :         qsort(srels, nrels, sizeof(SMgrSortArray), rlocator_comparator);
                               5279                 :                : 
 2222 noah@leadboat.com        5280         [ +  + ]:CBC       81925 :     for (i = 0; i < NBuffers; i++)
                               5281                 :                :     {
                               5282                 :          81920 :         SMgrSortArray *srelent = NULL;
                               5283                 :          81920 :         BufferDesc *bufHdr = GetBufferDescriptor(i);
                               5284                 :                :         uint64      buf_state;
                               5285                 :                : 
                               5286                 :                :         /*
                               5287                 :                :          * As in DropRelationBuffers, an unlocked precheck should be safe and
                               5288                 :                :          * saves some cycles.
                               5289                 :                :          */
                               5290                 :                : 
                               5291         [ +  - ]:          81920 :         if (!use_bsearch)
                               5292                 :                :         {
                               5293                 :                :             int         j;
                               5294                 :                : 
                               5295         [ +  + ]:         161277 :             for (j = 0; j < nrels; j++)
                               5296                 :                :             {
 1350 rhaas@postgresql.org     5297         [ +  + ]:          81920 :                 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srels[j].rlocator))
                               5298                 :                :                 {
 2222 noah@leadboat.com        5299                 :           2563 :                     srelent = &srels[j];
                               5300                 :           2563 :                     break;
                               5301                 :                :                 }
                               5302                 :                :             }
                               5303                 :                :         }
                               5304                 :                :         else
                               5305                 :                :         {
                               5306                 :                :             RelFileLocator rlocator;
                               5307                 :                : 
 1350 rhaas@postgresql.org     5308                 :UBC           0 :             rlocator = BufTagGetRelFileLocator(&bufHdr->tag);
  515 peter@eisentraut.org     5309                 :              0 :             srelent = bsearch(&rlocator,
                               5310                 :                :                               srels, nrels, sizeof(SMgrSortArray),
                               5311                 :                :                               rlocator_comparator);
                               5312                 :                :         }
                               5313                 :                : 
                               5314                 :                :         /* buffer doesn't belong to any of the given relfilelocators; skip it */
 2222 noah@leadboat.com        5315         [ +  + ]:CBC       81920 :         if (srelent == NULL)
                               5316                 :          79357 :             continue;
                               5317                 :                : 
                               5318                 :                :         /* Make sure we can handle the pin */
                               5319                 :           2563 :         ReservePrivateRefCountEntry();
  909 heikki.linnakangas@i     5320                 :           2563 :         ResourceOwnerEnlarge(CurrentResourceOwner);
                               5321                 :                : 
 2222 noah@leadboat.com        5322                 :           2563 :         buf_state = LockBufHdr(bufHdr);
 1350 rhaas@postgresql.org     5323         [ +  - ]:           2563 :         if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srelent->rlocator) &&
 2222 noah@leadboat.com        5324         [ +  + ]:           2563 :             (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
                               5325                 :                :         {
                               5326                 :           2526 :             PinBuffer_Locked(bufHdr);
  209 andres@anarazel.de       5327                 :GNC        2526 :             FlushUnlockedBuffer(bufHdr, srelent->srel, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
 1313 michael@paquier.xyz      5328                 :CBC        2526 :             UnpinBuffer(bufHdr);
                               5329                 :                :         }
                               5330                 :                :         else
  180 andres@anarazel.de       5331                 :GNC          37 :             UnlockBufHdr(bufHdr);
                               5332                 :                :     }
                               5333                 :                : 
 2222 noah@leadboat.com        5334                 :CBC           5 :     pfree(srels);
                               5335                 :                : }
                               5336                 :                : 
                               5337                 :                : /* ---------------------------------------------------------------------
                               5338                 :                :  *      RelationCopyStorageUsingBuffer
                               5339                 :                :  *
                               5340                 :                :  *      Copy fork's data using bufmgr.  Same as RelationCopyStorage but instead
                               5341                 :                :  *      of using smgrread and smgrextend this will copy using bufmgr APIs.
                               5342                 :                :  *
                               5343                 :                :  *      Refer comments atop CreateAndCopyRelationData() for details about
                               5344                 :                :  *      'permanent' parameter.
                               5345                 :                :  * --------------------------------------------------------------------
                               5346                 :                :  */
                               5347                 :                : static void
 1362 rhaas@postgresql.org     5348                 :          85895 : RelationCopyStorageUsingBuffer(RelFileLocator srclocator,
                               5349                 :                :                                RelFileLocator dstlocator,
                               5350                 :                :                                ForkNumber forkNum, bool permanent)
                               5351                 :                : {
                               5352                 :                :     Buffer      srcBuf;
                               5353                 :                :     Buffer      dstBuf;
                               5354                 :                :     Page        srcPage;
                               5355                 :                :     Page        dstPage;
                               5356                 :                :     bool        use_wal;
                               5357                 :                :     BlockNumber nblocks;
                               5358                 :                :     BlockNumber blkno;
                               5359                 :                :     PGIOAlignedBlock buf;
                               5360                 :                :     BufferAccessStrategy bstrategy_src;
                               5361                 :                :     BufferAccessStrategy bstrategy_dst;
                               5362                 :                :     BlockRangeReadStreamPrivate p;
                               5363                 :                :     ReadStream *src_stream;
                               5364                 :                :     SMgrRelation src_smgr;
                               5365                 :                : 
                               5366                 :                :     /*
                               5367                 :                :      * In general, we want to write WAL whenever wal_level > 'minimal', but we
                               5368                 :                :      * can skip it when copying any fork of an unlogged relation other than
                               5369                 :                :      * the init fork.
                               5370                 :                :      */
 1498                          5371   [ +  +  -  +  :          85895 :     use_wal = XLogIsNeeded() && (permanent || forkNum == INIT_FORKNUM);
                                              -  - ]
                               5372                 :                : 
                               5373                 :                :     /* Get number of blocks in the source relation. */
  793 heikki.linnakangas@i     5374                 :          85895 :     nblocks = smgrnblocks(smgropen(srclocator, INVALID_PROC_NUMBER),
                               5375                 :                :                           forkNum);
                               5376                 :                : 
                               5377                 :                :     /* Nothing to copy; just return. */
 1498 rhaas@postgresql.org     5378         [ +  + ]:          85895 :     if (nblocks == 0)
                               5379                 :          15933 :         return;
                               5380                 :                : 
                               5381                 :                :     /*
                               5382                 :                :      * Bulk extend the destination relation of the same size as the source
                               5383                 :                :      * relation before starting to copy block by block.
                               5384                 :                :      */
 1356                          5385                 :          69962 :     memset(buf.data, 0, BLCKSZ);
  793 heikki.linnakangas@i     5386                 :          69962 :     smgrextend(smgropen(dstlocator, INVALID_PROC_NUMBER), forkNum, nblocks - 1,
                               5387                 :                :                buf.data, true);
                               5388                 :                : 
                               5389                 :                :     /* This is a bulk operation, so use buffer access strategies. */
 1498 rhaas@postgresql.org     5390                 :          69962 :     bstrategy_src = GetAccessStrategy(BAS_BULKREAD);
                               5391                 :          69962 :     bstrategy_dst = GetAccessStrategy(BAS_BULKWRITE);
                               5392                 :                : 
                               5393                 :                :     /* Initialize streaming read */
  609 noah@leadboat.com        5394                 :          69962 :     p.current_blocknum = 0;
                               5395                 :          69962 :     p.last_exclusive = nblocks;
  654                          5396                 :          69962 :     src_smgr = smgropen(srclocator, INVALID_PROC_NUMBER);
                               5397                 :                : 
                               5398                 :                :     /*
                               5399                 :                :      * It is safe to use batchmode as block_range_read_stream_cb takes no
                               5400                 :                :      * locks.
                               5401                 :                :      */
  401 andres@anarazel.de       5402         [ +  - ]:          69962 :     src_stream = read_stream_begin_smgr_relation(READ_STREAM_FULL |
                               5403                 :                :                                                  READ_STREAM_USE_BATCHING,
                               5404                 :                :                                                  bstrategy_src,
                               5405                 :                :                                                  src_smgr,
                               5406                 :                :                                                  permanent ? RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED,
                               5407                 :                :                                                  forkNum,
                               5408                 :                :                                                  block_range_read_stream_cb,
                               5409                 :                :                                                  &p,
                               5410                 :                :                                                  0);
                               5411                 :                : 
                               5412                 :                :     /* Iterate over each block of the source relation file. */
 1498 rhaas@postgresql.org     5413         [ +  + ]:         336047 :     for (blkno = 0; blkno < nblocks; blkno++)
                               5414                 :                :     {
                               5415         [ -  + ]:         266088 :         CHECK_FOR_INTERRUPTS();
                               5416                 :                : 
                               5417                 :                :         /* Read block from source relation. */
  654 noah@leadboat.com        5418                 :         266088 :         srcBuf = read_stream_next_buffer(src_stream, NULL);
 1368 tgl@sss.pgh.pa.us        5419                 :         266085 :         LockBuffer(srcBuf, BUFFER_LOCK_SHARE);
 1498 rhaas@postgresql.org     5420                 :         266085 :         srcPage = BufferGetPage(srcBuf);
                               5421                 :                : 
  654 noah@leadboat.com        5422                 :         266085 :         dstBuf = ReadBufferWithoutRelcache(dstlocator, forkNum,
                               5423                 :                :                                            BufferGetBlockNumber(srcBuf),
                               5424                 :                :                                            RBM_ZERO_AND_LOCK, bstrategy_dst,
                               5425                 :                :                                            permanent);
 1368 tgl@sss.pgh.pa.us        5426                 :         266085 :         dstPage = BufferGetPage(dstBuf);
                               5427                 :                : 
 1498 rhaas@postgresql.org     5428                 :         266085 :         START_CRIT_SECTION();
                               5429                 :                : 
                               5430                 :                :         /* Copy page data from the source to the destination. */
                               5431                 :         266085 :         memcpy(dstPage, srcPage, BLCKSZ);
                               5432                 :         266085 :         MarkBufferDirty(dstBuf);
                               5433                 :                : 
                               5434                 :                :         /* WAL-log the copied page. */
                               5435         [ +  + ]:         266085 :         if (use_wal)
                               5436                 :         147845 :             log_newpage_buffer(dstBuf, true);
                               5437                 :                : 
                               5438         [ -  + ]:         266085 :         END_CRIT_SECTION();
                               5439                 :                : 
                               5440                 :         266085 :         UnlockReleaseBuffer(dstBuf);
 1368 tgl@sss.pgh.pa.us        5441                 :         266085 :         UnlockReleaseBuffer(srcBuf);
                               5442                 :                :     }
  654 noah@leadboat.com        5443         [ -  + ]:          69959 :     Assert(read_stream_next_buffer(src_stream, NULL) == InvalidBuffer);
                               5444                 :          69959 :     read_stream_end(src_stream);
                               5445                 :                : 
 1142 andres@anarazel.de       5446                 :          69959 :     FreeAccessStrategy(bstrategy_src);
                               5447                 :          69959 :     FreeAccessStrategy(bstrategy_dst);
                               5448                 :                : }
                               5449                 :                : 
                               5450                 :                : /* ---------------------------------------------------------------------
                               5451                 :                :  *      CreateAndCopyRelationData
                               5452                 :                :  *
                               5453                 :                :  *      Create destination relation storage and copy all forks from the
                               5454                 :                :  *      source relation to the destination.
                               5455                 :                :  *
                               5456                 :                :  *      Pass permanent as true for permanent relations and false for
                               5457                 :                :  *      unlogged relations.  Currently this API is not supported for
                               5458                 :                :  *      temporary relations.
                               5459                 :                :  * --------------------------------------------------------------------
                               5460                 :                :  */
                               5461                 :                : void
 1399 rhaas@postgresql.org     5462                 :          65910 : CreateAndCopyRelationData(RelFileLocator src_rlocator,
                               5463                 :                :                           RelFileLocator dst_rlocator, bool permanent)
                               5464                 :                : {
                               5465                 :                :     char        relpersistence;
                               5466                 :                :     SMgrRelation src_rel;
                               5467                 :                :     SMgrRelation dst_rel;
                               5468                 :                : 
                               5469                 :                :     /* Set the relpersistence. */
 1498                          5470         [ +  - ]:          65910 :     relpersistence = permanent ?
                               5471                 :                :         RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED;
                               5472                 :                : 
  793 heikki.linnakangas@i     5473                 :          65910 :     src_rel = smgropen(src_rlocator, INVALID_PROC_NUMBER);
                               5474                 :          65910 :     dst_rel = smgropen(dst_rlocator, INVALID_PROC_NUMBER);
                               5475                 :                : 
                               5476                 :                :     /*
                               5477                 :                :      * Create and copy all forks of the relation.  During create database we
                               5478                 :                :      * have a separate cleanup mechanism which deletes complete database
                               5479                 :                :      * directory.  Therefore, each individual relation doesn't need to be
                               5480                 :                :      * registered for cleanup.
                               5481                 :                :      */
 1399 rhaas@postgresql.org     5482                 :          65910 :     RelationCreateStorage(dst_rlocator, relpersistence, false);
                               5483                 :                : 
                               5484                 :                :     /* copy main fork. */
 1362                          5485                 :          65910 :     RelationCopyStorageUsingBuffer(src_rlocator, dst_rlocator, MAIN_FORKNUM,
                               5486                 :                :                                    permanent);
                               5487                 :                : 
                               5488                 :                :     /* copy those extra forks that exist */
 1498                          5489                 :          65907 :     for (ForkNumber forkNum = MAIN_FORKNUM + 1;
                               5490         [ +  + ]:         263628 :          forkNum <= MAX_FORKNUM; forkNum++)
                               5491                 :                :     {
  825 heikki.linnakangas@i     5492         [ +  + ]:         197721 :         if (smgrexists(src_rel, forkNum))
                               5493                 :                :         {
                               5494                 :          19985 :             smgrcreate(dst_rel, forkNum, false);
                               5495                 :                : 
                               5496                 :                :             /*
                               5497                 :                :              * WAL log creation if the relation is persistent, or this is the
                               5498                 :                :              * init fork of an unlogged relation.
                               5499                 :                :              */
 1498 rhaas@postgresql.org     5500   [ -  +  -  - ]:          19985 :             if (permanent || forkNum == INIT_FORKNUM)
 1399                          5501                 :          19985 :                 log_smgrcreate(&dst_rlocator, forkNum);
                               5502                 :                : 
                               5503                 :                :             /* Copy a fork's data, block by block. */
 1362                          5504                 :          19985 :             RelationCopyStorageUsingBuffer(src_rlocator, dst_rlocator, forkNum,
                               5505                 :                :                                            permanent);
                               5506                 :                :         }
                               5507                 :                :     }
 1498                          5508                 :          65907 : }
                               5509                 :                : 
                               5510                 :                : /* ---------------------------------------------------------------------
                               5511                 :                :  *      FlushDatabaseBuffers
                               5512                 :                :  *
                               5513                 :                :  *      This function writes all dirty pages of a database out to disk
                               5514                 :                :  *      (or more accurately, out to kernel disk buffers), ensuring that the
                               5515                 :                :  *      kernel has an up-to-date view of the database.
                               5516                 :                :  *
                               5517                 :                :  *      Generally, the caller should be holding an appropriate lock to ensure
                               5518                 :                :  *      no other backend is active in the target database; otherwise more
                               5519                 :                :  *      pages could get dirtied.
                               5520                 :                :  *
                               5521                 :                :  *      Note we don't worry about flushing any pages of temporary relations.
                               5522                 :                :  *      It's assumed these wouldn't be interesting.
                               5523                 :                :  * --------------------------------------------------------------------
                               5524                 :                :  */
                               5525                 :                : void
 6886 tgl@sss.pgh.pa.us        5526                 :              5 : FlushDatabaseBuffers(Oid dbid)
                               5527                 :                : {
                               5528                 :                :     int         i;
                               5529                 :                :     BufferDesc *bufHdr;
                               5530                 :                : 
                               5531         [ +  + ]:            645 :     for (i = 0; i < NBuffers; i++)
                               5532                 :                :     {
                               5533                 :                :         uint64      buf_state;
                               5534                 :                : 
 4114 andres@anarazel.de       5535                 :            640 :         bufHdr = GetBufferDescriptor(i);
                               5536                 :                : 
                               5537                 :                :         /*
                               5538                 :                :          * As in DropRelationBuffers, an unlocked precheck should be safe and
                               5539                 :                :          * saves some cycles.
                               5540                 :                :          */
 1350 rhaas@postgresql.org     5541         [ +  + ]:            640 :         if (bufHdr->tag.dbOid != dbid)
 5080 tgl@sss.pgh.pa.us        5542                 :            474 :             continue;
                               5543                 :                : 
                               5544                 :                :         /* Make sure we can handle the pin */
 4124 andres@anarazel.de       5545                 :            166 :         ReservePrivateRefCountEntry();
  909 heikki.linnakangas@i     5546                 :            166 :         ResourceOwnerEnlarge(CurrentResourceOwner);
                               5547                 :                : 
 3677 andres@anarazel.de       5548                 :            166 :         buf_state = LockBufHdr(bufHdr);
 1350 rhaas@postgresql.org     5549         [ +  - ]:            166 :         if (bufHdr->tag.dbOid == dbid &&
 3677 andres@anarazel.de       5550         [ +  + ]:            166 :             (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
                               5551                 :                :         {
 6886 tgl@sss.pgh.pa.us        5552                 :             25 :             PinBuffer_Locked(bufHdr);
  209 andres@anarazel.de       5553                 :GNC          25 :             FlushUnlockedBuffer(bufHdr, NULL, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
 1313 michael@paquier.xyz      5554                 :CBC          25 :             UnpinBuffer(bufHdr);
                               5555                 :                :         }
                               5556                 :                :         else
  180 andres@anarazel.de       5557                 :GNC         141 :             UnlockBufHdr(bufHdr);
                               5558                 :                :     }
 6886 tgl@sss.pgh.pa.us        5559                 :CBC           5 : }
                               5560                 :                : 
                               5561                 :                : /*
                               5562                 :                :  * Flush a previously, share-exclusively or exclusively, locked and pinned
                               5563                 :                :  * buffer to the OS.
                               5564                 :                :  */
                               5565                 :                : void
 3799 andres@anarazel.de       5566                 :            107 : FlushOneBuffer(Buffer buffer)
                               5567                 :                : {
                               5568                 :                :     BufferDesc *bufHdr;
                               5569                 :                : 
                               5570                 :                :     /* currently not needed, but no fundamental reason not to support */
                               5571         [ -  + ]:            107 :     Assert(!BufferIsLocal(buffer));
                               5572                 :                : 
                               5573   [ -  +  -  +  :            107 :     Assert(BufferIsPinned(buffer));
                                              -  + ]
                               5574                 :                : 
                               5575                 :            107 :     bufHdr = GetBufferDescriptor(buffer - 1);
                               5576                 :                : 
  209 andres@anarazel.de       5577         [ -  + ]:GNC         107 :     Assert(BufferIsLockedByMe(buffer));
                               5578                 :                : 
 1181 andres@anarazel.de       5579                 :CBC         107 :     FlushBuffer(bufHdr, NULL, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
 3799                          5580                 :            107 : }
                               5581                 :                : 
                               5582                 :                : /*
                               5583                 :                :  * ReleaseBuffer -- release the pin on a buffer
                               5584                 :                :  */
                               5585                 :                : void
 8673 tgl@sss.pgh.pa.us        5586                 :       48595076 : ReleaseBuffer(Buffer buffer)
                               5587                 :                : {
 7872                          5588         [ -  + ]:       48595076 :     if (!BufferIsValid(buffer))
 5434 peter_e@gmx.net          5589         [ #  # ]:UBC           0 :         elog(ERROR, "bad buffer ID: %d", buffer);
                               5590                 :                : 
10467 bruce@momjian.us         5591         [ +  + ]:CBC    48595076 :     if (BufferIsLocal(buffer))
 1126 andres@anarazel.de       5592                 :         754007 :         UnpinLocalBuffer(buffer);
                               5593                 :                :     else
                               5594                 :       47841069 :         UnpinBuffer(GetBufferDescriptor(buffer - 1));
10892 scrappy@hub.org          5595                 :       48595076 : }
                               5596                 :                : 
                               5597                 :                : /*
                               5598                 :                :  * UnlockReleaseBuffer -- release the content lock and pin on a buffer
                               5599                 :                :  *
                               5600                 :                :  * This is just a, more efficient, shorthand for a common combination.
                               5601                 :                :  */
                               5602                 :                : void
 7340 tgl@sss.pgh.pa.us        5603                 :       50896038 : UnlockReleaseBuffer(Buffer buffer)
                               5604                 :                : {
                               5605                 :                :     int         mode;
                               5606                 :                :     BufferDesc *buf;
                               5607                 :                :     PrivateRefCountEntry *ref;
                               5608                 :                :     uint64      sub;
                               5609                 :                :     uint64      lockstate;
                               5610                 :                : 
   39 andres@anarazel.de       5611   [ -  +  +  +  :GNC    50896038 :     Assert(BufferIsPinned(buffer));
                                              -  + ]
                               5612                 :                : 
                               5613         [ +  + ]:       50896038 :     if (BufferIsLocal(buffer))
                               5614                 :                :     {
                               5615                 :        1646201 :         UnpinLocalBuffer(buffer);
                               5616                 :        1646201 :         return;
                               5617                 :                :     }
                               5618                 :                : 
                               5619                 :       49249837 :     ResourceOwnerForgetBuffer(CurrentResourceOwner, buffer);
                               5620                 :                : 
                               5621                 :       49249837 :     buf = GetBufferDescriptor(buffer - 1);
                               5622                 :                : 
                               5623                 :       49249837 :     mode = BufferLockDisownInternal(buffer, buf);
                               5624                 :                : 
                               5625                 :                :     /* compute state modification for lock release */
                               5626                 :       49249837 :     sub = BufferLockReleaseSub(mode);
                               5627                 :                : 
                               5628                 :                :     /* compute state modification for pin release */
                               5629                 :       49249837 :     ref = GetPrivateRefCountEntry(buffer, false);
                               5630         [ -  + ]:       49249837 :     Assert(ref != NULL);
                               5631         [ -  + ]:       49249837 :     Assert(ref->data.refcount > 0);
                               5632                 :       49249837 :     ref->data.refcount--;
                               5633                 :                : 
                               5634                 :                :     /* no more backend local pins, reduce shared pin count */
                               5635         [ +  + ]:       49249837 :     if (likely(ref->data.refcount == 0))
                               5636                 :                :     {
                               5637                 :                :         /* See comment in UnpinBufferNoOwner() */
                               5638                 :                :         VALGRIND_MAKE_MEM_NOACCESS(BufHdrGetBlock(buf), BLCKSZ);
                               5639                 :                : 
                               5640                 :       46862518 :         sub |= BUF_REFCOUNT_ONE;
                               5641                 :       46862518 :         ForgetPrivateRefCountEntry(ref);
                               5642                 :                :     }
                               5643                 :                : 
                               5644                 :                :     /* perform the lock and pin release in one atomic op */
                               5645                 :       49249837 :     lockstate = pg_atomic_sub_fetch_u64(&buf->state, sub);
                               5646                 :                : 
                               5647                 :                :     /* wake up waiters for the lock */
                               5648                 :       49249837 :     BufferLockProcessRelease(buf, mode, lockstate);
                               5649                 :                : 
                               5650                 :                :     /* wake up waiter for the pin release */
                               5651         [ +  + ]:       49249837 :     if (lockstate & BM_PIN_COUNT_WAITER)
                               5652                 :              5 :         WakePinCountWaiter(buf);
                               5653                 :                : 
                               5654                 :                :     /*
                               5655                 :                :      * Now okay to allow cancel/die interrupts again, which were held when the
                               5656                 :                :      * lock was acquired.
                               5657                 :                :      */
                               5658         [ -  + ]:       49249837 :     RESUME_INTERRUPTS();
 7340 tgl@sss.pgh.pa.us        5659                 :ECB  (17962458) : }
                               5660                 :                : 
                               5661                 :                : /*
                               5662                 :                :  * IncrBufferRefCount
                               5663                 :                :  *      Increment the pin count on a buffer that we have *already* pinned
                               5664                 :                :  *      at least once.
                               5665                 :                :  *
                               5666                 :                :  *      This function cannot be used on a buffer we do not have pinned,
                               5667                 :                :  *      because it doesn't change the shared buffer state.
                               5668                 :                :  */
                               5669                 :                : void
 7962 tgl@sss.pgh.pa.us        5670                 :CBC    14556593 : IncrBufferRefCount(Buffer buffer)
                               5671                 :                : {
 7832 neilc@samurai.com        5672   [ -  +  +  +  :       14556593 :     Assert(BufferIsPinned(buffer));
                                              -  + ]
  909 heikki.linnakangas@i     5673                 :       14556593 :     ResourceOwnerEnlarge(CurrentResourceOwner);
 7962 tgl@sss.pgh.pa.us        5674         [ +  + ]:       14556593 :     if (BufferIsLocal(buffer))
                               5675                 :         471199 :         LocalRefCount[-buffer - 1]++;
                               5676                 :                :     else
                               5677                 :                :     {
                               5678                 :                :         PrivateRefCountEntry *ref;
                               5679                 :                : 
 4124 andres@anarazel.de       5680                 :       14085394 :         ref = GetPrivateRefCountEntry(buffer, true);
 4266                          5681         [ -  + ]:       14085394 :         Assert(ref != NULL);
  142 andres@anarazel.de       5682                 :GNC    14085394 :         ref->data.refcount++;
                               5683                 :                :     }
 3100 tgl@sss.pgh.pa.us        5684                 :CBC    14556593 :     ResourceOwnerRememberBuffer(CurrentResourceOwner, buffer);
 7962                          5685                 :       14556593 : }
                               5686                 :                : 
                               5687                 :                : /*
                               5688                 :                :  * Shared-buffer only helper for MarkBufferDirtyHint() and
                               5689                 :                :  * BufferSetHintBits16().
                               5690                 :                :  *
                               5691                 :                :  * This is separated out because it turns out that the repeated checks for
                               5692                 :                :  * local buffers, repeated GetBufferDescriptor() and repeated reading of the
                               5693                 :                :  * buffer's state sufficiently hurts the performance of BufferSetHintBits16().
                               5694                 :                :  */
                               5695                 :                : static inline void
   56 andres@anarazel.de       5696                 :GNC    15320964 : MarkSharedBufferDirtyHint(Buffer buffer, BufferDesc *bufHdr, uint64 lockstate,
                               5697                 :                :                           bool buffer_std)
                               5698                 :                : {
 3667 kgrittn@postgresql.o     5699                 :CBC    15320964 :     Page        page = BufferGetPage(buffer);
                               5700                 :                : 
 4266 andres@anarazel.de       5701         [ -  + ]:       15320964 :     Assert(GetPrivateRefCount(buffer) > 0);
                               5702                 :                : 
                               5703                 :                :     /* here, either share-exclusive or exclusive lock is OK */
   56 andres@anarazel.de       5704   [ +  +  -  + ]:GNC    15320964 :     Assert(BufferLockHeldByMeInMode(bufHdr, BUFFER_LOCK_EXCLUSIVE) ||
                               5705                 :                :            BufferLockHeldByMeInMode(bufHdr, BUFFER_LOCK_SHARE_EXCLUSIVE));
                               5706                 :                : 
                               5707                 :                :     /*
                               5708                 :                :      * This routine might get called many times on the same page, if we are
                               5709                 :                :      * making the first scan after commit of an xact that added/deleted many
                               5710                 :                :      * tuples. So, be as quick as we can if the buffer is already dirty.
                               5711                 :                :      *
                               5712                 :                :      * As we are holding (at least) a share-exclusive lock, nobody could have
                               5713                 :                :      * cleaned or dirtied the page concurrently, so we can just rely on the
                               5714                 :                :      * previously fetched value here without any danger of races.
                               5715                 :                :      */
   55                          5716         [ +  + ]:       15320964 :     if (unlikely(!(lockstate & BM_DIRTY)))
                               5717                 :                :     {
 4792 simon@2ndQuadrant.co     5718                 :CBC      428407 :         XLogRecPtr  lsn = InvalidXLogRecPtr;
   55 andres@anarazel.de       5719                 :GNC      428407 :         bool        wal_log = false;
                               5720                 :                :         uint64      buf_state;
                               5721                 :                : 
                               5722                 :                :         /*
                               5723                 :                :          * If we need to protect hint bit updates from torn writes, WAL-log a
                               5724                 :                :          * full page image of the page. This full page image is only necessary
                               5725                 :                :          * if the hint bit update is the first change to the page since the
                               5726                 :                :          * last checkpoint.
                               5727                 :                :          *
                               5728                 :                :          * We don't check full_page_writes here because that logic is included
                               5729                 :                :          * when we call XLogInsert() since the value changes dynamically.
                               5730                 :                :          */
   56                          5731   [ +  +  +  +  :         428407 :         if (XLogHintBitIsNeeded() && (lockstate & BM_PERMANENT))
                                              +  + ]
                               5732                 :                :         {
                               5733                 :                :             /*
                               5734                 :                :              * If we must not write WAL, due to a relfilelocator-specific
                               5735                 :                :              * condition or being in recovery, don't dirty the page.  We can
                               5736                 :                :              * set the hint, just not dirty the page as a result so the hint
                               5737                 :                :              * is lost when we evict the page or shutdown.
                               5738                 :                :              *
                               5739                 :                :              * See src/backend/storage/page/README for longer discussion.
                               5740                 :                :              */
 2222 noah@leadboat.com        5741   [ +  +  +  + ]:CBC      514412 :             if (RecoveryInProgress() ||
 1350 rhaas@postgresql.org     5742                 :          87989 :                 RelFileLocatorSkippingWAL(BufTagGetRelFileLocator(&bufHdr->tag)))
 4792 simon@2ndQuadrant.co     5743                 :         340270 :                 return;
                               5744                 :                : 
   55 andres@anarazel.de       5745                 :GNC       86153 :             wal_log = true;
                               5746                 :                :         }
                               5747                 :                : 
                               5748                 :                :         /*
                               5749                 :                :          * We must mark the page dirty before we emit the WAL record, as per
                               5750                 :                :          * the usual rules, to ensure that BufferSync()/SyncOneBuffer() try to
                               5751                 :                :          * flush the buffer, even if we haven't inserted the WAL record yet.
                               5752                 :                :          * As we hold at least a share-exclusive lock, checkpoints will wait
                               5753                 :                :          * for this backend to be done with the buffer before continuing. If
                               5754                 :                :          * we did it the other way round, a checkpoint could start between
                               5755                 :                :          * writing the WAL record and marking the buffer dirty.
                               5756                 :                :          */
 3677 andres@anarazel.de       5757                 :CBC       88137 :         buf_state = LockBufHdr(bufHdr);
                               5758                 :                : 
                               5759                 :                :         /*
                               5760                 :                :          * It should not be possible for the buffer to already be dirty, see
                               5761                 :                :          * comment above.
                               5762                 :                :          */
   56 andres@anarazel.de       5763         [ -  + ]:GNC       88137 :         Assert(!(buf_state & BM_DIRTY));
 3677 andres@anarazel.de       5764         [ -  + ]:CBC       88137 :         Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
   55 andres@anarazel.de       5765                 :GNC       88137 :         UnlockBufHdrExt(bufHdr, buf_state,
                               5766                 :                :                         BM_DIRTY,
                               5767                 :                :                         0, 0);
                               5768                 :                : 
                               5769                 :                :         /*
                               5770                 :                :          * If the block is already dirty because we either made a change or
                               5771                 :                :          * set a hint already, then we don't need to write a full page image.
                               5772                 :                :          * Note that aggressive cleaning of blocks dirtied by hint bit setting
                               5773                 :                :          * would increase the call rate. Bulk setting of hint bits would
                               5774                 :                :          * reduce the call rate...
                               5775                 :                :          */
                               5776         [ +  + ]:          88137 :         if (wal_log)
                               5777                 :          86153 :             lsn = XLogSaveBufferForHint(buffer, buffer_std);
                               5778                 :                : 
   56                          5779         [ +  + ]:          88137 :         if (XLogRecPtrIsValid(lsn))
                               5780                 :                :         {
                               5781                 :                :             /*
                               5782                 :                :              * Set the page LSN if we wrote a backup block. To allow backends
                               5783                 :                :              * that only hold a share lock on the buffer to read the LSN in a
                               5784                 :                :              * tear-free manner, we set the page LSN while holding the buffer
                               5785                 :                :              * header lock. This allows any reader of an LSN who holds only a
                               5786                 :                :              * share lock to also obtain a buffer header lock before using
                               5787                 :                :              * PageGetLSN() to read the LSN in a tear free way. This is done
                               5788                 :                :              * in BufferGetLSNAtomic().
                               5789                 :                :              *
                               5790                 :                :              * If checksums are enabled, you might think we should reset the
                               5791                 :                :              * checksum here. That will happen when the page is written
                               5792                 :                :              * sometime later in this checkpoint cycle.
                               5793                 :                :              */
   55                          5794                 :          58084 :             buf_state = LockBufHdr(bufHdr);
   56                          5795                 :          58084 :             PageSetLSN(page, lsn);
   55                          5796                 :          58084 :             UnlockBufHdr(bufHdr);
                               5797                 :                :         }
                               5798                 :                : 
   56                          5799                 :          88137 :         pgBufferUsage.shared_blks_dirtied++;
                               5800         [ +  + ]:          88137 :         if (VacuumCostActive)
                               5801                 :           2479 :             VacuumCostBalance += VacuumCostPageDirty;
                               5802                 :                :     }
                               5803                 :                : }
                               5804                 :                : 
                               5805                 :                : /*
                               5806                 :                :  * MarkBufferDirtyHint
                               5807                 :                :  *
                               5808                 :                :  *  Mark a buffer dirty for non-critical changes.
                               5809                 :                :  *
                               5810                 :                :  * This is essentially the same as MarkBufferDirty, except:
                               5811                 :                :  *
                               5812                 :                :  * 1. The caller does not write WAL; so if checksums are enabled, we may need
                               5813                 :                :  *    to write an XLOG_FPI_FOR_HINT WAL record to protect against torn pages.
                               5814                 :                :  * 2. The caller might have only a share-exclusive-lock instead of an
                               5815                 :                :  *    exclusive-lock on the buffer's content lock.
                               5816                 :                :  * 3. This function does not guarantee that the buffer is always marked dirty
                               5817                 :                :  *    (it e.g. can't always on a hot standby), so it cannot be used for
                               5818                 :                :  *    important changes.
                               5819                 :                :  */
                               5820                 :                : inline void
                               5821                 :         440796 : MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
                               5822                 :                : {
                               5823                 :                :     BufferDesc *bufHdr;
                               5824                 :                : 
                               5825                 :         440796 :     bufHdr = GetBufferDescriptor(buffer - 1);
                               5826                 :                : 
                               5827         [ -  + ]:         440796 :     if (!BufferIsValid(buffer))
   56 andres@anarazel.de       5828         [ #  # ]:UNC           0 :         elog(ERROR, "bad buffer ID: %d", buffer);
                               5829                 :                : 
   56 andres@anarazel.de       5830         [ +  + ]:GNC      440796 :     if (BufferIsLocal(buffer))
                               5831                 :                :     {
                               5832                 :          23969 :         MarkLocalBufferDirty(buffer);
                               5833                 :          23969 :         return;
                               5834                 :                :     }
                               5835                 :                : 
                               5836                 :         416827 :     MarkSharedBufferDirtyHint(buffer, bufHdr,
                               5837                 :         416827 :                               pg_atomic_read_u64(&bufHdr->state),
                               5838                 :                :                               buffer_std);
                               5839                 :                : }
                               5840                 :                : 
                               5841                 :                : /*
                               5842                 :                :  * Release buffer content locks for shared buffers.
                               5843                 :                :  *
                               5844                 :                :  * Used to clean up after errors.
                               5845                 :                :  *
                               5846                 :                :  * Currently, we can expect that resource owner cleanup, via
                               5847                 :                :  * ResOwnerReleaseBuffer(), took care of releasing buffer content locks per
                               5848                 :                :  * se; the only thing we need to deal with here is clearing any PIN_COUNT
                               5849                 :                :  * request that was in progress.
                               5850                 :                :  */
                               5851                 :                : void
 9244 tgl@sss.pgh.pa.us        5852                 :CBC       63696 : UnlockBuffers(void)
                               5853                 :                : {
 3823 rhaas@postgresql.org     5854                 :          63696 :     BufferDesc *buf = PinCountWaitBuf;
                               5855                 :                : 
 7871 tgl@sss.pgh.pa.us        5856         [ -  + ]:          63696 :     if (buf)
                               5857                 :                :     {
                               5858                 :                :         uint64      buf_state;
  110 andres@anarazel.de       5859                 :UNC           0 :         uint64      unset_bits = 0;
                               5860                 :                : 
 3677 andres@anarazel.de       5861                 :UBC           0 :         buf_state = LockBufHdr(buf);
                               5862                 :                : 
                               5863                 :                :         /*
                               5864                 :                :          * Don't complain if flag bit not set; it could have been reset but we
                               5865                 :                :          * got a cancel/die interrupt before getting the signal.
                               5866                 :                :          */
                               5867         [ #  # ]:              0 :         if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
  803 heikki.linnakangas@i     5868         [ #  # ]:              0 :             buf->wait_backend_pgprocno == MyProcNumber)
  180 andres@anarazel.de       5869                 :UNC           0 :             unset_bits = BM_PIN_COUNT_WAITER;
                               5870                 :                : 
                               5871                 :              0 :         UnlockBufHdrExt(buf, buf_state,
                               5872                 :                :                         0, unset_bits,
                               5873                 :                :                         0);
                               5874                 :                : 
 7732 tgl@sss.pgh.pa.us        5875                 :UBC           0 :         PinCountWaitBuf = NULL;
                               5876                 :                :     }
10003 vadim4o@yahoo.com        5877                 :CBC       63696 : }
                               5878                 :                : 
                               5879                 :                : /*
                               5880                 :                :  * Acquire the buffer content lock in the specified mode
                               5881                 :                :  *
                               5882                 :                :  * If the lock is not available, sleep until it is.
                               5883                 :                :  *
                               5884                 :                :  * Side effect: cancel/die interrupts are held off until lock release.
                               5885                 :                :  *
                               5886                 :                :  * This uses almost the same locking approach as lwlock.c's
                               5887                 :                :  * LWLockAcquire(). See documentation at the top of lwlock.c for a more
                               5888                 :                :  * detailed discussion.
                               5889                 :                :  *
                               5890                 :                :  * The reason that this, and most of the other BufferLock* functions, get both
                               5891                 :                :  * the Buffer and BufferDesc* as parameters, is that looking up one from the
                               5892                 :                :  * other repeatedly shows up noticeably in profiles.
                               5893                 :                :  *
                               5894                 :                :  * Callers should provide a constant for mode, for more efficient code
                               5895                 :                :  * generation.
                               5896                 :                :  */
                               5897                 :                : static inline void
  110 andres@anarazel.de       5898                 :GNC   111095916 : BufferLockAcquire(Buffer buffer, BufferDesc *buf_hdr, BufferLockMode mode)
                               5899                 :                : {
                               5900                 :                :     PrivateRefCountEntry *entry;
                               5901                 :      111095916 :     int         extraWaits = 0;
                               5902                 :                : 
                               5903                 :                :     /*
                               5904                 :                :      * Get reference to the refcount entry before we hold the lock, it seems
                               5905                 :                :      * better to do before holding the lock.
                               5906                 :                :      */
                               5907                 :      111095916 :     entry = GetPrivateRefCountEntry(buffer, true);
                               5908                 :                : 
                               5909                 :                :     /*
                               5910                 :                :      * We better not already hold a lock on the buffer.
                               5911                 :                :      */
                               5912         [ -  + ]:      111095916 :     Assert(entry->data.lockmode == BUFFER_LOCK_UNLOCK);
                               5913                 :                : 
                               5914                 :                :     /*
                               5915                 :                :      * Lock out cancel/die interrupts until we exit the code section protected
                               5916                 :                :      * by the content lock.  This ensures that interrupts will not interfere
                               5917                 :                :      * with manipulations of data structures in shared memory.
                               5918                 :                :      */
                               5919                 :      111095916 :     HOLD_INTERRUPTS();
                               5920                 :                : 
                               5921                 :                :     for (;;)
                               5922                 :          28803 :     {
  109                          5923                 :      111124719 :         uint32      wait_event = 0; /* initialized to avoid compiler warning */
                               5924                 :                :         bool        mustwait;
                               5925                 :                : 
                               5926                 :                :         /*
                               5927                 :                :          * Try to grab the lock the first time, we're not in the waitqueue
                               5928                 :                :          * yet/anymore.
                               5929                 :                :          */
  110                          5930                 :      111124719 :         mustwait = BufferLockAttempt(buf_hdr, mode);
                               5931                 :                : 
                               5932         [ +  + ]:      111124719 :         if (likely(!mustwait))
                               5933                 :                :         {
                               5934                 :      111095147 :             break;
                               5935                 :                :         }
                               5936                 :                : 
                               5937                 :                :         /*
                               5938                 :                :          * Ok, at this point we couldn't grab the lock on the first try. We
                               5939                 :                :          * cannot simply queue ourselves to the end of the list and wait to be
                               5940                 :                :          * woken up because by now the lock could long have been released.
                               5941                 :                :          * Instead add us to the queue and try to grab the lock again. If we
                               5942                 :                :          * succeed we need to revert the queuing and be happy, otherwise we
                               5943                 :                :          * recheck the lock. If we still couldn't grab it, we know that the
                               5944                 :                :          * other locker will see our queue entries when releasing since they
                               5945                 :                :          * existed before we checked for the lock.
                               5946                 :                :          */
                               5947                 :                : 
                               5948                 :                :         /* add to the queue */
                               5949                 :          29572 :         BufferLockQueueSelf(buf_hdr, mode);
                               5950                 :                : 
                               5951                 :                :         /* we're now guaranteed to be woken up if necessary */
                               5952                 :          29572 :         mustwait = BufferLockAttempt(buf_hdr, mode);
                               5953                 :                : 
                               5954                 :                :         /* ok, grabbed the lock the second time round, need to undo queueing */
                               5955         [ +  + ]:          29572 :         if (!mustwait)
                               5956                 :                :         {
                               5957                 :            769 :             BufferLockDequeueSelf(buf_hdr);
                               5958                 :            769 :             break;
                               5959                 :                :         }
                               5960                 :                : 
                               5961   [ +  +  +  -  :          28803 :         switch (mode)
                                                 - ]
                               5962                 :                :         {
                               5963                 :          16215 :             case BUFFER_LOCK_EXCLUSIVE:
                               5964                 :          16215 :                 wait_event = WAIT_EVENT_BUFFER_EXCLUSIVE;
                               5965                 :          16215 :                 break;
                               5966                 :            104 :             case BUFFER_LOCK_SHARE_EXCLUSIVE:
                               5967                 :            104 :                 wait_event = WAIT_EVENT_BUFFER_SHARE_EXCLUSIVE;
                               5968                 :            104 :                 break;
                               5969                 :          12484 :             case BUFFER_LOCK_SHARE:
                               5970                 :          12484 :                 wait_event = WAIT_EVENT_BUFFER_SHARED;
                               5971                 :          12484 :                 break;
  110 andres@anarazel.de       5972                 :UNC           0 :             case BUFFER_LOCK_UNLOCK:
                               5973                 :              0 :                 pg_unreachable();
                               5974                 :                : 
                               5975                 :                :         }
  110 andres@anarazel.de       5976                 :GNC       28803 :         pgstat_report_wait_start(wait_event);
                               5977                 :                : 
                               5978                 :                :         /*
                               5979                 :                :          * Wait until awakened.
                               5980                 :                :          *
                               5981                 :                :          * It is possible that we get awakened for a reason other than being
                               5982                 :                :          * signaled by BufferLockWakeup().  If so, loop back and wait again.
                               5983                 :                :          * Once we've gotten the lock, re-increment the sema by the number of
                               5984                 :                :          * additional signals received.
                               5985                 :                :          */
                               5986                 :                :         for (;;)
                               5987                 :                :         {
                               5988                 :          28803 :             PGSemaphoreLock(MyProc->sem);
                               5989         [ +  - ]:          28803 :             if (MyProc->lwWaiting == LW_WS_NOT_WAITING)
                               5990                 :          28803 :                 break;
  110 andres@anarazel.de       5991                 :UNC           0 :             extraWaits++;
                               5992                 :                :         }
                               5993                 :                : 
  110 andres@anarazel.de       5994                 :GNC       28803 :         pgstat_report_wait_end();
                               5995                 :                : 
                               5996                 :                :         /* Retrying, allow BufferLockReleaseSub to release waiters again. */
                               5997                 :          28803 :         pg_atomic_fetch_and_u64(&buf_hdr->state, ~BM_LOCK_WAKE_IN_PROGRESS);
                               5998                 :                :     }
                               5999                 :                : 
                               6000                 :                :     /* Remember that we now hold this lock */
                               6001                 :      111095916 :     entry->data.lockmode = mode;
                               6002                 :                : 
                               6003                 :                :     /*
                               6004                 :                :      * Fix the process wait semaphore's count for any absorbed wakeups.
                               6005                 :                :      */
                               6006         [ -  + ]:      111095916 :     while (unlikely(extraWaits-- > 0))
  110 andres@anarazel.de       6007                 :UNC           0 :         PGSemaphoreUnlock(MyProc->sem);
  110 andres@anarazel.de       6008                 :GNC   111095916 : }
                               6009                 :                : 
                               6010                 :                : /*
                               6011                 :                :  * Release a previously acquired buffer content lock.
                               6012                 :                :  */
                               6013                 :                : static void
                               6014                 :       63598113 : BufferLockUnlock(Buffer buffer, BufferDesc *buf_hdr)
                               6015                 :                : {
                               6016                 :                :     BufferLockMode mode;
                               6017                 :                :     uint64      oldstate;
                               6018                 :                :     uint64      sub;
                               6019                 :                : 
                               6020                 :       63598113 :     mode = BufferLockDisownInternal(buffer, buf_hdr);
                               6021                 :                : 
                               6022                 :                :     /*
                               6023                 :                :      * Release my hold on lock, after that it can immediately be acquired by
                               6024                 :                :      * others, even if we still have to wakeup other waiters.
                               6025                 :                :      */
                               6026                 :       63598113 :     sub = BufferLockReleaseSub(mode);
                               6027                 :                : 
                               6028                 :       63598113 :     oldstate = pg_atomic_sub_fetch_u64(&buf_hdr->state, sub);
                               6029                 :                : 
                               6030                 :       63598113 :     BufferLockProcessRelease(buf_hdr, mode, oldstate);
                               6031                 :                : 
                               6032                 :                :     /*
                               6033                 :                :      * Now okay to allow cancel/die interrupts.
                               6034                 :                :      */
                               6035         [ -  + ]:       63598113 :     RESUME_INTERRUPTS();
                               6036                 :       63598113 : }
                               6037                 :                : 
                               6038                 :                : 
                               6039                 :                : /*
                               6040                 :                :  * Acquire the content lock for the buffer, but only if we don't have to wait.
                               6041                 :                :  *
                               6042                 :                :  * It is allowed to try to conditionally acquire a lock on a buffer that this
                               6043                 :                :  * backend has already locked, but the lock acquisition will always fail, even
                               6044                 :                :  * if the new lock acquisition does not conflict with an already held lock
                               6045                 :                :  * (e.g. two share locks). This is because we currently do not have space to
                               6046                 :                :  * track multiple lock ownerships of the same buffer within one backend.  That
                               6047                 :                :  * is ok for the current uses of BufferLockConditional().
                               6048                 :                :  */
                               6049                 :                : static bool
                               6050                 :        1752898 : BufferLockConditional(Buffer buffer, BufferDesc *buf_hdr, BufferLockMode mode)
                               6051                 :                : {
                               6052                 :        1752898 :     PrivateRefCountEntry *entry = GetPrivateRefCountEntry(buffer, true);
                               6053                 :                :     bool        mustwait;
                               6054                 :                : 
                               6055                 :                :     /*
                               6056                 :                :      * As described above, if we're trying to lock a buffer this backend
                               6057                 :                :      * already has locked, return false, independent of the existing and
                               6058                 :                :      * desired lock level.
                               6059                 :                :      */
   96                          6060         [ -  + ]:        1752898 :     if (entry->data.lockmode != BUFFER_LOCK_UNLOCK)
   96 andres@anarazel.de       6061                 :UNC           0 :         return false;
                               6062                 :                : 
                               6063                 :                :     /*
                               6064                 :                :      * Lock out cancel/die interrupts until we exit the code section protected
                               6065                 :                :      * by the content lock.  This ensures that interrupts will not interfere
                               6066                 :                :      * with manipulations of data structures in shared memory.
                               6067                 :                :      */
  110 andres@anarazel.de       6068                 :GNC     1752898 :     HOLD_INTERRUPTS();
                               6069                 :                : 
                               6070                 :                :     /* Check for the lock */
                               6071                 :        1752898 :     mustwait = BufferLockAttempt(buf_hdr, mode);
                               6072                 :                : 
                               6073         [ +  + ]:        1752898 :     if (mustwait)
                               6074                 :                :     {
                               6075                 :                :         /* Failed to get lock, so release interrupt holdoff */
                               6076         [ -  + ]:            864 :         RESUME_INTERRUPTS();
                               6077                 :                :     }
                               6078                 :                :     else
                               6079                 :                :     {
                               6080                 :        1752034 :         entry->data.lockmode = mode;
                               6081                 :                :     }
                               6082                 :                : 
                               6083                 :        1752898 :     return !mustwait;
                               6084                 :                : }
                               6085                 :                : 
                               6086                 :                : /*
                               6087                 :                :  * Internal function that tries to atomically acquire the content lock in the
                               6088                 :                :  * passed in mode.
                               6089                 :                :  *
                               6090                 :                :  * This function will not block waiting for a lock to become free - that's the
                               6091                 :                :  * caller's job.
                               6092                 :                :  *
                               6093                 :                :  * Similar to LWLockAttemptLock().
                               6094                 :                :  */
                               6095                 :                : static inline bool
                               6096                 :      112907189 : BufferLockAttempt(BufferDesc *buf_hdr, BufferLockMode mode)
                               6097                 :                : {
                               6098                 :                :     uint64      old_state;
                               6099                 :                : 
                               6100                 :                :     /*
                               6101                 :                :      * Read once outside the loop, later iterations will get the newer value
                               6102                 :                :      * via compare & exchange.
                               6103                 :                :      */
                               6104                 :      112907189 :     old_state = pg_atomic_read_u64(&buf_hdr->state);
                               6105                 :                : 
                               6106                 :                :     /* loop until we've determined whether we could acquire the lock or not */
                               6107                 :                :     while (true)
                               6108                 :           4998 :     {
                               6109                 :                :         uint64      desired_state;
                               6110                 :                :         bool        lock_free;
                               6111                 :                : 
                               6112                 :      112912187 :         desired_state = old_state;
                               6113                 :                : 
                               6114         [ +  + ]:      112912187 :         if (mode == BUFFER_LOCK_EXCLUSIVE)
                               6115                 :                :         {
                               6116                 :       38014773 :             lock_free = (old_state & BM_LOCK_MASK) == 0;
                               6117         [ +  + ]:       38014773 :             if (lock_free)
                               6118                 :       37980912 :                 desired_state += BM_LOCK_VAL_EXCLUSIVE;
                               6119                 :                :         }
                               6120         [ +  + ]:       74897414 :         else if (mode == BUFFER_LOCK_SHARE_EXCLUSIVE)
                               6121                 :                :         {
                               6122                 :         734358 :             lock_free = (old_state & (BM_LOCK_VAL_EXCLUSIVE | BM_LOCK_VAL_SHARE_EXCLUSIVE)) == 0;
                               6123         [ +  + ]:         734358 :             if (lock_free)
                               6124                 :         734148 :                 desired_state += BM_LOCK_VAL_SHARE_EXCLUSIVE;
                               6125                 :                :         }
                               6126                 :                :         else
                               6127                 :                :         {
                               6128                 :       74163056 :             lock_free = (old_state & BM_LOCK_VAL_EXCLUSIVE) == 0;
                               6129         [ +  + ]:       74163056 :             if (lock_free)
                               6130                 :       74137693 :                 desired_state += BM_LOCK_VAL_SHARED;
                               6131                 :                :         }
                               6132                 :                : 
                               6133                 :                :         /*
                               6134                 :                :          * Attempt to swap in the state we are expecting. If we didn't see
                               6135                 :                :          * lock to be free, that's just the old value. If we saw it as free,
                               6136                 :                :          * we'll attempt to mark it acquired. The reason that we always swap
                               6137                 :                :          * in the value is that this doubles as a memory barrier. We could try
                               6138                 :                :          * to be smarter and only swap in values if we saw the lock as free,
                               6139                 :                :          * but benchmark haven't shown it as beneficial so far.
                               6140                 :                :          *
                               6141                 :                :          * Retry if the value changed since we last looked at it.
                               6142                 :                :          */
                               6143         [ +  + ]:      112912187 :         if (likely(pg_atomic_compare_exchange_u64(&buf_hdr->state,
                               6144                 :                :                                                   &old_state, desired_state)))
                               6145                 :                :         {
                               6146         [ +  + ]:      112907189 :             if (lock_free)
                               6147                 :                :             {
                               6148                 :                :                 /* Great! Got the lock. */
                               6149                 :      112847950 :                 return false;
                               6150                 :                :             }
                               6151                 :                :             else
                               6152                 :          59239 :                 return true;    /* somebody else has the lock */
                               6153                 :                :         }
                               6154                 :                :     }
                               6155                 :                : 
                               6156                 :                :     pg_unreachable();
                               6157                 :                : }
                               6158                 :                : 
                               6159                 :                : /*
                               6160                 :                :  * Add ourselves to the end of the content lock's wait queue.
                               6161                 :                :  */
                               6162                 :                : static void
                               6163                 :          29572 : BufferLockQueueSelf(BufferDesc *buf_hdr, BufferLockMode mode)
                               6164                 :                : {
                               6165                 :                :     /*
                               6166                 :                :      * If we don't have a PGPROC structure, there's no way to wait. This
                               6167                 :                :      * should never occur, since MyProc should only be null during shared
                               6168                 :                :      * memory initialization.
                               6169                 :                :      */
                               6170         [ -  + ]:          29572 :     if (MyProc == NULL)
  110 andres@anarazel.de       6171         [ #  # ]:UNC           0 :         elog(PANIC, "cannot wait without a PGPROC structure");
                               6172                 :                : 
  110 andres@anarazel.de       6173         [ -  + ]:GNC       29572 :     if (MyProc->lwWaiting != LW_WS_NOT_WAITING)
  110 andres@anarazel.de       6174         [ #  # ]:UNC           0 :         elog(PANIC, "queueing for lock while waiting on another one");
                               6175                 :                : 
  110 andres@anarazel.de       6176                 :GNC       29572 :     LockBufHdr(buf_hdr);
                               6177                 :                : 
                               6178                 :                :     /* setting the flag is protected by the spinlock */
                               6179                 :          29572 :     pg_atomic_fetch_or_u64(&buf_hdr->state, BM_LOCK_HAS_WAITERS);
                               6180                 :                : 
                               6181                 :                :     /*
                               6182                 :                :      * These are currently used both for lwlocks and buffer content locks,
                               6183                 :                :      * which is acceptable, although not pretty, because a backend can't wait
                               6184                 :                :      * for both types of locks at the same time.
                               6185                 :                :      */
                               6186                 :          29572 :     MyProc->lwWaiting = LW_WS_WAITING;
                               6187                 :          29572 :     MyProc->lwWaitMode = mode;
                               6188                 :                : 
                               6189                 :          29572 :     proclist_push_tail(&buf_hdr->lock_waiters, MyProcNumber, lwWaitLink);
                               6190                 :                : 
                               6191                 :                :     /* Can release the mutex now */
                               6192                 :          29572 :     UnlockBufHdr(buf_hdr);
                               6193                 :          29572 : }
                               6194                 :                : 
                               6195                 :                : /*
                               6196                 :                :  * Remove ourselves from the waitlist.
                               6197                 :                :  *
                               6198                 :                :  * This is used if we queued ourselves because we thought we needed to sleep
                               6199                 :                :  * but, after further checking, we discovered that we don't actually need to
                               6200                 :                :  * do so.
                               6201                 :                :  */
                               6202                 :                : static void
                               6203                 :            769 : BufferLockDequeueSelf(BufferDesc *buf_hdr)
                               6204                 :                : {
                               6205                 :                :     bool        on_waitlist;
                               6206                 :                : 
                               6207                 :            769 :     LockBufHdr(buf_hdr);
                               6208                 :                : 
                               6209                 :            769 :     on_waitlist = MyProc->lwWaiting == LW_WS_WAITING;
                               6210         [ +  + ]:            769 :     if (on_waitlist)
                               6211                 :            551 :         proclist_delete(&buf_hdr->lock_waiters, MyProcNumber, lwWaitLink);
                               6212                 :                : 
                               6213         [ +  + ]:            769 :     if (proclist_is_empty(&buf_hdr->lock_waiters) &&
                               6214         [ +  + ]:            746 :         (pg_atomic_read_u64(&buf_hdr->state) & BM_LOCK_HAS_WAITERS) != 0)
                               6215                 :                :     {
                               6216                 :            528 :         pg_atomic_fetch_and_u64(&buf_hdr->state, ~BM_LOCK_HAS_WAITERS);
                               6217                 :                :     }
                               6218                 :                : 
                               6219                 :                :     /* XXX: combine with fetch_and above? */
                               6220                 :            769 :     UnlockBufHdr(buf_hdr);
                               6221                 :                : 
                               6222                 :                :     /* clear waiting state again, nice for debugging */
                               6223         [ +  + ]:            769 :     if (on_waitlist)
                               6224                 :            551 :         MyProc->lwWaiting = LW_WS_NOT_WAITING;
                               6225                 :                :     else
                               6226                 :                :     {
                               6227                 :            218 :         int         extraWaits = 0;
                               6228                 :                : 
                               6229                 :                : 
                               6230                 :                :         /*
                               6231                 :                :          * Somebody else dequeued us and has or will wake us up. Deal with the
                               6232                 :                :          * superfluous absorption of a wakeup.
                               6233                 :                :          */
                               6234                 :                : 
                               6235                 :                :         /*
                               6236                 :                :          * Clear BM_LOCK_WAKE_IN_PROGRESS if somebody woke us before we
                               6237                 :                :          * removed ourselves - they'll have set it.
                               6238                 :                :          */
                               6239                 :            218 :         pg_atomic_fetch_and_u64(&buf_hdr->state, ~BM_LOCK_WAKE_IN_PROGRESS);
                               6240                 :                : 
                               6241                 :                :         /*
                               6242                 :                :          * Now wait for the scheduled wakeup, otherwise our ->lwWaiting would
                               6243                 :                :          * get reset at some inconvenient point later. Most of the time this
                               6244                 :                :          * will immediately return.
                               6245                 :                :          */
                               6246                 :                :         for (;;)
                               6247                 :                :         {
                               6248                 :            218 :             PGSemaphoreLock(MyProc->sem);
                               6249         [ +  - ]:            218 :             if (MyProc->lwWaiting == LW_WS_NOT_WAITING)
                               6250                 :            218 :                 break;
  110 andres@anarazel.de       6251                 :UNC           0 :             extraWaits++;
                               6252                 :                :         }
                               6253                 :                : 
                               6254                 :                :         /*
                               6255                 :                :          * Fix the process wait semaphore's count for any absorbed wakeups.
                               6256                 :                :          */
  110 andres@anarazel.de       6257         [ -  + ]:GNC         218 :         while (extraWaits-- > 0)
  110 andres@anarazel.de       6258                 :UNC           0 :             PGSemaphoreUnlock(MyProc->sem);
                               6259                 :                :     }
  110 andres@anarazel.de       6260                 :GNC         769 : }
                               6261                 :                : 
                               6262                 :                : /*
                               6263                 :                :  * Stop treating lock as held by current backend.
                               6264                 :                :  *
                               6265                 :                :  * After calling this function it's the callers responsibility to ensure that
                               6266                 :                :  * the lock gets released, even in case of an error. This only is desirable if
                               6267                 :                :  * the lock is going to be released in a different process than the process
                               6268                 :                :  * that acquired it.
                               6269                 :                :  */
                               6270                 :                : static inline void
  110 andres@anarazel.de       6271                 :UNC           0 : BufferLockDisown(Buffer buffer, BufferDesc *buf_hdr)
                               6272                 :                : {
                               6273                 :              0 :     BufferLockDisownInternal(buffer, buf_hdr);
                               6274         [ #  # ]:              0 :     RESUME_INTERRUPTS();
                               6275                 :              0 : }
                               6276                 :                : 
                               6277                 :                : /*
                               6278                 :                :  * Stop treating lock as held by current backend.
                               6279                 :                :  *
                               6280                 :                :  * This is the code that can be shared between actually releasing a lock
                               6281                 :                :  * (BufferLockUnlock()) and just not tracking ownership of the lock anymore
                               6282                 :                :  * without releasing the lock (BufferLockDisown()).
                               6283                 :                :  */
                               6284                 :                : static inline int
  110 andres@anarazel.de       6285                 :GNC   112847950 : BufferLockDisownInternal(Buffer buffer, BufferDesc *buf_hdr)
                               6286                 :                : {
                               6287                 :                :     BufferLockMode mode;
                               6288                 :                :     PrivateRefCountEntry *ref;
                               6289                 :                : 
                               6290                 :      112847950 :     ref = GetPrivateRefCountEntry(buffer, false);
                               6291         [ -  + ]:      112847950 :     if (ref == NULL)
  110 andres@anarazel.de       6292         [ #  # ]:UNC           0 :         elog(ERROR, "lock %d is not held", buffer);
  110 andres@anarazel.de       6293                 :GNC   112847950 :     mode = ref->data.lockmode;
                               6294                 :      112847950 :     ref->data.lockmode = BUFFER_LOCK_UNLOCK;
                               6295                 :                : 
                               6296                 :      112847950 :     return mode;
                               6297                 :                : }
                               6298                 :                : 
                               6299                 :                : /*
                               6300                 :                :  * Wakeup all the lockers that currently have a chance to acquire the lock.
                               6301                 :                :  *
                               6302                 :                :  * wake_exclusive indicates whether exclusive lock waiters should be woken up.
                               6303                 :                :  */
                               6304                 :                : static void
                               6305                 :          27093 : BufferLockWakeup(BufferDesc *buf_hdr, bool wake_exclusive)
                               6306                 :                : {
                               6307                 :          27093 :     bool        new_wake_in_progress = false;
                               6308                 :          27093 :     bool        wake_share_exclusive = true;
                               6309                 :                :     proclist_head wakeup;
                               6310                 :                :     proclist_mutable_iter iter;
                               6311                 :                : 
                               6312                 :          27093 :     proclist_init(&wakeup);
                               6313                 :                : 
                               6314                 :                :     /* lock wait list while collecting backends to wake up */
                               6315                 :          27093 :     LockBufHdr(buf_hdr);
                               6316                 :                : 
                               6317   [ +  +  +  +  :          41269 :     proclist_foreach_modify(iter, &buf_hdr->lock_waiters, lwWaitLink)
                                              +  + ]
                               6318                 :                :     {
                               6319                 :          30506 :         PGPROC     *waiter = GetPGProcByNumber(iter.cur);
                               6320                 :                : 
                               6321                 :                :         /*
                               6322                 :                :          * Already woke up a conflicting lock, so skip over this wait list
                               6323                 :                :          * entry.
                               6324                 :                :          */
                               6325   [ +  +  +  + ]:          30506 :         if (!wake_exclusive && waiter->lwWaitMode == BUFFER_LOCK_EXCLUSIVE)
                               6326                 :           1476 :             continue;
                               6327   [ +  +  -  + ]:          29030 :         if (!wake_share_exclusive && waiter->lwWaitMode == BUFFER_LOCK_SHARE_EXCLUSIVE)
  110 andres@anarazel.de       6328                 :UNC           0 :             continue;
                               6329                 :                : 
  110 andres@anarazel.de       6330                 :GNC       29030 :         proclist_delete(&buf_hdr->lock_waiters, iter.cur, lwWaitLink);
                               6331                 :          29030 :         proclist_push_tail(&wakeup, iter.cur, lwWaitLink);
                               6332                 :                : 
                               6333                 :                :         /*
                               6334                 :                :          * Prevent additional wakeups until retryer gets to run. Backends that
                               6335                 :                :          * are just waiting for the lock to become free don't retry
                               6336                 :                :          * automatically.
                               6337                 :                :          */
                               6338                 :          29030 :         new_wake_in_progress = true;
                               6339                 :                : 
                               6340                 :                :         /*
                               6341                 :                :          * Signal that the process isn't on the wait list anymore. This allows
                               6342                 :                :          * BufferLockDequeueSelf() to remove itself from the waitlist with a
                               6343                 :                :          * proclist_delete(), rather than having to check if it has been
                               6344                 :                :          * removed from the list.
                               6345                 :                :          */
                               6346         [ -  + ]:          29030 :         Assert(waiter->lwWaiting == LW_WS_WAITING);
                               6347                 :          29030 :         waiter->lwWaiting = LW_WS_PENDING_WAKEUP;
                               6348                 :                : 
                               6349                 :                :         /*
                               6350                 :                :          * Don't wakeup further waiters after waking a conflicting waiter.
                               6351                 :                :          */
                               6352         [ +  + ]:          29030 :         if (waiter->lwWaitMode == BUFFER_LOCK_SHARE)
                               6353                 :                :         {
                               6354                 :                :             /*
                               6355                 :                :              * Share locks conflict with exclusive locks.
                               6356                 :                :              */
                               6357                 :          12588 :             wake_exclusive = false;
                               6358                 :                :         }
                               6359         [ +  + ]:          16442 :         else if (waiter->lwWaitMode == BUFFER_LOCK_SHARE_EXCLUSIVE)
                               6360                 :                :         {
                               6361                 :                :             /*
                               6362                 :                :              * Share-exclusive locks conflict with share-exclusive and
                               6363                 :                :              * exclusive locks.
                               6364                 :                :              */
                               6365                 :            112 :             wake_exclusive = false;
                               6366                 :            112 :             wake_share_exclusive = false;
                               6367                 :                :         }
                               6368         [ +  - ]:          16330 :         else if (waiter->lwWaitMode == BUFFER_LOCK_EXCLUSIVE)
                               6369                 :                :         {
                               6370                 :                :             /*
                               6371                 :                :              * Exclusive locks conflict with all other locks, there's no point
                               6372                 :                :              * in waking up anybody else.
                               6373                 :                :              */
                               6374                 :          16330 :             break;
                               6375                 :                :         }
                               6376                 :                :     }
                               6377                 :                : 
                               6378   [ +  +  -  + ]:          27093 :     Assert(proclist_is_empty(&wakeup) || pg_atomic_read_u64(&buf_hdr->state) & BM_LOCK_HAS_WAITERS);
                               6379                 :                : 
                               6380                 :                :     /* unset required flags, and release lock, in one fell swoop */
                               6381                 :                :     {
                               6382                 :                :         uint64      old_state;
                               6383                 :                :         uint64      desired_state;
                               6384                 :                : 
                               6385                 :          27093 :         old_state = pg_atomic_read_u64(&buf_hdr->state);
                               6386                 :                :         while (true)
                               6387                 :                :         {
                               6388                 :          27098 :             desired_state = old_state;
                               6389                 :                : 
                               6390                 :                :             /* compute desired flags */
                               6391                 :                : 
                               6392         [ +  + ]:          27098 :             if (new_wake_in_progress)
                               6393                 :          26972 :                 desired_state |= BM_LOCK_WAKE_IN_PROGRESS;
                               6394                 :                :             else
                               6395                 :            126 :                 desired_state &= ~BM_LOCK_WAKE_IN_PROGRESS;
                               6396                 :                : 
                               6397         [ +  + ]:          27098 :             if (proclist_is_empty(&buf_hdr->lock_waiters))
                               6398                 :          22074 :                 desired_state &= ~BM_LOCK_HAS_WAITERS;
                               6399                 :                : 
                               6400                 :          27098 :             desired_state &= ~BM_LOCKED;    /* release lock */
                               6401                 :                : 
                               6402         [ +  + ]:          27098 :             if (pg_atomic_compare_exchange_u64(&buf_hdr->state, &old_state,
                               6403                 :                :                                                desired_state))
                               6404                 :          27093 :                 break;
                               6405                 :                :         }
                               6406                 :                :     }
                               6407                 :                : 
                               6408                 :                :     /* Awaken any waiters I removed from the queue. */
                               6409   [ +  +  +  +  :          56123 :     proclist_foreach_modify(iter, &wakeup, lwWaitLink)
                                              +  + ]
                               6410                 :                :     {
                               6411                 :          29030 :         PGPROC     *waiter = GetPGProcByNumber(iter.cur);
                               6412                 :                : 
                               6413                 :          29030 :         proclist_delete(&wakeup, iter.cur, lwWaitLink);
                               6414                 :                : 
                               6415                 :                :         /*
                               6416                 :                :          * Guarantee that lwWaiting being unset only becomes visible once the
                               6417                 :                :          * unlink from the link has completed. Otherwise the target backend
                               6418                 :                :          * could be woken up for other reason and enqueue for a new lock - if
                               6419                 :                :          * that happens before the list unlink happens, the list would end up
                               6420                 :                :          * being corrupted.
                               6421                 :                :          *
                               6422                 :                :          * The barrier pairs with the LockBufHdr() when enqueuing for another
                               6423                 :                :          * lock.
                               6424                 :                :          */
                               6425                 :          29030 :         pg_write_barrier();
                               6426                 :          29030 :         waiter->lwWaiting = LW_WS_NOT_WAITING;
                               6427                 :          29030 :         PGSemaphoreUnlock(waiter->sem);
                               6428                 :                :     }
                               6429                 :          27093 : }
                               6430                 :                : 
                               6431                 :                : /*
                               6432                 :                :  * Compute subtraction from buffer state for a release of a held lock in
                               6433                 :                :  * `mode`.
                               6434                 :                :  *
                               6435                 :                :  * This is separated from BufferLockUnlock() as we want to combine the lock
                               6436                 :                :  * release with other atomic operations when possible, leading to the lock
                               6437                 :                :  * release being done in multiple places, each needing to compute what to
                               6438                 :                :  * subtract from the lock state.
                               6439                 :                :  */
                               6440                 :                : static inline uint64
                               6441                 :      112847950 : BufferLockReleaseSub(BufferLockMode mode)
                               6442                 :                : {
                               6443                 :                :     /*
                               6444                 :                :      * Turns out that a switch() leads gcc to generate sufficiently worse code
                               6445                 :                :      * for this to show up in profiles...
                               6446                 :                :      */
                               6447         [ +  + ]:      112847950 :     if (mode == BUFFER_LOCK_EXCLUSIVE)
                               6448                 :       37980747 :         return BM_LOCK_VAL_EXCLUSIVE;
                               6449         [ +  + ]:       74867203 :     else if (mode == BUFFER_LOCK_SHARE_EXCLUSIVE)
                               6450                 :        4167237 :         return BM_LOCK_VAL_SHARE_EXCLUSIVE;
                               6451                 :                :     else
                               6452                 :                :     {
                               6453         [ -  + ]:       70699966 :         Assert(mode == BUFFER_LOCK_SHARE);
                               6454                 :       70699966 :         return BM_LOCK_VAL_SHARED;
                               6455                 :                :     }
                               6456                 :                : 
                               6457                 :                :     return 0;                   /* keep compiler quiet */
                               6458                 :                : }
                               6459                 :                : 
                               6460                 :                : /*
                               6461                 :                :  * Handle work that needs to be done after releasing a lock that was held in
                               6462                 :                :  * `mode`, where `lockstate` is the result of the atomic operation modifying
                               6463                 :                :  * the state variable.
                               6464                 :                :  *
                               6465                 :                :  * This is separated from BufferLockUnlock() as we want to combine the lock
                               6466                 :                :  * release with other atomic operations when possible, leading to the lock
                               6467                 :                :  * release being done in multiple places.
                               6468                 :                :  */
                               6469                 :                : static void
                               6470                 :      112847950 : BufferLockProcessRelease(BufferDesc *buf_hdr, BufferLockMode mode, uint64 lockstate)
                               6471                 :                : {
                               6472                 :      112847950 :     bool        check_waiters = false;
                               6473                 :      112847950 :     bool        wake_exclusive = false;
                               6474                 :                : 
                               6475                 :                :     /* nobody else can have that kind of lock */
                               6476         [ -  + ]:      112847950 :     Assert(!(lockstate & BM_LOCK_VAL_EXCLUSIVE));
                               6477                 :                : 
                               6478                 :                :     /*
                               6479                 :                :      * If we're still waiting for backends to get scheduled, don't wake them
                               6480                 :                :      * up again. Otherwise check if we need to look through the waitqueue to
                               6481                 :                :      * wake other backends.
                               6482                 :                :      */
                               6483         [ +  + ]:      112847950 :     if ((lockstate & BM_LOCK_HAS_WAITERS) &&
                               6484         [ +  + ]:          96856 :         !(lockstate & BM_LOCK_WAKE_IN_PROGRESS))
                               6485                 :                :     {
                               6486         [ +  + ]:          51454 :         if ((lockstate & BM_LOCK_MASK) == 0)
                               6487                 :                :         {
                               6488                 :                :             /*
                               6489                 :                :              * We released a lock and the lock was, in that moment, free. We
                               6490                 :                :              * therefore can wake waiters for any kind of lock.
                               6491                 :                :              */
                               6492                 :          27088 :             check_waiters = true;
                               6493                 :          27088 :             wake_exclusive = true;
                               6494                 :                :         }
                               6495         [ +  + ]:          24366 :         else if (mode == BUFFER_LOCK_SHARE_EXCLUSIVE)
                               6496                 :                :         {
                               6497                 :                :             /*
                               6498                 :                :              * We released the lock, but another backend still holds a lock.
                               6499                 :                :              * We can't have released an exclusive lock, as there couldn't
                               6500                 :                :              * have been other lock holders. If we released a share lock, no
                               6501                 :                :              * waiters need to be woken up, as there must be other share
                               6502                 :                :              * lockers. However, if we held a share-exclusive lock, another
                               6503                 :                :              * backend now could acquire a share-exclusive lock.
                               6504                 :                :              */
                               6505                 :              5 :             check_waiters = true;
                               6506                 :              5 :             wake_exclusive = false;
                               6507                 :                :         }
                               6508                 :                :     }
                               6509                 :                : 
                               6510                 :                :     /*
                               6511                 :                :      * As waking up waiters requires the spinlock to be acquired, only do so
                               6512                 :                :      * if necessary.
                               6513                 :                :      */
                               6514         [ +  + ]:      112847950 :     if (check_waiters)
                               6515                 :          27093 :         BufferLockWakeup(buf_hdr, wake_exclusive);
                               6516                 :      112847950 : }
                               6517                 :                : 
                               6518                 :                : /*
                               6519                 :                :  * BufferLockHeldByMeInMode - test whether my process holds the content lock
                               6520                 :                :  * in the specified mode
                               6521                 :                :  *
                               6522                 :                :  * This is meant as debug support only.
                               6523                 :                :  */
                               6524                 :                : static bool
                               6525                 :      139226354 : BufferLockHeldByMeInMode(BufferDesc *buf_hdr, BufferLockMode mode)
                               6526                 :                : {
                               6527                 :                :     PrivateRefCountEntry *entry =
                               6528                 :      139226354 :         GetPrivateRefCountEntry(BufferDescriptorGetBuffer(buf_hdr), false);
                               6529                 :                : 
                               6530         [ -  + ]:      139226354 :     if (!entry)
  110 andres@anarazel.de       6531                 :UNC           0 :         return false;
                               6532                 :                :     else
  110 andres@anarazel.de       6533                 :GNC   139226354 :         return entry->data.lockmode == mode;
                               6534                 :                : }
                               6535                 :                : 
                               6536                 :                : /*
                               6537                 :                :  * BufferLockHeldByMe - test whether my process holds the content lock in any
                               6538                 :                :  * mode
                               6539                 :                :  *
                               6540                 :                :  * This is meant as debug support only.
                               6541                 :                :  */
                               6542                 :                : static bool
                               6543                 :       29258501 : BufferLockHeldByMe(BufferDesc *buf_hdr)
                               6544                 :                : {
                               6545                 :                :     PrivateRefCountEntry *entry =
                               6546                 :       29258501 :         GetPrivateRefCountEntry(BufferDescriptorGetBuffer(buf_hdr), false);
                               6547                 :                : 
                               6548         [ -  + ]:       29258501 :     if (!entry)
  110 andres@anarazel.de       6549                 :UNC           0 :         return false;
                               6550                 :                :     else
  110 andres@anarazel.de       6551                 :GNC    29258501 :         return entry->data.lockmode != BUFFER_LOCK_UNLOCK;
                               6552                 :                : }
                               6553                 :                : 
                               6554                 :                : /*
                               6555                 :                :  * Release the content lock for the buffer.
                               6556                 :                :  */
                               6557                 :                : void
                               6558                 :       68436947 : UnlockBuffer(Buffer buffer)
                               6559                 :                : {
                               6560                 :                :     BufferDesc *buf_hdr;
                               6561                 :                : 
 2115 pg@bowt.ie               6562   [ -  +  +  +  :CBC    68436947 :     Assert(BufferIsPinned(buffer));
                                              -  + ]
10003 vadim4o@yahoo.com        6563         [ +  + ]:       68436947 :     if (BufferIsLocal(buffer))
 7340 tgl@sss.pgh.pa.us        6564                 :        5201989 :         return;                 /* local buffers need no lock */
                               6565                 :                : 
  110 andres@anarazel.de       6566                 :GNC    63234958 :     buf_hdr = GetBufferDescriptor(buffer - 1);
                               6567                 :       63234958 :     BufferLockUnlock(buffer, buf_hdr);
                               6568                 :                : }
                               6569                 :                : 
                               6570                 :                : /*
                               6571                 :                :  * Acquire the content_lock for the buffer.
                               6572                 :                :  */
                               6573                 :                : void
                               6574                 :      117467835 : LockBufferInternal(Buffer buffer, BufferLockMode mode)
                               6575                 :                : {
                               6576                 :                :     BufferDesc *buf_hdr;
                               6577                 :                : 
                               6578                 :                :     /*
                               6579                 :                :      * We can't wait if we haven't got a PGPROC.  This should only occur
                               6580                 :                :      * during bootstrap or shared memory initialization.  Put an Assert here
                               6581                 :                :      * to catch unsafe coding practices.
                               6582                 :                :      */
                               6583   [ -  +  -  - ]:      117467835 :     Assert(!(MyProc == NULL && IsUnderPostmaster));
                               6584                 :                : 
                               6585                 :                :     /* handled in LockBuffer() wrapper */
                               6586         [ -  + ]:      117467835 :     Assert(mode != BUFFER_LOCK_UNLOCK);
                               6587                 :                : 
                               6588   [ -  +  +  +  :      117467835 :     Assert(BufferIsPinned(buffer));
                                              -  + ]
                               6589         [ +  + ]:      117467835 :     if (BufferIsLocal(buffer))
                               6590                 :        6734958 :         return;                 /* local buffers need no lock */
                               6591                 :                : 
                               6592                 :      110732877 :     buf_hdr = GetBufferDescriptor(buffer - 1);
                               6593                 :                : 
                               6594                 :                :     /*
                               6595                 :                :      * Test the most frequent lock modes first. While a switch (mode) would be
                               6596                 :                :      * nice, at least gcc generates considerably worse code for it.
                               6597                 :                :      *
                               6598                 :                :      * Call BufferLockAcquire() with a constant argument for mode, to generate
                               6599                 :                :      * more efficient code for the different lock modes.
                               6600                 :                :      */
                               6601         [ +  + ]:      110732877 :     if (mode == BUFFER_LOCK_SHARE)
                               6602                 :       74133055 :         BufferLockAcquire(buffer, buf_hdr, BUFFER_LOCK_SHARE);
10003 vadim4o@yahoo.com        6603         [ +  - ]:CBC    36599822 :     else if (mode == BUFFER_LOCK_EXCLUSIVE)
  110 andres@anarazel.de       6604                 :GNC    36599822 :         BufferLockAcquire(buffer, buf_hdr, BUFFER_LOCK_EXCLUSIVE);
  110 andres@anarazel.de       6605         [ #  # ]:UNC           0 :     else if (mode == BUFFER_LOCK_SHARE_EXCLUSIVE)
                               6606                 :              0 :         BufferLockAcquire(buffer, buf_hdr, BUFFER_LOCK_SHARE_EXCLUSIVE);
                               6607                 :                :     else
 8321 tgl@sss.pgh.pa.us        6608         [ #  # ]:UBC           0 :         elog(ERROR, "unrecognized buffer lock mode: %d", mode);
                               6609                 :                : }
                               6610                 :                : 
                               6611                 :                : /*
                               6612                 :                :  * Acquire the content_lock for the buffer, but only if we don't have to wait.
                               6613                 :                :  *
                               6614                 :                :  * This assumes the caller wants BUFFER_LOCK_EXCLUSIVE mode.
                               6615                 :                :  */
                               6616                 :                : bool
 8304 tgl@sss.pgh.pa.us        6617                 :CBC     1467970 : ConditionalLockBuffer(Buffer buffer)
                               6618                 :                : {
                               6619                 :                :     BufferDesc *buf;
                               6620                 :                : 
 2115 pg@bowt.ie               6621   [ -  +  +  +  :        1467970 :     Assert(BufferIsPinned(buffer));
                                              -  + ]
 8304 tgl@sss.pgh.pa.us        6622         [ +  + ]:        1467970 :     if (BufferIsLocal(buffer))
                               6623                 :          86197 :         return true;            /* act as though we got it */
                               6624                 :                : 
 4114 andres@anarazel.de       6625                 :        1381773 :     buf = GetBufferDescriptor(buffer - 1);
                               6626                 :                : 
  110 andres@anarazel.de       6627                 :GNC     1381773 :     return BufferLockConditional(buffer, buf, BUFFER_LOCK_EXCLUSIVE);
                               6628                 :                : }
                               6629                 :                : 
                               6630                 :                : /*
                               6631                 :                :  * Verify that this backend is pinning the buffer exactly once.
                               6632                 :                :  *
                               6633                 :                :  * NOTE: Like in BufferIsPinned(), what we check here is that *this* backend
                               6634                 :                :  * holds a pin on the buffer.  We do not care whether some other backend does.
                               6635                 :                :  */
                               6636                 :                : void
 1126 andres@anarazel.de       6637                 :CBC     4632669 : CheckBufferIsPinnedOnce(Buffer buffer)
                               6638                 :                : {
                               6639         [ +  + ]:        4632669 :     if (BufferIsLocal(buffer))
                               6640                 :                :     {
                               6641         [ -  + ]:           1049 :         if (LocalRefCount[-buffer - 1] != 1)
 1126 andres@anarazel.de       6642         [ #  # ]:UBC           0 :             elog(ERROR, "incorrect local pin count: %d",
                               6643                 :                :                  LocalRefCount[-buffer - 1]);
                               6644                 :                :     }
                               6645                 :                :     else
                               6646                 :                :     {
 1126 andres@anarazel.de       6647         [ -  + ]:CBC     4631620 :         if (GetPrivateRefCount(buffer) != 1)
 1126 andres@anarazel.de       6648         [ #  # ]:UBC           0 :             elog(ERROR, "incorrect local pin count: %d",
                               6649                 :                :                  GetPrivateRefCount(buffer));
                               6650                 :                :     }
 1126 andres@anarazel.de       6651                 :CBC     4632669 : }
                               6652                 :                : 
                               6653                 :                : /*
                               6654                 :                :  * LockBufferForCleanup - lock a buffer in preparation for deleting items
                               6655                 :                :  *
                               6656                 :                :  * Items may be deleted from a disk page only when the caller (a) holds an
                               6657                 :                :  * exclusive lock on the buffer and (b) has observed that no other backend
                               6658                 :                :  * holds a pin on the buffer.  If there is a pin, then the other backend
                               6659                 :                :  * might have a pointer into the buffer (for example, a heapscan reference
                               6660                 :                :  * to an item --- see README for more details).  It's OK if a pin is added
                               6661                 :                :  * after the cleanup starts, however; the newly-arrived backend will be
                               6662                 :                :  * unable to look at the page until we release the exclusive lock.
                               6663                 :                :  *
                               6664                 :                :  * To implement this protocol, a would-be deleter must pin the buffer and
                               6665                 :                :  * then call LockBufferForCleanup().  LockBufferForCleanup() is similar to
                               6666                 :                :  * LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE), except that it loops until
                               6667                 :                :  * it has successfully observed pin count = 1.
                               6668                 :                :  */
                               6669                 :                : void
 9069 tgl@sss.pgh.pa.us        6670                 :          33597 : LockBufferForCleanup(Buffer buffer)
                               6671                 :                : {
                               6672                 :                :     BufferDesc *bufHdr;
 1943 fujii@postgresql.org     6673                 :          33597 :     TimestampTz waitStart = 0;
 1170 drowley@postgresql.o     6674                 :          33597 :     bool        waiting = false;
 1943 fujii@postgresql.org     6675                 :          33597 :     bool        logged_recovery_conflict = false;
                               6676                 :                : 
 2115 pg@bowt.ie               6677   [ -  +  +  +  :          33597 :     Assert(BufferIsPinned(buffer));
                                              -  + ]
 7871 tgl@sss.pgh.pa.us        6678         [ -  + ]:          33597 :     Assert(PinCountWaitBuf == NULL);
                               6679                 :                : 
 1126 andres@anarazel.de       6680                 :          33597 :     CheckBufferIsPinnedOnce(buffer);
                               6681                 :                : 
                               6682                 :                :     /*
                               6683                 :                :      * We do not yet need to be worried about in-progress AIOs holding a pin,
                               6684                 :                :      * as we, so far, only support doing reads via AIO and this function can
                               6685                 :                :      * only be called once the buffer is valid (i.e. no read can be in
                               6686                 :                :      * flight).
                               6687                 :                :      */
                               6688                 :                : 
                               6689                 :                :     /* Nobody else to wait for */
 9069 tgl@sss.pgh.pa.us        6690         [ +  + ]:          33597 :     if (BufferIsLocal(buffer))
                               6691                 :             18 :         return;
                               6692                 :                : 
 4114 andres@anarazel.de       6693                 :          33579 :     bufHdr = GetBufferDescriptor(buffer - 1);
                               6694                 :                : 
                               6695                 :                :     for (;;)
 9069 tgl@sss.pgh.pa.us        6696                 :             68 :     {
                               6697                 :                :         uint64      buf_state;
  110 andres@anarazel.de       6698                 :GNC       33647 :         uint64      unset_bits = 0;
                               6699                 :                : 
                               6700                 :                :         /* Try to acquire lock */
 9069 tgl@sss.pgh.pa.us        6701                 :CBC       33647 :         LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
 3677 andres@anarazel.de       6702                 :          33647 :         buf_state = LockBufHdr(bufHdr);
                               6703                 :                : 
                               6704         [ -  + ]:          33647 :         Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
                               6705         [ +  + ]:          33647 :         if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
                               6706                 :                :         {
                               6707                 :                :             /* Successfully acquired exclusive lock with pincount 1 */
  180 andres@anarazel.de       6708                 :GNC       33579 :             UnlockBufHdr(bufHdr);
                               6709                 :                : 
                               6710                 :                :             /*
                               6711                 :                :              * Emit the log message if recovery conflict on buffer pin was
                               6712                 :                :              * resolved but the startup process waited longer than
                               6713                 :                :              * deadlock_timeout for it.
                               6714                 :                :              */
 1938 fujii@postgresql.org     6715         [ +  + ]:CBC       33579 :             if (logged_recovery_conflict)
   84 heikki.linnakangas@i     6716                 :GNC           2 :                 LogRecoveryConflict(RECOVERY_CONFLICT_BUFFERPIN,
                               6717                 :                :                                     waitStart, GetCurrentTimestamp(),
                               6718                 :                :                                     NULL, false);
                               6719                 :                : 
 1170 drowley@postgresql.o     6720         [ +  + ]:CBC       33579 :             if (waiting)
                               6721                 :                :             {
                               6722                 :                :                 /* reset ps display to remove the suffix if we added one */
                               6723                 :              2 :                 set_ps_display_remove_suffix();
                               6724                 :              2 :                 waiting = false;
                               6725                 :                :             }
 9069 tgl@sss.pgh.pa.us        6726                 :          33579 :             return;
                               6727                 :                :         }
                               6728                 :                :         /* Failed, so mark myself as waiting for pincount 1 */
 3677 andres@anarazel.de       6729         [ -  + ]:             68 :         if (buf_state & BM_PIN_COUNT_WAITER)
                               6730                 :                :         {
  180 andres@anarazel.de       6731                 :UNC           0 :             UnlockBufHdr(bufHdr);
 9069 tgl@sss.pgh.pa.us        6732                 :UBC           0 :             LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
 8321                          6733         [ #  # ]:              0 :             elog(ERROR, "multiple backends attempting to wait for pincount 1");
                               6734                 :                :         }
  803 heikki.linnakangas@i     6735                 :CBC          68 :         bufHdr->wait_backend_pgprocno = MyProcNumber;
 7871 tgl@sss.pgh.pa.us        6736                 :             68 :         PinCountWaitBuf = bufHdr;
  180 andres@anarazel.de       6737                 :GNC          68 :         UnlockBufHdrExt(bufHdr, buf_state,
                               6738                 :                :                         BM_PIN_COUNT_WAITER, 0,
                               6739                 :                :                         0);
 9069 tgl@sss.pgh.pa.us        6740                 :CBC          68 :         LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
                               6741                 :                : 
                               6742                 :                :         /* Wait to be signaled by UnpinBuffer() */
 5946 simon@2ndQuadrant.co     6743         [ +  + ]:             68 :         if (InHotStandby)
                               6744                 :                :         {
 1170 drowley@postgresql.o     6745         [ +  + ]:             11 :             if (!waiting)
                               6746                 :                :             {
                               6747                 :                :                 /* adjust the process title to indicate that it's waiting */
                               6748                 :              2 :                 set_ps_display_suffix("waiting");
                               6749                 :              2 :                 waiting = true;
                               6750                 :                :             }
                               6751                 :                : 
                               6752                 :                :             /*
                               6753                 :                :              * Emit the log message if the startup process is waiting longer
                               6754                 :                :              * than deadlock_timeout for recovery conflict on buffer pin.
                               6755                 :                :              *
                               6756                 :                :              * Skip this if first time through because the startup process has
                               6757                 :                :              * not started waiting yet in this case. So, the wait start
                               6758                 :                :              * timestamp is set after this logic.
                               6759                 :                :              */
 1943 fujii@postgresql.org     6760   [ +  +  +  + ]:             11 :             if (waitStart != 0 && !logged_recovery_conflict)
                               6761                 :                :             {
                               6762                 :              4 :                 TimestampTz now = GetCurrentTimestamp();
                               6763                 :                : 
                               6764         [ +  + ]:              4 :                 if (TimestampDifferenceExceeds(waitStart, now,
                               6765                 :                :                                                DeadlockTimeout))
                               6766                 :                :                 {
   84 heikki.linnakangas@i     6767                 :GNC           2 :                     LogRecoveryConflict(RECOVERY_CONFLICT_BUFFERPIN,
                               6768                 :                :                                         waitStart, now, NULL, true);
 1943 fujii@postgresql.org     6769                 :CBC           2 :                     logged_recovery_conflict = true;
                               6770                 :                :                 }
                               6771                 :                :             }
                               6772                 :                : 
                               6773                 :                :             /*
                               6774                 :                :              * Set the wait start timestamp if logging is enabled and first
                               6775                 :                :              * time through.
                               6776                 :                :              */
                               6777   [ +  -  +  + ]:             11 :             if (log_recovery_conflict_waits && waitStart == 0)
                               6778                 :              2 :                 waitStart = GetCurrentTimestamp();
                               6779                 :                : 
                               6780                 :                :             /* Publish the bufid that Startup process waits on */
 5946 simon@2ndQuadrant.co     6781                 :             11 :             SetStartupBufferPinWaitBufId(buffer - 1);
                               6782                 :                :             /* Set alarm and then wait to be signaled by UnpinBuffer() */
                               6783                 :             11 :             ResolveRecoveryConflictWithBufferPin();
                               6784                 :                :             /* Reset the published bufid */
                               6785                 :             11 :             SetStartupBufferPinWaitBufId(-1);
                               6786                 :                :         }
                               6787                 :                :         else
  153 andres@anarazel.de       6788                 :GNC          57 :             ProcWaitForSignal(WAIT_EVENT_BUFFER_CLEANUP);
                               6789                 :                : 
                               6790                 :                :         /*
                               6791                 :                :          * Remove flag marking us as waiter. Normally this will not be set
                               6792                 :                :          * anymore, but ProcWaitForSignal() can return for other signals as
                               6793                 :                :          * well.  We take care to only reset the flag if we're the waiter, as
                               6794                 :                :          * theoretically another backend could have started waiting. That's
                               6795                 :                :          * impossible with the current usages due to table level locking, but
                               6796                 :                :          * better be safe.
                               6797                 :                :          */
 3677 andres@anarazel.de       6798                 :CBC          68 :         buf_state = LockBufHdr(bufHdr);
                               6799         [ +  + ]:             68 :         if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
  803 heikki.linnakangas@i     6800         [ +  - ]:              9 :             bufHdr->wait_backend_pgprocno == MyProcNumber)
  180 andres@anarazel.de       6801                 :GNC           9 :             unset_bits |= BM_PIN_COUNT_WAITER;
                               6802                 :                : 
                               6803                 :             68 :         UnlockBufHdrExt(bufHdr, buf_state,
                               6804                 :                :                         0, unset_bits,
                               6805                 :                :                         0);
                               6806                 :                : 
 7871 tgl@sss.pgh.pa.us        6807                 :CBC          68 :         PinCountWaitBuf = NULL;
                               6808                 :                :         /* Loop back and try again */
                               6809                 :                :     }
                               6810                 :                : }
                               6811                 :                : 
                               6812                 :                : /*
                               6813                 :                :  * Check called from ProcessRecoveryConflictInterrupts() when Startup process
                               6814                 :                :  * requests cancellation of all pin holders that are blocking it.
                               6815                 :                :  */
                               6816                 :                : bool
 5946 simon@2ndQuadrant.co     6817                 :              4 : HoldingBufferPinThatDelaysRecovery(void)
                               6818                 :                : {
 5912 bruce@momjian.us         6819                 :              4 :     int         bufid = GetStartupBufferPinWaitBufId();
                               6820                 :                : 
                               6821                 :                :     /*
                               6822                 :                :      * If we get woken slowly then it's possible that the Startup process was
                               6823                 :                :      * already woken by other backends before we got here. Also possible that
                               6824                 :                :      * we get here by multiple interrupts or interrupts at inappropriate
                               6825                 :                :      * times, so make sure we do nothing if the bufid is not set.
                               6826                 :                :      */
 5946 simon@2ndQuadrant.co     6827         [ +  + ]:              4 :     if (bufid < 0)
                               6828                 :              2 :         return false;
                               6829                 :                : 
 4266 andres@anarazel.de       6830         [ +  - ]:              2 :     if (GetPrivateRefCount(bufid + 1) > 0)
 5946 simon@2ndQuadrant.co     6831                 :              2 :         return true;
                               6832                 :                : 
 5946 simon@2ndQuadrant.co     6833                 :UBC           0 :     return false;
                               6834                 :                : }
                               6835                 :                : 
                               6836                 :                : /*
                               6837                 :                :  * ConditionalLockBufferForCleanup - as above, but don't wait to get the lock
                               6838                 :                :  *
                               6839                 :                :  * We won't loop, but just check once to see if the pin count is OK.  If
                               6840                 :                :  * not, return false with no lock held.
                               6841                 :                :  */
                               6842                 :                : bool
 6802 tgl@sss.pgh.pa.us        6843                 :CBC      224579 : ConditionalLockBufferForCleanup(Buffer buffer)
                               6844                 :                : {
                               6845                 :                :     BufferDesc *bufHdr;
                               6846                 :                :     uint64      buf_state,
                               6847                 :                :                 refcount;
                               6848                 :                : 
                               6849         [ -  + ]:         224579 :     Assert(BufferIsValid(buffer));
                               6850                 :                : 
                               6851                 :                :     /* see AIO related comment in LockBufferForCleanup() */
                               6852                 :                : 
                               6853         [ +  + ]:         224579 :     if (BufferIsLocal(buffer))
                               6854                 :                :     {
 3677 andres@anarazel.de       6855                 :          12854 :         refcount = LocalRefCount[-buffer - 1];
                               6856                 :                :         /* There should be exactly one pin */
                               6857         [ -  + ]:          12854 :         Assert(refcount > 0);
                               6858         [ +  + ]:          12854 :         if (refcount != 1)
 6802 tgl@sss.pgh.pa.us        6859                 :           1540 :             return false;
                               6860                 :                :         /* Nobody else to wait for */
                               6861                 :          11314 :         return true;
                               6862                 :                :     }
                               6863                 :                : 
                               6864                 :                :     /* There should be exactly one local pin */
 3677 andres@anarazel.de       6865                 :         211725 :     refcount = GetPrivateRefCount(buffer);
                               6866         [ -  + ]:         211725 :     Assert(refcount);
                               6867         [ +  + ]:         211725 :     if (refcount != 1)
 6802 tgl@sss.pgh.pa.us        6868                 :            321 :         return false;
                               6869                 :                : 
                               6870                 :                :     /* Try to acquire lock */
                               6871         [ +  + ]:         211404 :     if (!ConditionalLockBuffer(buffer))
                               6872                 :             84 :         return false;
                               6873                 :                : 
 4114 andres@anarazel.de       6874                 :         211320 :     bufHdr = GetBufferDescriptor(buffer - 1);
 3677                          6875                 :         211320 :     buf_state = LockBufHdr(bufHdr);
                               6876                 :         211320 :     refcount = BUF_STATE_GET_REFCOUNT(buf_state);
                               6877                 :                : 
                               6878         [ -  + ]:         211320 :     Assert(refcount > 0);
                               6879         [ +  + ]:         211320 :     if (refcount == 1)
                               6880                 :                :     {
                               6881                 :                :         /* Successfully acquired exclusive lock with pincount 1 */
  180 andres@anarazel.de       6882                 :GNC      211103 :         UnlockBufHdr(bufHdr);
 6802 tgl@sss.pgh.pa.us        6883                 :         211103 :         return true;
                               6884                 :                :     }
                               6885                 :                : 
                               6886                 :                :     /* Failed, so release the lock */
  180 andres@anarazel.de       6887                 :            217 :     UnlockBufHdr(bufHdr);
 6802 tgl@sss.pgh.pa.us        6888                 :            217 :     LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
                               6889                 :            217 :     return false;
                               6890                 :                : }
                               6891                 :                : 
                               6892                 :                : /*
                               6893                 :                :  * IsBufferCleanupOK - as above, but we already have the lock
                               6894                 :                :  *
                               6895                 :                :  * Check whether it's OK to perform cleanup on a buffer we've already
                               6896                 :                :  * locked.  If we observe that the pin count is 1, our exclusive lock
                               6897                 :                :  * happens to be a cleanup lock, and we can proceed with anything that
                               6898                 :                :  * would have been allowable had we sought a cleanup lock originally.
                               6899                 :                :  */
                               6900                 :                : bool
 3469 rhaas@postgresql.org     6901                 :           2714 : IsBufferCleanupOK(Buffer buffer)
                               6902                 :                : {
                               6903                 :                :     BufferDesc *bufHdr;
                               6904                 :                :     uint64      buf_state;
                               6905                 :                : 
                               6906         [ -  + ]:           2714 :     Assert(BufferIsValid(buffer));
                               6907                 :                : 
                               6908                 :                :     /* see AIO related comment in LockBufferForCleanup() */
                               6909                 :                : 
                               6910         [ -  + ]:           2714 :     if (BufferIsLocal(buffer))
                               6911                 :                :     {
                               6912                 :                :         /* There should be exactly one pin */
 3469 rhaas@postgresql.org     6913         [ #  # ]:UNC           0 :         if (LocalRefCount[-buffer - 1] != 1)
                               6914                 :              0 :             return false;
                               6915                 :                :         /* Nobody else to wait for */
                               6916                 :              0 :         return true;
                               6917                 :                :     }
                               6918                 :                : 
                               6919                 :                :     /* There should be exactly one local pin */
 3469 rhaas@postgresql.org     6920         [ -  + ]:GNC        2714 :     if (GetPrivateRefCount(buffer) != 1)
 3469 rhaas@postgresql.org     6921                 :UNC           0 :         return false;
                               6922                 :                : 
 3469 rhaas@postgresql.org     6923                 :GNC        2714 :     bufHdr = GetBufferDescriptor(buffer - 1);
                               6924                 :                : 
                               6925                 :                :     /* caller must hold exclusive lock on buffer */
  209 andres@anarazel.de       6926         [ -  + ]:           2714 :     Assert(BufferIsLockedByMeInMode(buffer, BUFFER_LOCK_EXCLUSIVE));
                               6927                 :                : 
 3469 rhaas@postgresql.org     6928                 :           2714 :     buf_state = LockBufHdr(bufHdr);
                               6929                 :                : 
                               6930         [ -  + ]:           2714 :     Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
                               6931         [ +  - ]:           2714 :     if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
                               6932                 :                :     {
                               6933                 :                :         /* pincount is OK. */
  180 andres@anarazel.de       6934                 :           2714 :         UnlockBufHdr(bufHdr);
 3469 rhaas@postgresql.org     6935                 :           2714 :         return true;
                               6936                 :                :     }
                               6937                 :                : 
  180 andres@anarazel.de       6938                 :UNC           0 :     UnlockBufHdr(bufHdr);
 3469 rhaas@postgresql.org     6939                 :              0 :     return false;
                               6940                 :                : }
                               6941                 :                : 
                               6942                 :                : /*
                               6943                 :                :  * Helper for BufferBeginSetHintBits() and BufferSetHintBits16().
                               6944                 :                :  *
                               6945                 :                :  * This checks if the current lock mode already suffices to allow hint bits
                               6946                 :                :  * being set and, if not, whether the current lock can be upgraded.
                               6947                 :                :  *
                               6948                 :                :  * Updates *lockstate when returning true.
                               6949                 :                :  */
                               6950                 :                : static inline bool
   56 andres@anarazel.de       6951                 :GNC    15207755 : SharedBufferBeginSetHintBits(Buffer buffer, BufferDesc *buf_hdr, uint64 *lockstate)
                               6952                 :                : {
                               6953                 :                :     uint64      old_state;
                               6954                 :                :     PrivateRefCountEntry *ref;
                               6955                 :                :     BufferLockMode mode;
                               6956                 :                : 
                               6957                 :       15207755 :     ref = GetPrivateRefCountEntry(buffer, true);
                               6958                 :                : 
                               6959         [ -  + ]:       15207755 :     if (ref == NULL)
   56 andres@anarazel.de       6960         [ #  # ]:UNC           0 :         elog(ERROR, "buffer is not pinned");
                               6961                 :                : 
   56 andres@anarazel.de       6962                 :GNC    15207755 :     mode = ref->data.lockmode;
                               6963         [ -  + ]:       15207755 :     if (mode == BUFFER_LOCK_UNLOCK)
   56 andres@anarazel.de       6964         [ #  # ]:UNC           0 :         elog(ERROR, "buffer is not locked");
                               6965                 :                : 
                               6966                 :                :     /* we're done if we are already holding a sufficient lock level */
   56 andres@anarazel.de       6967   [ +  +  +  + ]:GNC    15207755 :     if (mode == BUFFER_LOCK_EXCLUSIVE || mode == BUFFER_LOCK_SHARE_EXCLUSIVE)
                               6968                 :                :     {
                               6969                 :       11774544 :         *lockstate = pg_atomic_read_u64(&buf_hdr->state);
                               6970                 :       11774544 :         return true;
                               6971                 :                :     }
                               6972                 :                : 
                               6973                 :                :     /*
                               6974                 :                :      * We are only holding a share lock right now, try to upgrade it to
                               6975                 :                :      * SHARE_EXCLUSIVE.
                               6976                 :                :      */
                               6977         [ -  + ]:        3433211 :     Assert(mode == BUFFER_LOCK_SHARE);
                               6978                 :                : 
                               6979                 :        3433211 :     old_state = pg_atomic_read_u64(&buf_hdr->state);
                               6980                 :                :     while (true)
                               6981                 :              7 :     {
                               6982                 :                :         uint64      desired_state;
                               6983                 :                : 
                               6984                 :        3433218 :         desired_state = old_state;
                               6985                 :                : 
                               6986                 :                :         /*
                               6987                 :                :          * Can't upgrade if somebody else holds the lock in exclusive or
                               6988                 :                :          * share-exclusive mode.
                               6989                 :                :          */
                               6990         [ +  + ]:        3433218 :         if (unlikely((old_state & (BM_LOCK_VAL_EXCLUSIVE | BM_LOCK_VAL_SHARE_EXCLUSIVE)) != 0))
                               6991                 :                :         {
                               6992                 :            122 :             return false;
                               6993                 :                :         }
                               6994                 :                : 
                               6995                 :                :         /* currently held lock state */
                               6996                 :        3433096 :         desired_state -= BM_LOCK_VAL_SHARED;
                               6997                 :                : 
                               6998                 :                :         /* new lock level */
                               6999                 :        3433096 :         desired_state += BM_LOCK_VAL_SHARE_EXCLUSIVE;
                               7000                 :                : 
                               7001         [ +  + ]:        3433096 :         if (likely(pg_atomic_compare_exchange_u64(&buf_hdr->state,
                               7002                 :                :                                                   &old_state, desired_state)))
                               7003                 :                :         {
                               7004                 :        3433089 :             ref->data.lockmode = BUFFER_LOCK_SHARE_EXCLUSIVE;
                               7005                 :        3433089 :             *lockstate = desired_state;
                               7006                 :                : 
                               7007                 :        3433089 :             return true;
                               7008                 :                :         }
                               7009                 :                :     }
                               7010                 :                : }
                               7011                 :                : 
                               7012                 :                : /*
                               7013                 :                :  * Try to acquire the right to set hint bits on the buffer.
                               7014                 :                :  *
                               7015                 :                :  * To be allowed to set hint bits, this backend needs to hold either a
                               7016                 :                :  * share-exclusive or an exclusive lock. In case this backend only holds a
                               7017                 :                :  * share lock, this function will try to upgrade the lock to
                               7018                 :                :  * share-exclusive. The caller is only allowed to set hint bits if true is
                               7019                 :                :  * returned.
                               7020                 :                :  *
                               7021                 :                :  * Once BufferBeginSetHintBits() has returned true, hint bits may be set
                               7022                 :                :  * without further calls to BufferBeginSetHintBits(), until the buffer is
                               7023                 :                :  * unlocked.
                               7024                 :                :  *
                               7025                 :                :  *
                               7026                 :                :  * Requiring a share-exclusive lock to set hint bits prevents setting hint
                               7027                 :                :  * bits on buffers that are currently being written out, which could corrupt
                               7028                 :                :  * the checksum on the page. Flushing buffers also requires a share-exclusive
                               7029                 :                :  * lock.
                               7030                 :                :  *
                               7031                 :                :  * Due to a lock >= share-exclusive being required to set hint bits, only one
                               7032                 :                :  * backend can set hint bits at a time. Allowing multiple backends to set hint
                               7033                 :                :  * bits would require more complicated locking: For setting hint bits we'd
                               7034                 :                :  * need to store the count of backends currently setting hint bits, for I/O we
                               7035                 :                :  * would need another lock-level conflicting with the hint-setting
                               7036                 :                :  * lock-level. Given that the share-exclusive lock for setting hint bits is
                               7037                 :                :  * only held for a short time, that backends often would just set the same
                               7038                 :                :  * hint bits and that the cost of occasionally not setting hint bits in hotly
                               7039                 :                :  * accessed pages is fairly low, this seems like an acceptable tradeoff.
                               7040                 :                :  */
                               7041                 :                : bool
                               7042                 :         305948 : BufferBeginSetHintBits(Buffer buffer)
                               7043                 :                : {
                               7044                 :                :     BufferDesc *buf_hdr;
                               7045                 :                :     uint64      lockstate;
                               7046                 :                : 
                               7047         [ +  + ]:         305948 :     if (BufferIsLocal(buffer))
                               7048                 :                :     {
                               7049                 :                :         /*
                               7050                 :                :          * NB: Will need to check if there is a write in progress, once it is
                               7051                 :                :          * possible for writes to be done asynchronously.
                               7052                 :                :          */
   56 andres@anarazel.de       7053                 :CBC        2437 :         return true;
                               7054                 :                :     }
                               7055                 :                : 
   56 andres@anarazel.de       7056                 :GNC      303511 :     buf_hdr = GetBufferDescriptor(buffer - 1);
                               7057                 :                : 
                               7058                 :         303511 :     return SharedBufferBeginSetHintBits(buffer, buf_hdr, &lockstate);
                               7059                 :                : }
                               7060                 :                : 
                               7061                 :                : /*
                               7062                 :                :  * End a phase of setting hint bits on this buffer, started with
                               7063                 :                :  * BufferBeginSetHintBits().
                               7064                 :                :  *
                               7065                 :                :  * This would strictly speaking not be required (i.e. the caller could do
                               7066                 :                :  * MarkBufferDirtyHint() if so desired), but allows us to perform some sanity
                               7067                 :                :  * checks.
                               7068                 :                :  */
                               7069                 :                : void
                               7070                 :         305933 : BufferFinishSetHintBits(Buffer buffer, bool mark_dirty, bool buffer_std)
                               7071                 :                : {
                               7072         [ +  + ]:         305933 :     if (!BufferIsLocal(buffer))
                               7073   [ +  +  -  + ]:         303496 :         Assert(BufferIsLockedByMeInMode(buffer, BUFFER_LOCK_SHARE_EXCLUSIVE) ||
                               7074                 :                :                BufferIsLockedByMeInMode(buffer, BUFFER_LOCK_EXCLUSIVE));
                               7075                 :                : 
                               7076         [ +  + ]:         305933 :     if (mark_dirty)
                               7077                 :         232833 :         MarkBufferDirtyHint(buffer, buffer_std);
                               7078                 :         305933 : }
                               7079                 :                : 
                               7080                 :                : /*
                               7081                 :                :  * Try to set hint bits on a single 16bit value in a buffer.
                               7082                 :                :  *
                               7083                 :                :  * If hint bits are allowed to be set, set *ptr = val, try to mark the buffer
                               7084                 :                :  * dirty and return true. Otherwise false is returned.
                               7085                 :                :  *
                               7086                 :                :  * *ptr needs to be a pointer to memory within the buffer.
                               7087                 :                :  *
                               7088                 :                :  * This is a bit faster than BufferBeginSetHintBits() /
                               7089                 :                :  * BufferFinishSetHintBits() when setting hints once in a buffer, but slower
                               7090                 :                :  * than the former when setting hint bits multiple times in the same buffer.
                               7091                 :                :  */
                               7092                 :                : bool
                               7093                 :       15816708 : BufferSetHintBits16(uint16 *ptr, uint16 val, Buffer buffer)
                               7094                 :                : {
                               7095                 :                :     BufferDesc *buf_hdr;
                               7096                 :                :     uint64      lockstate;
                               7097                 :                : #ifdef USE_ASSERT_CHECKING
                               7098                 :                :     char       *page;
                               7099                 :                : 
                               7100                 :                :     /* verify that the address is on the page */
                               7101                 :       15816708 :     page = BufferGetPage(buffer);
                               7102   [ +  -  -  + ]:       15816708 :     Assert((char *) ptr >= page && (char *) ptr < (page + BLCKSZ));
                               7103                 :                : #endif
                               7104                 :                : 
   56 andres@anarazel.de       7105         [ +  + ]:CBC    15816708 :     if (BufferIsLocal(buffer))
                               7106                 :                :     {
   56 andres@anarazel.de       7107                 :GNC      912464 :         *ptr = val;
                               7108                 :                : 
                               7109                 :         912464 :         MarkLocalBufferDirty(buffer);
                               7110                 :                : 
   56 andres@anarazel.de       7111                 :GBC      912464 :         return true;
                               7112                 :                :     }
                               7113                 :                : 
   56 andres@anarazel.de       7114                 :GNC    14904244 :     buf_hdr = GetBufferDescriptor(buffer - 1);
                               7115                 :                : 
                               7116         [ +  + ]:       14904244 :     if (SharedBufferBeginSetHintBits(buffer, buf_hdr, &lockstate))
                               7117                 :                :     {
                               7118                 :       14904137 :         *ptr = val;
                               7119                 :                : 
                               7120                 :       14904137 :         MarkSharedBufferDirtyHint(buffer, buf_hdr, lockstate, true);
                               7121                 :                : 
   56 andres@anarazel.de       7122                 :CBC    14904137 :         return true;
                               7123                 :                :     }
                               7124                 :                : 
   56 andres@anarazel.de       7125                 :GBC         107 :     return false;
                               7126                 :                : }
                               7127                 :                : 
                               7128                 :                : 
                               7129                 :                : /*
                               7130                 :                :  *  Functions for buffer I/O handling
                               7131                 :                :  *
                               7132                 :                :  *  Also note that these are used only for shared buffers, not local ones.
                               7133                 :                :  */
                               7134                 :                : 
                               7135                 :                : /*
                               7136                 :                :  * WaitIO -- Block until the IO_IN_PROGRESS flag on 'buf' is cleared.
                               7137                 :                :  */
                               7138                 :                : static void
 3823 rhaas@postgresql.org     7139                 :CBC         306 : WaitIO(BufferDesc *buf)
                               7140                 :                : {
 1881 tmunro@postgresql.or     7141                 :            306 :     ConditionVariable *cv = BufferDescriptorGetIOCV(buf);
                               7142                 :                : 
                               7143                 :                :     /*
                               7144                 :                :      * Should never end up here with unsubmitted IO, as no AIO unaware code
                               7145                 :                :      * may be used while in batch mode and AIO aware code needs to have
                               7146                 :                :      * submitted all staged IO to avoid deadlocks & slowness.
                               7147                 :                :      */
   39 andres@anarazel.de       7148         [ -  + ]:GNC         306 :     Assert(!pgaio_have_staged());
                               7149                 :                : 
 1881 tmunro@postgresql.or     7150                 :CBC         306 :     ConditionVariablePrepareToSleep(cv);
                               7151                 :                :     for (;;)
 7732 tgl@sss.pgh.pa.us        7152                 :            306 :     {
                               7153                 :                :         uint64      buf_state;
                               7154                 :                :         PgAioWaitRef iow;
                               7155                 :                : 
                               7156                 :                :         /*
                               7157                 :                :          * It may not be necessary to acquire the spinlock to check the flag
                               7158                 :                :          * here, but since this test is essential for correctness, we'd better
                               7159                 :                :          * play it safe.
                               7160                 :                :          */
 3677 andres@anarazel.de       7161                 :            612 :         buf_state = LockBufHdr(buf);
                               7162                 :                : 
                               7163                 :                :         /*
                               7164                 :                :          * Copy the wait reference while holding the spinlock. This protects
                               7165                 :                :          * against a concurrent TerminateBufferIO() in another backend from
                               7166                 :                :          * clearing the wref while it's being read.
                               7167                 :                :          */
  401                          7168                 :            612 :         iow = buf->io_wref;
  180 andres@anarazel.de       7169                 :GNC         612 :         UnlockBufHdr(buf);
                               7170                 :                : 
                               7171                 :                :         /* no IO in progress, we don't need to wait */
 3677 andres@anarazel.de       7172         [ +  + ]:CBC         612 :         if (!(buf_state & BM_IO_IN_PROGRESS))
 7732 tgl@sss.pgh.pa.us        7173                 :            306 :             break;
                               7174                 :                : 
                               7175                 :                :         /*
                               7176                 :                :          * The buffer has asynchronous IO in progress, wait for it to
                               7177                 :                :          * complete.
                               7178                 :                :          */
  401 andres@anarazel.de       7179         [ +  + ]:            306 :         if (pgaio_wref_valid(&iow))
                               7180                 :                :         {
                               7181                 :             37 :             pgaio_wref_wait(&iow);
                               7182                 :                : 
                               7183                 :                :             /*
                               7184                 :                :              * The AIO subsystem internally uses condition variables and thus
                               7185                 :                :              * might remove this backend from the BufferDesc's CV. While that
                               7186                 :                :              * wouldn't cause a correctness issue (the first CV sleep just
                               7187                 :                :              * immediately returns if not already registered), it seems worth
                               7188                 :                :              * avoiding unnecessary loop iterations, given that we take care
                               7189                 :                :              * to do so at the start of the function.
                               7190                 :                :              */
                               7191                 :             37 :             ConditionVariablePrepareToSleep(cv);
                               7192                 :             37 :             continue;
                               7193                 :                :         }
                               7194                 :                : 
                               7195                 :                :         /* wait on BufferDesc->cv, e.g. for concurrent synchronous IO */
 1881 tmunro@postgresql.or     7196                 :            269 :         ConditionVariableSleep(cv, WAIT_EVENT_BUFFER_IO);
                               7197                 :                :     }
                               7198                 :            306 :     ConditionVariableCancelSleep();
 7732 tgl@sss.pgh.pa.us        7199                 :            306 : }
                               7200                 :                : 
                               7201                 :                : /*
                               7202                 :                :  * StartSharedBufferIO: begin I/O on this buffer
                               7203                 :                :  *  (Assumptions)
                               7204                 :                :  *  The buffer is Pinned
                               7205                 :                :  *
                               7206                 :                :  * In several scenarios the buffer may already be undergoing I/O in this or
                               7207                 :                :  * another backend. How to best handle that depends on the caller's
                               7208                 :                :  * situation. It might be appropriate to wait synchronously (e.g., because the
                               7209                 :                :  * buffer is about to be invalidated); wait asynchronously, using the buffer's
                               7210                 :                :  * IO wait reference (e.g., because the caller is doing readahead and doesn't
                               7211                 :                :  * need the buffer to be ready immediately); or to not wait at all (e.g.,
                               7212                 :                :  * because the caller is trying to combine IO for this buffer with another
                               7213                 :                :  * buffer).
                               7214                 :                :  *
                               7215                 :                :  * How and whether to wait is controlled by the wait and io_wref
                               7216                 :                :  * parameters. In detail:
                               7217                 :                :  *
                               7218                 :                :  * - If the caller passes a non-NULL io_wref and the buffer has an I/O wait
                               7219                 :                :  *   reference, the *io_wref is set to the buffer's io_wref and
                               7220                 :                :  *   BUFFER_IO_IN_PROGRESS is returned. This is done regardless of the wait
                               7221                 :                :  *   parameter.
                               7222                 :                :  *
                               7223                 :                :  * - If the caller passes a NULL io_wref (i.e. the caller does not want to
                               7224                 :                :  *   asynchronously wait for the completion of the IO), wait = false and the
                               7225                 :                :  *   buffer is undergoing IO, BUFFER_IO_IN_PROGRESS is returned.
                               7226                 :                :  *
                               7227                 :                :  * - If wait = true and either the buffer does not have a wait reference,
                               7228                 :                :  *   or the caller passes io_wref = NULL, WaitIO() is used to wait for the IO
                               7229                 :                :  *   to complete.  To avoid the potential of deadlocks and unnecessary delays,
                               7230                 :                :  *   all staged I/O is submitted before waiting.
                               7231                 :                :  *
                               7232                 :                :  * Input operations are only attempted on buffers that are not BM_VALID, and
                               7233                 :                :  * output operations only on buffers that are BM_VALID and BM_DIRTY, so we can
                               7234                 :                :  * always tell if the work is already done.  If no I/O is necessary,
                               7235                 :                :  * BUFFER_IO_ALREADY_DONE is returned.
                               7236                 :                :  *
                               7237                 :                :  * If we successfully marked the buffer as BM_IO_IN_PROGRESS,
                               7238                 :                :  * BUFFER_IO_READY_FOR_IO is returned.
                               7239                 :                :  */
                               7240                 :                : StartBufferIOResult
   39 andres@anarazel.de       7241                 :GNC     2956721 : StartSharedBufferIO(BufferDesc *buf, bool forInput, bool wait, PgAioWaitRef *io_wref)
                               7242                 :                : {
                               7243                 :                :     uint64      buf_state;
                               7244                 :                : 
  909 heikki.linnakangas@i     7245                 :CBC     2956721 :     ResourceOwnerEnlarge(CurrentResourceOwner);
                               7246                 :                : 
                               7247                 :                :     for (;;)
                               7248                 :                :     {
 3677 andres@anarazel.de       7249                 :        2957025 :         buf_state = LockBufHdr(buf);
                               7250                 :                : 
                               7251         [ +  + ]:        2957025 :         if (!(buf_state & BM_IO_IN_PROGRESS))
 7732 tgl@sss.pgh.pa.us        7252                 :        2954570 :             break;
                               7253                 :                : 
                               7254                 :                :         /* Join the existing IO */
   39 andres@anarazel.de       7255   [ +  +  +  + ]:GNC        2455 :         if (io_wref != NULL && pgaio_wref_valid(&buf->io_wref))
                               7256                 :                :         {
                               7257                 :           2136 :             *io_wref = buf->io_wref;
                               7258                 :           2136 :             UnlockBufHdr(buf);
                               7259                 :                : 
                               7260                 :           2136 :             return BUFFER_IO_IN_PROGRESS;
                               7261                 :                :         }
                               7262         [ +  + ]:            319 :         else if (!wait)
                               7263                 :                :         {
                               7264                 :             15 :             UnlockBufHdr(buf);
                               7265                 :             15 :             return BUFFER_IO_IN_PROGRESS;
                               7266                 :                :         }
                               7267                 :                :         else
                               7268                 :                :         {
                               7269                 :                :             /*
                               7270                 :                :              * With wait = true, we always have to wait if the caller has
                               7271                 :                :              * passed io_wref = NULL.
                               7272                 :                :              *
                               7273                 :                :              * Even with io_wref != NULL, we have to wait if the buffer's wait
                               7274                 :                :              * ref is not valid but the IO is in progress, someone else
                               7275                 :                :              * started IO but hasn't set the wait ref yet. We have no choice
                               7276                 :                :              * but to wait until the IO completes.
                               7277                 :                :              */
                               7278                 :            304 :             UnlockBufHdr(buf);
                               7279                 :                : 
                               7280                 :                :             /*
                               7281                 :                :              * If this backend currently has staged IO, submit it before
                               7282                 :                :              * waiting for in-progress IO, to avoid potential deadlocks and
                               7283                 :                :              * unnecessary delays.
                               7284                 :                :              */
                               7285                 :            304 :             pgaio_submit_staged();
                               7286                 :                : 
                               7287                 :            304 :             WaitIO(buf);
                               7288                 :                :         }
                               7289                 :                :     }
                               7290                 :                : 
                               7291                 :                :     /* Once we get here, there is definitely no I/O active on this buffer */
                               7292                 :                : 
                               7293                 :                :     /* Check if someone else already did the I/O */
 3677 andres@anarazel.de       7294   [ +  +  +  + ]:CBC     2954570 :     if (forInput ? (buf_state & BM_VALID) : !(buf_state & BM_DIRTY))
                               7295                 :                :     {
  180 andres@anarazel.de       7296                 :GNC         544 :         UnlockBufHdr(buf);
   39                          7297                 :            544 :         return BUFFER_IO_ALREADY_DONE;
                               7298                 :                :     }
                               7299                 :                : 
                               7300                 :                :     /*
                               7301                 :                :      * No IO in progress and not already done; we will start IO. It's possible
                               7302                 :                :      * that the IO was in progress but we're not done, because the IO errored
                               7303                 :                :      * out. We'll do the IO ourselves.
                               7304                 :                :      */
  180                          7305                 :        2954026 :     UnlockBufHdrExt(buf, buf_state,
                               7306                 :                :                     BM_IO_IN_PROGRESS, 0,
                               7307                 :                :                     0);
                               7308                 :                : 
 1126 andres@anarazel.de       7309                 :CBC     2954026 :     ResourceOwnerRememberBufferIO(CurrentResourceOwner,
                               7310                 :                :                                   BufferDescriptorGetBuffer(buf));
                               7311                 :                : 
   39 andres@anarazel.de       7312                 :GNC     2954026 :     return BUFFER_IO_READY_FOR_IO;
                               7313                 :                : }
                               7314                 :                : 
                               7315                 :                : /*
                               7316                 :                :  * Wrapper around StartSharedBufferIO / StartLocalBufferIO. Only to be used
                               7317                 :                :  * when the caller doesn't otherwise need to care about local vs shared. See
                               7318                 :                :  * StartSharedBufferIO() for details.
                               7319                 :                :  */
                               7320                 :                : StartBufferIOResult
                               7321                 :        1656808 : StartBufferIO(Buffer buffer, bool forInput, bool wait, PgAioWaitRef *io_wref)
                               7322                 :                : {
                               7323                 :                :     BufferDesc *buf_hdr;
                               7324                 :                : 
                               7325         [ +  + ]:        1656808 :     if (BufferIsLocal(buffer))
                               7326                 :                :     {
                               7327                 :          11065 :         buf_hdr = GetLocalBufferDescriptor(-buffer - 1);
                               7328                 :                : 
                               7329                 :          11065 :         return StartLocalBufferIO(buf_hdr, forInput, wait, io_wref);
                               7330                 :                :     }
                               7331                 :                :     else
                               7332                 :                :     {
                               7333                 :        1645743 :         buf_hdr = GetBufferDescriptor(buffer - 1);
                               7334                 :                : 
                               7335                 :        1645743 :         return StartSharedBufferIO(buf_hdr, forInput, wait, io_wref);
                               7336                 :                :     }
                               7337                 :                : }
                               7338                 :                : 
                               7339                 :                : /*
                               7340                 :                :  * TerminateBufferIO: release a buffer we were doing I/O on
                               7341                 :                :  *  (Assumptions)
                               7342                 :                :  *  My process is executing IO for the buffer
                               7343                 :                :  *  BM_IO_IN_PROGRESS bit is set for the buffer
                               7344                 :                :  *  The buffer is Pinned
                               7345                 :                :  *
                               7346                 :                :  * If clear_dirty is true, we clear the buffer's BM_DIRTY flag.  This is
                               7347                 :                :  * appropriate when terminating a successful write.
                               7348                 :                :  *
                               7349                 :                :  * set_flag_bits gets ORed into the buffer's flags.  It must include
                               7350                 :                :  * BM_IO_ERROR in a failure case.  For successful completion it could
                               7351                 :                :  * be 0, or BM_VALID if we just finished reading in the page.
                               7352                 :                :  *
                               7353                 :                :  * If forget_owner is true, we release the buffer I/O from the current
                               7354                 :                :  * resource owner. (forget_owner=false is used when the resource owner itself
                               7355                 :                :  * is being released)
                               7356                 :                :  */
                               7357                 :                : void
  110                          7358                 :        2785027 : TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint64 set_flag_bits,
                               7359                 :                :                   bool forget_owner, bool release_aio)
                               7360                 :                : {
                               7361                 :                :     uint64      buf_state;
                               7362                 :        2785027 :     uint64      unset_flag_bits = 0;
  180                          7363                 :        2785027 :     int         refcount_change = 0;
                               7364                 :                : 
 3677 andres@anarazel.de       7365                 :CBC     2785027 :     buf_state = LockBufHdr(buf);
                               7366                 :                : 
                               7367         [ -  + ]:        2785027 :     Assert(buf_state & BM_IO_IN_PROGRESS);
  180 andres@anarazel.de       7368                 :GNC     2785027 :     unset_flag_bits |= BM_IO_IN_PROGRESS;
                               7369                 :                : 
                               7370                 :                :     /* Clear earlier errors, if this IO failed, it'll be marked again */
                               7371                 :        2785027 :     unset_flag_bits |= BM_IO_ERROR;
                               7372                 :                : 
   55                          7373         [ +  + ]:        2785027 :     if (clear_dirty)
  180                          7374                 :         711748 :         unset_flag_bits |= BM_DIRTY | BM_CHECKPOINT_NEEDED;
                               7375                 :                : 
  401 andres@anarazel.de       7376         [ +  + ]:CBC     2785027 :     if (release_aio)
                               7377                 :                :     {
                               7378                 :                :         /* release ownership by the AIO subsystem */
                               7379         [ -  + ]:        1474206 :         Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
  180 andres@anarazel.de       7380                 :GNC     1474206 :         refcount_change = -1;
  401 andres@anarazel.de       7381                 :CBC     1474206 :         pgaio_wref_clear(&buf->io_wref);
                               7382                 :                :     }
                               7383                 :                : 
  180 andres@anarazel.de       7384                 :GNC     2785027 :     buf_state = UnlockBufHdrExt(buf, buf_state,
                               7385                 :                :                                 set_flag_bits, unset_flag_bits,
                               7386                 :                :                                 refcount_change);
                               7387                 :                : 
  909 heikki.linnakangas@i     7388         [ +  + ]:CBC     2785027 :     if (forget_owner)
                               7389                 :        1310793 :         ResourceOwnerForgetBufferIO(CurrentResourceOwner,
                               7390                 :                :                                     BufferDescriptorGetBuffer(buf));
                               7391                 :                : 
 1881 tmunro@postgresql.or     7392                 :        2785027 :     ConditionVariableBroadcast(BufferDescriptorGetIOCV(buf));
                               7393                 :                : 
                               7394                 :                :     /*
                               7395                 :                :      * Support LockBufferForCleanup()
                               7396                 :                :      *
                               7397                 :                :      * We may have just released the last pin other than the waiter's. In most
                               7398                 :                :      * cases, this backend holds another pin on the buffer. But, if, for
                               7399                 :                :      * example, this backend is completing an IO issued by another backend, it
                               7400                 :                :      * may be time to wake the waiter.
                               7401                 :                :      */
  401 andres@anarazel.de       7402   [ +  +  -  + ]:        2785027 :     if (release_aio && (buf_state & BM_PIN_COUNT_WAITER))
  401 andres@anarazel.de       7403                 :UBC           0 :         WakePinCountWaiter(buf);
 9605 inoue@tpf.co.jp          7404                 :CBC     2785027 : }
                               7405                 :                : 
                               7406                 :                : /*
                               7407                 :                :  * AbortBufferIO: Clean up active buffer I/O after an error.
                               7408                 :                :  *
                               7409                 :                :  *  All LWLocks & content locks we might have held have been released, but we
                               7410                 :                :  *  haven't yet released buffer pins, so the buffer is still pinned.
                               7411                 :                :  *
                               7412                 :                :  *  If I/O was in progress, we always set BM_IO_ERROR, even though it's
                               7413                 :                :  *  possible the error condition wasn't related to the I/O.
                               7414                 :                :  *
                               7415                 :                :  *  Note: this does not remove the buffer I/O from the resource owner.
                               7416                 :                :  *  That's correct when we're releasing the whole resource owner, but
                               7417                 :                :  *  beware if you use this in other contexts.
                               7418                 :                :  */
                               7419                 :                : static void
 1118 pg@bowt.ie               7420                 :             15 : AbortBufferIO(Buffer buffer)
                               7421                 :                : {
                               7422                 :             15 :     BufferDesc *buf_hdr = GetBufferDescriptor(buffer - 1);
                               7423                 :                :     uint64      buf_state;
                               7424                 :                : 
 1126 andres@anarazel.de       7425                 :             15 :     buf_state = LockBufHdr(buf_hdr);
                               7426         [ -  + ]:             15 :     Assert(buf_state & (BM_IO_IN_PROGRESS | BM_TAG_VALID));
                               7427                 :                : 
                               7428         [ +  - ]:             15 :     if (!(buf_state & BM_VALID))
                               7429                 :                :     {
                               7430         [ -  + ]:             15 :         Assert(!(buf_state & BM_DIRTY));
  180 andres@anarazel.de       7431                 :GNC          15 :         UnlockBufHdr(buf_hdr);
                               7432                 :                :     }
                               7433                 :                :     else
                               7434                 :                :     {
 1124 andres@anarazel.de       7435         [ #  # ]:UBC           0 :         Assert(buf_state & BM_DIRTY);
  180 andres@anarazel.de       7436                 :UNC           0 :         UnlockBufHdr(buf_hdr);
                               7437                 :                : 
                               7438                 :                :         /* Issue notice if this is not the first failure... */
 1126 andres@anarazel.de       7439         [ #  # ]:UBC           0 :         if (buf_state & BM_IO_ERROR)
                               7440                 :                :         {
                               7441                 :                :             /* Buffer is pinned, so we can read tag without spinlock */
                               7442         [ #  # ]:              0 :             ereport(WARNING,
                               7443                 :                :                     (errcode(ERRCODE_IO_ERROR),
                               7444                 :                :                      errmsg("could not write block %u of %s",
                               7445                 :                :                             buf_hdr->tag.blockNum,
                               7446                 :                :                             relpathperm(BufTagGetRelFileLocator(&buf_hdr->tag),
                               7447                 :                :                                         BufTagGetForkNum(&buf_hdr->tag)).str),
                               7448                 :                :                      errdetail("Multiple failures --- write error might be permanent.")));
                               7449                 :                :         }
                               7450                 :                :     }
                               7451                 :                : 
  401 andres@anarazel.de       7452                 :CBC          15 :     TerminateBufferIO(buf_hdr, false, BM_IO_ERROR, false, false);
 9605 inoue@tpf.co.jp          7453                 :             15 : }
                               7454                 :                : 
                               7455                 :                : /*
                               7456                 :                :  * Error context callback for errors occurring during shared buffer writes.
                               7457                 :                :  */
                               7458                 :                : static void
 5744 rhaas@postgresql.org     7459                 :             41 : shared_buffer_write_error_callback(void *arg)
                               7460                 :                : {
 3823                          7461                 :             41 :     BufferDesc *bufHdr = (BufferDesc *) arg;
                               7462                 :                : 
                               7463                 :                :     /* Buffer is pinned, so we can read the tag without locking the spinlock */
 8396 tgl@sss.pgh.pa.us        7464         [ +  - ]:             41 :     if (bufHdr != NULL)
  252 peter@eisentraut.org     7465                 :             82 :         errcontext("writing block %u of relation \"%s\"",
                               7466                 :                :                    bufHdr->tag.blockNum,
  434 andres@anarazel.de       7467                 :             41 :                    relpathperm(BufTagGetRelFileLocator(&bufHdr->tag),
                               7468                 :                :                                BufTagGetForkNum(&bufHdr->tag)).str);
 5744 rhaas@postgresql.org     7469                 :             41 : }
                               7470                 :                : 
                               7471                 :                : /*
                               7472                 :                :  * Error context callback for errors occurring during local buffer writes.
                               7473                 :                :  */
                               7474                 :                : static void
 5744 rhaas@postgresql.org     7475                 :UBC           0 : local_buffer_write_error_callback(void *arg)
                               7476                 :                : {
 3823                          7477                 :              0 :     BufferDesc *bufHdr = (BufferDesc *) arg;
                               7478                 :                : 
 5744                          7479         [ #  # ]:              0 :     if (bufHdr != NULL)
  252 peter@eisentraut.org     7480                 :              0 :         errcontext("writing block %u of relation \"%s\"",
                               7481                 :                :                    bufHdr->tag.blockNum,
  434 andres@anarazel.de       7482                 :              0 :                    relpathbackend(BufTagGetRelFileLocator(&bufHdr->tag),
                               7483                 :                :                                   MyProcNumber,
                               7484                 :                :                                   BufTagGetForkNum(&bufHdr->tag)).str);
 8396 tgl@sss.pgh.pa.us        7485                 :              0 : }
                               7486                 :                : 
                               7487                 :                : /*
                               7488                 :                :  * RelFileLocator qsort/bsearch comparator; see RelFileLocatorEquals.
                               7489                 :                :  */
                               7490                 :                : static int
 1399 rhaas@postgresql.org     7491                 :CBC    13256335 : rlocator_comparator(const void *p1, const void *p2)
                               7492                 :                : {
                               7493                 :       13256335 :     RelFileLocator n1 = *(const RelFileLocator *) p1;
                               7494                 :       13256335 :     RelFileLocator n2 = *(const RelFileLocator *) p2;
                               7495                 :                : 
                               7496         [ +  + ]:       13256335 :     if (n1.relNumber < n2.relNumber)
 4856 alvherre@alvh.no-ip.     7497                 :       13207970 :         return -1;
 1399 rhaas@postgresql.org     7498         [ +  + ]:          48365 :     else if (n1.relNumber > n2.relNumber)
 4856 alvherre@alvh.no-ip.     7499                 :          46596 :         return 1;
                               7500                 :                : 
 1399 rhaas@postgresql.org     7501         [ -  + ]:           1769 :     if (n1.dbOid < n2.dbOid)
 4856 alvherre@alvh.no-ip.     7502                 :UBC           0 :         return -1;
 1399 rhaas@postgresql.org     7503         [ -  + ]:CBC        1769 :     else if (n1.dbOid > n2.dbOid)
 4856 alvherre@alvh.no-ip.     7504                 :UBC           0 :         return 1;
                               7505                 :                : 
 1399 rhaas@postgresql.org     7506         [ -  + ]:CBC        1769 :     if (n1.spcOid < n2.spcOid)
 4856 alvherre@alvh.no-ip.     7507                 :UBC           0 :         return -1;
 1399 rhaas@postgresql.org     7508         [ -  + ]:CBC        1769 :     else if (n1.spcOid > n2.spcOid)
 4856 alvherre@alvh.no-ip.     7509                 :UBC           0 :         return 1;
                               7510                 :                :     else
 4856 alvherre@alvh.no-ip.     7511                 :CBC        1769 :         return 0;
                               7512                 :                : }
                               7513                 :                : 
                               7514                 :                : /*
                               7515                 :                :  * Lock buffer header - set BM_LOCKED in buffer state.
                               7516                 :                :  */
                               7517                 :                : uint64
 3677 andres@anarazel.de       7518                 :       28281999 : LockBufHdr(BufferDesc *desc)
                               7519                 :                : {
                               7520                 :                :     uint64      old_buf_state;
                               7521                 :                : 
 1126                          7522         [ -  + ]:       28281999 :     Assert(!BufferIsLocal(BufferDescriptorGetBuffer(desc)));
                               7523                 :                : 
                               7524                 :                :     while (true)
                               7525                 :                :     {
                               7526                 :                :         /*
                               7527                 :                :          * Always try once to acquire the lock directly, without setting up
                               7528                 :                :          * the spin-delay infrastructure. The work necessary for that shows up
                               7529                 :                :          * in profiles and is rarely necessary.
                               7530                 :                :          */
  110 andres@anarazel.de       7531                 :GNC    28282465 :         old_buf_state = pg_atomic_fetch_or_u64(&desc->state, BM_LOCKED);
  137                          7532         [ +  + ]:       28282465 :         if (likely(!(old_buf_state & BM_LOCKED)))
                               7533                 :       28281999 :             break;              /* got lock */
                               7534                 :                : 
                               7535                 :                :         /* and then spin without atomic operations until lock is released */
                               7536                 :                :         {
                               7537                 :                :             SpinDelayStatus delayStatus;
                               7538                 :                : 
                               7539                 :            466 :             init_local_spin_delay(&delayStatus);
                               7540                 :                : 
                               7541         [ +  + ]:           5850 :             while (old_buf_state & BM_LOCKED)
                               7542                 :                :             {
                               7543                 :           5384 :                 perform_spin_delay(&delayStatus);
  110                          7544                 :           5384 :                 old_buf_state = pg_atomic_read_u64(&desc->state);
                               7545                 :                :             }
  137                          7546                 :            466 :             finish_spin_delay(&delayStatus);
                               7547                 :                :         }
                               7548                 :                : 
                               7549                 :                :         /*
                               7550                 :                :          * Retry. The lock might obviously already be re-acquired by the time
                               7551                 :                :          * we're attempting to get it again.
                               7552                 :                :          */
                               7553                 :                :     }
                               7554                 :                : 
 3677 andres@anarazel.de       7555                 :CBC    28281999 :     return old_buf_state | BM_LOCKED;
                               7556                 :                : }
                               7557                 :                : 
                               7558                 :                : /*
                               7559                 :                :  * Wait until the BM_LOCKED flag isn't set anymore and return the buffer's
                               7560                 :                :  * state at that point.
                               7561                 :                :  *
                               7562                 :                :  * Obviously the buffer could be locked by the time the value is returned, so
                               7563                 :                :  * this is primarily useful in CAS style loops.
                               7564                 :                :  */
                               7565                 :                : pg_noinline uint64
                               7566                 :            385 : WaitBufHdrUnlocked(BufferDesc *buf)
                               7567                 :                : {
                               7568                 :                :     SpinDelayStatus delayStatus;
                               7569                 :                :     uint64      buf_state;
                               7570                 :                : 
 3673                          7571                 :            385 :     init_local_spin_delay(&delayStatus);
                               7572                 :                : 
  110 andres@anarazel.de       7573                 :GNC         385 :     buf_state = pg_atomic_read_u64(&buf->state);
                               7574                 :                : 
 3677 andres@anarazel.de       7575         [ +  + ]:CBC        8855 :     while (buf_state & BM_LOCKED)
                               7576                 :                :     {
                               7577                 :           8470 :         perform_spin_delay(&delayStatus);
  110 andres@anarazel.de       7578                 :GNC        8470 :         buf_state = pg_atomic_read_u64(&buf->state);
                               7579                 :                :     }
                               7580                 :                : 
 3677 andres@anarazel.de       7581                 :CBC         385 :     finish_spin_delay(&delayStatus);
                               7582                 :                : 
                               7583                 :            385 :     return buf_state;
                               7584                 :                : }
                               7585                 :                : 
                               7586                 :                : /*
                               7587                 :                :  * BufferTag comparator.
                               7588                 :                :  */
                               7589                 :                : static inline int
 1880 tmunro@postgresql.or     7590                 :UBC           0 : buffertag_comparator(const BufferTag *ba, const BufferTag *bb)
                               7591                 :                : {
                               7592                 :                :     int         ret;
                               7593                 :                :     RelFileLocator rlocatora;
                               7594                 :                :     RelFileLocator rlocatorb;
                               7595                 :                : 
 1350 rhaas@postgresql.org     7596                 :              0 :     rlocatora = BufTagGetRelFileLocator(ba);
                               7597                 :              0 :     rlocatorb = BufTagGetRelFileLocator(bb);
                               7598                 :                : 
                               7599                 :              0 :     ret = rlocator_comparator(&rlocatora, &rlocatorb);
                               7600                 :                : 
 3728 andres@anarazel.de       7601         [ #  # ]:              0 :     if (ret != 0)
                               7602                 :              0 :         return ret;
                               7603                 :                : 
 1350 rhaas@postgresql.org     7604         [ #  # ]:              0 :     if (BufTagGetForkNum(ba) < BufTagGetForkNum(bb))
 3728 andres@anarazel.de       7605                 :              0 :         return -1;
 1350 rhaas@postgresql.org     7606         [ #  # ]:              0 :     if (BufTagGetForkNum(ba) > BufTagGetForkNum(bb))
 3728 andres@anarazel.de       7607                 :              0 :         return 1;
                               7608                 :                : 
                               7609         [ #  # ]:              0 :     if (ba->blockNum < bb->blockNum)
                               7610                 :              0 :         return -1;
                               7611         [ #  # ]:              0 :     if (ba->blockNum > bb->blockNum)
                               7612                 :              0 :         return 1;
                               7613                 :                : 
                               7614                 :              0 :     return 0;
                               7615                 :                : }
                               7616                 :                : 
                               7617                 :                : /*
                               7618                 :                :  * Comparator determining the writeout order in a checkpoint.
                               7619                 :                :  *
                               7620                 :                :  * It is important that tablespaces are compared first, the logic balancing
                               7621                 :                :  * writes between tablespaces relies on it.
                               7622                 :                :  */
                               7623                 :                : static inline int
 1880 tmunro@postgresql.or     7624                 :CBC     3503709 : ckpt_buforder_comparator(const CkptSortItem *a, const CkptSortItem *b)
                               7625                 :                : {
                               7626                 :                :     /* compare tablespace */
 3728 andres@anarazel.de       7627         [ +  + ]:        3503709 :     if (a->tsId < b->tsId)
                               7628                 :          11200 :         return -1;
                               7629         [ +  + ]:        3492509 :     else if (a->tsId > b->tsId)
                               7630                 :          29707 :         return 1;
                               7631                 :                :     /* compare relation */
 1399 rhaas@postgresql.org     7632         [ +  + ]:        3462802 :     if (a->relNumber < b->relNumber)
 3728 andres@anarazel.de       7633                 :         973922 :         return -1;
 1399 rhaas@postgresql.org     7634         [ +  + ]:        2488880 :     else if (a->relNumber > b->relNumber)
 3728 andres@anarazel.de       7635                 :         939896 :         return 1;
                               7636                 :                :     /* compare fork */
                               7637         [ +  + ]:        1548984 :     else if (a->forkNum < b->forkNum)
                               7638                 :          64720 :         return -1;
                               7639         [ +  + ]:        1484264 :     else if (a->forkNum > b->forkNum)
                               7640                 :          73711 :         return 1;
                               7641                 :                :     /* compare block number */
                               7642         [ +  + ]:        1410553 :     else if (a->blockNum < b->blockNum)
                               7643                 :         684552 :         return -1;
 3037 tgl@sss.pgh.pa.us        7644         [ +  + ]:         726001 :     else if (a->blockNum > b->blockNum)
 3728 andres@anarazel.de       7645                 :         672254 :         return 1;
                               7646                 :                :     /* equal page IDs are unlikely, but not impossible */
 3037 tgl@sss.pgh.pa.us        7647                 :          53747 :     return 0;
                               7648                 :                : }
                               7649                 :                : 
                               7650                 :                : /*
                               7651                 :                :  * Comparator for a Min-Heap over the per-tablespace checkpoint completion
                               7652                 :                :  * progress.
                               7653                 :                :  */
                               7654                 :                : static int
 3728 andres@anarazel.de       7655                 :         286676 : ts_ckpt_progress_comparator(Datum a, Datum b, void *arg)
                               7656                 :                : {
  270 peter@eisentraut.org     7657                 :GNC      286676 :     CkptTsStatus *sa = (CkptTsStatus *) DatumGetPointer(a);
                               7658                 :         286676 :     CkptTsStatus *sb = (CkptTsStatus *) DatumGetPointer(b);
                               7659                 :                : 
                               7660                 :                :     /* we want a min-heap, so return 1 for the a < b */
 3728 andres@anarazel.de       7661         [ +  + ]:CBC      286676 :     if (sa->progress < sb->progress)
                               7662                 :         258702 :         return 1;
                               7663         [ +  + ]:          27974 :     else if (sa->progress == sb->progress)
                               7664                 :            780 :         return 0;
                               7665                 :                :     else
                               7666                 :          27194 :         return -1;
                               7667                 :                : }
                               7668                 :                : 
                               7669                 :                : /*
                               7670                 :                :  * Initialize a writeback context, discarding potential previous state.
                               7671                 :                :  *
                               7672                 :                :  * *max_pending is a pointer instead of an immediate value, so the coalesce
                               7673                 :                :  * limits can easily changed by the GUC mechanism, and so calling code does
                               7674                 :                :  * not have to check the current configuration. A value of 0 means that no
                               7675                 :                :  * writeback control will be performed.
                               7676                 :                :  */
                               7677                 :                : void
                               7678                 :           3085 : WritebackContextInit(WritebackContext *context, int *max_pending)
                               7679                 :                : {
                               7680         [ -  + ]:           3085 :     Assert(*max_pending <= WRITEBACK_MAX_PENDING_FLUSHES);
                               7681                 :                : 
                               7682                 :           3085 :     context->max_pending = max_pending;
                               7683                 :           3085 :     context->nr_pending = 0;
                               7684                 :           3085 : }
                               7685                 :                : 
                               7686                 :                : /*
                               7687                 :                :  * Add buffer to list of pending writeback requests.
                               7688                 :                :  */
                               7689                 :                : void
 1084                          7690                 :         707750 : ScheduleBufferTagForWriteback(WritebackContext *wb_context, IOContext io_context,
                               7691                 :                :                               BufferTag *tag)
                               7692                 :                : {
                               7693                 :                :     PendingWriteback *pending;
                               7694                 :                : 
                               7695                 :                :     /*
                               7696                 :                :      * As pg_flush_data() doesn't do anything with fsync disabled, there's no
                               7697                 :                :      * point in tracking in that case.
                               7698                 :                :      */
  574                          7699         [ +  + ]:         707750 :     if (io_direct_flags & IO_DIRECT_DATA ||
                               7700         [ +  + ]:         707219 :         !enableFsync)
 1123 tmunro@postgresql.or     7701                 :         707748 :         return;
                               7702                 :                : 
                               7703                 :                :     /*
                               7704                 :                :      * Add buffer to the pending writeback array, unless writeback control is
                               7705                 :                :      * disabled.
                               7706                 :                :      */
 1084 andres@anarazel.de       7707         [ -  + ]:              2 :     if (*wb_context->max_pending > 0)
                               7708                 :                :     {
 1084 andres@anarazel.de       7709         [ #  # ]:UBC           0 :         Assert(*wb_context->max_pending <= WRITEBACK_MAX_PENDING_FLUSHES);
                               7710                 :                : 
                               7711                 :              0 :         pending = &wb_context->pending_writebacks[wb_context->nr_pending++];
                               7712                 :                : 
 3728                          7713                 :              0 :         pending->tag = *tag;
                               7714                 :                :     }
                               7715                 :                : 
                               7716                 :                :     /*
                               7717                 :                :      * Perform pending flushes if the writeback limit is exceeded. This
                               7718                 :                :      * includes the case where previously an item has been added, but control
                               7719                 :                :      * is now disabled.
                               7720                 :                :      */
 1084 andres@anarazel.de       7721         [ +  - ]:CBC           2 :     if (wb_context->nr_pending >= *wb_context->max_pending)
                               7722                 :              2 :         IssuePendingWritebacks(wb_context, io_context);
                               7723                 :                : }
                               7724                 :                : 
                               7725                 :                : #define ST_SORT sort_pending_writebacks
                               7726                 :                : #define ST_ELEMENT_TYPE PendingWriteback
                               7727                 :                : #define ST_COMPARE(a, b) buffertag_comparator(&a->tag, &b->tag)
                               7728                 :                : #define ST_SCOPE static
                               7729                 :                : #define ST_DEFINE
                               7730                 :                : #include "lib/sort_template.h"
                               7731                 :                : 
                               7732                 :                : /*
                               7733                 :                :  * Issue all pending writeback requests, previously scheduled with
                               7734                 :                :  * ScheduleBufferTagForWriteback, to the OS.
                               7735                 :                :  *
                               7736                 :                :  * Because this is only used to improve the OSs IO scheduling we try to never
                               7737                 :                :  * error out - it's just a hint.
                               7738                 :                :  */
                               7739                 :                : void
                               7740                 :           1208 : IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_context)
                               7741                 :                : {
                               7742                 :                :     instr_time  io_start;
                               7743                 :                :     int         i;
                               7744                 :                : 
                               7745         [ +  - ]:           1208 :     if (wb_context->nr_pending == 0)
 3728                          7746                 :           1208 :         return;
                               7747                 :                : 
                               7748                 :                :     /*
                               7749                 :                :      * Executing the writes in-order can make them a lot faster, and allows to
                               7750                 :                :      * merge writeback requests to consecutive blocks into larger writebacks.
                               7751                 :                :      */
 1084 andres@anarazel.de       7752                 :UBC           0 :     sort_pending_writebacks(wb_context->pending_writebacks,
                               7753                 :              0 :                             wb_context->nr_pending);
                               7754                 :                : 
  433 michael@paquier.xyz      7755                 :              0 :     io_start = pgstat_prepare_io_time(track_io_timing);
                               7756                 :                : 
                               7757                 :                :     /*
                               7758                 :                :      * Coalesce neighbouring writes, but nothing else. For that we iterate
                               7759                 :                :      * through the, now sorted, array of pending flushes, and look forward to
                               7760                 :                :      * find all neighbouring (or identical) writes.
                               7761                 :                :      */
 1084 andres@anarazel.de       7762         [ #  # ]:              0 :     for (i = 0; i < wb_context->nr_pending; i++)
                               7763                 :                :     {
                               7764                 :                :         PendingWriteback *cur;
                               7765                 :                :         PendingWriteback *next;
                               7766                 :                :         SMgrRelation reln;
                               7767                 :                :         int         ahead;
                               7768                 :                :         BufferTag   tag;
                               7769                 :                :         RelFileLocator currlocator;
 3728                          7770                 :              0 :         Size        nblocks = 1;
                               7771                 :                : 
 1084                          7772                 :              0 :         cur = &wb_context->pending_writebacks[i];
 3728                          7773                 :              0 :         tag = cur->tag;
 1350 rhaas@postgresql.org     7774                 :              0 :         currlocator = BufTagGetRelFileLocator(&tag);
                               7775                 :                : 
                               7776                 :                :         /*
                               7777                 :                :          * Peek ahead, into following writeback requests, to see if they can
                               7778                 :                :          * be combined with the current one.
                               7779                 :                :          */
 1084 andres@anarazel.de       7780         [ #  # ]:              0 :         for (ahead = 0; i + ahead + 1 < wb_context->nr_pending; ahead++)
                               7781                 :                :         {
                               7782                 :                : 
                               7783                 :              0 :             next = &wb_context->pending_writebacks[i + ahead + 1];
                               7784                 :                : 
                               7785                 :                :             /* different file, stop */
 1350 rhaas@postgresql.org     7786   [ #  #  #  #  :              0 :             if (!RelFileLocatorEquals(currlocator,
                                              #  # ]
                               7787         [ #  # ]:              0 :                                       BufTagGetRelFileLocator(&next->tag)) ||
                               7788                 :              0 :                 BufTagGetForkNum(&cur->tag) != BufTagGetForkNum(&next->tag))
                               7789                 :                :                 break;
                               7790                 :                : 
                               7791                 :                :             /* ok, block queued twice, skip */
 3728 andres@anarazel.de       7792         [ #  # ]:              0 :             if (cur->tag.blockNum == next->tag.blockNum)
                               7793                 :              0 :                 continue;
                               7794                 :                : 
                               7795                 :                :             /* only merge consecutive writes */
                               7796         [ #  # ]:              0 :             if (cur->tag.blockNum + 1 != next->tag.blockNum)
                               7797                 :              0 :                 break;
                               7798                 :                : 
                               7799                 :              0 :             nblocks++;
                               7800                 :              0 :             cur = next;
                               7801                 :                :         }
                               7802                 :                : 
                               7803                 :              0 :         i += ahead;
                               7804                 :                : 
                               7805                 :                :         /* and finally tell the kernel to write the data to storage */
  793 heikki.linnakangas@i     7806                 :              0 :         reln = smgropen(currlocator, INVALID_PROC_NUMBER);
 1350 rhaas@postgresql.org     7807                 :              0 :         smgrwriteback(reln, BufTagGetForkNum(&tag), tag.blockNum, nblocks);
                               7808                 :                :     }
                               7809                 :                : 
                               7810                 :                :     /*
                               7811                 :                :      * Assume that writeback requests are only issued for buffers containing
                               7812                 :                :      * blocks of permanent relations.
                               7813                 :                :      */
 1084 andres@anarazel.de       7814                 :              0 :     pgstat_count_io_op_time(IOOBJECT_RELATION, io_context,
  476 michael@paquier.xyz      7815                 :              0 :                             IOOP_WRITEBACK, io_start, wb_context->nr_pending, 0);
                               7816                 :                : 
 1084 andres@anarazel.de       7817                 :              0 :     wb_context->nr_pending = 0;
                               7818                 :                : }
                               7819                 :                : 
                               7820                 :                : /* ResourceOwner callbacks */
                               7821                 :                : 
                               7822                 :                : static void
  909 heikki.linnakangas@i     7823                 :CBC          15 : ResOwnerReleaseBufferIO(Datum res)
                               7824                 :                : {
                               7825                 :             15 :     Buffer      buffer = DatumGetInt32(res);
                               7826                 :                : 
                               7827                 :             15 :     AbortBufferIO(buffer);
                               7828                 :             15 : }
                               7829                 :                : 
                               7830                 :                : static char *
  909 heikki.linnakangas@i     7831                 :UBC           0 : ResOwnerPrintBufferIO(Datum res)
                               7832                 :                : {
                               7833                 :              0 :     Buffer      buffer = DatumGetInt32(res);
                               7834                 :                : 
                               7835                 :              0 :     return psprintf("lost track of buffer IO on buffer %d", buffer);
                               7836                 :                : }
                               7837                 :                : 
                               7838                 :                : /*
                               7839                 :                :  * Release buffer as part of resource owner cleanup. This will only be called
                               7840                 :                :  * if the buffer is pinned. If this backend held the content lock at the time
                               7841                 :                :  * of the error we also need to release that (note that it is not possible to
                               7842                 :                :  * hold a content lock without a pin).
                               7843                 :                :  */
                               7844                 :                : static void
  110 andres@anarazel.de       7845                 :GNC       10609 : ResOwnerReleaseBuffer(Datum res)
                               7846                 :                : {
  909 heikki.linnakangas@i     7847                 :CBC       10609 :     Buffer      buffer = DatumGetInt32(res);
                               7848                 :                : 
                               7849                 :                :     /* Like ReleaseBuffer, but don't call ResourceOwnerForgetBuffer */
                               7850         [ -  + ]:          10609 :     if (!BufferIsValid(buffer))
  909 heikki.linnakangas@i     7851         [ #  # ]:UBC           0 :         elog(ERROR, "bad buffer ID: %d", buffer);
                               7852                 :                : 
  909 heikki.linnakangas@i     7853         [ +  + ]:CBC       10609 :     if (BufferIsLocal(buffer))
                               7854                 :           3998 :         UnpinLocalBufferNoOwner(buffer);
                               7855                 :                :     else
                               7856                 :                :     {
                               7857                 :                :         PrivateRefCountEntry *ref;
                               7858                 :                : 
  110 andres@anarazel.de       7859                 :GNC        6611 :         ref = GetPrivateRefCountEntry(buffer, false);
                               7860                 :                : 
                               7861                 :                :         /* not having a private refcount would imply resowner corruption */
                               7862         [ -  + ]:           6611 :         Assert(ref != NULL);
                               7863                 :                : 
                               7864                 :                :         /*
                               7865                 :                :          * If the buffer was locked at the time of the resowner release,
                               7866                 :                :          * release the lock now. This should only happen after errors.
                               7867                 :                :          */
                               7868         [ +  + ]:           6611 :         if (ref->data.lockmode != BUFFER_LOCK_UNLOCK)
                               7869                 :                :         {
                               7870                 :            116 :             BufferDesc *buf = GetBufferDescriptor(buffer - 1);
                               7871                 :                : 
                               7872                 :            116 :             HOLD_INTERRUPTS();  /* match the upcoming RESUME_INTERRUPTS */
                               7873                 :            116 :             BufferLockUnlock(buffer, buf);
                               7874                 :                :         }
                               7875                 :                : 
  909 heikki.linnakangas@i     7876                 :CBC        6611 :         UnpinBufferNoOwner(GetBufferDescriptor(buffer - 1));
                               7877                 :                :     }
                               7878                 :          10609 : }
                               7879                 :                : 
                               7880                 :                : static char *
  110 andres@anarazel.de       7881                 :UNC           0 : ResOwnerPrintBuffer(Datum res)
                               7882                 :                : {
  909 heikki.linnakangas@i     7883                 :UBC           0 :     return DebugPrintBufferRefcount(DatumGetInt32(res));
                               7884                 :                : }
                               7885                 :                : 
                               7886                 :                : /*
                               7887                 :                :  * Helper function to evict unpinned buffer whose buffer header lock is
                               7888                 :                :  * already acquired.
                               7889                 :                :  */
                               7890                 :                : static bool
  392 andres@anarazel.de       7891                 :CBC        2796 : EvictUnpinnedBufferInternal(BufferDesc *desc, bool *buffer_flushed)
                               7892                 :                : {
                               7893                 :                :     uint64      buf_state;
                               7894                 :                :     bool        result;
                               7895                 :                : 
                               7896                 :           2796 :     *buffer_flushed = false;
                               7897                 :                : 
  110 andres@anarazel.de       7898                 :GNC        2796 :     buf_state = pg_atomic_read_u64(&(desc->state));
  392 andres@anarazel.de       7899         [ -  + ]:CBC        2796 :     Assert(buf_state & BM_LOCKED);
                               7900                 :                : 
  758 tmunro@postgresql.or     7901         [ -  + ]:           2796 :     if ((buf_state & BM_VALID) == 0)
                               7902                 :                :     {
  180 andres@anarazel.de       7903                 :UNC           0 :         UnlockBufHdr(desc);
  758 tmunro@postgresql.or     7904                 :UBC           0 :         return false;
                               7905                 :                :     }
                               7906                 :                : 
                               7907                 :                :     /* Check that it's not pinned already. */
  758 tmunro@postgresql.or     7908         [ -  + ]:CBC        2796 :     if (BUF_STATE_GET_REFCOUNT(buf_state) > 0)
                               7909                 :                :     {
  180 andres@anarazel.de       7910                 :UNC           0 :         UnlockBufHdr(desc);
  758 tmunro@postgresql.or     7911                 :UBC           0 :         return false;
                               7912                 :                :     }
                               7913                 :                : 
  758 tmunro@postgresql.or     7914                 :CBC        2796 :     PinBuffer_Locked(desc);     /* releases spinlock */
                               7915                 :                : 
                               7916                 :                :     /* If it was dirty, try to clean it once. */
                               7917         [ +  + ]:           2796 :     if (buf_state & BM_DIRTY)
                               7918                 :                :     {
  209 andres@anarazel.de       7919                 :GNC        1143 :         FlushUnlockedBuffer(desc, NULL, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
  392 andres@anarazel.de       7920                 :CBC        1143 :         *buffer_flushed = true;
                               7921                 :                :     }
                               7922                 :                : 
                               7923                 :                :     /* This will return false if it becomes dirty or someone else pins it. */
  758 tmunro@postgresql.or     7924                 :           2796 :     result = InvalidateVictimBuffer(desc);
                               7925                 :                : 
                               7926                 :           2796 :     UnpinBuffer(desc);
                               7927                 :                : 
                               7928                 :           2796 :     return result;
                               7929                 :                : }
                               7930                 :                : 
                               7931                 :                : /*
                               7932                 :                :  * Try to evict the current block in a shared buffer.
                               7933                 :                :  *
                               7934                 :                :  * This function is intended for testing/development use only!
                               7935                 :                :  *
                               7936                 :                :  * To succeed, the buffer must not be pinned on entry, so if the caller had a
                               7937                 :                :  * particular block in mind, it might already have been replaced by some other
                               7938                 :                :  * block by the time this function runs.  It's also unpinned on return, so the
                               7939                 :                :  * buffer might be occupied again by the time control is returned, potentially
                               7940                 :                :  * even by the same block.  This inherent raciness without other interlocking
                               7941                 :                :  * makes the function unsuitable for non-testing usage.
                               7942                 :                :  *
                               7943                 :                :  * *buffer_flushed is set to true if the buffer was dirty and has been
                               7944                 :                :  * flushed, false otherwise.  However, *buffer_flushed=true does not
                               7945                 :                :  * necessarily mean that we flushed the buffer, it could have been flushed by
                               7946                 :                :  * someone else.
                               7947                 :                :  *
                               7948                 :                :  * Returns true if the buffer was valid and it has now been made invalid.
                               7949                 :                :  * Returns false if it wasn't valid, if it couldn't be evicted due to a pin,
                               7950                 :                :  * or if the buffer becomes dirty again while we're trying to write it out.
                               7951                 :                :  */
                               7952                 :                : bool
  392 andres@anarazel.de       7953                 :            205 : EvictUnpinnedBuffer(Buffer buf, bool *buffer_flushed)
                               7954                 :                : {
                               7955                 :                :     BufferDesc *desc;
                               7956                 :                : 
                               7957   [ +  -  -  + ]:            205 :     Assert(BufferIsValid(buf) && !BufferIsLocal(buf));
                               7958                 :                : 
                               7959                 :                :     /* Make sure we can pin the buffer. */
                               7960                 :            205 :     ResourceOwnerEnlarge(CurrentResourceOwner);
                               7961                 :            205 :     ReservePrivateRefCountEntry();
                               7962                 :                : 
                               7963                 :            205 :     desc = GetBufferDescriptor(buf - 1);
                               7964                 :            205 :     LockBufHdr(desc);
                               7965                 :                : 
                               7966                 :            205 :     return EvictUnpinnedBufferInternal(desc, buffer_flushed);
                               7967                 :                : }
                               7968                 :                : 
                               7969                 :                : /*
                               7970                 :                :  * Try to evict all the shared buffers.
                               7971                 :                :  *
                               7972                 :                :  * This function is intended for testing/development use only! See
                               7973                 :                :  * EvictUnpinnedBuffer().
                               7974                 :                :  *
                               7975                 :                :  * The buffers_* parameters are mandatory and indicate the total count of
                               7976                 :                :  * buffers that:
                               7977                 :                :  * - buffers_evicted - were evicted
                               7978                 :                :  * - buffers_flushed - were flushed
                               7979                 :                :  * - buffers_skipped - could not be evicted
                               7980                 :                :  */
                               7981                 :                : void
                               7982                 :              1 : EvictAllUnpinnedBuffers(int32 *buffers_evicted, int32 *buffers_flushed,
                               7983                 :                :                         int32 *buffers_skipped)
                               7984                 :                : {
                               7985                 :              1 :     *buffers_evicted = 0;
                               7986                 :              1 :     *buffers_skipped = 0;
                               7987                 :              1 :     *buffers_flushed = 0;
                               7988                 :                : 
                               7989         [ +  + ]:          16385 :     for (int buf = 1; buf <= NBuffers; buf++)
                               7990                 :                :     {
                               7991                 :          16384 :         BufferDesc *desc = GetBufferDescriptor(buf - 1);
                               7992                 :                :         uint64      buf_state;
                               7993                 :                :         bool        buffer_flushed;
                               7994                 :                : 
  182 msawada@postgresql.o     7995         [ -  + ]:          16384 :         CHECK_FOR_INTERRUPTS();
                               7996                 :                : 
  110 andres@anarazel.de       7997                 :GNC       16384 :         buf_state = pg_atomic_read_u64(&desc->state);
  392 andres@anarazel.de       7998         [ +  + ]:CBC       16384 :         if (!(buf_state & BM_VALID))
                               7999                 :          14316 :             continue;
                               8000                 :                : 
                               8001                 :           2068 :         ResourceOwnerEnlarge(CurrentResourceOwner);
                               8002                 :           2068 :         ReservePrivateRefCountEntry();
                               8003                 :                : 
                               8004                 :           2068 :         LockBufHdr(desc);
                               8005                 :                : 
                               8006         [ +  - ]:           2068 :         if (EvictUnpinnedBufferInternal(desc, &buffer_flushed))
                               8007                 :           2068 :             (*buffers_evicted)++;
                               8008                 :                :         else
  392 andres@anarazel.de       8009                 :UBC           0 :             (*buffers_skipped)++;
                               8010                 :                : 
  392 andres@anarazel.de       8011         [ +  + ]:CBC        2068 :         if (buffer_flushed)
                               8012                 :            998 :             (*buffers_flushed)++;
                               8013                 :                :     }
                               8014                 :              1 : }
                               8015                 :                : 
                               8016                 :                : /*
                               8017                 :                :  * Try to evict all the shared buffers containing provided relation's pages.
                               8018                 :                :  *
                               8019                 :                :  * This function is intended for testing/development use only! See
                               8020                 :                :  * EvictUnpinnedBuffer().
                               8021                 :                :  *
                               8022                 :                :  * The caller must hold at least AccessShareLock on the relation to prevent
                               8023                 :                :  * the relation from being dropped.
                               8024                 :                :  *
                               8025                 :                :  * The buffers_* parameters are mandatory and indicate the total count of
                               8026                 :                :  * buffers that:
                               8027                 :                :  * - buffers_evicted - were evicted
                               8028                 :                :  * - buffers_flushed - were flushed
                               8029                 :                :  * - buffers_skipped - could not be evicted
                               8030                 :                :  */
                               8031                 :                : void
                               8032                 :             44 : EvictRelUnpinnedBuffers(Relation rel, int32 *buffers_evicted,
                               8033                 :                :                         int32 *buffers_flushed, int32 *buffers_skipped)
                               8034                 :                : {
                               8035         [ -  + ]:             44 :     Assert(!RelationUsesLocalBuffers(rel));
                               8036                 :                : 
                               8037                 :             44 :     *buffers_skipped = 0;
                               8038                 :             44 :     *buffers_evicted = 0;
                               8039                 :             44 :     *buffers_flushed = 0;
                               8040                 :                : 
                               8041         [ +  + ]:         720940 :     for (int buf = 1; buf <= NBuffers; buf++)
                               8042                 :                :     {
                               8043                 :         720896 :         BufferDesc *desc = GetBufferDescriptor(buf - 1);
  110 andres@anarazel.de       8044                 :GNC      720896 :         uint64      buf_state = pg_atomic_read_u64(&(desc->state));
                               8045                 :                :         bool        buffer_flushed;
                               8046                 :                : 
  182 msawada@postgresql.o     8047         [ -  + ]:CBC      720896 :         CHECK_FOR_INTERRUPTS();
                               8048                 :                : 
                               8049                 :                :         /* An unlocked precheck should be safe and saves some cycles. */
  392 andres@anarazel.de       8050         [ +  + ]:         720896 :         if ((buf_state & BM_VALID) == 0 ||
                               8051         [ +  + ]:          98775 :             !BufTagMatchesRelFileLocator(&desc->tag, &rel->rd_locator))
                               8052                 :         720373 :             continue;
                               8053                 :                : 
                               8054                 :                :         /* Make sure we can pin the buffer. */
  392 andres@anarazel.de       8055                 :GBC         523 :         ResourceOwnerEnlarge(CurrentResourceOwner);
                               8056                 :            523 :         ReservePrivateRefCountEntry();
                               8057                 :                : 
                               8058                 :            523 :         buf_state = LockBufHdr(desc);
                               8059                 :                : 
                               8060                 :                :         /* recheck, could have changed without the lock */
                               8061         [ +  - ]:            523 :         if ((buf_state & BM_VALID) == 0 ||
                               8062         [ -  + ]:            523 :             !BufTagMatchesRelFileLocator(&desc->tag, &rel->rd_locator))
                               8063                 :                :         {
  180 andres@anarazel.de       8064                 :UNC           0 :             UnlockBufHdr(desc);
  392 andres@anarazel.de       8065                 :UBC           0 :             continue;
                               8066                 :                :         }
                               8067                 :                : 
  392 andres@anarazel.de       8068         [ +  - ]:GBC         523 :         if (EvictUnpinnedBufferInternal(desc, &buffer_flushed))
                               8069                 :            523 :             (*buffers_evicted)++;
                               8070                 :                :         else
  392 andres@anarazel.de       8071                 :UBC           0 :             (*buffers_skipped)++;
                               8072                 :                : 
  392 andres@anarazel.de       8073         [ +  + ]:GBC         523 :         if (buffer_flushed)
                               8074                 :            117 :             (*buffers_flushed)++;
                               8075                 :                :     }
  392 andres@anarazel.de       8076                 :CBC          44 : }
                               8077                 :                : 
                               8078                 :                : /*
                               8079                 :                :  * Helper function to mark unpinned buffer dirty whose buffer header lock is
                               8080                 :                :  * already acquired.
                               8081                 :                :  */
                               8082                 :                : static bool
  158 michael@paquier.xyz      8083                 :GNC          36 : MarkDirtyUnpinnedBufferInternal(Buffer buf, BufferDesc *desc,
                               8084                 :                :                                 bool *buffer_already_dirty)
                               8085                 :                : {
                               8086                 :                :     uint64      buf_state;
                               8087                 :             36 :     bool        result = false;
                               8088                 :                : 
                               8089                 :             36 :     *buffer_already_dirty = false;
                               8090                 :                : 
  110 andres@anarazel.de       8091                 :             36 :     buf_state = pg_atomic_read_u64(&(desc->state));
  158 michael@paquier.xyz      8092         [ -  + ]:             36 :     Assert(buf_state & BM_LOCKED);
                               8093                 :                : 
                               8094         [ +  + ]:             36 :     if ((buf_state & BM_VALID) == 0)
                               8095                 :                :     {
                               8096                 :              1 :         UnlockBufHdr(desc);
                               8097                 :              1 :         return false;
                               8098                 :                :     }
                               8099                 :                : 
                               8100                 :                :     /* Check that it's not pinned already. */
                               8101         [ -  + ]:             35 :     if (BUF_STATE_GET_REFCOUNT(buf_state) > 0)
                               8102                 :                :     {
  158 michael@paquier.xyz      8103                 :UNC           0 :         UnlockBufHdr(desc);
                               8104                 :              0 :         return false;
                               8105                 :                :     }
                               8106                 :                : 
                               8107                 :                :     /* Pin the buffer and then release the buffer spinlock */
  158 michael@paquier.xyz      8108                 :GNC          35 :     PinBuffer_Locked(desc);
                               8109                 :                : 
                               8110                 :                :     /* If it was not already dirty, mark it as dirty. */
                               8111         [ +  + ]:             35 :     if (!(buf_state & BM_DIRTY))
                               8112                 :                :     {
  110 andres@anarazel.de       8113                 :             16 :         BufferLockAcquire(buf, desc, BUFFER_LOCK_EXCLUSIVE);
  158 michael@paquier.xyz      8114                 :             16 :         MarkBufferDirty(buf);
                               8115                 :             16 :         result = true;
  110 andres@anarazel.de       8116                 :             16 :         BufferLockUnlock(buf, desc);
                               8117                 :                :     }
                               8118                 :                :     else
  158 michael@paquier.xyz      8119                 :             19 :         *buffer_already_dirty = true;
                               8120                 :                : 
                               8121                 :             35 :     UnpinBuffer(desc);
                               8122                 :                : 
                               8123                 :             35 :     return result;
                               8124                 :                : }
                               8125                 :                : 
                               8126                 :                : /*
                               8127                 :                :  * Try to mark the provided shared buffer as dirty.
                               8128                 :                :  *
                               8129                 :                :  * This function is intended for testing/development use only!
                               8130                 :                :  *
                               8131                 :                :  * Same as EvictUnpinnedBuffer() but with MarkBufferDirty() call inside.
                               8132                 :                :  *
                               8133                 :                :  * The buffer_already_dirty parameter is mandatory and indicate if the buffer
                               8134                 :                :  * could not be dirtied because it is already dirty.
                               8135                 :                :  *
                               8136                 :                :  * Returns true if the buffer has successfully been marked as dirty.
                               8137                 :                :  */
                               8138                 :                : bool
                               8139                 :              1 : MarkDirtyUnpinnedBuffer(Buffer buf, bool *buffer_already_dirty)
                               8140                 :                : {
                               8141                 :                :     BufferDesc *desc;
                               8142                 :              1 :     bool        buffer_dirtied = false;
                               8143                 :                : 
                               8144         [ -  + ]:              1 :     Assert(!BufferIsLocal(buf));
                               8145                 :                : 
                               8146                 :                :     /* Make sure we can pin the buffer. */
                               8147                 :              1 :     ResourceOwnerEnlarge(CurrentResourceOwner);
                               8148                 :              1 :     ReservePrivateRefCountEntry();
                               8149                 :                : 
                               8150                 :              1 :     desc = GetBufferDescriptor(buf - 1);
                               8151                 :              1 :     LockBufHdr(desc);
                               8152                 :                : 
                               8153                 :              1 :     buffer_dirtied = MarkDirtyUnpinnedBufferInternal(buf, desc, buffer_already_dirty);
                               8154                 :                :     /* Both can not be true at the same time */
                               8155   [ -  +  -  - ]:              1 :     Assert(!(buffer_dirtied && *buffer_already_dirty));
                               8156                 :                : 
                               8157                 :              1 :     return buffer_dirtied;
                               8158                 :                : }
                               8159                 :                : 
                               8160                 :                : /*
                               8161                 :                :  * Try to mark all the shared buffers containing provided relation's pages as
                               8162                 :                :  * dirty.
                               8163                 :                :  *
                               8164                 :                :  * This function is intended for testing/development use only! See
                               8165                 :                :  * MarkDirtyUnpinnedBuffer().
                               8166                 :                :  *
                               8167                 :                :  * The buffers_* parameters are mandatory and indicate the total count of
                               8168                 :                :  * buffers that:
                               8169                 :                :  * - buffers_dirtied - were dirtied
                               8170                 :                :  * - buffers_already_dirty - were already dirty
                               8171                 :                :  * - buffers_skipped - could not be dirtied because of a reason different
                               8172                 :                :  * than a buffer being already dirty.
                               8173                 :                :  */
                               8174                 :                : void
                               8175                 :              1 : MarkDirtyRelUnpinnedBuffers(Relation rel,
                               8176                 :                :                             int32 *buffers_dirtied,
                               8177                 :                :                             int32 *buffers_already_dirty,
                               8178                 :                :                             int32 *buffers_skipped)
                               8179                 :                : {
                               8180         [ -  + ]:              1 :     Assert(!RelationUsesLocalBuffers(rel));
                               8181                 :                : 
                               8182                 :              1 :     *buffers_dirtied = 0;
                               8183                 :              1 :     *buffers_already_dirty = 0;
                               8184                 :              1 :     *buffers_skipped = 0;
                               8185                 :                : 
                               8186         [ +  + ]:          16385 :     for (int buf = 1; buf <= NBuffers; buf++)
                               8187                 :                :     {
                               8188                 :          16384 :         BufferDesc *desc = GetBufferDescriptor(buf - 1);
  110 andres@anarazel.de       8189                 :          16384 :         uint64      buf_state = pg_atomic_read_u64(&(desc->state));
                               8190                 :                :         bool        buffer_already_dirty;
                               8191                 :                : 
  158 michael@paquier.xyz      8192         [ -  + ]:          16384 :         CHECK_FOR_INTERRUPTS();
                               8193                 :                : 
                               8194                 :                :         /* An unlocked precheck should be safe and saves some cycles. */
                               8195         [ +  + ]:          16384 :         if ((buf_state & BM_VALID) == 0 ||
                               8196         [ +  - ]:             27 :             !BufTagMatchesRelFileLocator(&desc->tag, &rel->rd_locator))
                               8197                 :          16384 :             continue;
                               8198                 :                : 
                               8199                 :                :         /* Make sure we can pin the buffer. */
  158 michael@paquier.xyz      8200                 :UNC           0 :         ResourceOwnerEnlarge(CurrentResourceOwner);
                               8201                 :              0 :         ReservePrivateRefCountEntry();
                               8202                 :                : 
                               8203                 :              0 :         buf_state = LockBufHdr(desc);
                               8204                 :                : 
                               8205                 :                :         /* recheck, could have changed without the lock */
                               8206         [ #  # ]:              0 :         if ((buf_state & BM_VALID) == 0 ||
                               8207         [ #  # ]:              0 :             !BufTagMatchesRelFileLocator(&desc->tag, &rel->rd_locator))
                               8208                 :                :         {
                               8209                 :              0 :             UnlockBufHdr(desc);
                               8210                 :              0 :             continue;
                               8211                 :                :         }
                               8212                 :                : 
                               8213         [ #  # ]:              0 :         if (MarkDirtyUnpinnedBufferInternal(buf, desc, &buffer_already_dirty))
                               8214                 :              0 :             (*buffers_dirtied)++;
                               8215         [ #  # ]:              0 :         else if (buffer_already_dirty)
                               8216                 :              0 :             (*buffers_already_dirty)++;
                               8217                 :                :         else
                               8218                 :              0 :             (*buffers_skipped)++;
                               8219                 :                :     }
  158 michael@paquier.xyz      8220                 :GNC           1 : }
                               8221                 :                : 
                               8222                 :                : /*
                               8223                 :                :  * Try to mark all the shared buffers as dirty.
                               8224                 :                :  *
                               8225                 :                :  * This function is intended for testing/development use only! See
                               8226                 :                :  * MarkDirtyUnpinnedBuffer().
                               8227                 :                :  *
                               8228                 :                :  * See MarkDirtyRelUnpinnedBuffers() above for details about the buffers_*
                               8229                 :                :  * parameters.
                               8230                 :                :  */
                               8231                 :                : void
                               8232                 :              1 : MarkDirtyAllUnpinnedBuffers(int32 *buffers_dirtied,
                               8233                 :                :                             int32 *buffers_already_dirty,
                               8234                 :                :                             int32 *buffers_skipped)
                               8235                 :                : {
                               8236                 :              1 :     *buffers_dirtied = 0;
                               8237                 :              1 :     *buffers_already_dirty = 0;
                               8238                 :              1 :     *buffers_skipped = 0;
                               8239                 :                : 
                               8240         [ +  + ]:          16385 :     for (int buf = 1; buf <= NBuffers; buf++)
                               8241                 :                :     {
                               8242                 :          16384 :         BufferDesc *desc = GetBufferDescriptor(buf - 1);
                               8243                 :                :         uint64      buf_state;
                               8244                 :                :         bool        buffer_already_dirty;
                               8245                 :                : 
                               8246         [ -  + ]:          16384 :         CHECK_FOR_INTERRUPTS();
                               8247                 :                : 
  110 andres@anarazel.de       8248                 :          16384 :         buf_state = pg_atomic_read_u64(&desc->state);
  158 michael@paquier.xyz      8249         [ +  + ]:          16384 :         if (!(buf_state & BM_VALID))
                               8250                 :          16349 :             continue;
                               8251                 :                : 
                               8252                 :             35 :         ResourceOwnerEnlarge(CurrentResourceOwner);
                               8253                 :             35 :         ReservePrivateRefCountEntry();
                               8254                 :                : 
                               8255                 :             35 :         LockBufHdr(desc);
                               8256                 :                : 
                               8257         [ +  + ]:             35 :         if (MarkDirtyUnpinnedBufferInternal(buf, desc, &buffer_already_dirty))
                               8258                 :             16 :             (*buffers_dirtied)++;
                               8259         [ +  - ]:             19 :         else if (buffer_already_dirty)
                               8260                 :             19 :             (*buffers_already_dirty)++;
                               8261                 :                :         else
  158 michael@paquier.xyz      8262                 :UNC           0 :             (*buffers_skipped)++;
                               8263                 :                :     }
  158 michael@paquier.xyz      8264                 :GNC           1 : }
                               8265                 :                : 
                               8266                 :                : /*
                               8267                 :                :  * Generic implementation of the AIO handle staging callback for readv/writev
                               8268                 :                :  * on local/shared buffers.
                               8269                 :                :  *
                               8270                 :                :  * Each readv/writev can target multiple buffers. The buffers have already
                               8271                 :                :  * been registered with the IO handle.
                               8272                 :                :  *
                               8273                 :                :  * To make the IO ready for execution ("staging"), we need to ensure that the
                               8274                 :                :  * targeted buffers are in an appropriate state while the IO is ongoing. For
                               8275                 :                :  * that the AIO subsystem needs to have its own buffer pin, otherwise an error
                               8276                 :                :  * in this backend could lead to this backend's buffer pin being released as
                               8277                 :                :  * part of error handling, which in turn could lead to the buffer being
                               8278                 :                :  * replaced while IO is ongoing.
                               8279                 :                :  */
                               8280                 :                : static pg_attribute_always_inline void
  401 andres@anarazel.de       8281                 :CBC     1472227 : buffer_stage_common(PgAioHandle *ioh, bool is_write, bool is_temp)
                               8282                 :                : {
                               8283                 :                :     uint64     *io_data;
                               8284                 :                :     uint8       handle_data_len;
                               8285                 :                :     PgAioWaitRef io_ref;
                               8286                 :        1472227 :     BufferTag   first PG_USED_FOR_ASSERTS_ONLY = {0};
                               8287                 :                : 
                               8288                 :        1472227 :     io_data = pgaio_io_get_handle_data(ioh, &handle_data_len);
                               8289                 :                : 
                               8290                 :        1472227 :     pgaio_io_get_wref(ioh, &io_ref);
                               8291                 :                : 
                               8292                 :                :     /* iterate over all buffers affected by the vectored readv/writev */
                               8293         [ +  + ]:        3126562 :     for (int i = 0; i < handle_data_len; i++)
                               8294                 :                :     {
                               8295                 :        1654335 :         Buffer      buffer = (Buffer) io_data[i];
                               8296                 :        1654335 :         BufferDesc *buf_hdr = is_temp ?
                               8297                 :          11130 :             GetLocalBufferDescriptor(-buffer - 1)
                               8298         [ +  + ]:        1654335 :             : GetBufferDescriptor(buffer - 1);
                               8299                 :                :         uint64      buf_state;
                               8300                 :                : 
                               8301                 :                :         /*
                               8302                 :                :          * Check that all the buffers are actually ones that could conceivably
                               8303                 :                :          * be done in one IO, i.e. are sequential. This is the last
                               8304                 :                :          * buffer-aware code before IO is actually executed and confusion
                               8305                 :                :          * about which buffers are targeted by IO can be hard to debug, making
                               8306                 :                :          * it worth doing extra-paranoid checks.
                               8307                 :                :          */
                               8308         [ +  + ]:        1654335 :         if (i == 0)
                               8309                 :        1472227 :             first = buf_hdr->tag;
                               8310                 :                :         else
                               8311                 :                :         {
                               8312         [ -  + ]:         182108 :             Assert(buf_hdr->tag.relNumber == first.relNumber);
                               8313         [ -  + ]:         182108 :             Assert(buf_hdr->tag.blockNum == first.blockNum + i);
                               8314                 :                :         }
                               8315                 :                : 
                               8316         [ +  + ]:        1654335 :         if (is_temp)
  110 andres@anarazel.de       8317                 :GNC       11130 :             buf_state = pg_atomic_read_u64(&buf_hdr->state);
                               8318                 :                :         else
  401 andres@anarazel.de       8319                 :CBC     1643205 :             buf_state = LockBufHdr(buf_hdr);
                               8320                 :                : 
                               8321                 :                :         /* verify the buffer is in the expected state */
                               8322         [ -  + ]:        1654335 :         Assert(buf_state & BM_TAG_VALID);
                               8323         [ -  + ]:        1654335 :         if (is_write)
                               8324                 :                :         {
  401 andres@anarazel.de       8325         [ #  # ]:UBC           0 :             Assert(buf_state & BM_VALID);
                               8326         [ #  # ]:              0 :             Assert(buf_state & BM_DIRTY);
                               8327                 :                :         }
                               8328                 :                :         else
                               8329                 :                :         {
  401 andres@anarazel.de       8330         [ -  + ]:CBC     1654335 :             Assert(!(buf_state & BM_VALID));
                               8331         [ -  + ]:        1654335 :             Assert(!(buf_state & BM_DIRTY));
                               8332                 :                :         }
                               8333                 :                : 
                               8334                 :                :         /* temp buffers don't use BM_IO_IN_PROGRESS */
                               8335         [ +  + ]:        1654335 :         if (!is_temp)
                               8336         [ -  + ]:        1643205 :             Assert(buf_state & BM_IO_IN_PROGRESS);
                               8337                 :                : 
                               8338         [ -  + ]:        1654335 :         Assert(BUF_STATE_GET_REFCOUNT(buf_state) >= 1);
                               8339                 :                : 
                               8340                 :                :         /*
                               8341                 :                :          * Reflect that the buffer is now owned by the AIO subsystem.
                               8342                 :                :          *
                               8343                 :                :          * For local buffers: This can't be done just via LocalRefCount, as
                               8344                 :                :          * one might initially think, as this backend could error out while
                               8345                 :                :          * AIO is still in progress, releasing all the pins by the backend
                               8346                 :                :          * itself.
                               8347                 :                :          *
                               8348                 :                :          * This pin is released again in TerminateBufferIO().
                               8349                 :                :          */
                               8350                 :        1654335 :         buf_hdr->io_wref = io_ref;
                               8351                 :                : 
                               8352         [ +  + ]:        1654335 :         if (is_temp)
                               8353                 :                :         {
  180 andres@anarazel.de       8354                 :GNC       11130 :             buf_state += BUF_REFCOUNT_ONE;
  110                          8355                 :          11130 :             pg_atomic_unlocked_write_u64(&buf_hdr->state, buf_state);
                               8356                 :                :         }
                               8357                 :                :         else
  180                          8358                 :        1643205 :             UnlockBufHdrExt(buf_hdr, buf_state, 0, 0, 1);
                               8359                 :                : 
                               8360                 :                :         /*
                               8361                 :                :          * Ensure the content lock that prevents buffer modifications while
                               8362                 :                :          * the buffer is being written out is not released early due to an
                               8363                 :                :          * error.
                               8364                 :                :          */
  401 andres@anarazel.de       8365   [ -  +  -  - ]:CBC     1654335 :         if (is_write && !is_temp)
                               8366                 :                :         {
  110 andres@anarazel.de       8367         [ #  # ]:UNC           0 :             Assert(BufferLockHeldByMe(buf_hdr));
                               8368                 :                : 
                               8369                 :                :             /*
                               8370                 :                :              * Lock is now owned by AIO subsystem.
                               8371                 :                :              */
                               8372                 :              0 :             BufferLockDisown(buffer, buf_hdr);
                               8373                 :                :         }
                               8374                 :                : 
                               8375                 :                :         /*
                               8376                 :                :          * Stop tracking this buffer via the resowner - the AIO system now
                               8377                 :                :          * keeps track.
                               8378                 :                :          */
  401 andres@anarazel.de       8379         [ +  + ]:CBC     1654335 :         if (!is_temp)
                               8380                 :        1643205 :             ResourceOwnerForgetBufferIO(CurrentResourceOwner, buffer);
                               8381                 :                :     }
                               8382                 :        1472227 : }
                               8383                 :                : 
                               8384                 :                : /*
                               8385                 :                :  * Decode readv errors as encoded by buffer_readv_encode_error().
                               8386                 :                :  */
                               8387                 :                : static inline void
                               8388                 :            962 : buffer_readv_decode_error(PgAioResult result,
                               8389                 :                :                           bool *zeroed_any,
                               8390                 :                :                           bool *ignored_any,
                               8391                 :                :                           uint8 *zeroed_or_error_count,
                               8392                 :                :                           uint8 *checkfail_count,
                               8393                 :                :                           uint8 *first_off)
                               8394                 :                : {
                               8395                 :            962 :     uint32      rem_error = result.error_data;
                               8396                 :                : 
                               8397                 :                :     /* see static asserts in buffer_readv_encode_error */
                               8398                 :                : #define READV_COUNT_BITS    7
                               8399                 :                : #define READV_COUNT_MASK    ((1 << READV_COUNT_BITS) - 1)
                               8400                 :                : 
                               8401                 :            962 :     *zeroed_any = rem_error & 1;
                               8402                 :            962 :     rem_error >>= 1;
                               8403                 :                : 
                               8404                 :            962 :     *ignored_any = rem_error & 1;
                               8405                 :            962 :     rem_error >>= 1;
                               8406                 :                : 
                               8407                 :            962 :     *zeroed_or_error_count = rem_error & READV_COUNT_MASK;
                               8408                 :            962 :     rem_error >>= READV_COUNT_BITS;
                               8409                 :                : 
                               8410                 :            962 :     *checkfail_count = rem_error & READV_COUNT_MASK;
                               8411                 :            962 :     rem_error >>= READV_COUNT_BITS;
                               8412                 :                : 
                               8413                 :            962 :     *first_off = rem_error & READV_COUNT_MASK;
                               8414                 :            962 :     rem_error >>= READV_COUNT_BITS;
                               8415                 :            962 : }
                               8416                 :                : 
                               8417                 :                : /*
                               8418                 :                :  * Helper to encode errors for buffer_readv_complete()
                               8419                 :                :  *
                               8420                 :                :  * Errors are encoded as follows:
                               8421                 :                :  * - bit 0 indicates whether any page was zeroed (1) or not (0)
                               8422                 :                :  * - bit 1 indicates whether any checksum failure was ignored (1) or not (0)
                               8423                 :                :  * - next READV_COUNT_BITS bits indicate the number of errored or zeroed pages
                               8424                 :                :  * - next READV_COUNT_BITS bits indicate the number of checksum failures
                               8425                 :                :  * - next READV_COUNT_BITS bits indicate the first offset of the first page
                               8426                 :                :  *   that was errored or zeroed or, if no errors/zeroes, the first ignored
                               8427                 :                :  *   checksum
                               8428                 :                :  */
                               8429                 :                : static inline void
                               8430                 :            288 : buffer_readv_encode_error(PgAioResult *result,
                               8431                 :                :                           bool is_temp,
                               8432                 :                :                           bool zeroed_any,
                               8433                 :                :                           bool ignored_any,
                               8434                 :                :                           uint8 error_count,
                               8435                 :                :                           uint8 zeroed_count,
                               8436                 :                :                           uint8 checkfail_count,
                               8437                 :                :                           uint8 first_error_off,
                               8438                 :                :                           uint8 first_zeroed_off,
                               8439                 :                :                           uint8 first_ignored_off)
                               8440                 :                : {
                               8441                 :                : 
                               8442                 :            288 :     uint8       shift = 0;
                               8443         [ +  + ]:            288 :     uint8       zeroed_or_error_count =
                               8444                 :                :         error_count > 0 ? error_count : zeroed_count;
                               8445                 :                :     uint8       first_off;
                               8446                 :                : 
                               8447                 :                :     StaticAssertDecl(PG_IOV_MAX <= 1 << READV_COUNT_BITS,
                               8448                 :                :                      "PG_IOV_MAX is bigger than reserved space for error data");
                               8449                 :                :     StaticAssertDecl((1 + 1 + 3 * READV_COUNT_BITS) <= PGAIO_RESULT_ERROR_BITS,
                               8450                 :                :                      "PGAIO_RESULT_ERROR_BITS is insufficient for buffer_readv");
                               8451                 :                : 
                               8452                 :                :     /*
                               8453                 :                :      * We only have space to encode one offset - but luckily that's good
                               8454                 :                :      * enough. If there is an error, the error is the interesting offset, same
                               8455                 :                :      * with a zeroed buffer vs an ignored buffer.
                               8456                 :                :      */
                               8457         [ +  + ]:            288 :     if (error_count > 0)
                               8458                 :            141 :         first_off = first_error_off;
                               8459         [ +  + ]:            147 :     else if (zeroed_count > 0)
                               8460                 :            120 :         first_off = first_zeroed_off;
                               8461                 :                :     else
                               8462                 :             27 :         first_off = first_ignored_off;
                               8463                 :                : 
                               8464   [ +  +  -  + ]:            288 :     Assert(!zeroed_any || error_count == 0);
                               8465                 :                : 
                               8466                 :            288 :     result->error_data = 0;
                               8467                 :                : 
                               8468                 :            288 :     result->error_data |= zeroed_any << shift;
                               8469                 :            288 :     shift += 1;
                               8470                 :                : 
                               8471                 :            288 :     result->error_data |= ignored_any << shift;
                               8472                 :            288 :     shift += 1;
                               8473                 :                : 
                               8474                 :            288 :     result->error_data |= ((uint32) zeroed_or_error_count) << shift;
                               8475                 :            288 :     shift += READV_COUNT_BITS;
                               8476                 :                : 
                               8477                 :            288 :     result->error_data |= ((uint32) checkfail_count) << shift;
                               8478                 :            288 :     shift += READV_COUNT_BITS;
                               8479                 :                : 
                               8480                 :            288 :     result->error_data |= ((uint32) first_off) << shift;
                               8481                 :            288 :     shift += READV_COUNT_BITS;
                               8482                 :                : 
                               8483         [ +  + ]:            288 :     result->id = is_temp ? PGAIO_HCB_LOCAL_BUFFER_READV :
                               8484                 :                :         PGAIO_HCB_SHARED_BUFFER_READV;
                               8485                 :                : 
                               8486         [ +  + ]:            288 :     if (error_count > 0)
                               8487                 :            141 :         result->status = PGAIO_RS_ERROR;
                               8488                 :                :     else
                               8489                 :            147 :         result->status = PGAIO_RS_WARNING;
                               8490                 :                : 
                               8491                 :                :     /*
                               8492                 :                :      * The encoding is complicated enough to warrant cross-checking it against
                               8493                 :                :      * the decode function.
                               8494                 :                :      */
                               8495                 :                : #ifdef USE_ASSERT_CHECKING
                               8496                 :                :     {
                               8497                 :                :         bool        zeroed_any_2,
                               8498                 :                :                     ignored_any_2;
                               8499                 :                :         uint8       zeroed_or_error_count_2,
                               8500                 :                :                     checkfail_count_2,
                               8501                 :                :                     first_off_2;
                               8502                 :                : 
                               8503                 :            288 :         buffer_readv_decode_error(*result,
                               8504                 :                :                                   &zeroed_any_2, &ignored_any_2,
                               8505                 :                :                                   &zeroed_or_error_count_2,
                               8506                 :                :                                   &checkfail_count_2,
                               8507                 :                :                                   &first_off_2);
                               8508         [ -  + ]:            288 :         Assert(zeroed_any == zeroed_any_2);
                               8509         [ -  + ]:            288 :         Assert(ignored_any == ignored_any_2);
                               8510         [ -  + ]:            288 :         Assert(zeroed_or_error_count == zeroed_or_error_count_2);
                               8511         [ -  + ]:            288 :         Assert(checkfail_count == checkfail_count_2);
                               8512         [ -  + ]:            288 :         Assert(first_off == first_off_2);
                               8513                 :                :     }
                               8514                 :                : #endif
                               8515                 :                : 
                               8516                 :                : #undef READV_COUNT_BITS
                               8517                 :                : #undef READV_COUNT_MASK
                               8518                 :            288 : }
                               8519                 :                : 
                               8520                 :                : /*
                               8521                 :                :  * Helper for AIO readv completion callbacks, supporting both shared and temp
                               8522                 :                :  * buffers. Gets called once for each buffer in a multi-page read.
                               8523                 :                :  */
                               8524                 :                : static pg_attribute_always_inline void
                               8525                 :        1485336 : buffer_readv_complete_one(PgAioTargetData *td, uint8 buf_off, Buffer buffer,
                               8526                 :                :                           uint8 flags, bool failed, bool is_temp,
                               8527                 :                :                           bool *buffer_invalid,
                               8528                 :                :                           bool *failed_checksum,
                               8529                 :                :                           bool *ignored_checksum,
                               8530                 :                :                           bool *zeroed_buffer)
                               8531                 :                : {
                               8532                 :        1485336 :     BufferDesc *buf_hdr = is_temp ?
                               8533                 :          11130 :         GetLocalBufferDescriptor(-buffer - 1)
                               8534         [ +  + ]:        1485336 :         : GetBufferDescriptor(buffer - 1);
                               8535                 :        1485336 :     BufferTag   tag = buf_hdr->tag;
                               8536                 :        1485336 :     char       *bufdata = BufferGetBlock(buffer);
                               8537                 :                :     uint64      set_flag_bits;
                               8538                 :                :     int         piv_flags;
                               8539                 :                : 
                               8540                 :                :     /* check that the buffer is in the expected state for a read */
                               8541                 :                : #ifdef USE_ASSERT_CHECKING
                               8542                 :                :     {
  110 andres@anarazel.de       8543                 :GNC     1485336 :         uint64      buf_state = pg_atomic_read_u64(&buf_hdr->state);
                               8544                 :                : 
  401 andres@anarazel.de       8545         [ -  + ]:CBC     1485336 :         Assert(buf_state & BM_TAG_VALID);
                               8546         [ -  + ]:        1485336 :         Assert(!(buf_state & BM_VALID));
                               8547                 :                :         /* temp buffers don't use BM_IO_IN_PROGRESS */
                               8548         [ +  + ]:        1485336 :         if (!is_temp)
                               8549         [ -  + ]:        1474206 :             Assert(buf_state & BM_IO_IN_PROGRESS);
                               8550         [ -  + ]:        1485336 :         Assert(!(buf_state & BM_DIRTY));
                               8551                 :                :     }
                               8552                 :                : #endif
                               8553                 :                : 
                               8554                 :        1485336 :     *buffer_invalid = false;
                               8555                 :        1485336 :     *failed_checksum = false;
                               8556                 :        1485336 :     *ignored_checksum = false;
                               8557                 :        1485336 :     *zeroed_buffer = false;
                               8558                 :                : 
                               8559                 :                :     /*
                               8560                 :                :      * We ask PageIsVerified() to only log the message about checksum errors,
                               8561                 :                :      * as the completion might be run in any backend (or IO workers). We will
                               8562                 :                :      * report checksum errors in buffer_readv_report().
                               8563                 :                :      */
                               8564                 :        1485336 :     piv_flags = PIV_LOG_LOG;
                               8565                 :                : 
                               8566                 :                :     /* the local zero_damaged_pages may differ from the definer's */
                               8567         [ +  + ]:        1485336 :     if (flags & READ_BUFFERS_IGNORE_CHECKSUM_FAILURES)
                               8568                 :             57 :         piv_flags |= PIV_IGNORE_CHECKSUM_FAILURE;
                               8569                 :                : 
                               8570                 :                :     /*
                               8571                 :                :      * If the buffers are marked for zero on error, we want to log that in
                               8572                 :                :      * case of a checksum failure.
                               8573                 :                :      */
   32 dgustafsson@postgres     8574         [ +  + ]:GNC     1485336 :     if (flags & READ_BUFFERS_ZERO_ON_ERROR)
                               8575                 :          45842 :         piv_flags |= PIV_ZERO_BUFFERS_ON_ERROR;
                               8576                 :                : 
                               8577                 :                :     /* Check for garbage data. */
  401 andres@anarazel.de       8578         [ +  + ]:CBC     1485336 :     if (!failed)
                               8579                 :                :     {
                               8580                 :                :         /*
                               8581                 :                :          * If the buffer is not currently pinned by this backend, e.g. because
                               8582                 :                :          * we're completing this IO after an error, the buffer data will have
                               8583                 :                :          * been marked as inaccessible when the buffer was unpinned. The AIO
                               8584                 :                :          * subsystem holds a pin, but that doesn't prevent the buffer from
                               8585                 :                :          * having been marked as inaccessible. The completion might also be
                               8586                 :                :          * executed in a different process.
                               8587                 :                :          */
                               8588                 :                : #ifdef USE_VALGRIND
                               8589                 :                :         if (!BufferIsPinned(buffer))
                               8590                 :                :             VALGRIND_MAKE_MEM_DEFINED(bufdata, BLCKSZ);
                               8591                 :                : #endif
                               8592                 :                : 
                               8593         [ +  + ]:        1484167 :         if (!PageIsVerified((Page) bufdata, tag.blockNum, piv_flags,
                               8594                 :                :                             failed_checksum))
                               8595                 :                :         {
                               8596         [ +  + ]:            144 :             if (flags & READ_BUFFERS_ZERO_ON_ERROR)
                               8597                 :                :             {
                               8598                 :             69 :                 memset(bufdata, 0, BLCKSZ);
                               8599                 :             69 :                 *zeroed_buffer = true;
                               8600                 :                :             }
                               8601                 :                :             else
                               8602                 :                :             {
                               8603                 :             75 :                 *buffer_invalid = true;
                               8604                 :                :                 /* mark buffer as having failed */
                               8605                 :             75 :                 failed = true;
                               8606                 :                :             }
                               8607                 :                :         }
                               8608         [ +  + ]:        1484023 :         else if (*failed_checksum)
                               8609                 :             18 :             *ignored_checksum = true;
                               8610                 :                : 
                               8611                 :                :         /* undo what we did above */
                               8612                 :                : #ifdef USE_VALGRIND
                               8613                 :                :         if (!BufferIsPinned(buffer))
                               8614                 :                :             VALGRIND_MAKE_MEM_NOACCESS(bufdata, BLCKSZ);
                               8615                 :                : #endif
                               8616                 :                : 
                               8617                 :                :         /*
                               8618                 :                :          * Immediately log a message about the invalid page, but only to the
                               8619                 :                :          * server log. The reason to do so immediately is that this may be
                               8620                 :                :          * executed in a different backend than the one that originated the
                               8621                 :                :          * request. The reason to do so immediately is that the originator
                               8622                 :                :          * might not process the query result immediately (because it is busy
                               8623                 :                :          * doing another part of query processing) or at all (e.g. if it was
                               8624                 :                :          * cancelled or errored out due to another IO also failing). The
                               8625                 :                :          * definer of the IO will emit an ERROR or WARNING when processing the
                               8626                 :                :          * IO's results
                               8627                 :                :          *
                               8628                 :                :          * To avoid duplicating the code to emit these log messages, we reuse
                               8629                 :                :          * buffer_readv_report().
                               8630                 :                :          */
                               8631   [ +  +  +  +  :        1484167 :         if (*buffer_invalid || *failed_checksum || *zeroed_buffer)
                                              +  + ]
                               8632                 :                :         {
  396                          8633                 :            162 :             PgAioResult result_one = {0};
                               8634                 :                : 
  401                          8635                 :            162 :             buffer_readv_encode_error(&result_one, is_temp,
                               8636                 :            162 :                                       *zeroed_buffer,
                               8637                 :            162 :                                       *ignored_checksum,
                               8638                 :            162 :                                       *buffer_invalid,
                               8639                 :            162 :                                       *zeroed_buffer ? 1 : 0,
                               8640                 :            162 :                                       *failed_checksum ? 1 : 0,
                               8641                 :                :                                       buf_off, buf_off, buf_off);
                               8642                 :            162 :             pgaio_result_report(result_one, td, LOG_SERVER_ONLY);
                               8643                 :                :         }
                               8644                 :                :     }
                               8645                 :                : 
                               8646                 :                :     /* Terminate I/O and set BM_VALID. */
                               8647         [ +  + ]:        1485336 :     set_flag_bits = failed ? BM_IO_ERROR : BM_VALID;
                               8648         [ +  + ]:        1485336 :     if (is_temp)
                               8649                 :          11130 :         TerminateLocalBufferIO(buf_hdr, false, set_flag_bits, true);
                               8650                 :                :     else
                               8651                 :        1474206 :         TerminateBufferIO(buf_hdr, false, set_flag_bits, false, true);
                               8652                 :                : 
                               8653                 :                :     /*
                               8654                 :                :      * Call the BUFFER_READ_DONE tracepoint in the callback, even though the
                               8655                 :                :      * callback may not be executed in the same backend that called
                               8656                 :                :      * BUFFER_READ_START. The alternative would be to defer calling the
                               8657                 :                :      * tracepoint to a later point (e.g. the local completion callback for
                               8658                 :                :      * shared buffer reads), which seems even less helpful.
                               8659                 :                :      */
                               8660                 :                :     TRACE_POSTGRESQL_BUFFER_READ_DONE(tag.forkNum,
                               8661                 :                :                                       tag.blockNum,
                               8662                 :                :                                       tag.spcOid,
                               8663                 :                :                                       tag.dbOid,
                               8664                 :                :                                       tag.relNumber,
                               8665                 :                :                                       is_temp ? MyProcNumber : INVALID_PROC_NUMBER,
                               8666                 :                :                                       false);
                               8667                 :        1485336 : }
                               8668                 :                : 
                               8669                 :                : /*
                               8670                 :                :  * Perform completion handling of a single AIO read. This read may cover
                               8671                 :                :  * multiple blocks / buffers.
                               8672                 :                :  *
                               8673                 :                :  * Shared between shared and local buffers, to reduce code duplication.
                               8674                 :                :  */
                               8675                 :                : static pg_attribute_always_inline PgAioResult
                               8676                 :        1337520 : buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result,
                               8677                 :                :                       uint8 cb_data, bool is_temp)
                               8678                 :                : {
                               8679                 :        1337520 :     PgAioResult result = prior_result;
                               8680                 :        1337520 :     PgAioTargetData *td = pgaio_io_get_target_data(ioh);
                               8681                 :        1337520 :     uint8       first_error_off = 0;
                               8682                 :        1337520 :     uint8       first_zeroed_off = 0;
                               8683                 :        1337520 :     uint8       first_ignored_off = 0;
                               8684                 :        1337520 :     uint8       error_count = 0;
                               8685                 :        1337520 :     uint8       zeroed_count = 0;
                               8686                 :        1337520 :     uint8       ignored_count = 0;
                               8687                 :        1337520 :     uint8       checkfail_count = 0;
                               8688                 :                :     uint64     *io_data;
                               8689                 :                :     uint8       handle_data_len;
                               8690                 :                : 
                               8691         [ +  + ]:        1337520 :     if (is_temp)
                               8692                 :                :     {
                               8693         [ -  + ]:           2884 :         Assert(td->smgr.is_temp);
                               8694         [ -  + ]:           2884 :         Assert(pgaio_io_get_owner(ioh) == MyProcNumber);
                               8695                 :                :     }
                               8696                 :                :     else
                               8697         [ -  + ]:        1334636 :         Assert(!td->smgr.is_temp);
                               8698                 :                : 
                               8699                 :                :     /*
                               8700                 :                :      * Iterate over all the buffers affected by this IO and call the
                               8701                 :                :      * per-buffer completion function for each buffer.
                               8702                 :                :      */
                               8703                 :        1337520 :     io_data = pgaio_io_get_handle_data(ioh, &handle_data_len);
                               8704         [ +  + ]:        2822856 :     for (uint8 buf_off = 0; buf_off < handle_data_len; buf_off++)
                               8705                 :                :     {
                               8706                 :        1485336 :         Buffer      buf = io_data[buf_off];
                               8707                 :                :         bool        failed;
                               8708                 :        1485336 :         bool        failed_verification = false;
                               8709                 :        1485336 :         bool        failed_checksum = false;
                               8710                 :        1485336 :         bool        zeroed_buffer = false;
                               8711                 :        1485336 :         bool        ignored_checksum = false;
                               8712                 :                : 
                               8713         [ -  + ]:        1485336 :         Assert(BufferIsValid(buf));
                               8714                 :                : 
                               8715                 :                :         /*
                               8716                 :                :          * If the entire I/O failed on a lower-level, each buffer needs to be
                               8717                 :                :          * marked as failed. In case of a partial read, the first few buffers
                               8718                 :                :          * may be ok.
                               8719                 :                :          */
                               8720                 :        1485336 :         failed =
                               8721                 :        1485336 :             prior_result.status == PGAIO_RS_ERROR
                               8722   [ +  +  +  + ]:        1485336 :             || prior_result.result <= buf_off;
                               8723                 :                : 
                               8724                 :        1485336 :         buffer_readv_complete_one(td, buf_off, buf, cb_data, failed, is_temp,
                               8725                 :                :                                   &failed_verification,
                               8726                 :                :                                   &failed_checksum,
                               8727                 :                :                                   &ignored_checksum,
                               8728                 :                :                                   &zeroed_buffer);
                               8729                 :                : 
                               8730                 :                :         /*
                               8731                 :                :          * Track information about the number of different kinds of error
                               8732                 :                :          * conditions across all pages, as there can be multiple pages failing
                               8733                 :                :          * verification as part of one IO.
                               8734                 :                :          */
                               8735   [ +  +  +  -  :        1485336 :         if (failed_verification && !zeroed_buffer && error_count++ == 0)
                                              +  + ]
                               8736                 :             66 :             first_error_off = buf_off;
                               8737   [ +  +  +  + ]:        1485336 :         if (zeroed_buffer && zeroed_count++ == 0)
                               8738                 :             51 :             first_zeroed_off = buf_off;
                               8739   [ +  +  +  + ]:        1485336 :         if (ignored_checksum && ignored_count++ == 0)
                               8740                 :             15 :             first_ignored_off = buf_off;
                               8741         [ +  + ]:        1485336 :         if (failed_checksum)
                               8742                 :             48 :             checkfail_count++;
                               8743                 :                :     }
                               8744                 :                : 
                               8745                 :                :     /*
                               8746                 :                :      * If the smgr read succeeded [partially] and page verification failed for
                               8747                 :                :      * some of the pages, adjust the IO's result state appropriately.
                               8748                 :                :      */
                               8749   [ +  +  +  + ]:        1337520 :     if (prior_result.status != PGAIO_RS_ERROR &&
                               8750   [ +  +  +  + ]:        1337438 :         (error_count > 0 || ignored_count > 0 || zeroed_count > 0))
                               8751                 :                :     {
                               8752                 :            126 :         buffer_readv_encode_error(&result, is_temp,
                               8753                 :                :                                   zeroed_count > 0, ignored_count > 0,
                               8754                 :                :                                   error_count, zeroed_count, checkfail_count,
                               8755                 :                :                                   first_error_off, first_zeroed_off,
                               8756                 :                :                                   first_ignored_off);
                               8757                 :            126 :         pgaio_result_report(result, td, DEBUG1);
                               8758                 :                :     }
                               8759                 :                : 
                               8760                 :                :     /*
                               8761                 :                :      * For shared relations this reporting is done in
                               8762                 :                :      * shared_buffer_readv_complete_local().
                               8763                 :                :      */
                               8764   [ +  +  +  + ]:        1337520 :     if (is_temp && checkfail_count > 0)
                               8765                 :              3 :         pgstat_report_checksum_failures_in_db(td->smgr.rlocator.dbOid,
                               8766                 :                :                                               checkfail_count);
                               8767                 :                : 
                               8768                 :        1337520 :     return result;
                               8769                 :                : }
                               8770                 :                : 
                               8771                 :                : /*
                               8772                 :                :  * AIO error reporting callback for aio_shared_buffer_readv_cb and
                               8773                 :                :  * aio_local_buffer_readv_cb.
                               8774                 :                :  *
                               8775                 :                :  * The error is encoded / decoded in buffer_readv_encode_error() /
                               8776                 :                :  * buffer_readv_decode_error().
                               8777                 :                :  */
                               8778                 :                : static void
                               8779                 :            408 : buffer_readv_report(PgAioResult result, const PgAioTargetData *td,
                               8780                 :                :                     int elevel)
                               8781                 :                : {
                               8782                 :            408 :     int         nblocks = td->smgr.nblocks;
                               8783                 :            408 :     BlockNumber first = td->smgr.blockNum;
                               8784                 :            408 :     BlockNumber last = first + nblocks - 1;
                               8785                 :            408 :     ProcNumber  errProc =
                               8786         [ +  + ]:            408 :         td->smgr.is_temp ? MyProcNumber : INVALID_PROC_NUMBER;
                               8787                 :                :     RelPathStr  rpath =
                               8788                 :            408 :         relpathbackend(td->smgr.rlocator, errProc, td->smgr.forkNum);
                               8789                 :                :     bool        zeroed_any,
                               8790                 :                :                 ignored_any;
                               8791                 :                :     uint8       zeroed_or_error_count,
                               8792                 :                :                 checkfail_count,
                               8793                 :                :                 first_off;
                               8794                 :                :     uint8       affected_count;
                               8795                 :                :     const char *msg_one,
                               8796                 :                :                *msg_mult,
                               8797                 :                :                *det_mult,
                               8798                 :                :                *hint_mult;
                               8799                 :                : 
                               8800                 :            408 :     buffer_readv_decode_error(result, &zeroed_any, &ignored_any,
                               8801                 :                :                               &zeroed_or_error_count,
                               8802                 :                :                               &checkfail_count,
                               8803                 :                :                               &first_off);
                               8804                 :                : 
                               8805                 :                :     /*
                               8806                 :                :      * Treat a read that had both zeroed buffers *and* ignored checksums as a
                               8807                 :                :      * special case, it's too irregular to be emitted the same way as the
                               8808                 :                :      * other cases.
                               8809                 :                :      */
                               8810   [ +  +  +  + ]:            408 :     if (zeroed_any && ignored_any)
                               8811                 :                :     {
                               8812   [ +  -  -  + ]:              6 :         Assert(zeroed_any && ignored_any);
                               8813         [ -  + ]:              6 :         Assert(nblocks > 1); /* same block can't be both zeroed and ignored */
                               8814         [ -  + ]:              6 :         Assert(result.status != PGAIO_RS_ERROR);
                               8815                 :              6 :         affected_count = zeroed_or_error_count;
                               8816                 :                : 
                               8817   [ +  -  +  - ]:              6 :         ereport(elevel,
                               8818                 :                :                 errcode(ERRCODE_DATA_CORRUPTED),
                               8819                 :                :                 errmsg("zeroing %u page(s) and ignoring %u checksum failure(s) among blocks %u..%u of relation \"%s\"",
                               8820                 :                :                        affected_count, checkfail_count, first, last, rpath.str),
                               8821                 :                :                 affected_count > 1 ?
                               8822                 :                :                 errdetail("Block %u held the first zeroed page.",
                               8823                 :                :                           first + first_off) : 0,
                               8824                 :                :                 errhint_plural("See server log for details about the other %d invalid block.",
                               8825                 :                :                                "See server log for details about the other %d invalid blocks.",
                               8826                 :                :                                affected_count + checkfail_count - 1,
                               8827                 :                :                                affected_count + checkfail_count - 1));
                               8828                 :              6 :         return;
                               8829                 :                :     }
                               8830                 :                : 
                               8831                 :                :     /*
                               8832                 :                :      * The other messages are highly repetitive. To avoid duplicating a long
                               8833                 :                :      * and complicated ereport(), gather the translated format strings
                               8834                 :                :      * separately and then do one common ereport.
                               8835                 :                :      */
                               8836         [ +  + ]:            402 :     if (result.status == PGAIO_RS_ERROR)
                               8837                 :                :     {
                               8838         [ -  + ]:            204 :         Assert(!zeroed_any);    /* can't have invalid pages when zeroing them */
                               8839                 :            204 :         affected_count = zeroed_or_error_count;
  252 peter@eisentraut.org     8840                 :            204 :         msg_one = _("invalid page in block %u of relation \"%s\"");
                               8841                 :            204 :         msg_mult = _("%u invalid pages among blocks %u..%u of relation \"%s\"");
                               8842                 :            204 :         det_mult = _("Block %u held the first invalid page.");
  401 andres@anarazel.de       8843                 :            204 :         hint_mult = _("See server log for the other %u invalid block(s).");
                               8844                 :                :     }
                               8845   [ +  +  +  - ]:            198 :     else if (zeroed_any && !ignored_any)
                               8846                 :                :     {
                               8847                 :            162 :         affected_count = zeroed_or_error_count;
  252 peter@eisentraut.org     8848                 :            162 :         msg_one = _("invalid page in block %u of relation \"%s\"; zeroing out page");
                               8849                 :            162 :         msg_mult = _("zeroing out %u invalid pages among blocks %u..%u of relation \"%s\"");
                               8850                 :            162 :         det_mult = _("Block %u held the first zeroed page.");
  401 andres@anarazel.de       8851                 :            162 :         hint_mult = _("See server log for the other %u zeroed block(s).");
                               8852                 :                :     }
                               8853   [ +  -  +  - ]:             36 :     else if (!zeroed_any && ignored_any)
                               8854                 :                :     {
                               8855                 :             36 :         affected_count = checkfail_count;
  252 peter@eisentraut.org     8856                 :             36 :         msg_one = _("ignoring checksum failure in block %u of relation \"%s\"");
                               8857                 :             36 :         msg_mult = _("ignoring %u checksum failures among blocks %u..%u of relation \"%s\"");
                               8858                 :             36 :         det_mult = _("Block %u held the first ignored page.");
  401 andres@anarazel.de       8859                 :             36 :         hint_mult = _("See server log for the other %u ignored block(s).");
                               8860                 :                :     }
                               8861                 :                :     else
  401 andres@anarazel.de       8862                 :UBC           0 :         pg_unreachable();
                               8863                 :                : 
  401 andres@anarazel.de       8864   [ +  -  +  +  :CBC         402 :     ereport(elevel,
                                        +  +  +  + ]
                               8865                 :                :             errcode(ERRCODE_DATA_CORRUPTED),
                               8866                 :                :             affected_count == 1 ?
                               8867                 :                :             errmsg_internal(msg_one, first + first_off, rpath.str) :
                               8868                 :                :             errmsg_internal(msg_mult, affected_count, first, last, rpath.str),
                               8869                 :                :             affected_count > 1 ? errdetail_internal(det_mult, first + first_off) : 0,
                               8870                 :                :             affected_count > 1 ? errhint_internal(hint_mult, affected_count - 1) : 0);
                               8871                 :                : }
                               8872                 :                : 
                               8873                 :                : static void
                               8874                 :        1469343 : shared_buffer_readv_stage(PgAioHandle *ioh, uint8 cb_data)
                               8875                 :                : {
                               8876                 :        1469343 :     buffer_stage_common(ioh, false, false);
                               8877                 :        1469343 : }
                               8878                 :                : 
                               8879                 :                : static PgAioResult
                               8880                 :        1334636 : shared_buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result,
                               8881                 :                :                              uint8 cb_data)
                               8882                 :                : {
                               8883                 :        1334636 :     return buffer_readv_complete(ioh, prior_result, cb_data, false);
                               8884                 :                : }
                               8885                 :                : 
                               8886                 :                : /*
                               8887                 :                :  * We need a backend-local completion callback for shared buffers, to be able
                               8888                 :                :  * to report checksum errors correctly. Unfortunately that can only safely
                               8889                 :                :  * happen if the reporting backend has previously called
                               8890                 :                :  * pgstat_prepare_report_checksum_failure(), which we can only guarantee in
                               8891                 :                :  * the backend that started the IO. Hence this callback.
                               8892                 :                :  */
                               8893                 :                : static PgAioResult
                               8894                 :        1469343 : shared_buffer_readv_complete_local(PgAioHandle *ioh, PgAioResult prior_result,
                               8895                 :                :                                    uint8 cb_data)
                               8896                 :                : {
                               8897                 :                :     bool        zeroed_any,
                               8898                 :                :                 ignored_any;
                               8899                 :                :     uint8       zeroed_or_error_count,
                               8900                 :                :                 checkfail_count,
                               8901                 :                :                 first_off;
                               8902                 :                : 
                               8903         [ +  + ]:        1469343 :     if (prior_result.status == PGAIO_RS_OK)
                               8904                 :        1469077 :         return prior_result;
                               8905                 :                : 
                               8906                 :            266 :     buffer_readv_decode_error(prior_result,
                               8907                 :                :                               &zeroed_any,
                               8908                 :                :                               &ignored_any,
                               8909                 :                :                               &zeroed_or_error_count,
                               8910                 :                :                               &checkfail_count,
                               8911                 :                :                               &first_off);
                               8912                 :                : 
                               8913         [ +  + ]:            266 :     if (checkfail_count)
                               8914                 :                :     {
                               8915                 :             36 :         PgAioTargetData *td = pgaio_io_get_target_data(ioh);
                               8916                 :                : 
                               8917                 :             36 :         pgstat_report_checksum_failures_in_db(td->smgr.rlocator.dbOid,
                               8918                 :                :                                               checkfail_count);
                               8919                 :                :     }
                               8920                 :                : 
                               8921                 :            266 :     return prior_result;
                               8922                 :                : }
                               8923                 :                : 
                               8924                 :                : static void
                               8925                 :           2884 : local_buffer_readv_stage(PgAioHandle *ioh, uint8 cb_data)
                               8926                 :                : {
                               8927                 :           2884 :     buffer_stage_common(ioh, false, true);
                               8928                 :           2884 : }
                               8929                 :                : 
                               8930                 :                : static PgAioResult
                               8931                 :           2884 : local_buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result,
                               8932                 :                :                             uint8 cb_data)
                               8933                 :                : {
                               8934                 :           2884 :     return buffer_readv_complete(ioh, prior_result, cb_data, true);
                               8935                 :                : }
                               8936                 :                : 
                               8937                 :                : /* readv callback is passed READ_BUFFERS_* flags as callback data */
                               8938                 :                : const PgAioHandleCallbacks aio_shared_buffer_readv_cb = {
                               8939                 :                :     .stage = shared_buffer_readv_stage,
                               8940                 :                :     .complete_shared = shared_buffer_readv_complete,
                               8941                 :                :     /* need a local callback to report checksum failures */
                               8942                 :                :     .complete_local = shared_buffer_readv_complete_local,
                               8943                 :                :     .report = buffer_readv_report,
                               8944                 :                : };
                               8945                 :                : 
                               8946                 :                : /* readv callback is passed READ_BUFFERS_* flags as callback data */
                               8947                 :                : const PgAioHandleCallbacks aio_local_buffer_readv_cb = {
                               8948                 :                :     .stage = local_buffer_readv_stage,
                               8949                 :                : 
                               8950                 :                :     /*
                               8951                 :                :      * Note that this, in contrast to the shared_buffers case, uses
                               8952                 :                :      * complete_local, as only the issuing backend has access to the required
                               8953                 :                :      * datastructures. This is important in case the IO completion may be
                               8954                 :                :      * consumed incidentally by another backend.
                               8955                 :                :      */
                               8956                 :                :     .complete_local = local_buffer_readv_complete,
                               8957                 :                :     .report = buffer_readv_report,
                               8958                 :                : };
        

Generated by: LCOV version 2.5.0-beta