Age Owner Branch data TLA Line data Source code
1 : : /*-------------------------------------------------------------------------
2 : : *
3 : : * syncscan.c
4 : : * scan synchronization support
5 : : *
6 : : * When multiple backends run a sequential scan on the same table, we try
7 : : * to keep them synchronized to reduce the overall I/O needed. The goal is
8 : : * to read each page into shared buffer cache only once, and let all backends
9 : : * that take part in the shared scan process the page before it falls out of
10 : : * the cache.
11 : : *
12 : : * Since the "leader" in a pack of backends doing a seqscan will have to wait
13 : : * for I/O, while the "followers" don't, there is a strong self-synchronizing
14 : : * effect once we can get the backends examining approximately the same part
15 : : * of the table at the same time. Hence all that is really needed is to get
16 : : * a new backend beginning a seqscan to begin it close to where other backends
17 : : * are reading. We can scan the table circularly, from block X up to the
18 : : * end and then from block 0 to X-1, to ensure we visit all rows while still
19 : : * participating in the common scan.
20 : : *
21 : : * To accomplish that, we keep track of the scan position of each table, and
22 : : * start new scans close to where the previous scan(s) are. We don't try to
23 : : * do any extra synchronization to keep the scans together afterwards; some
24 : : * scans might progress much more slowly than others, for example if the
25 : : * results need to be transferred to the client over a slow network, and we
26 : : * don't want such queries to slow down others.
27 : : *
28 : : * There can realistically only be a few large sequential scans on different
29 : : * tables in progress at any time. Therefore we just keep the scan positions
30 : : * in a small LRU list which we scan every time we need to look up or update a
31 : : * scan position. The whole mechanism is only applied for tables exceeding
32 : : * a threshold size (but that is not the concern of this module).
33 : : *
34 : : * INTERFACE ROUTINES
35 : : * ss_get_location - return current scan location of a relation
36 : : * ss_report_location - update current scan location
37 : : *
38 : : *
39 : : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
40 : : * Portions Copyright (c) 1994, Regents of the University of California
41 : : *
42 : : * IDENTIFICATION
43 : : * src/backend/access/common/syncscan.c
44 : : *
45 : : *-------------------------------------------------------------------------
46 : : */
47 : : #include "postgres.h"
48 : :
49 : : #include "access/syncscan.h"
50 : : #include "miscadmin.h"
51 : : #include "storage/lwlock.h"
52 : : #include "storage/shmem.h"
53 : : #include "storage/subsystems.h"
54 : : #include "utils/rel.h"
55 : :
56 : :
57 : : /* GUC variables */
58 : : #ifdef TRACE_SYNCSCAN
59 : : bool trace_syncscan = false;
60 : : #endif
61 : :
62 : :
63 : : /*
64 : : * Size of the LRU list.
65 : : *
66 : : * Note: the code assumes that SYNC_SCAN_NELEM > 1.
67 : : *
68 : : * XXX: What's a good value? It should be large enough to hold the
69 : : * maximum number of large tables scanned simultaneously. But a larger value
70 : : * means more traversing of the LRU list when starting a new scan.
71 : : */
72 : : #define SYNC_SCAN_NELEM 20
73 : :
74 : : /*
75 : : * Interval between reports of the location of the current scan, in pages.
76 : : *
77 : : * Note: This should be smaller than the ring size (see buffer/freelist.c)
78 : : * we use for bulk reads. Otherwise a scan joining other scans might start
79 : : * from a page that's no longer in the buffer cache. This is a bit fuzzy;
80 : : * there's no guarantee that the new scan will read the page before it leaves
81 : : * the buffer cache anyway, and on the other hand the page is most likely
82 : : * still in the OS cache.
83 : : */
84 : : #define SYNC_SCAN_REPORT_INTERVAL (128 * 1024 / BLCKSZ)
85 : :
86 : :
87 : : /*
88 : : * The scan locations structure is essentially a doubly-linked LRU with head
89 : : * and tail pointer, but designed to hold a fixed maximum number of elements in
90 : : * fixed-size shared memory.
91 : : */
92 : : typedef struct ss_scan_location_t
93 : : {
94 : : RelFileLocator relfilelocator; /* identity of a relation */
95 : : BlockNumber location; /* last-reported location in the relation */
96 : : } ss_scan_location_t;
97 : :
98 : : typedef struct ss_lru_item_t
99 : : {
100 : : struct ss_lru_item_t *prev;
101 : : struct ss_lru_item_t *next;
102 : : ss_scan_location_t location;
103 : : } ss_lru_item_t;
104 : :
105 : : typedef struct ss_scan_locations_t
106 : : {
107 : : ss_lru_item_t *head;
108 : : ss_lru_item_t *tail;
109 : : ss_lru_item_t items[FLEXIBLE_ARRAY_MEMBER]; /* SYNC_SCAN_NELEM items */
110 : : } ss_scan_locations_t;
111 : :
112 : : #define SizeOfScanLocations(N) \
113 : : (offsetof(ss_scan_locations_t, items) + (N) * sizeof(ss_lru_item_t))
114 : :
115 : : static void SyncScanShmemRequest(void *arg);
116 : : static void SyncScanShmemInit(void *arg);
117 : :
118 : : const ShmemCallbacks SyncScanShmemCallbacks = {
119 : : .request_fn = SyncScanShmemRequest,
120 : : .init_fn = SyncScanShmemInit,
121 : : };
122 : :
123 : : /* Pointer to struct in shared memory */
124 : : static ss_scan_locations_t *scan_locations;
125 : :
126 : : /* prototypes for internal functions */
127 : : static BlockNumber ss_search(RelFileLocator relfilelocator,
128 : : BlockNumber location, bool set);
129 : :
130 : :
131 : : /*
132 : : * SyncScanShmemRequest --- register this module's shared memory
133 : : */
134 : : static void
29 heikki.linnakangas@i 135 :GNC 1244 : SyncScanShmemRequest(void *arg)
136 : : {
137 : 1244 : ShmemRequestStruct(.name = "Sync Scan Locations List",
138 : : .size = SizeOfScanLocations(SYNC_SCAN_NELEM),
139 : : .ptr = (void **) &scan_locations,
140 : : );
6906 tgl@sss.pgh.pa.us 141 :GIC 1244 : }
142 : :
143 : : /*
144 : : * SyncScanShmemInit --- initialize this module's shared memory
145 : : */
146 : : static void
29 heikki.linnakangas@i 147 :GNC 1241 : SyncScanShmemInit(void *arg)
148 : : {
149 : : int i;
150 : :
151 : 1241 : scan_locations->head = &scan_locations->items[0];
152 : 1241 : scan_locations->tail = &scan_locations->items[SYNC_SCAN_NELEM - 1];
153 : :
154 [ + + ]: 26061 : for (i = 0; i < SYNC_SCAN_NELEM; i++)
155 : : {
156 : 24820 : ss_lru_item_t *item = &scan_locations->items[i];
157 : :
158 : : /*
159 : : * Initialize all slots with invalid values. As scans are started,
160 : : * these invalid entries will fall off the LRU list and get replaced
161 : : * with real entries.
162 : : */
163 : 24820 : item->location.relfilelocator.spcOid = InvalidOid;
164 : 24820 : item->location.relfilelocator.dbOid = InvalidOid;
165 : 24820 : item->location.relfilelocator.relNumber = InvalidRelFileNumber;
166 : 24820 : item->location.location = InvalidBlockNumber;
167 : :
168 : 24820 : item->prev = (i > 0) ?
169 [ + + ]: 24820 : (&scan_locations->items[i - 1]) : NULL;
170 : 24820 : item->next = (i < SYNC_SCAN_NELEM - 1) ?
171 [ + + ]: 24820 : (&scan_locations->items[i + 1]) : NULL;
172 : : }
6906 tgl@sss.pgh.pa.us 173 :CBC 1241 : }
174 : :
175 : : /*
176 : : * ss_search --- search the scan_locations structure for an entry with the
177 : : * given relfilelocator.
178 : : *
179 : : * If "set" is true, the location is updated to the given location. If no
180 : : * entry for the given relfilelocator is found, it will be created at the head
181 : : * of the list with the given location, even if "set" is false.
182 : : *
183 : : * In any case, the location after possible update is returned.
184 : : *
185 : : * Caller is responsible for having acquired suitable lock on the shared
186 : : * data structure.
187 : : */
188 : : static BlockNumber
1399 rhaas@postgresql.org 189 : 3905 : ss_search(RelFileLocator relfilelocator, BlockNumber location, bool set)
190 : : {
191 : : ss_lru_item_t *item;
192 : :
6906 tgl@sss.pgh.pa.us 193 : 3905 : item = scan_locations->head;
194 : : for (;;)
195 : 665 : {
196 : : bool match;
197 : :
1399 rhaas@postgresql.org 198 [ + + + + : 4570 : match = RelFileLocatorEquals(item->location.relfilelocator,
+ + ]
199 : : relfilelocator);
200 : :
6906 tgl@sss.pgh.pa.us 201 [ + + + + ]: 4570 : if (match || item->next == NULL)
202 : : {
203 : : /*
204 : : * If we reached the end of list and no match was found, take over
205 : : * the last entry
206 : : */
207 [ + + ]: 3905 : if (!match)
208 : : {
1399 rhaas@postgresql.org 209 : 35 : item->location.relfilelocator = relfilelocator;
6906 tgl@sss.pgh.pa.us 210 : 35 : item->location.location = location;
211 : : }
212 [ + + ]: 3870 : else if (set)
213 : 3825 : item->location.location = location;
214 : :
215 : : /* Move the entry to the front of the LRU list */
216 [ + + ]: 3905 : if (item != scan_locations->head)
217 : : {
218 : : /* unlink */
219 [ + - ]: 35 : if (item == scan_locations->tail)
220 : 35 : scan_locations->tail = item->prev;
221 : 35 : item->prev->next = item->next;
222 [ - + ]: 35 : if (item->next)
6906 tgl@sss.pgh.pa.us 223 :UBC 0 : item->next->prev = item->prev;
224 : :
225 : : /* link */
6906 tgl@sss.pgh.pa.us 226 :CBC 35 : item->prev = NULL;
227 : 35 : item->next = scan_locations->head;
228 : 35 : scan_locations->head->prev = item;
229 : 35 : scan_locations->head = item;
230 : : }
231 : :
232 : 3905 : return item->location.location;
233 : : }
234 : :
235 : 665 : item = item->next;
236 : : }
237 : :
238 : : /* not reached */
239 : : }
240 : :
241 : : /*
242 : : * ss_get_location --- get the optimal starting location for scan
243 : : *
244 : : * Returns the last-reported location of a sequential scan on the
245 : : * relation, or 0 if no valid location is found.
246 : : *
247 : : * We expect the caller has just done RelationGetNumberOfBlocks(), and
248 : : * so that number is passed in rather than computing it again. The result
249 : : * is guaranteed less than relnblocks (assuming that's > 0).
250 : : */
251 : : BlockNumber
252 : 80 : ss_get_location(Relation rel, BlockNumber relnblocks)
253 : : {
254 : : BlockNumber startloc;
255 : :
256 : 80 : LWLockAcquire(SyncScanLock, LW_EXCLUSIVE);
1399 rhaas@postgresql.org 257 : 80 : startloc = ss_search(rel->rd_locator, 0, false);
6906 tgl@sss.pgh.pa.us 258 : 80 : LWLockRelease(SyncScanLock);
259 : :
260 : : /*
261 : : * If the location is not a valid block number for this scan, start at 0.
262 : : *
263 : : * This can happen if for instance a VACUUM truncated the table since the
264 : : * location was saved.
265 : : */
266 [ - + ]: 80 : if (startloc >= relnblocks)
6906 tgl@sss.pgh.pa.us 267 :UBC 0 : startloc = 0;
268 : :
269 : : #ifdef TRACE_SYNCSCAN
270 : : if (trace_syncscan)
271 : : elog(LOG,
272 : : "SYNC_SCAN: start \"%s\" (size %u) at %u",
273 : : RelationGetRelationName(rel), relnblocks, startloc);
274 : : #endif
275 : :
6906 tgl@sss.pgh.pa.us 276 :CBC 80 : return startloc;
277 : : }
278 : :
279 : : /*
280 : : * ss_report_location --- update the current scan location
281 : : *
282 : : * Writes an entry into the shared Sync Scan state of the form
283 : : * (relfilelocator, blocknumber), overwriting any existing entry for the
284 : : * same relfilelocator.
285 : : */
286 : : void
287 : 60491 : ss_report_location(Relation rel, BlockNumber location)
288 : : {
289 : : #ifdef TRACE_SYNCSCAN
290 : : if (trace_syncscan)
291 : : {
292 : : if ((location % 1024) == 0)
293 : : elog(LOG,
294 : : "SYNC_SCAN: scanning \"%s\" at %u",
295 : : RelationGetRelationName(rel), location);
296 : : }
297 : : #endif
298 : :
299 : : /*
300 : : * To reduce lock contention, only report scan progress every N pages. For
301 : : * the same reason, don't block if the lock isn't immediately available.
302 : : * Missing a few updates isn't critical, it just means that a new scan
303 : : * that wants to join the pack will start a little bit behind the head of
304 : : * the scan. Hopefully the pages are still in OS cache and the scan
305 : : * catches up quickly.
306 : : */
307 [ + + ]: 60491 : if ((location % SYNC_SCAN_REPORT_INTERVAL) == 0)
308 : : {
309 [ + - ]: 3825 : if (LWLockConditionalAcquire(SyncScanLock, LW_EXCLUSIVE))
310 : : {
1399 rhaas@postgresql.org 311 : 3825 : (void) ss_search(rel->rd_locator, location, true);
6906 tgl@sss.pgh.pa.us 312 : 3825 : LWLockRelease(SyncScanLock);
313 : : }
314 : : #ifdef TRACE_SYNCSCAN
315 : : else if (trace_syncscan)
316 : : elog(LOG,
317 : : "SYNC_SCAN: missed update for \"%s\" at %u",
318 : : RelationGetRelationName(rel), location);
319 : : #endif
320 : : }
321 : 60491 : }
|