Age Owner Branch data TLA Line data Source code
1 : : /*-------------------------------------------------------------------------
2 : : *
3 : : * blkreftable.c
4 : : * Block reference tables.
5 : : *
6 : : * A block reference table is used to keep track of which blocks have
7 : : * been modified by WAL records within a certain LSN range.
8 : : *
9 : : * For each relation fork, we keep track of all blocks that have appeared
10 : : * in block reference in the WAL. We also keep track of the "limit block",
11 : : * which is the smallest relation length in blocks known to have occurred
12 : : * during that range of WAL records. This should be set to 0 if the relation
13 : : * fork is created or destroyed, and to the post-truncation length if
14 : : * truncated.
15 : : *
16 : : * Whenever we set the limit block, we also forget about any modified blocks
17 : : * beyond that point. Those blocks don't exist any more. Such blocks can
18 : : * later be marked as modified again; if that happens, it means the relation
19 : : * was re-extended.
20 : : *
21 : : * Portions Copyright (c) 2010-2026, PostgreSQL Global Development Group
22 : : *
23 : : * src/common/blkreftable.c
24 : : *
25 : : *-------------------------------------------------------------------------
26 : : */
27 : :
28 : :
29 : : #ifndef FRONTEND
30 : : #include "postgres.h"
31 : : #else
32 : : #include "postgres_fe.h"
33 : : #endif
34 : :
35 : : #ifdef FRONTEND
36 : : #include "common/logging.h"
37 : : #endif
38 : :
39 : : #include "common/blkreftable.h"
40 : : #include "common/hashfn.h"
41 : : #include "port/pg_crc32c.h"
42 : :
43 : : /*
44 : : * A block reference table keeps track of the status of each relation
45 : : * fork individually.
46 : : */
47 : : typedef struct BlockRefTableKey
48 : : {
49 : : RelFileLocator rlocator;
50 : : ForkNumber forknum;
51 : : } BlockRefTableKey;
52 : :
53 : : /*
54 : : * We could need to store data either for a relation in which only a
55 : : * tiny fraction of the blocks have been modified or for a relation in
56 : : * which nearly every block has been modified, and we want a
57 : : * space-efficient representation in both cases. To accomplish this,
58 : : * we divide the relation into chunks of 2^16 blocks and choose between
59 : : * an array representation and a bitmap representation for each chunk.
60 : : *
61 : : * When the number of modified blocks in a given chunk is small, we
62 : : * essentially store an array of block numbers, but we need not store the
63 : : * entire block number: instead, we store each block number as a 2-byte
64 : : * offset from the start of the chunk.
65 : : *
66 : : * When the number of modified blocks in a given chunk is large, we switch
67 : : * to a bitmap representation.
68 : : *
69 : : * These same basic representational choices are used both when a block
70 : : * reference table is stored in memory and when it is serialized to disk.
71 : : *
72 : : * In the in-memory representation, we initially allocate each chunk with
73 : : * space for a number of entries given by INITIAL_ENTRIES_PER_CHUNK and
74 : : * increase that as necessary until we reach MAX_ENTRIES_PER_CHUNK.
75 : : * Any chunk whose allocated size reaches MAX_ENTRIES_PER_CHUNK is converted
76 : : * to a bitmap, and thus never needs to grow further.
77 : : */
78 : : #define BLOCKS_PER_CHUNK (1 << 16)
79 : : #define BLOCKS_PER_ENTRY (BITS_PER_BYTE * sizeof(uint16))
80 : : #define MAX_ENTRIES_PER_CHUNK (BLOCKS_PER_CHUNK / BLOCKS_PER_ENTRY)
81 : : #define INITIAL_ENTRIES_PER_CHUNK 16
82 : : typedef uint16 *BlockRefTableChunk;
83 : :
84 : : /*
85 : : * State for one relation fork.
86 : : *
87 : : * 'rlocator' and 'forknum' identify the relation fork to which this entry
88 : : * pertains.
89 : : *
90 : : * 'limit_block' is the shortest known length of the relation in blocks
91 : : * within the LSN range covered by a particular block reference table.
92 : : * It should be set to 0 if the relation fork is created or dropped. If the
93 : : * relation fork is truncated, it should be set to the number of blocks that
94 : : * remain after truncation.
95 : : *
96 : : * 'nchunks' is the allocated length of each of the three arrays that follow.
97 : : * We can only represent the status of block numbers less than nchunks *
98 : : * BLOCKS_PER_CHUNK.
99 : : *
100 : : * 'chunk_size' is an array storing the allocated size of each chunk.
101 : : *
102 : : * 'chunk_usage' is an array storing the number of elements used in each
103 : : * chunk. If that value is less than MAX_ENTRIES_PER_CHUNK, the corresponding
104 : : * chunk is used as an array; else the corresponding chunk is used as a bitmap.
105 : : * When used as a bitmap, the least significant bit of the first array element
106 : : * is the status of the lowest-numbered block covered by this chunk.
107 : : *
108 : : * 'chunk_data' is the array of chunks.
109 : : */
110 : : struct BlockRefTableEntry
111 : : {
112 : : BlockRefTableKey key;
113 : : BlockNumber limit_block;
114 : : char status;
115 : : uint32 nchunks;
116 : : uint16 *chunk_size;
117 : : uint16 *chunk_usage;
118 : : BlockRefTableChunk *chunk_data;
119 : : };
120 : :
121 : : /* Declare and define a hash table over type BlockRefTableEntry. */
122 : : #define SH_PREFIX blockreftable
123 : : #define SH_ELEMENT_TYPE BlockRefTableEntry
124 : : #define SH_KEY_TYPE BlockRefTableKey
125 : : #define SH_KEY key
126 : : #define SH_HASH_KEY(tb, key) \
127 : : hash_bytes((const unsigned char *) &key, sizeof(BlockRefTableKey))
128 : : #define SH_EQUAL(tb, a, b) (memcmp(&a, &b, sizeof(BlockRefTableKey)) == 0)
129 : : #define SH_SCOPE static inline
130 : : #ifdef FRONTEND
131 : : #define SH_RAW_ALLOCATOR pg_malloc0
132 : : #endif
133 : : #define SH_DEFINE
134 : : #define SH_DECLARE
135 : : #include "lib/simplehash.h"
136 : :
137 : : /*
138 : : * A block reference table is basically just the hash table, but we don't
139 : : * want to expose that to outside callers.
140 : : *
141 : : * We keep track of the memory context in use explicitly too, so that it's
142 : : * easy to place all of our allocations in the same context.
143 : : */
144 : : struct BlockRefTable
145 : : {
146 : : blockreftable_hash *hash;
147 : : #ifndef FRONTEND
148 : : MemoryContext mcxt;
149 : : #endif
150 : : };
151 : :
152 : : /*
153 : : * On-disk serialization format for block reference table entries.
154 : : */
155 : : typedef struct BlockRefTableSerializedEntry
156 : : {
157 : : RelFileLocator rlocator;
158 : : ForkNumber forknum;
159 : : BlockNumber limit_block;
160 : : uint32 nchunks;
161 : : } BlockRefTableSerializedEntry;
162 : :
163 : : /*
164 : : * Buffer size, so that we avoid doing many small I/Os.
165 : : */
166 : : #define BUFSIZE 65536
167 : :
168 : : /*
169 : : * Ad-hoc buffer for file I/O.
170 : : */
171 : : typedef struct BlockRefTableBuffer
172 : : {
173 : : io_callback_fn io_callback;
174 : : void *io_callback_arg;
175 : : char data[BUFSIZE];
176 : : int used;
177 : : int cursor;
178 : : pg_crc32c crc;
179 : : } BlockRefTableBuffer;
180 : :
181 : : /*
182 : : * State for keeping track of progress while incrementally reading a block
183 : : * table reference file from disk.
184 : : *
185 : : * total_chunks means the number of chunks for the RelFileLocator/ForkNumber
186 : : * combination that is currently being read, and consumed_chunks is the number
187 : : * of those that have been read. (We always read all the information for
188 : : * a single chunk at one time, so we don't need to be able to represent the
189 : : * state where a chunk has been partially read.)
190 : : *
191 : : * chunk_size is the array of chunk sizes. The length is given by total_chunks.
192 : : *
193 : : * chunk_data holds the current chunk.
194 : : *
195 : : * chunk_position helps us figure out how much progress we've made in returning
196 : : * the block numbers for the current chunk to the caller. If the chunk is a
197 : : * bitmap, it's the number of bits we've scanned; otherwise, it's the number
198 : : * of chunk entries we've scanned.
199 : : */
200 : : struct BlockRefTableReader
201 : : {
202 : : BlockRefTableBuffer buffer;
203 : : char *error_filename;
204 : : report_error_fn error_callback;
205 : : void *error_callback_arg;
206 : : uint32 total_chunks;
207 : : uint32 consumed_chunks;
208 : : uint16 *chunk_size;
209 : : uint16 chunk_data[MAX_ENTRIES_PER_CHUNK];
210 : : uint32 chunk_position;
211 : : };
212 : :
213 : : /*
214 : : * State for keeping track of progress while incrementally writing a block
215 : : * reference table file to disk.
216 : : */
217 : : struct BlockRefTableWriter
218 : : {
219 : : BlockRefTableBuffer buffer;
220 : : };
221 : :
222 : : /* Function prototypes. */
223 : : static int BlockRefTableComparator(const void *a, const void *b);
224 : : static void BlockRefTableFlush(BlockRefTableBuffer *buffer);
225 : : static void BlockRefTableRead(BlockRefTableReader *reader, void *data,
226 : : int length);
227 : : static void BlockRefTableWrite(BlockRefTableBuffer *buffer, void *data,
228 : : int length);
229 : : static void BlockRefTableFileTerminate(BlockRefTableBuffer *buffer);
230 : :
231 : : /*
232 : : * Create an empty block reference table.
233 : : */
234 : : BlockRefTable *
892 rhaas@postgresql.org 235 :CBC 32 : CreateEmptyBlockRefTable(void)
236 : : {
172 michael@paquier.xyz 237 :GNC 32 : BlockRefTable *brtab = palloc_object(BlockRefTable);
238 : :
239 : : /*
240 : : * Even completely empty database has a few hundred relation forks, so it
241 : : * seems best to size the hash on the assumption that we're going to have
242 : : * at least a few thousand entries.
243 : : */
244 : : #ifdef FRONTEND
892 rhaas@postgresql.org 245 :UBC 0 : brtab->hash = blockreftable_create(4096, NULL);
246 : : #else
892 rhaas@postgresql.org 247 :CBC 32 : brtab->mcxt = CurrentMemoryContext;
248 : 32 : brtab->hash = blockreftable_create(brtab->mcxt, 4096, NULL);
249 : : #endif
250 : :
251 : 32 : return brtab;
252 : : }
253 : :
254 : : /*
255 : : * Set the "limit block" for a relation fork and forget any modified blocks
256 : : * with equal or higher block numbers.
257 : : *
258 : : * The "limit block" is the shortest known length of the relation within the
259 : : * range of WAL records covered by this block reference table.
260 : : */
261 : : void
262 : 407 : BlockRefTableSetLimitBlock(BlockRefTable *brtab,
263 : : const RelFileLocator *rlocator,
264 : : ForkNumber forknum,
265 : : BlockNumber limit_block)
266 : : {
267 : : BlockRefTableEntry *brtentry;
436 peter@eisentraut.org 268 : 407 : BlockRefTableKey key = {0}; /* make sure any padding is zero */
269 : : bool found;
270 : :
892 rhaas@postgresql.org 271 : 407 : memcpy(&key.rlocator, rlocator, sizeof(RelFileLocator));
272 : 407 : key.forknum = forknum;
273 : 407 : brtentry = blockreftable_insert(brtab->hash, key, &found);
274 : :
275 [ + + ]: 407 : if (!found)
276 : : {
277 : : /*
278 : : * We have no existing data about this relation fork, so just record
279 : : * the limit_block value supplied by the caller, and make sure other
280 : : * parts of the entry are properly initialized.
281 : : */
282 : 401 : brtentry->limit_block = limit_block;
283 : 401 : brtentry->nchunks = 0;
284 : 401 : brtentry->chunk_size = NULL;
285 : 401 : brtentry->chunk_usage = NULL;
286 : 401 : brtentry->chunk_data = NULL;
287 : 401 : return;
288 : : }
289 : :
290 : 6 : BlockRefTableEntrySetLimitBlock(brtentry, limit_block);
291 : : }
292 : :
293 : : /*
294 : : * Mark a block in a given relation fork as known to have been modified.
295 : : */
296 : : void
297 : 55469 : BlockRefTableMarkBlockModified(BlockRefTable *brtab,
298 : : const RelFileLocator *rlocator,
299 : : ForkNumber forknum,
300 : : BlockNumber blknum)
301 : : {
302 : : BlockRefTableEntry *brtentry;
436 peter@eisentraut.org 303 : 55469 : BlockRefTableKey key = {0}; /* make sure any padding is zero */
304 : : bool found;
305 : : #ifndef FRONTEND
892 rhaas@postgresql.org 306 : 55469 : MemoryContext oldcontext = MemoryContextSwitchTo(brtab->mcxt);
307 : : #endif
308 : :
309 : 55469 : memcpy(&key.rlocator, rlocator, sizeof(RelFileLocator));
310 : 55469 : key.forknum = forknum;
311 : 55469 : brtentry = blockreftable_insert(brtab->hash, key, &found);
312 : :
313 [ + + ]: 55469 : if (!found)
314 : : {
315 : : /*
316 : : * We want to set the initial limit block value to something higher
317 : : * than any legal block number. InvalidBlockNumber fits the bill.
318 : : */
319 : 644 : brtentry->limit_block = InvalidBlockNumber;
320 : 644 : brtentry->nchunks = 0;
321 : 644 : brtentry->chunk_size = NULL;
322 : 644 : brtentry->chunk_usage = NULL;
323 : 644 : brtentry->chunk_data = NULL;
324 : : }
325 : :
326 : 55469 : BlockRefTableEntryMarkBlockModified(brtentry, forknum, blknum);
327 : :
328 : : #ifndef FRONTEND
329 : 55469 : MemoryContextSwitchTo(oldcontext);
330 : : #endif
331 : 55469 : }
332 : :
333 : : /*
334 : : * Get an entry from a block reference table.
335 : : *
336 : : * If the entry does not exist, this function returns NULL. Otherwise, it
337 : : * returns the entry and sets *limit_block to the value from the entry.
338 : : */
339 : : BlockRefTableEntry *
340 : 20013 : BlockRefTableGetEntry(BlockRefTable *brtab, const RelFileLocator *rlocator,
341 : : ForkNumber forknum, BlockNumber *limit_block)
342 : : {
436 peter@eisentraut.org 343 : 20013 : BlockRefTableKey key = {0}; /* make sure any padding is zero */
344 : : BlockRefTableEntry *entry;
345 : :
892 rhaas@postgresql.org 346 [ - + ]: 20013 : Assert(limit_block != NULL);
347 : :
348 : 20013 : memcpy(&key.rlocator, rlocator, sizeof(RelFileLocator));
349 : 20013 : key.forknum = forknum;
350 : 20013 : entry = blockreftable_lookup(brtab->hash, key);
351 : :
352 [ + + ]: 20013 : if (entry != NULL)
353 : 317 : *limit_block = entry->limit_block;
354 : :
355 : 20013 : return entry;
356 : : }
357 : :
358 : : /*
359 : : * Get block numbers from a table entry.
360 : : *
361 : : * 'blocks' must point to enough space to hold at least 'nblocks' block
362 : : * numbers, and any block numbers we manage to get will be written there.
363 : : * The return value is the number of block numbers actually written.
364 : : *
365 : : * We do not return block numbers unless they are greater than or equal to
366 : : * start_blkno and strictly less than stop_blkno.
367 : : */
368 : : int
369 : 36 : BlockRefTableEntryGetBlocks(BlockRefTableEntry *entry,
370 : : BlockNumber start_blkno,
371 : : BlockNumber stop_blkno,
372 : : BlockNumber *blocks,
373 : : int nblocks)
374 : : {
375 : : uint32 start_chunkno;
376 : : uint32 stop_chunkno;
377 : : uint32 chunkno;
378 : 36 : int nresults = 0;
379 : :
380 [ - + ]: 36 : Assert(entry != NULL);
381 : :
382 : : /*
383 : : * Figure out which chunks could potentially contain blocks of interest.
384 : : *
385 : : * We need to be careful about overflow here, because stop_blkno could be
386 : : * InvalidBlockNumber or something very close to it.
387 : : */
388 : 36 : start_chunkno = start_blkno / BLOCKS_PER_CHUNK;
389 : 36 : stop_chunkno = stop_blkno / BLOCKS_PER_CHUNK;
390 [ + - ]: 36 : if ((stop_blkno % BLOCKS_PER_CHUNK) != 0)
391 : 36 : ++stop_chunkno;
392 [ + + ]: 36 : if (stop_chunkno > entry->nchunks)
393 : 1 : stop_chunkno = entry->nchunks;
394 : :
395 : : /*
396 : : * Loop over chunks.
397 : : */
398 [ + + ]: 71 : for (chunkno = start_chunkno; chunkno < stop_chunkno; ++chunkno)
399 : : {
400 : 35 : uint16 chunk_usage = entry->chunk_usage[chunkno];
401 : 35 : BlockRefTableChunk chunk_data = entry->chunk_data[chunkno];
402 : 35 : unsigned start_offset = 0;
403 : 35 : unsigned stop_offset = BLOCKS_PER_CHUNK;
404 : :
405 : : /*
406 : : * If the start and/or stop block number falls within this chunk, the
407 : : * whole chunk may not be of interest. Figure out which portion we
408 : : * care about, if it's not the whole thing.
409 : : */
410 [ + - ]: 35 : if (chunkno == start_chunkno)
411 : 35 : start_offset = start_blkno % BLOCKS_PER_CHUNK;
412 [ + - ]: 35 : if (chunkno == stop_chunkno - 1)
413 : : {
785 414 [ - + ]: 35 : Assert(stop_blkno > chunkno * BLOCKS_PER_CHUNK);
415 : 35 : stop_offset = stop_blkno - (chunkno * BLOCKS_PER_CHUNK);
416 [ - + ]: 35 : Assert(stop_offset <= BLOCKS_PER_CHUNK);
417 : : }
418 : :
419 : : /*
420 : : * Handling differs depending on whether this is an array of offsets
421 : : * or a bitmap.
422 : : */
892 423 [ - + ]: 35 : if (chunk_usage == MAX_ENTRIES_PER_CHUNK)
424 : : {
425 : : unsigned i;
426 : :
427 : : /* It's a bitmap, so test every relevant bit. */
892 rhaas@postgresql.org 428 [ # # ]:UBC 0 : for (i = start_offset; i < stop_offset; ++i)
429 : : {
430 : 0 : uint16 w = chunk_data[i / BLOCKS_PER_ENTRY];
431 : :
432 [ # # ]: 0 : if ((w & (1 << (i % BLOCKS_PER_ENTRY))) != 0)
433 : : {
434 : 0 : BlockNumber blkno = chunkno * BLOCKS_PER_CHUNK + i;
435 : :
436 : 0 : blocks[nresults++] = blkno;
437 : :
438 : : /* Early exit if we run out of output space. */
439 [ # # ]: 0 : if (nresults == nblocks)
440 : 0 : return nresults;
441 : : }
442 : : }
443 : : }
444 : : else
445 : : {
446 : : unsigned i;
447 : :
448 : : /* It's an array of offsets, so check each one. */
892 rhaas@postgresql.org 449 [ + + ]:CBC 113 : for (i = 0; i < chunk_usage; ++i)
450 : : {
451 : 78 : uint16 offset = chunk_data[i];
452 : :
453 [ + - + - ]: 78 : if (offset >= start_offset && offset < stop_offset)
454 : : {
455 : 78 : BlockNumber blkno = chunkno * BLOCKS_PER_CHUNK + offset;
456 : :
457 : 78 : blocks[nresults++] = blkno;
458 : :
459 : : /* Early exit if we run out of output space. */
460 [ - + ]: 78 : if (nresults == nblocks)
892 rhaas@postgresql.org 461 :UBC 0 : return nresults;
462 : : }
463 : : }
464 : : }
465 : : }
466 : :
892 rhaas@postgresql.org 467 :CBC 36 : return nresults;
468 : : }
469 : :
470 : : /*
471 : : * Serialize a block reference table to a file.
472 : : */
473 : : void
474 : 18 : WriteBlockRefTable(BlockRefTable *brtab,
475 : : io_callback_fn write_callback,
476 : : void *write_callback_arg)
477 : : {
478 : 18 : BlockRefTableSerializedEntry *sdata = NULL;
479 : : BlockRefTableBuffer buffer;
480 : 18 : uint32 magic = BLOCKREFTABLE_MAGIC;
481 : :
482 : : /* Prepare buffer. */
483 : 18 : memset(&buffer, 0, sizeof(BlockRefTableBuffer));
484 : 18 : buffer.io_callback = write_callback;
485 : 18 : buffer.io_callback_arg = write_callback_arg;
486 : 18 : INIT_CRC32C(buffer.crc);
487 : :
488 : : /* Write magic number. */
489 : 18 : BlockRefTableWrite(&buffer, &magic, sizeof(uint32));
490 : :
491 : : /* Write the entries, assuming there are some. */
492 [ + + ]: 18 : if (brtab->hash->members > 0)
493 : : {
494 : 15 : unsigned i = 0;
495 : : blockreftable_iterator it;
496 : : BlockRefTableEntry *brtentry;
497 : :
498 : : /* Extract entries into serializable format and sort them. */
499 : : sdata =
172 michael@paquier.xyz 500 : 15 : palloc_array(BlockRefTableSerializedEntry, brtab->hash->members);
892 rhaas@postgresql.org 501 : 15 : blockreftable_start_iterate(brtab->hash, &it);
502 [ + + ]: 690 : while ((brtentry = blockreftable_iterate(brtab->hash, &it)) != NULL)
503 : : {
504 : 675 : BlockRefTableSerializedEntry *sentry = &sdata[i++];
505 : :
506 : 675 : sentry->rlocator = brtentry->key.rlocator;
507 : 675 : sentry->forknum = brtentry->key.forknum;
508 : 675 : sentry->limit_block = brtentry->limit_block;
509 : 675 : sentry->nchunks = brtentry->nchunks;
510 : :
511 : : /* trim trailing zero entries */
512 [ + + ]: 10606 : while (sentry->nchunks > 0 &&
513 [ + + ]: 10592 : brtentry->chunk_usage[sentry->nchunks - 1] == 0)
514 : 9931 : sentry->nchunks--;
515 : : }
516 [ - + ]: 15 : Assert(i == brtab->hash->members);
517 : 15 : qsort(sdata, i, sizeof(BlockRefTableSerializedEntry),
518 : : BlockRefTableComparator);
519 : :
520 : : /* Loop over entries in sorted order and serialize each one. */
521 [ + + ]: 690 : for (i = 0; i < brtab->hash->members; ++i)
522 : : {
523 : 675 : BlockRefTableSerializedEntry *sentry = &sdata[i];
436 peter@eisentraut.org 524 : 675 : BlockRefTableKey key = {0}; /* make sure any padding is zero */
525 : : unsigned j;
526 : :
527 : : /* Write the serialized entry itself. */
892 rhaas@postgresql.org 528 : 675 : BlockRefTableWrite(&buffer, sentry,
529 : : sizeof(BlockRefTableSerializedEntry));
530 : :
531 : : /* Look up the original entry so we can access the chunks. */
532 : 675 : memcpy(&key.rlocator, &sentry->rlocator, sizeof(RelFileLocator));
533 : 675 : key.forknum = sentry->forknum;
534 : 675 : brtentry = blockreftable_lookup(brtab->hash, key);
535 [ - + ]: 675 : Assert(brtentry != NULL);
536 : :
537 : : /* Write the untruncated portion of the chunk length array. */
538 [ + + ]: 675 : if (sentry->nchunks != 0)
539 : 661 : BlockRefTableWrite(&buffer, brtentry->chunk_usage,
540 : 661 : sentry->nchunks * sizeof(uint16));
541 : :
542 : : /* Write the contents of each chunk. */
543 [ + + ]: 11267 : for (j = 0; j < brtentry->nchunks; ++j)
544 : : {
545 [ + + ]: 10592 : if (brtentry->chunk_usage[j] == 0)
546 : 9931 : continue;
547 : 661 : BlockRefTableWrite(&buffer, brtentry->chunk_data[j],
548 : 661 : brtentry->chunk_usage[j] * sizeof(uint16));
549 : : }
550 : : }
551 : : }
552 : :
553 : : /* Write out appropriate terminator and CRC and flush buffer. */
554 : 18 : BlockRefTableFileTerminate(&buffer);
555 : 18 : }
556 : :
557 : : /*
558 : : * Prepare to incrementally read a block reference table file.
559 : : *
560 : : * 'read_callback' is a function that can be called to read data from the
561 : : * underlying file (or other data source) into our internal buffer.
562 : : *
563 : : * 'read_callback_arg' is an opaque argument to be passed to read_callback.
564 : : *
565 : : * 'error_filename' is the filename that should be included in error messages
566 : : * if the file is found to be malformed. The value is not copied, so the
567 : : * caller should ensure that it remains valid until done with this
568 : : * BlockRefTableReader.
569 : : *
570 : : * 'error_callback' is a function to be called if the file is found to be
571 : : * malformed. This is not used for I/O errors, which must be handled internally
572 : : * by read_callback.
573 : : *
574 : : * 'error_callback_arg' is an opaque argument to be passed to error_callback.
575 : : */
576 : : BlockRefTableReader *
577 : 29 : CreateBlockRefTableReader(io_callback_fn read_callback,
578 : : void *read_callback_arg,
579 : : char *error_filename,
580 : : report_error_fn error_callback,
581 : : void *error_callback_arg)
582 : : {
583 : : BlockRefTableReader *reader;
584 : : uint32 magic;
585 : :
586 : : /* Initialize data structure. */
172 michael@paquier.xyz 587 :GNC 29 : reader = palloc0_object(BlockRefTableReader);
892 rhaas@postgresql.org 588 :CBC 29 : reader->buffer.io_callback = read_callback;
589 : 29 : reader->buffer.io_callback_arg = read_callback_arg;
590 : 29 : reader->error_filename = error_filename;
591 : 29 : reader->error_callback = error_callback;
592 : 29 : reader->error_callback_arg = error_callback_arg;
593 : 29 : INIT_CRC32C(reader->buffer.crc);
594 : :
595 : : /* Verify magic number. */
596 : 29 : BlockRefTableRead(reader, &magic, sizeof(uint32));
597 [ - + ]: 29 : if (magic != BLOCKREFTABLE_MAGIC)
892 rhaas@postgresql.org 598 :UBC 0 : error_callback(error_callback_arg,
599 : : "file \"%s\" has wrong magic number: expected %u, found %u",
600 : : error_filename,
601 : : BLOCKREFTABLE_MAGIC, magic);
602 : :
892 rhaas@postgresql.org 603 :CBC 29 : return reader;
604 : : }
605 : :
606 : : /*
607 : : * Read next relation fork covered by this block reference table file.
608 : : *
609 : : * After calling this function, you must call BlockRefTableReaderGetBlocks
610 : : * until it returns 0 before calling it again.
611 : : */
612 : : bool
613 : 740 : BlockRefTableReaderNextRelation(BlockRefTableReader *reader,
614 : : RelFileLocator *rlocator,
615 : : ForkNumber *forknum,
616 : : BlockNumber *limit_block)
617 : : {
618 : : BlockRefTableSerializedEntry sentry;
436 peter@eisentraut.org 619 : 740 : BlockRefTableSerializedEntry zentry = {0};
620 : :
621 : : /*
622 : : * Sanity check: caller must read all blocks from all chunks before moving
623 : : * on to the next relation.
624 : : */
892 rhaas@postgresql.org 625 [ - + ]: 740 : Assert(reader->total_chunks == reader->consumed_chunks);
626 : :
627 : : /* Read serialized entry. */
628 : 740 : BlockRefTableRead(reader, &sentry,
629 : : sizeof(BlockRefTableSerializedEntry));
630 : :
631 : : /*
632 : : * If we just read the sentinel entry indicating that we've reached the
633 : : * end, read and check the CRC.
634 : : */
635 [ + + ]: 740 : if (memcmp(&sentry, &zentry, sizeof(BlockRefTableSerializedEntry)) == 0)
636 : : {
637 : : pg_crc32c expected_crc;
638 : : pg_crc32c actual_crc;
639 : :
640 : : /*
641 : : * We want to know the CRC of the file excluding the 4-byte CRC
642 : : * itself, so copy the current value of the CRC accumulator before
643 : : * reading those bytes, and use the copy to finalize the calculation.
644 : : */
645 : 29 : expected_crc = reader->buffer.crc;
646 : 29 : FIN_CRC32C(expected_crc);
647 : :
648 : : /* Now we can read the actual value. */
649 : 29 : BlockRefTableRead(reader, &actual_crc, sizeof(pg_crc32c));
650 : :
651 : : /* Throw an error if there is a mismatch. */
652 [ - + ]: 29 : if (!EQ_CRC32C(expected_crc, actual_crc))
892 rhaas@postgresql.org 653 :UBC 0 : reader->error_callback(reader->error_callback_arg,
654 : : "file \"%s\" has wrong checksum: expected %08X, found %08X",
655 : : reader->error_filename, expected_crc, actual_crc);
656 : :
892 rhaas@postgresql.org 657 :CBC 29 : return false;
658 : : }
659 : :
660 : : /*
661 : : * Sanity-check the nchunks value. In the backend, palloc_array would
662 : : * enforce this anyway (with a more generic error message); but in
663 : : * frontend it would not, potentially allowing BlockRefTableRead's length
664 : : * parameter to overflow.
665 : : */
19 tgl@sss.pgh.pa.us 666 [ - + ]: 711 : if (sentry.nchunks > MaxAllocSize / sizeof(uint16))
667 : : {
19 tgl@sss.pgh.pa.us 668 :UBC 0 : reader->error_callback(reader->error_callback_arg,
669 : : "file \"%s\" has oversized chunk size array",
670 : : reader->error_filename);
671 : 0 : return false;
672 : : }
673 : :
674 : : /* Read chunk size array. */
892 rhaas@postgresql.org 675 [ + + ]:CBC 711 : if (reader->chunk_size != NULL)
676 : 691 : pfree(reader->chunk_size);
172 michael@paquier.xyz 677 : 711 : reader->chunk_size = palloc_array(uint16, sentry.nchunks);
892 rhaas@postgresql.org 678 : 711 : BlockRefTableRead(reader, reader->chunk_size,
679 : 711 : sentry.nchunks * sizeof(uint16));
680 : :
681 : : /* Set up for chunk scan. */
682 : 711 : reader->total_chunks = sentry.nchunks;
683 : 711 : reader->consumed_chunks = 0;
684 : :
685 : : /* Return data to caller. */
686 : 711 : memcpy(rlocator, &sentry.rlocator, sizeof(RelFileLocator));
687 : 711 : *forknum = sentry.forknum;
688 : 711 : *limit_block = sentry.limit_block;
689 : 711 : return true;
690 : : }
691 : :
692 : : /*
693 : : * Get modified blocks associated with the relation fork returned by
694 : : * the most recent call to BlockRefTableReaderNextRelation.
695 : : *
696 : : * On return, block numbers will be written into the 'blocks' array, whose
697 : : * length should be passed via 'nblocks'. The return value is the number of
698 : : * entries actually written into the 'blocks' array, which may be less than
699 : : * 'nblocks' if we run out of modified blocks in the relation fork before
700 : : * we run out of room in the array.
701 : : */
702 : : unsigned
703 : 1309 : BlockRefTableReaderGetBlocks(BlockRefTableReader *reader,
704 : : BlockNumber *blocks,
705 : : int nblocks)
706 : : {
707 : 1309 : unsigned blocks_found = 0;
708 : :
709 : : /* Must provide space for at least one block number to be returned. */
710 [ + - ]: 1309 : Assert(nblocks > 0);
711 : :
712 : : /* Loop collecting blocks to return to caller. */
713 : : for (;;)
714 : 599 : {
715 : : uint16 next_chunk_size;
716 : :
717 : : /*
718 : : * If we've read at least one chunk, maybe it contains some block
719 : : * numbers that could satisfy caller's request.
720 : : */
721 [ + + ]: 1908 : if (reader->consumed_chunks > 0)
722 : : {
723 : 1197 : uint32 chunkno = reader->consumed_chunks - 1;
724 : 1197 : uint16 chunk_size = reader->chunk_size[chunkno];
725 : :
726 [ - + ]: 1197 : if (chunk_size == MAX_ENTRIES_PER_CHUNK)
727 : : {
728 : : /* Bitmap format, so search for bits that are set. */
892 rhaas@postgresql.org 729 [ # # ]:UBC 0 : while (reader->chunk_position < BLOCKS_PER_CHUNK &&
730 [ # # ]: 0 : blocks_found < nblocks)
731 : : {
732 : 0 : uint16 chunkoffset = reader->chunk_position;
733 : : uint16 w;
734 : :
735 : 0 : w = reader->chunk_data[chunkoffset / BLOCKS_PER_ENTRY];
736 [ # # ]: 0 : if ((w & (1u << (chunkoffset % BLOCKS_PER_ENTRY))) != 0)
737 : 0 : blocks[blocks_found++] =
738 : 0 : chunkno * BLOCKS_PER_CHUNK + chunkoffset;
739 : 0 : ++reader->chunk_position;
740 : : }
741 : : }
742 : : else
743 : : {
744 : : /* Not in bitmap format, so each entry is a 2-byte offset. */
892 rhaas@postgresql.org 745 [ + + ]:CBC 3217 : while (reader->chunk_position < chunk_size &&
746 [ + - ]: 2020 : blocks_found < nblocks)
747 : : {
748 : 2020 : blocks[blocks_found++] = chunkno * BLOCKS_PER_CHUNK
749 : 2020 : + reader->chunk_data[reader->chunk_position];
750 : 2020 : ++reader->chunk_position;
751 : : }
752 : : }
753 : : }
754 : :
755 : : /* We found enough blocks, so we're done. */
756 [ - + ]: 1908 : if (blocks_found >= nblocks)
892 rhaas@postgresql.org 757 :UBC 0 : break;
758 : :
759 : : /*
760 : : * We didn't find enough blocks, so we must need the next chunk. If
761 : : * there are none left, though, then we're done anyway.
762 : : */
892 rhaas@postgresql.org 763 [ + + ]:CBC 1908 : if (reader->consumed_chunks == reader->total_chunks)
764 : 1309 : break;
765 : :
766 : : /*
767 : : * Read data for next chunk and reset scan position to beginning of
768 : : * chunk. Note that the next chunk might be empty, in which case we
769 : : * consume the chunk without actually consuming any bytes from the
770 : : * underlying file.
771 : : */
772 : 599 : next_chunk_size = reader->chunk_size[reader->consumed_chunks];
773 [ + - ]: 599 : if (next_chunk_size > 0)
774 : 599 : BlockRefTableRead(reader, reader->chunk_data,
775 : 599 : next_chunk_size * sizeof(uint16));
776 : 599 : ++reader->consumed_chunks;
777 : 599 : reader->chunk_position = 0;
778 : : }
779 : :
780 : 1309 : return blocks_found;
781 : : }
782 : :
783 : : /*
784 : : * Release memory used while reading a block reference table from a file.
785 : : */
786 : : void
787 : 29 : DestroyBlockRefTableReader(BlockRefTableReader *reader)
788 : : {
789 [ + + ]: 29 : if (reader->chunk_size != NULL)
790 : : {
791 : 20 : pfree(reader->chunk_size);
792 : 20 : reader->chunk_size = NULL;
793 : : }
794 : 29 : pfree(reader);
795 : 29 : }
796 : :
797 : : /*
798 : : * Prepare to write a block reference table file incrementally.
799 : : *
800 : : * Caller must be able to supply BlockRefTableEntry objects sorted in the
801 : : * appropriate order.
802 : : */
803 : : BlockRefTableWriter *
892 rhaas@postgresql.org 804 :UBC 0 : CreateBlockRefTableWriter(io_callback_fn write_callback,
805 : : void *write_callback_arg)
806 : : {
807 : : BlockRefTableWriter *writer;
808 : 0 : uint32 magic = BLOCKREFTABLE_MAGIC;
809 : :
810 : : /* Prepare buffer and CRC check and save callbacks. */
172 michael@paquier.xyz 811 :UNC 0 : writer = palloc0_object(BlockRefTableWriter);
892 rhaas@postgresql.org 812 :UBC 0 : writer->buffer.io_callback = write_callback;
813 : 0 : writer->buffer.io_callback_arg = write_callback_arg;
814 : 0 : INIT_CRC32C(writer->buffer.crc);
815 : :
816 : : /* Write magic number. */
817 : 0 : BlockRefTableWrite(&writer->buffer, &magic, sizeof(uint32));
818 : :
819 : 0 : return writer;
820 : : }
821 : :
822 : : /*
823 : : * Append one entry to a block reference table file.
824 : : *
825 : : * Note that entries must be written in the proper order, that is, sorted by
826 : : * tablespace, then database, then relfilenumber, then fork number. Caller
827 : : * is responsible for supplying data in the correct order. If that seems hard,
828 : : * use an in-memory BlockRefTable instead.
829 : : */
830 : : void
831 : 0 : BlockRefTableWriteEntry(BlockRefTableWriter *writer, BlockRefTableEntry *entry)
832 : : {
833 : : BlockRefTableSerializedEntry sentry;
834 : : unsigned j;
835 : :
836 : : /* Convert to serialized entry format. */
837 : 0 : sentry.rlocator = entry->key.rlocator;
838 : 0 : sentry.forknum = entry->key.forknum;
839 : 0 : sentry.limit_block = entry->limit_block;
840 : 0 : sentry.nchunks = entry->nchunks;
841 : :
842 : : /* Trim trailing zero entries. */
843 [ # # # # ]: 0 : while (sentry.nchunks > 0 && entry->chunk_usage[sentry.nchunks - 1] == 0)
844 : 0 : sentry.nchunks--;
845 : :
846 : : /* Write the serialized entry itself. */
847 : 0 : BlockRefTableWrite(&writer->buffer, &sentry,
848 : : sizeof(BlockRefTableSerializedEntry));
849 : :
850 : : /* Write the untruncated portion of the chunk length array. */
851 [ # # ]: 0 : if (sentry.nchunks != 0)
852 : 0 : BlockRefTableWrite(&writer->buffer, entry->chunk_usage,
853 : 0 : sentry.nchunks * sizeof(uint16));
854 : :
855 : : /* Write the contents of each chunk. */
856 [ # # ]: 0 : for (j = 0; j < entry->nchunks; ++j)
857 : : {
858 [ # # ]: 0 : if (entry->chunk_usage[j] == 0)
859 : 0 : continue;
860 : 0 : BlockRefTableWrite(&writer->buffer, entry->chunk_data[j],
861 : 0 : entry->chunk_usage[j] * sizeof(uint16));
862 : : }
863 : 0 : }
864 : :
865 : : /*
866 : : * Finalize an incremental write of a block reference table file.
867 : : */
868 : : void
869 : 0 : DestroyBlockRefTableWriter(BlockRefTableWriter *writer)
870 : : {
871 : 0 : BlockRefTableFileTerminate(&writer->buffer);
872 : 0 : pfree(writer);
873 : 0 : }
874 : :
875 : : /*
876 : : * Allocate a standalone BlockRefTableEntry.
877 : : *
878 : : * When we're manipulating a full in-memory BlockRefTable, the entries are
879 : : * part of the hash table and are allocated by simplehash. This routine is
880 : : * used by callers that want to write out a BlockRefTable to a file without
881 : : * needing to store the whole thing in memory at once.
882 : : *
883 : : * Entries allocated by this function can be manipulated using the functions
884 : : * BlockRefTableEntrySetLimitBlock and BlockRefTableEntryMarkBlockModified
885 : : * and then written using BlockRefTableWriteEntry and freed using
886 : : * BlockRefTableFreeEntry.
887 : : */
888 : : BlockRefTableEntry *
889 : 0 : CreateBlockRefTableEntry(RelFileLocator rlocator, ForkNumber forknum)
890 : : {
172 michael@paquier.xyz 891 :UNC 0 : BlockRefTableEntry *entry = palloc0_object(BlockRefTableEntry);
892 : :
892 rhaas@postgresql.org 893 :UBC 0 : memcpy(&entry->key.rlocator, &rlocator, sizeof(RelFileLocator));
894 : 0 : entry->key.forknum = forknum;
895 : 0 : entry->limit_block = InvalidBlockNumber;
896 : :
897 : 0 : return entry;
898 : : }
899 : :
900 : : /*
901 : : * Update a BlockRefTableEntry with a new value for the "limit block" and
902 : : * forget any equal-or-higher-numbered modified blocks.
903 : : *
904 : : * The "limit block" is the shortest known length of the relation within the
905 : : * range of WAL records covered by this block reference table.
906 : : */
907 : : void
892 rhaas@postgresql.org 908 :CBC 6 : BlockRefTableEntrySetLimitBlock(BlockRefTableEntry *entry,
909 : : BlockNumber limit_block)
910 : : {
911 : : unsigned chunkno;
912 : : unsigned limit_chunkno;
913 : : unsigned limit_chunkoffset;
914 : : BlockRefTableChunk limit_chunk;
915 : :
916 : : /* If we already have an equal or lower limit block, do nothing. */
917 [ + + ]: 6 : if (limit_block >= entry->limit_block)
918 : 4 : return;
919 : :
920 : : /* Record the new limit block value. */
921 : 2 : entry->limit_block = limit_block;
922 : :
923 : : /*
924 : : * Figure out which chunk would store the state of the new limit block,
925 : : * and which offset within that chunk.
926 : : */
927 : 2 : limit_chunkno = limit_block / BLOCKS_PER_CHUNK;
928 : 2 : limit_chunkoffset = limit_block % BLOCKS_PER_CHUNK;
929 : :
930 : : /*
931 : : * If the number of chunks is not large enough for any blocks with equal
932 : : * or higher block numbers to exist, then there is nothing further to do.
933 : : */
934 [ - + ]: 2 : if (limit_chunkno >= entry->nchunks)
892 rhaas@postgresql.org 935 :UBC 0 : return;
936 : :
937 : : /* Discard entire contents of any higher-numbered chunks. */
892 rhaas@postgresql.org 938 [ + + ]:CBC 32 : for (chunkno = limit_chunkno + 1; chunkno < entry->nchunks; ++chunkno)
939 : 30 : entry->chunk_usage[chunkno] = 0;
940 : :
941 : : /*
942 : : * Next, we need to discard any offsets within the chunk that would
943 : : * contain the limit_block. We must handle this differently depending on
944 : : * whether the chunk that would contain limit_block is a bitmap or an
945 : : * array of offsets.
946 : : */
947 : 2 : limit_chunk = entry->chunk_data[limit_chunkno];
948 [ - + ]: 2 : if (entry->chunk_usage[limit_chunkno] == MAX_ENTRIES_PER_CHUNK)
949 : : {
950 : : unsigned chunkoffset;
951 : :
952 : : /* It's a bitmap. Unset bits. */
892 rhaas@postgresql.org 953 [ # # ]:UBC 0 : for (chunkoffset = limit_chunkoffset; chunkoffset < BLOCKS_PER_CHUNK;
954 : 0 : ++chunkoffset)
955 : 0 : limit_chunk[chunkoffset / BLOCKS_PER_ENTRY] &=
956 : 0 : ~(1 << (chunkoffset % BLOCKS_PER_ENTRY));
957 : : }
958 : : else
959 : : {
960 : : unsigned i,
892 rhaas@postgresql.org 961 :CBC 2 : j = 0;
962 : :
963 : : /* It's an offset array. Filter out large offsets. */
964 [ + + ]: 4 : for (i = 0; i < entry->chunk_usage[limit_chunkno]; ++i)
965 : : {
966 [ - + ]: 2 : Assert(j <= i);
967 [ + + ]: 2 : if (limit_chunk[i] < limit_chunkoffset)
968 : 1 : limit_chunk[j++] = limit_chunk[i];
969 : : }
970 [ - + ]: 2 : Assert(j <= entry->chunk_usage[limit_chunkno]);
971 : 2 : entry->chunk_usage[limit_chunkno] = j;
972 : : }
973 : : }
974 : :
975 : : /*
976 : : * Mark a block in a given BlockRefTableEntry as known to have been modified.
977 : : */
978 : : void
979 : 55469 : BlockRefTableEntryMarkBlockModified(BlockRefTableEntry *entry,
980 : : ForkNumber forknum,
981 : : BlockNumber blknum)
982 : : {
983 : : unsigned chunkno;
984 : : unsigned chunkoffset;
985 : : unsigned i;
986 : :
987 : : /*
988 : : * Which chunk should store the state of this block? And what is the
989 : : * offset of this block relative to the start of that chunk?
990 : : */
991 : 55469 : chunkno = blknum / BLOCKS_PER_CHUNK;
992 : 55469 : chunkoffset = blknum % BLOCKS_PER_CHUNK;
993 : :
994 : : /*
995 : : * If 'nchunks' isn't big enough for us to be able to represent the state
996 : : * of this block, we need to enlarge our arrays.
997 : : */
998 [ + + ]: 55469 : if (chunkno >= entry->nchunks)
999 : : {
1000 : : unsigned max_chunks;
1001 : : unsigned extra_chunks;
1002 : :
1003 : : /*
1004 : : * New array size is a power of 2, at least 16, big enough so that
1005 : : * chunkno will be a valid array index.
1006 : : */
1007 : 928 : max_chunks = Max(16, entry->nchunks);
1008 [ - + ]: 928 : while (max_chunks < chunkno + 1)
870 rhaas@postgresql.org 1009 :UBC 0 : max_chunks *= 2;
892 rhaas@postgresql.org 1010 :CBC 928 : extra_chunks = max_chunks - entry->nchunks;
1011 : :
1012 [ + - ]: 928 : if (entry->nchunks == 0)
1013 : : {
172 michael@paquier.xyz 1014 : 928 : entry->chunk_size = palloc0_array(uint16, max_chunks);
1015 : 928 : entry->chunk_usage = palloc0_array(uint16, max_chunks);
1016 : 928 : entry->chunk_data = palloc0_array(BlockRefTableChunk, max_chunks);
1017 : : }
1018 : : else
1019 : : {
892 rhaas@postgresql.org 1020 :UBC 0 : entry->chunk_size = repalloc(entry->chunk_size,
1021 : : sizeof(uint16) * max_chunks);
1022 : 0 : memset(&entry->chunk_size[entry->nchunks], 0,
1023 : : extra_chunks * sizeof(uint16));
1024 : 0 : entry->chunk_usage = repalloc(entry->chunk_usage,
1025 : : sizeof(uint16) * max_chunks);
1026 : 0 : memset(&entry->chunk_usage[entry->nchunks], 0,
1027 : : extra_chunks * sizeof(uint16));
1028 : 0 : entry->chunk_data = repalloc(entry->chunk_data,
1029 : : sizeof(BlockRefTableChunk) * max_chunks);
1030 : 0 : memset(&entry->chunk_data[entry->nchunks], 0,
1031 : : extra_chunks * sizeof(BlockRefTableChunk));
1032 : : }
892 rhaas@postgresql.org 1033 :CBC 928 : entry->nchunks = max_chunks;
1034 : : }
1035 : :
1036 : : /*
1037 : : * If the chunk that covers this block number doesn't exist yet, create it
1038 : : * as an array and add the appropriate offset to it. We make it pretty
1039 : : * small initially, because there might only be 1 or a few block
1040 : : * references in this chunk and we don't want to use up too much memory.
1041 : : */
1042 [ + + ]: 55469 : if (entry->chunk_size[chunkno] == 0)
1043 : : {
1044 : 1856 : entry->chunk_data[chunkno] =
172 michael@paquier.xyz 1045 : 928 : palloc_array(uint16, INITIAL_ENTRIES_PER_CHUNK);
892 rhaas@postgresql.org 1046 : 928 : entry->chunk_size[chunkno] = INITIAL_ENTRIES_PER_CHUNK;
1047 : 928 : entry->chunk_data[chunkno][0] = chunkoffset;
1048 : 928 : entry->chunk_usage[chunkno] = 1;
1049 : 928 : return;
1050 : : }
1051 : :
1052 : : /*
1053 : : * If the number of entries in this chunk is already maximum, it must be a
1054 : : * bitmap. Just set the appropriate bit.
1055 : : */
1056 [ - + ]: 54541 : if (entry->chunk_usage[chunkno] == MAX_ENTRIES_PER_CHUNK)
1057 : : {
892 rhaas@postgresql.org 1058 :UBC 0 : BlockRefTableChunk chunk = entry->chunk_data[chunkno];
1059 : :
1060 : 0 : chunk[chunkoffset / BLOCKS_PER_ENTRY] |=
1061 : 0 : 1 << (chunkoffset % BLOCKS_PER_ENTRY);
1062 : 0 : return;
1063 : : }
1064 : :
1065 : : /*
1066 : : * There is an existing chunk and it's in array format. Let's find out
1067 : : * whether it already has an entry for this block. If so, we do not need
1068 : : * to do anything.
1069 : : */
892 rhaas@postgresql.org 1070 [ + + ]:CBC 365120 : for (i = 0; i < entry->chunk_usage[chunkno]; ++i)
1071 : : {
1072 [ + + ]: 362980 : if (entry->chunk_data[chunkno][i] == chunkoffset)
1073 : 52401 : return;
1074 : : }
1075 : :
1076 : : /*
1077 : : * If the number of entries currently used is one less than the maximum,
1078 : : * it's time to convert to bitmap format.
1079 : : */
1080 [ - + ]: 2140 : if (entry->chunk_usage[chunkno] == MAX_ENTRIES_PER_CHUNK - 1)
1081 : : {
1082 : : BlockRefTableChunk newchunk;
1083 : : unsigned j;
1084 : :
1085 : : /* Allocate a new chunk. */
892 rhaas@postgresql.org 1086 :UBC 0 : newchunk = palloc0(MAX_ENTRIES_PER_CHUNK * sizeof(uint16));
1087 : :
1088 : : /* Set the bit for each existing entry. */
1089 [ # # ]: 0 : for (j = 0; j < entry->chunk_usage[chunkno]; ++j)
1090 : : {
1091 : 0 : unsigned coff = entry->chunk_data[chunkno][j];
1092 : :
1093 : 0 : newchunk[coff / BLOCKS_PER_ENTRY] |=
1094 : 0 : 1 << (coff % BLOCKS_PER_ENTRY);
1095 : : }
1096 : :
1097 : : /* Set the bit for the new entry. */
1098 : 0 : newchunk[chunkoffset / BLOCKS_PER_ENTRY] |=
1099 : 0 : 1 << (chunkoffset % BLOCKS_PER_ENTRY);
1100 : :
1101 : : /* Swap the new chunk into place and update metadata. */
1102 : 0 : pfree(entry->chunk_data[chunkno]);
1103 : 0 : entry->chunk_data[chunkno] = newchunk;
1104 : 0 : entry->chunk_size[chunkno] = MAX_ENTRIES_PER_CHUNK;
1105 : 0 : entry->chunk_usage[chunkno] = MAX_ENTRIES_PER_CHUNK;
1106 : 0 : return;
1107 : : }
1108 : :
1109 : : /*
1110 : : * OK, we currently have an array, and we don't need to convert to a
1111 : : * bitmap, but we do need to add a new element. If there's not enough
1112 : : * room, we'll have to expand the array.
1113 : : */
892 rhaas@postgresql.org 1114 [ + + ]:CBC 2140 : if (entry->chunk_usage[chunkno] == entry->chunk_size[chunkno])
1115 : : {
1116 : 48 : unsigned newsize = entry->chunk_size[chunkno] * 2;
1117 : :
1118 [ - + ]: 48 : Assert(newsize <= MAX_ENTRIES_PER_CHUNK);
1119 : 48 : entry->chunk_data[chunkno] = repalloc(entry->chunk_data[chunkno],
1120 : : newsize * sizeof(uint16));
1121 : 48 : entry->chunk_size[chunkno] = newsize;
1122 : : }
1123 : :
1124 : : /* Now we can add the new entry. */
1125 : 2140 : entry->chunk_data[chunkno][entry->chunk_usage[chunkno]] =
1126 : : chunkoffset;
1127 : 2140 : entry->chunk_usage[chunkno]++;
1128 : : }
1129 : :
1130 : : /*
1131 : : * Release memory for a BlockRefTableEntry that was created by
1132 : : * CreateBlockRefTableEntry.
1133 : : */
1134 : : void
892 rhaas@postgresql.org 1135 :UBC 0 : BlockRefTableFreeEntry(BlockRefTableEntry *entry)
1136 : : {
1137 [ # # ]: 0 : if (entry->chunk_size != NULL)
1138 : : {
1139 : 0 : pfree(entry->chunk_size);
1140 : 0 : entry->chunk_size = NULL;
1141 : : }
1142 : :
1143 [ # # ]: 0 : if (entry->chunk_usage != NULL)
1144 : : {
1145 : 0 : pfree(entry->chunk_usage);
1146 : 0 : entry->chunk_usage = NULL;
1147 : : }
1148 : :
1149 [ # # ]: 0 : if (entry->chunk_data != NULL)
1150 : : {
1151 : 0 : pfree(entry->chunk_data);
1152 : 0 : entry->chunk_data = NULL;
1153 : : }
1154 : :
1155 : 0 : pfree(entry);
1156 : 0 : }
1157 : :
1158 : : /*
1159 : : * Comparator for BlockRefTableSerializedEntry objects.
1160 : : *
1161 : : * We make the tablespace OID the first column of the sort key to match
1162 : : * the on-disk tree structure.
1163 : : */
1164 : : static int
892 rhaas@postgresql.org 1165 :CBC 4806 : BlockRefTableComparator(const void *a, const void *b)
1166 : : {
1167 : 4806 : const BlockRefTableSerializedEntry *sa = a;
1168 : 4806 : const BlockRefTableSerializedEntry *sb = b;
1169 : :
1170 [ + + ]: 4806 : if (sa->rlocator.spcOid > sb->rlocator.spcOid)
1171 : 152 : return 1;
1172 [ + + ]: 4654 : if (sa->rlocator.spcOid < sb->rlocator.spcOid)
1173 : 130 : return -1;
1174 : :
1175 [ - + ]: 4524 : if (sa->rlocator.dbOid > sb->rlocator.dbOid)
892 rhaas@postgresql.org 1176 :UBC 0 : return 1;
892 rhaas@postgresql.org 1177 [ - + ]:CBC 4524 : if (sa->rlocator.dbOid < sb->rlocator.dbOid)
892 rhaas@postgresql.org 1178 :UBC 0 : return -1;
1179 : :
892 rhaas@postgresql.org 1180 [ + + ]:CBC 4524 : if (sa->rlocator.relNumber > sb->rlocator.relNumber)
1181 : 2258 : return 1;
1182 [ + + ]: 2266 : if (sa->rlocator.relNumber < sb->rlocator.relNumber)
1183 : 2159 : return -1;
1184 : :
1185 [ + + ]: 107 : if (sa->forknum > sb->forknum)
1186 : 57 : return 1;
1187 [ + - ]: 50 : if (sa->forknum < sb->forknum)
1188 : 50 : return -1;
1189 : :
892 rhaas@postgresql.org 1190 :UBC 0 : return 0;
1191 : : }
1192 : :
1193 : : /*
1194 : : * Flush any buffered data out of a BlockRefTableBuffer.
1195 : : */
1196 : : static void
892 rhaas@postgresql.org 1197 :CBC 18 : BlockRefTableFlush(BlockRefTableBuffer *buffer)
1198 : : {
1199 : 18 : buffer->io_callback(buffer->io_callback_arg, buffer->data, buffer->used);
1200 : 18 : buffer->used = 0;
1201 : 18 : }
1202 : :
1203 : : /*
1204 : : * Read data from a BlockRefTableBuffer, and update the running CRC
1205 : : * calculation for the returned data (but not any data that we may have
1206 : : * buffered but not yet actually returned).
1207 : : */
1208 : : static void
1209 : 2108 : BlockRefTableRead(BlockRefTableReader *reader, void *data, int length)
1210 : : {
1211 : 2108 : BlockRefTableBuffer *buffer = &reader->buffer;
1212 : :
1213 : : /* Loop until read is fully satisfied. */
1214 [ + + ]: 4133 : while (length > 0)
1215 : : {
1216 [ + + ]: 2025 : if (buffer->cursor < buffer->used)
1217 : : {
1218 : : /*
1219 : : * If any buffered data is available, use that to satisfy as much
1220 : : * of the request as possible.
1221 : : */
1222 : 1996 : int bytes_to_copy = Min(length, buffer->used - buffer->cursor);
1223 : :
1224 : 1996 : memcpy(data, &buffer->data[buffer->cursor], bytes_to_copy);
1225 : 1996 : COMP_CRC32C(buffer->crc, &buffer->data[buffer->cursor],
1226 : : bytes_to_copy);
1227 : 1996 : buffer->cursor += bytes_to_copy;
1228 : 1996 : data = ((char *) data) + bytes_to_copy;
1229 : 1996 : length -= bytes_to_copy;
1230 : : }
1231 [ - + ]: 29 : else if (length >= BUFSIZE)
1232 : : {
1233 : : /*
1234 : : * If the request length is long, read directly into caller's
1235 : : * buffer.
1236 : : */
1237 : : int bytes_read;
1238 : :
892 rhaas@postgresql.org 1239 :UBC 0 : bytes_read = buffer->io_callback(buffer->io_callback_arg,
1240 : : data, length);
1241 : 0 : COMP_CRC32C(buffer->crc, data, bytes_read);
1242 : 0 : data = ((char *) data) + bytes_read;
1243 : 0 : length -= bytes_read;
1244 : :
1245 : : /* If we didn't get anything, that's bad. */
1246 [ # # ]: 0 : if (bytes_read == 0)
1247 : 0 : reader->error_callback(reader->error_callback_arg,
1248 : : "file \"%s\" ends unexpectedly",
1249 : : reader->error_filename);
1250 : : }
1251 : : else
1252 : : {
1253 : : /*
1254 : : * Refill our buffer.
1255 : : */
892 rhaas@postgresql.org 1256 :CBC 58 : buffer->used = buffer->io_callback(buffer->io_callback_arg,
1257 : 29 : buffer->data, BUFSIZE);
1258 : 29 : buffer->cursor = 0;
1259 : :
1260 : : /* If we didn't get anything, that's bad. */
1261 [ - + ]: 29 : if (buffer->used == 0)
892 rhaas@postgresql.org 1262 :UBC 0 : reader->error_callback(reader->error_callback_arg,
1263 : : "file \"%s\" ends unexpectedly",
1264 : : reader->error_filename);
1265 : : }
1266 : : }
892 rhaas@postgresql.org 1267 :CBC 2108 : }
1268 : :
1269 : : /*
1270 : : * Supply data to a BlockRefTableBuffer for write to the underlying File,
1271 : : * and update the running CRC calculation for that data.
1272 : : */
1273 : : static void
1274 : 2051 : BlockRefTableWrite(BlockRefTableBuffer *buffer, void *data, int length)
1275 : : {
1276 : : /* Update running CRC calculation. */
1277 : 2051 : COMP_CRC32C(buffer->crc, data, length);
1278 : :
1279 : : /* If the new data can't fit into the buffer, flush the buffer. */
1280 [ - + ]: 2051 : if (buffer->used + length > BUFSIZE)
1281 : : {
892 rhaas@postgresql.org 1282 :UBC 0 : buffer->io_callback(buffer->io_callback_arg, buffer->data,
1283 : : buffer->used);
1284 : 0 : buffer->used = 0;
1285 : : }
1286 : :
1287 : : /* If the new data would fill the buffer, or more, write it directly. */
892 rhaas@postgresql.org 1288 [ - + ]:CBC 2051 : if (length >= BUFSIZE)
1289 : : {
892 rhaas@postgresql.org 1290 :UBC 0 : buffer->io_callback(buffer->io_callback_arg, data, length);
1291 : 0 : return;
1292 : : }
1293 : :
1294 : : /* Otherwise, copy the new data into the buffer. */
892 rhaas@postgresql.org 1295 :CBC 2051 : memcpy(&buffer->data[buffer->used], data, length);
1296 : 2051 : buffer->used += length;
1297 [ - + ]: 2051 : Assert(buffer->used <= BUFSIZE);
1298 : : }
1299 : :
1300 : : /*
1301 : : * Generate the sentinel and CRC required at the end of a block reference
1302 : : * table file and flush them out of our internal buffer.
1303 : : */
1304 : : static void
1305 : 18 : BlockRefTableFileTerminate(BlockRefTableBuffer *buffer)
1306 : : {
436 peter@eisentraut.org 1307 : 18 : BlockRefTableSerializedEntry zentry = {0};
1308 : : pg_crc32c crc;
1309 : :
1310 : : /* Write a sentinel indicating that there are no more entries. */
892 rhaas@postgresql.org 1311 : 18 : BlockRefTableWrite(buffer, &zentry,
1312 : : sizeof(BlockRefTableSerializedEntry));
1313 : :
1314 : : /*
1315 : : * Writing the checksum will perturb the ongoing checksum calculation, so
1316 : : * copy the state first and finalize the computation using the copy.
1317 : : */
1318 : 18 : crc = buffer->crc;
1319 : 18 : FIN_CRC32C(crc);
1320 : 18 : BlockRefTableWrite(buffer, &crc, sizeof(pg_crc32c));
1321 : :
1322 : : /* Flush any leftover data out of our buffer. */
1323 : 18 : BlockRefTableFlush(buffer);
1324 : 18 : }
|