Age Owner Branch data TLA Line data Source code
1 : : /*-------------------------------------------------------------------------
2 : : *
3 : : * fd.c
4 : : * Virtual file descriptor code.
5 : : *
6 : : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7 : : * Portions Copyright (c) 1994, Regents of the University of California
8 : : *
9 : : * IDENTIFICATION
10 : : * src/backend/storage/file/fd.c
11 : : *
12 : : * NOTES:
13 : : *
14 : : * This code manages a cache of 'virtual' file descriptors (VFDs).
15 : : * The server opens many file descriptors for a variety of reasons,
16 : : * including base tables, scratch files (e.g., sort and hash spool
17 : : * files), and random calls to C library routines like system(3); it
18 : : * is quite easy to exceed system limits on the number of open files a
19 : : * single process can have. (This is around 1024 on many modern
20 : : * operating systems, but may be lower on others.)
21 : : *
22 : : * VFDs are managed as an LRU pool, with actual OS file descriptors
23 : : * being opened and closed as needed. Obviously, if a routine is
24 : : * opened using these interfaces, all subsequent operations must also
25 : : * be through these interfaces (the File type is not a real file
26 : : * descriptor).
27 : : *
28 : : * For this scheme to work, most (if not all) routines throughout the
29 : : * server should use these interfaces instead of calling the C library
30 : : * routines (e.g., open(2) and fopen(3)) themselves. Otherwise, we
31 : : * may find ourselves short of real file descriptors anyway.
32 : : *
33 : : * INTERFACE ROUTINES
34 : : *
35 : : * PathNameOpenFile and OpenTemporaryFile are used to open virtual files.
36 : : * A File opened with OpenTemporaryFile is automatically deleted when the
37 : : * File is closed, either explicitly or implicitly at end of transaction or
38 : : * process exit. PathNameOpenFile is intended for files that are held open
39 : : * for a long time, like relation files. It is the caller's responsibility
40 : : * to close them, there is no automatic mechanism in fd.c for that.
41 : : *
42 : : * PathName(Create|Open|Delete)Temporary(File|Dir) are used to manage
43 : : * temporary files that have names so that they can be shared between
44 : : * backends. Such files are automatically closed and count against the
45 : : * temporary file limit of the backend that creates them, but unlike anonymous
46 : : * files they are not automatically deleted. See sharedfileset.c for a shared
47 : : * ownership mechanism that provides automatic cleanup for shared files when
48 : : * the last of a group of backends detaches.
49 : : *
50 : : * AllocateFile, AllocateDir, OpenPipeStream and OpenTransientFile are
51 : : * wrappers around fopen(3), opendir(3), popen(3) and open(2), respectively.
52 : : * They behave like the corresponding native functions, except that the handle
53 : : * is registered with the current subtransaction, and will be automatically
54 : : * closed at abort. These are intended mainly for short operations like
55 : : * reading a configuration file; there is a limit on the number of files that
56 : : * can be opened using these functions at any one time.
57 : : *
58 : : * Finally, BasicOpenFile is just a thin wrapper around open() that can
59 : : * release file descriptors in use by the virtual file descriptors if
60 : : * necessary. There is no automatic cleanup of file descriptors returned by
61 : : * BasicOpenFile, it is solely the caller's responsibility to close the file
62 : : * descriptor by calling close(2).
63 : : *
64 : : * If a non-virtual file descriptor needs to be held open for any length of
65 : : * time, report it to fd.c by calling AcquireExternalFD or ReserveExternalFD
66 : : * (and eventually ReleaseExternalFD), so that we can take it into account
67 : : * while deciding how many VFDs can be open. This applies to FDs obtained
68 : : * with BasicOpenFile as well as those obtained without use of any fd.c API.
69 : : *
70 : : *-------------------------------------------------------------------------
71 : : */
72 : :
73 : : #include "postgres.h"
74 : :
75 : : #include <dirent.h>
76 : : #include <sys/file.h>
77 : : #include <sys/param.h>
78 : : #include <sys/resource.h> /* for getrlimit */
79 : : #include <sys/stat.h>
80 : : #include <sys/types.h>
81 : : #ifndef WIN32
82 : : #include <sys/mman.h>
83 : : #endif
84 : : #include <limits.h>
85 : : #include <unistd.h>
86 : : #include <fcntl.h>
87 : :
88 : : #include "access/xact.h"
89 : : #include "access/xlog.h"
90 : : #include "catalog/pg_tablespace.h"
91 : : #include "common/file_perm.h"
92 : : #include "common/file_utils.h"
93 : : #include "common/pg_prng.h"
94 : : #include "miscadmin.h"
95 : : #include "pgstat.h"
96 : : #include "postmaster/startup.h"
97 : : #include "storage/aio.h"
98 : : #include "storage/fd.h"
99 : : #include "storage/ipc.h"
100 : : #include "utils/guc.h"
101 : : #include "utils/guc_hooks.h"
102 : : #include "utils/resowner.h"
103 : : #include "utils/varlena.h"
104 : :
105 : : /* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
106 : : #if defined(HAVE_SYNC_FILE_RANGE)
107 : : #define PG_FLUSH_DATA_WORKS 1
108 : : #elif !defined(WIN32) && defined(MS_ASYNC)
109 : : #define PG_FLUSH_DATA_WORKS 1
110 : : #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
111 : : #define PG_FLUSH_DATA_WORKS 1
112 : : #endif
113 : :
114 : : /*
115 : : * We must leave some file descriptors free for system(), the dynamic loader,
116 : : * and other code that tries to open files without consulting fd.c. This
117 : : * is the number left free. (While we try fairly hard to prevent EMFILE
118 : : * errors, there's never any guarantee that we won't get ENFILE due to
119 : : * other processes chewing up FDs. So it's a bad idea to try to open files
120 : : * without consulting fd.c. Nonetheless we cannot control all code.)
121 : : *
122 : : * Because this is just a fixed setting, we are effectively assuming that
123 : : * no such code will leave FDs open over the long term; otherwise the slop
124 : : * is likely to be insufficient. Note in particular that we expect that
125 : : * loading a shared library does not result in any permanent increase in
126 : : * the number of open files. (This appears to be true on most if not
127 : : * all platforms as of Feb 2004.)
128 : : */
129 : : #define NUM_RESERVED_FDS 10
130 : :
131 : : /*
132 : : * If we have fewer than this many usable FDs after allowing for the reserved
133 : : * ones, choke. (This value is chosen to work with "ulimit -n 64", but not
134 : : * much less than that. Note that this value ensures numExternalFDs can be
135 : : * at least 16; as of this writing, the contrib/postgres_fdw regression tests
136 : : * will not pass unless that can grow to at least 14.)
137 : : */
138 : : #define FD_MINFREE 48
139 : :
140 : : /*
141 : : * A number of platforms allow individual processes to open many more files
142 : : * than they can really support when *many* processes do the same thing.
143 : : * This GUC parameter lets the DBA limit max_safe_fds to something less than
144 : : * what the postmaster's initial probe suggests will work.
145 : : */
146 : : int max_files_per_process = 1000;
147 : :
148 : : /*
149 : : * Maximum number of file descriptors to open for operations that fd.c knows
150 : : * about (VFDs, AllocateFile etc, or "external" FDs). This is initialized
151 : : * to a conservative value, and remains that way indefinitely in bootstrap or
152 : : * standalone-backend cases. In normal postmaster operation, the postmaster
153 : : * calls set_max_safe_fds() late in initialization to update the value, and
154 : : * that value is then inherited by forked subprocesses.
155 : : *
156 : : * Note: the value of max_files_per_process is taken into account while
157 : : * setting this variable, and so need not be tested separately.
158 : : */
159 : : int max_safe_fds = FD_MINFREE; /* default if not changed */
160 : :
161 : : /* Whether it is safe to continue running after fsync() fails. */
162 : : bool data_sync_retry = false;
163 : :
164 : : /* How SyncDataDirectory() should do its job. */
165 : : int recovery_init_sync_method = DATA_DIR_SYNC_METHOD_FSYNC;
166 : :
167 : : /* Which kinds of files should be opened with PG_O_DIRECT. */
168 : : int io_direct_flags;
169 : :
170 : : /* Debugging.... */
171 : :
172 : : #ifdef FDDEBUG
173 : : #define DO_DB(A) \
174 : : do { \
175 : : int _do_db_save_errno = errno; \
176 : : A; \
177 : : errno = _do_db_save_errno; \
178 : : } while (0)
179 : : #else
180 : : #define DO_DB(A) \
181 : : ((void) 0)
182 : : #endif
183 : :
184 : : #define VFD_CLOSED (-1)
185 : :
186 : : #define FileIsValid(file) \
187 : : ((file) > 0 && (file) < (int) SizeVfdCache && VfdCache[file].fileName != NULL)
188 : :
189 : : #define FileIsNotOpen(file) (VfdCache[file].fd == VFD_CLOSED)
190 : :
191 : : /* these are the assigned bits in fdstate below: */
192 : : #define FD_DELETE_AT_CLOSE (1 << 0) /* T = delete when closed */
193 : : #define FD_CLOSE_AT_EOXACT (1 << 1) /* T = close at eoXact */
194 : : #define FD_TEMP_FILE_LIMIT (1 << 2) /* T = respect temp_file_limit */
195 : :
196 : : typedef struct vfd
197 : : {
198 : : int fd; /* current FD, or VFD_CLOSED if none */
199 : : unsigned short fdstate; /* bitflags for VFD's state */
200 : : ResourceOwner resowner; /* owner, for automatic cleanup */
201 : : File nextFree; /* link to next free VFD, if in freelist */
202 : : File lruMoreRecently; /* doubly linked recency-of-use list */
203 : : File lruLessRecently;
204 : : pgoff_t fileSize; /* current size of file (0 if not temporary) */
205 : : char *fileName; /* name of file, or NULL for unused VFD */
206 : : /* NB: fileName is malloc'd, and must be free'd when closing the VFD */
207 : : int fileFlags; /* open(2) flags for (re)opening the file */
208 : : mode_t fileMode; /* mode to pass to open(2) */
209 : : } Vfd;
210 : :
211 : : /*
212 : : * Virtual File Descriptor array pointer and size. This grows as
213 : : * needed. 'File' values are indexes into this array.
214 : : * Note that VfdCache[0] is not a usable VFD, just a list header.
215 : : */
216 : : static Vfd *VfdCache;
217 : : static Size SizeVfdCache = 0;
218 : :
219 : : /*
220 : : * Number of file descriptors known to be in use by VFD entries.
221 : : */
222 : : static int nfile = 0;
223 : :
224 : : /*
225 : : * Flag to tell whether it's worth scanning VfdCache looking for temp files
226 : : * to close
227 : : */
228 : : static bool have_xact_temporary_files = false;
229 : :
230 : : /*
231 : : * Tracks the total size of all temporary files. Note: when temp_file_limit
232 : : * is being enforced, this cannot overflow since the limit cannot be more
233 : : * than INT_MAX kilobytes. When not enforcing, it could theoretically
234 : : * overflow, but we don't care.
235 : : */
236 : : static uint64 temporary_files_size = 0;
237 : :
238 : : /* Temporary file access initialized and not yet shut down? */
239 : : #ifdef USE_ASSERT_CHECKING
240 : : static bool temporary_files_allowed = false;
241 : : #endif
242 : :
243 : : /*
244 : : * List of OS handles opened with AllocateFile, AllocateDir and
245 : : * OpenTransientFile.
246 : : */
247 : : typedef enum
248 : : {
249 : : AllocateDescFile,
250 : : AllocateDescPipe,
251 : : AllocateDescDir,
252 : : AllocateDescRawFD,
253 : : } AllocateDescKind;
254 : :
255 : : typedef struct
256 : : {
257 : : AllocateDescKind kind;
258 : : SubTransactionId create_subid;
259 : : union
260 : : {
261 : : FILE *file;
262 : : DIR *dir;
263 : : int fd;
264 : : } desc;
265 : : } AllocateDesc;
266 : :
267 : : static int numAllocatedDescs = 0;
268 : : static int maxAllocatedDescs = 0;
269 : : static AllocateDesc *allocatedDescs = NULL;
270 : :
271 : : /*
272 : : * Number of open "external" FDs reported to Reserve/ReleaseExternalFD.
273 : : */
274 : : static int numExternalFDs = 0;
275 : :
276 : : /*
277 : : * Number of temporary files opened during the current session;
278 : : * this is used in generation of tempfile names.
279 : : */
280 : : static long tempFileCounter = 0;
281 : :
282 : : /*
283 : : * Array of OIDs of temp tablespaces. (Some entries may be InvalidOid,
284 : : * indicating that the current database's default tablespace should be used.)
285 : : * When numTempTableSpaces is -1, this has not been set in the current
286 : : * transaction.
287 : : */
288 : : static Oid *tempTableSpaces = NULL;
289 : : static int numTempTableSpaces = -1;
290 : : static int nextTempTableSpace = 0;
291 : :
292 : :
293 : : /*--------------------
294 : : *
295 : : * Private Routines
296 : : *
297 : : * Delete - delete a file from the Lru ring
298 : : * LruDelete - remove a file from the Lru ring and close its FD
299 : : * Insert - put a file at the front of the Lru ring
300 : : * LruInsert - put a file at the front of the Lru ring and open it
301 : : * ReleaseLruFile - Release an fd by closing the last entry in the Lru ring
302 : : * ReleaseLruFiles - Release fd(s) until we're under the max_safe_fds limit
303 : : * AllocateVfd - grab a free (or new) file record (from VfdCache)
304 : : * FreeVfd - free a file record
305 : : *
306 : : * The Least Recently Used ring is a doubly linked list that begins and
307 : : * ends on element zero. Element zero is special -- it doesn't represent
308 : : * a file and its "fd" field always == VFD_CLOSED. Element zero is just an
309 : : * anchor that shows us the beginning/end of the ring.
310 : : * Only VFD elements that are currently really open (have an FD assigned) are
311 : : * in the Lru ring. Elements that are "virtually" open can be recognized
312 : : * by having a non-null fileName field.
313 : : *
314 : : * example:
315 : : *
316 : : * /--less----\ /---------\
317 : : * v \ v \
318 : : * #0 --more---> LeastRecentlyUsed --more-\ \
319 : : * ^\ | |
320 : : * \\less--> MostRecentlyUsedFile <---/ |
321 : : * \more---/ \--less--/
322 : : *
323 : : *--------------------
324 : : */
325 : : static void Delete(File file);
326 : : static void LruDelete(File file);
327 : : static void Insert(File file);
328 : : static int LruInsert(File file);
329 : : static bool ReleaseLruFile(void);
330 : : static void ReleaseLruFiles(void);
331 : : static File AllocateVfd(void);
332 : : static void FreeVfd(File file);
333 : :
334 : : static int FileAccess(File file);
335 : : static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError);
336 : : static bool reserveAllocatedDesc(void);
337 : : static int FreeDesc(AllocateDesc *desc);
338 : :
339 : : static void BeforeShmemExit_Files(int code, Datum arg);
340 : : static void CleanupTempFiles(bool isCommit, bool isProcExit);
341 : : static void RemovePgTempRelationFiles(const char *tsdirname);
342 : : static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname);
343 : :
344 : : static void walkdir(const char *path,
345 : : void (*action) (const char *fname, bool isdir, int elevel),
346 : : bool process_symlinks,
347 : : int elevel);
348 : : #ifdef PG_FLUSH_DATA_WORKS
349 : : static void pre_sync_fname(const char *fname, bool isdir, int elevel);
350 : : #endif
351 : : static void datadir_fsync_fname(const char *fname, bool isdir, int elevel);
352 : : static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel);
353 : :
354 : : static int fsync_parent_path(const char *fname, int elevel);
355 : :
356 : :
357 : : /* ResourceOwner callbacks to hold virtual file descriptors */
358 : : static void ResOwnerReleaseFile(Datum res);
359 : : static char *ResOwnerPrintFile(Datum res);
360 : :
361 : : static const ResourceOwnerDesc file_resowner_desc =
362 : : {
363 : : .name = "File",
364 : : .release_phase = RESOURCE_RELEASE_AFTER_LOCKS,
365 : : .release_priority = RELEASE_PRIO_FILES,
366 : : .ReleaseResource = ResOwnerReleaseFile,
367 : : .DebugPrint = ResOwnerPrintFile
368 : : };
369 : :
370 : : /* Convenience wrappers over ResourceOwnerRemember/Forget */
371 : : static inline void
770 heikki.linnakangas@i 372 :CBC 4272 : ResourceOwnerRememberFile(ResourceOwner owner, File file)
373 : : {
374 : 4272 : ResourceOwnerRemember(owner, Int32GetDatum(file), &file_resowner_desc);
375 : 4272 : }
376 : : static inline void
377 : 4268 : ResourceOwnerForgetFile(ResourceOwner owner, File file)
378 : : {
379 : 4268 : ResourceOwnerForget(owner, Int32GetDatum(file), &file_resowner_desc);
380 : 4268 : }
381 : :
382 : : /*
383 : : * pg_fsync --- do fsync with or without writethrough
384 : : */
385 : : int
9140 tgl@sss.pgh.pa.us 386 : 64390 : pg_fsync(int fd)
387 : : {
388 : : #if !defined(WIN32) && defined(USE_ASSERT_CHECKING)
389 : : struct stat st;
390 : :
391 : : /*
392 : : * Some operating system implementations of fsync() have requirements
393 : : * about the file access modes that were used when their file descriptor
394 : : * argument was opened, and these requirements differ depending on whether
395 : : * the file descriptor is for a directory.
396 : : *
397 : : * For any file descriptor that may eventually be handed to fsync(), we
398 : : * should have opened it with access modes that are compatible with
399 : : * fsync() on all supported systems, otherwise the code may not be
400 : : * portable, even if it runs ok on the current system.
401 : : *
402 : : * We assert here that a descriptor for a file was opened with write
403 : : * permissions (i.e., not O_RDONLY) and for a directory without write
404 : : * permissions (O_RDONLY). Notice that the assertion check is made even
405 : : * if fsync() is disabled.
406 : : *
407 : : * If fstat() fails, ignore it and let the follow-up fsync() complain.
408 : : */
2213 michael@paquier.xyz 409 [ + - ]: 64390 : if (fstat(fd, &st) == 0)
410 : : {
411 : 64390 : int desc_flags = fcntl(fd, F_GETFL);
412 : :
169 tgl@sss.pgh.pa.us 413 : 64390 : desc_flags &= O_ACCMODE;
414 : :
2213 michael@paquier.xyz 415 [ + + ]: 64390 : if (S_ISDIR(st.st_mode))
169 tgl@sss.pgh.pa.us 416 [ - + ]: 25668 : Assert(desc_flags == O_RDONLY);
417 : : else
418 [ - + ]: 38722 : Assert(desc_flags != O_RDONLY);
419 : : }
2213 michael@paquier.xyz 420 : 64390 : errno = 0;
421 : : #endif
422 : :
423 : : /* #if is to skip the wal_sync_method test if there's no need for it */
424 : : #if defined(HAVE_FSYNC_WRITETHROUGH)
425 : : if (wal_sync_method == WAL_SYNC_METHOD_FSYNC_WRITETHROUGH)
426 : : return pg_fsync_writethrough(fd);
427 : : else
428 : : #endif
5488 tgl@sss.pgh.pa.us 429 : 64390 : return pg_fsync_no_writethrough(fd);
430 : : }
431 : :
432 : :
433 : : /*
434 : : * pg_fsync_no_writethrough --- same as fsync except does nothing if
435 : : * enableFsync is off
436 : : */
437 : : int
7516 bruce@momjian.us 438 : 64390 : pg_fsync_no_writethrough(int fd)
439 : : {
440 : : int rc;
441 : :
912 andres@anarazel.de 442 [ - + ]: 64390 : if (!enableFsync)
9140 tgl@sss.pgh.pa.us 443 : 64390 : return 0;
444 : :
912 andres@anarazel.de 445 :UBC 0 : retry:
446 : 0 : rc = fsync(fd);
447 : :
448 [ # # # # ]: 0 : if (rc == -1 && errno == EINTR)
449 : 0 : goto retry;
450 : :
451 : 0 : return rc;
452 : : }
453 : :
454 : : /*
455 : : * pg_fsync_writethrough
456 : : */
457 : : int
7516 bruce@momjian.us 458 : 0 : pg_fsync_writethrough(int fd)
459 : : {
460 [ # # ]: 0 : if (enableFsync)
461 : : {
462 : : #if defined(F_FULLFSYNC)
463 : : return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0;
464 : : #else
5777 tgl@sss.pgh.pa.us 465 : 0 : errno = ENOSYS;
7516 bruce@momjian.us 466 : 0 : return -1;
467 : : #endif
468 : : }
469 : : else
470 : 0 : return 0;
471 : : }
472 : :
473 : : /*
474 : : * pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off
475 : : */
476 : : int
9068 tgl@sss.pgh.pa.us 477 : 0 : pg_fdatasync(int fd)
478 : : {
479 : : int rc;
480 : :
912 andres@anarazel.de 481 [ # # ]: 0 : if (!enableFsync)
9068 tgl@sss.pgh.pa.us 482 : 0 : return 0;
483 : :
912 andres@anarazel.de 484 : 0 : retry:
485 : 0 : rc = fdatasync(fd);
486 : :
487 [ # # # # ]: 0 : if (rc == -1 && errno == EINTR)
488 : 0 : goto retry;
489 : :
490 : 0 : return rc;
491 : : }
492 : :
493 : : /*
494 : : * pg_file_exists -- check that a file exists.
495 : : *
496 : : * This requires an absolute path to the file. Returns true if the file is
497 : : * not a directory, false otherwise.
498 : : */
499 : : bool
705 michael@paquier.xyz 500 :CBC 19241 : pg_file_exists(const char *name)
501 : : {
502 : : struct stat st;
503 : :
504 [ - + ]: 19241 : Assert(name != NULL);
505 : :
506 [ + + ]: 19241 : if (stat(name, &st) == 0)
507 : 9726 : return !S_ISDIR(st.st_mode);
508 [ - + - - : 9515 : else if (!(errno == ENOENT || errno == ENOTDIR || errno == EACCES))
- - ]
705 michael@paquier.xyz 509 [ # # ]:UBC 0 : ereport(ERROR,
510 : : (errcode_for_file_access(),
511 : : errmsg("could not access file \"%s\": %m", name)));
512 : :
705 michael@paquier.xyz 513 :CBC 9515 : return false;
514 : : }
515 : :
516 : : /*
517 : : * pg_flush_data --- advise OS that the described dirty data should be flushed
518 : : *
519 : : * offset of 0 with nbytes 0 means that the entire file should be flushed
520 : : */
521 : : void
34 michael@paquier.xyz 522 :GNC 38279 : pg_flush_data(int fd, pgoff_t offset, pgoff_t nbytes)
523 : : {
524 : : /*
525 : : * Right now file flushing is primarily used to avoid making later
526 : : * fsync()/fdatasync() calls have less impact. Thus don't trigger flushes
527 : : * if fsyncs are disabled - that's a decision we might want to make
528 : : * configurable at some point.
529 : : */
3589 andres@anarazel.de 530 [ + - ]:CBC 38279 : if (!enableFsync)
531 : 38279 : return;
532 : :
533 : : /*
534 : : * We compile all alternatives that are supported on the current platform,
535 : : * to find portability problems more easily.
536 : : */
537 : : #if defined(HAVE_SYNC_FILE_RANGE)
538 : : {
539 : : int rc;
540 : : static bool not_implemented_by_kernel = false;
541 : :
2488 tmunro@postgresql.or 542 [ # # ]:UBC 0 : if (not_implemented_by_kernel)
543 : 0 : return;
544 : :
912 andres@anarazel.de 545 : 0 : retry:
546 : :
547 : : /*
548 : : * sync_file_range(SYNC_FILE_RANGE_WRITE), currently linux specific,
549 : : * tells the OS that writeback for the specified blocks should be
550 : : * started, but that we don't want to wait for completion. Note that
551 : : * this call might block if too much dirty data exists in the range.
552 : : * This is the preferable method on OSs supporting it, as it works
553 : : * reliably when available (contrast to msync()) and doesn't flush out
554 : : * clean data (like FADV_DONTNEED).
555 : : */
3589 556 : 0 : rc = sync_file_range(fd, offset, nbytes,
557 : : SYNC_FILE_RANGE_WRITE);
558 [ # # ]: 0 : if (rc != 0)
559 : : {
560 : : int elevel;
561 : :
912 562 [ # # ]: 0 : if (rc == EINTR)
563 : 0 : goto retry;
564 : :
565 : : /*
566 : : * For systems that don't have an implementation of
567 : : * sync_file_range() such as Windows WSL, generate only one
568 : : * warning and then suppress all further attempts by this process.
569 : : */
2488 tmunro@postgresql.or 570 [ # # ]: 0 : if (errno == ENOSYS)
571 : : {
572 : 0 : elevel = WARNING;
573 : 0 : not_implemented_by_kernel = true;
574 : : }
575 : : else
576 : 0 : elevel = data_sync_elevel(WARNING);
577 : :
578 [ # # ]: 0 : ereport(elevel,
579 : : (errcode_for_file_access(),
580 : : errmsg("could not flush dirty data: %m")));
581 : : }
582 : :
3589 andres@anarazel.de 583 : 0 : return;
584 : : }
585 : : #endif
586 : : #if !defined(WIN32) && defined(MS_ASYNC)
587 : : {
588 : : void *p;
589 : : static int pagesize = 0;
590 : :
591 : : /*
592 : : * On several OSs msync(MS_ASYNC) on a mmap'ed file triggers
593 : : * writeback. On linux it only does so if MS_SYNC is specified, but
594 : : * then it does the writeback synchronously. Luckily all common linux
595 : : * systems have sync_file_range(). This is preferable over
596 : : * FADV_DONTNEED because it doesn't flush out clean data.
597 : : *
598 : : * We map the file (mmap()), tell the kernel to sync back the contents
599 : : * (msync()), and then remove the mapping again (munmap()).
600 : : */
601 : :
602 : : /* mmap() needs actual length if we want to map whole file */
603 : : if (offset == 0 && nbytes == 0)
604 : : {
605 : : nbytes = lseek(fd, 0, SEEK_END);
606 : : if (nbytes < 0)
607 : : {
608 : : ereport(WARNING,
609 : : (errcode_for_file_access(),
610 : : errmsg("could not determine dirty data size: %m")));
611 : : return;
612 : : }
613 : : }
614 : :
615 : : /*
616 : : * Some platforms reject partial-page mmap() attempts. To deal with
617 : : * that, just truncate the request to a page boundary. If any extra
618 : : * bytes don't get flushed, well, it's only a hint anyway.
619 : : */
620 : :
621 : : /* fetch pagesize only once */
622 : : if (pagesize == 0)
623 : : pagesize = sysconf(_SC_PAGESIZE);
624 : :
625 : : /* align length to pagesize, dropping any fractional page */
626 : : if (pagesize > 0)
627 : : nbytes = (nbytes / pagesize) * pagesize;
628 : :
629 : : /* fractional-page request is a no-op */
630 : : if (nbytes <= 0)
631 : : return;
632 : :
633 : : /*
634 : : * mmap could well fail, particularly on 32-bit platforms where there
635 : : * may simply not be enough address space. If so, silently fall
636 : : * through to the next implementation.
637 : : */
638 : : if (nbytes <= (pgoff_t) SSIZE_MAX)
639 : : p = mmap(NULL, nbytes, PROT_READ, MAP_SHARED, fd, offset);
640 : : else
641 : : p = MAP_FAILED;
642 : :
643 : : if (p != MAP_FAILED)
644 : : {
645 : : int rc;
646 : :
647 : : rc = msync(p, (size_t) nbytes, MS_ASYNC);
648 : : if (rc != 0)
649 : : {
650 : : ereport(data_sync_elevel(WARNING),
651 : : (errcode_for_file_access(),
652 : : errmsg("could not flush dirty data: %m")));
653 : : /* NB: need to fall through to munmap()! */
654 : : }
655 : :
656 : : rc = munmap(p, (size_t) nbytes);
657 : : if (rc != 0)
658 : : {
659 : : /* FATAL error because mapping would remain */
660 : : ereport(FATAL,
661 : : (errcode_for_file_access(),
662 : : errmsg("could not munmap() while flushing data: %m")));
663 : : }
664 : :
665 : : return;
666 : : }
667 : : }
668 : : #endif
669 : : #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
670 : : {
671 : : int rc;
672 : :
673 : : /*
674 : : * Signal the kernel that the passed in range should not be cached
675 : : * anymore. This has the, desired, side effect of writing out dirty
676 : : * data, and the, undesired, side effect of likely discarding useful
677 : : * clean cached blocks. For the latter reason this is the least
678 : : * preferable method.
679 : : */
680 : :
681 : : rc = posix_fadvise(fd, offset, nbytes, POSIX_FADV_DONTNEED);
682 : :
683 : : if (rc != 0)
684 : : {
685 : : /* don't error out, this is just a performance optimization */
686 : : ereport(WARNING,
687 : : (errcode_for_file_access(),
688 : : errmsg("could not flush dirty data: %m")));
689 : : }
690 : :
691 : : return;
692 : : }
693 : : #endif
694 : : }
695 : :
696 : : /*
697 : : * Truncate an open file to a given length.
698 : : */
699 : : static int
34 michael@paquier.xyz 700 :GNC 561 : pg_ftruncate(int fd, pgoff_t length)
701 : : {
702 : : int ret;
703 : :
912 andres@anarazel.de 704 :CBC 561 : retry:
705 : 561 : ret = ftruncate(fd, length);
706 : :
707 [ - + - - ]: 561 : if (ret == -1 && errno == EINTR)
912 andres@anarazel.de 708 :UBC 0 : goto retry;
709 : :
912 andres@anarazel.de 710 :CBC 561 : return ret;
711 : : }
712 : :
713 : : /*
714 : : * Truncate a file to a given length by name.
715 : : */
716 : : int
34 michael@paquier.xyz 717 :GNC 224908 : pg_truncate(const char *path, pgoff_t length)
718 : : {
719 : : int ret;
720 : : #ifdef WIN32
721 : : int save_errno;
722 : : int fd;
723 : :
724 : : fd = OpenTransientFile(path, O_RDWR | PG_BINARY);
725 : : if (fd >= 0)
726 : : {
727 : : ret = pg_ftruncate(fd, length);
728 : : save_errno = errno;
729 : : CloseTransientFile(fd);
730 : : errno = save_errno;
731 : : }
732 : : else
733 : : ret = -1;
734 : : #else
735 : :
912 andres@anarazel.de 736 :CBC 224908 : retry:
737 : 224908 : ret = truncate(path, length);
738 : :
739 [ + + - + ]: 224908 : if (ret == -1 && errno == EINTR)
912 andres@anarazel.de 740 :UBC 0 : goto retry;
741 : : #endif
742 : :
912 andres@anarazel.de 743 :CBC 224908 : return ret;
744 : : }
745 : :
746 : : /*
747 : : * fsync_fname -- fsync a file or directory, handling errors properly
748 : : *
749 : : * Try to fsync a file or directory. When doing the latter, ignore errors that
750 : : * indicate the OS just doesn't allow/require fsyncing directories.
751 : : */
752 : : void
3570 753 : 20781 : fsync_fname(const char *fname, bool isdir)
754 : : {
2585 tmunro@postgresql.or 755 : 20781 : fsync_fname_ext(fname, isdir, false, data_sync_elevel(ERROR));
3570 andres@anarazel.de 756 : 20781 : }
757 : :
758 : : /*
759 : : * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
760 : : *
761 : : * This routine ensures that, after returning, the effect of renaming file
762 : : * persists in case of a crash. A crash while this routine is running will
763 : : * leave you with either the pre-existing or the moved file in place of the
764 : : * new file; no mixed state or truncated files are possible.
765 : : *
766 : : * It does so by using fsync on the old filename and the possibly existing
767 : : * target filename before the rename, and the target file and directory after.
768 : : *
769 : : * Note that rename() cannot be used across arbitrary directories, as they
770 : : * might not be on the same filesystem. Therefore this routine does not
771 : : * support renaming across directories.
772 : : *
773 : : * Log errors with the caller specified severity.
774 : : *
775 : : * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
776 : : * valid upon return.
777 : : */
778 : : int
779 : 6314 : durable_rename(const char *oldfile, const char *newfile, int elevel)
780 : : {
781 : : int fd;
782 : :
783 : : /*
784 : : * First fsync the old and target path (if it exists), to ensure that they
785 : : * are properly persistent on disk. Syncing the target file is not
786 : : * strictly necessary, but it makes it easier to reason about crashes;
787 : : * because it's then guaranteed that either source or target file exists
788 : : * after a crash.
789 : : */
790 [ - + ]: 6314 : if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
3570 andres@anarazel.de 791 :UBC 0 : return -1;
792 : :
3007 peter_e@gmx.net 793 :CBC 6314 : fd = OpenTransientFile(newfile, PG_BINARY | O_RDWR);
3570 andres@anarazel.de 794 [ + + ]: 6314 : if (fd < 0)
795 : : {
796 [ - + ]: 4387 : if (errno != ENOENT)
797 : : {
3570 andres@anarazel.de 798 [ # # ]:UBC 0 : ereport(elevel,
799 : : (errcode_for_file_access(),
800 : : errmsg("could not open file \"%s\": %m", newfile)));
801 : 0 : return -1;
802 : : }
803 : : }
804 : : else
805 : : {
3570 andres@anarazel.de 806 [ - + ]:CBC 1927 : if (pg_fsync(fd) != 0)
807 : : {
808 : : int save_errno;
809 : :
810 : : /* close file upon error, might not be in transaction context */
3570 andres@anarazel.de 811 :UBC 0 : save_errno = errno;
812 : 0 : CloseTransientFile(fd);
813 : 0 : errno = save_errno;
814 : :
815 [ # # ]: 0 : ereport(elevel,
816 : : (errcode_for_file_access(),
817 : : errmsg("could not fsync file \"%s\": %m", newfile)));
818 : 0 : return -1;
819 : : }
820 : :
2356 peter@eisentraut.org 821 [ - + ]:CBC 1927 : if (CloseTransientFile(fd) != 0)
822 : : {
2475 michael@paquier.xyz 823 [ # # ]:UBC 0 : ereport(elevel,
824 : : (errcode_for_file_access(),
825 : : errmsg("could not close file \"%s\": %m", newfile)));
826 : 0 : return -1;
827 : : }
828 : : }
829 : :
830 : : /* Time to do the real deal... */
3570 andres@anarazel.de 831 [ - + ]:CBC 6314 : if (rename(oldfile, newfile) < 0)
832 : : {
3570 andres@anarazel.de 833 [ # # ]:UBC 0 : ereport(elevel,
834 : : (errcode_for_file_access(),
835 : : errmsg("could not rename file \"%s\" to \"%s\": %m",
836 : : oldfile, newfile)));
837 : 0 : return -1;
838 : : }
839 : :
840 : : /*
841 : : * To guarantee renaming the file is persistent, fsync the file with its
842 : : * new name, and its containing directory.
843 : : */
3570 andres@anarazel.de 844 [ - + ]:CBC 6314 : if (fsync_fname_ext(newfile, false, false, elevel) != 0)
3570 andres@anarazel.de 845 :UBC 0 : return -1;
846 : :
3570 andres@anarazel.de 847 [ - + ]:CBC 6314 : if (fsync_parent_path(newfile, elevel) != 0)
3570 andres@anarazel.de 848 :UBC 0 : return -1;
849 : :
3570 andres@anarazel.de 850 :CBC 6314 : return 0;
851 : : }
852 : :
853 : : /*
854 : : * durable_unlink -- remove a file in a durable manner
855 : : *
856 : : * This routine ensures that, after returning, the effect of removing file
857 : : * persists in case of a crash. A crash while this routine is running will
858 : : * leave the system in no mixed state.
859 : : *
860 : : * It does so by using fsync on the parent directory of the file after the
861 : : * actual removal is done.
862 : : *
863 : : * Log errors with the severity specified by caller.
864 : : *
865 : : * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
866 : : * valid upon return.
867 : : */
868 : : int
3187 teodor@sigaev.ru 869 : 1264 : durable_unlink(const char *fname, int elevel)
870 : : {
871 [ + + ]: 1264 : if (unlink(fname) < 0)
872 : : {
873 [ + + ]: 39 : ereport(elevel,
874 : : (errcode_for_file_access(),
875 : : errmsg("could not remove file \"%s\": %m",
876 : : fname)));
877 : 39 : return -1;
878 : : }
879 : :
880 : : /*
881 : : * To guarantee that the removal of the file is persistent, fsync its
882 : : * parent directory.
883 : : */
884 [ - + ]: 1225 : if (fsync_parent_path(fname, elevel) != 0)
3187 teodor@sigaev.ru 885 :UBC 0 : return -1;
886 : :
3187 teodor@sigaev.ru 887 :CBC 1225 : return 0;
888 : : }
889 : :
890 : : /*
891 : : * InitFileAccess --- initialize this module during backend startup
892 : : *
893 : : * This is called during either normal or standalone backend start.
894 : : * It is *not* called in the postmaster.
895 : : *
896 : : * Note that this does not initialize temporary file access, that is
897 : : * separately initialized via InitTemporaryFileAccess().
898 : : */
899 : : void
7436 tgl@sss.pgh.pa.us 900 : 19580 : InitFileAccess(void)
901 : : {
7368 bruce@momjian.us 902 [ - + ]: 19580 : Assert(SizeVfdCache == 0); /* call me only once */
903 : :
904 : : /* initialize cache header entry */
7436 tgl@sss.pgh.pa.us 905 : 19580 : VfdCache = (Vfd *) malloc(sizeof(Vfd));
906 [ - + ]: 19580 : if (VfdCache == NULL)
7436 tgl@sss.pgh.pa.us 907 [ # # ]:UBC 0 : ereport(FATAL,
908 : : (errcode(ERRCODE_OUT_OF_MEMORY),
909 : : errmsg("out of memory")));
910 : :
308 peter@eisentraut.org 911 [ + - + - :CBC 156640 : MemSet(&(VfdCache[0]), 0, sizeof(Vfd));
+ - + - +
+ ]
7436 tgl@sss.pgh.pa.us 912 : 19580 : VfdCache->fd = VFD_CLOSED;
913 : :
914 : 19580 : SizeVfdCache = 1;
1593 andres@anarazel.de 915 : 19580 : }
916 : :
917 : : /*
918 : : * InitTemporaryFileAccess --- initialize temporary file access during startup
919 : : *
920 : : * This is called during either normal or standalone backend start.
921 : : * It is *not* called in the postmaster.
922 : : *
923 : : * This is separate from InitFileAccess() because temporary file cleanup can
924 : : * cause pgstat reporting. As pgstat is shut down during before_shmem_exit(),
925 : : * our reporting has to happen before that. Low level file access should be
926 : : * available for longer, hence the separate initialization / shutdown of
927 : : * temporary file handling.
928 : : */
929 : : void
930 : 19580 : InitTemporaryFileAccess(void)
931 : : {
1343 drowley@postgresql.o 932 [ - + ]: 19580 : Assert(SizeVfdCache != 0); /* InitFileAccess() needs to have run */
1593 andres@anarazel.de 933 [ - + ]: 19580 : Assert(!temporary_files_allowed); /* call me only once */
934 : :
935 : : /*
936 : : * Register before-shmem-exit hook to ensure temp files are dropped while
937 : : * we can still report stats.
938 : : */
939 : 19580 : before_shmem_exit(BeforeShmemExit_Files, 0);
940 : :
941 : : #ifdef USE_ASSERT_CHECKING
942 : 19580 : temporary_files_allowed = true;
943 : : #endif
7436 tgl@sss.pgh.pa.us 944 : 19580 : }
945 : :
946 : : /*
947 : : * count_usable_fds --- count how many FDs the system will let us open,
948 : : * and estimate how many are already open.
949 : : *
950 : : * We stop counting if usable_fds reaches max_to_probe. Note: a small
951 : : * value of max_to_probe might result in an underestimate of already_open;
952 : : * we must fill in any "gaps" in the set of used FDs before the calculation
953 : : * of already_open will give the right answer. In practice, max_to_probe
954 : : * of a couple of dozen should be enough to ensure good results.
955 : : *
956 : : * We assume stderr (FD 2) is available for dup'ing. While the calling
957 : : * script could theoretically close that, it would be a really bad idea,
958 : : * since then one risks loss of error messages from, e.g., libc.
959 : : */
960 : : static void
7437 961 : 1065 : count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
962 : : {
963 : : int *fd;
964 : : int size;
7968 965 : 1065 : int used = 0;
966 : 1065 : int highestfd = 0;
967 : : int j;
968 : :
969 : : #ifdef HAVE_GETRLIMIT
970 : : struct rlimit rlim;
971 : : int getrlimit_status;
972 : : #endif
973 : :
974 : 1065 : size = 1024;
975 : 1065 : fd = (int *) palloc(size * sizeof(int));
976 : :
977 : : #ifdef HAVE_GETRLIMIT
6132 peter_e@gmx.net 978 : 1065 : getrlimit_status = getrlimit(RLIMIT_NOFILE, &rlim);
979 [ - + ]: 1065 : if (getrlimit_status != 0)
6132 peter_e@gmx.net 980 [ # # ]:UBC 0 : ereport(WARNING, (errmsg("getrlimit failed: %m")));
981 : : #endif /* HAVE_GETRLIMIT */
982 : :
983 : : /* dup until failure or probe limit reached */
984 : : for (;;)
7968 tgl@sss.pgh.pa.us 985 :CBC 1063935 : {
986 : : int thisfd;
987 : :
988 : : #ifdef HAVE_GETRLIMIT
989 : :
990 : : /*
991 : : * don't go beyond RLIMIT_NOFILE; causes irritating kernel logs on
992 : : * some platforms
993 : : */
6132 peter_e@gmx.net 994 [ + - - + ]: 1065000 : if (getrlimit_status == 0 && highestfd >= rlim.rlim_cur - 1)
6132 peter_e@gmx.net 995 :UBC 0 : break;
996 : : #endif
997 : :
1567 tgl@sss.pgh.pa.us 998 :CBC 1065000 : thisfd = dup(2);
7968 999 [ - + ]: 1065000 : if (thisfd < 0)
1000 : : {
1001 : : /* Expect EMFILE or ENFILE, else it's fishy */
7968 tgl@sss.pgh.pa.us 1002 [ # # # # ]:UBC 0 : if (errno != EMFILE && errno != ENFILE)
1567 1003 [ # # ]: 0 : elog(WARNING, "duplicating stderr file descriptor failed after %d successes: %m", used);
7968 1004 : 0 : break;
1005 : : }
1006 : :
7968 tgl@sss.pgh.pa.us 1007 [ - + ]:CBC 1065000 : if (used >= size)
1008 : : {
7968 tgl@sss.pgh.pa.us 1009 :UBC 0 : size *= 2;
1010 : 0 : fd = (int *) repalloc(fd, size * sizeof(int));
1011 : : }
7968 tgl@sss.pgh.pa.us 1012 :CBC 1065000 : fd[used++] = thisfd;
1013 : :
1014 [ + - ]: 1065000 : if (highestfd < thisfd)
1015 : 1065000 : highestfd = thisfd;
1016 : :
7437 1017 [ + + ]: 1065000 : if (used >= max_to_probe)
1018 : 1065 : break;
1019 : : }
1020 : :
1021 : : /* release the files we opened */
7968 1022 [ + + ]: 1066065 : for (j = 0; j < used; j++)
1023 : 1065000 : close(fd[j]);
1024 : :
1025 : 1065 : pfree(fd);
1026 : :
1027 : : /*
1028 : : * Return results. usable_fds is just the number of successful dups. We
1029 : : * assume that the system limit is highestfd+1 (remember 0 is a legal FD
1030 : : * number) and so already_open is highestfd+1 - usable_fds.
1031 : : */
1032 : 1065 : *usable_fds = used;
7780 bruce@momjian.us 1033 : 1065 : *already_open = highestfd + 1 - used;
7968 tgl@sss.pgh.pa.us 1034 : 1065 : }
1035 : :
1036 : : /*
1037 : : * set_max_safe_fds
1038 : : * Determine number of file descriptors that fd.c is allowed to use
1039 : : */
1040 : : void
1041 : 1065 : set_max_safe_fds(void)
1042 : : {
1043 : : int usable_fds;
1044 : : int already_open;
1045 : :
1046 : : /*----------
1047 : : * We want to set max_safe_fds to
1048 : : * MIN(usable_fds, max_files_per_process)
1049 : : * less the slop factor for files that are opened without consulting
1050 : : * fd.c. This ensures that we won't allow to open more than
1051 : : * max_files_per_process, or the experimentally-determined EMFILE limit,
1052 : : * additional files.
1053 : : *----------
1054 : : */
7437 1055 : 1065 : count_usable_fds(max_files_per_process,
1056 : : &usable_fds, &already_open);
1057 : :
268 andres@anarazel.de 1058 : 1065 : max_safe_fds = Min(usable_fds, max_files_per_process);
1059 : :
1060 : : /*
1061 : : * Take off the FDs reserved for system() etc.
1062 : : */
7968 tgl@sss.pgh.pa.us 1063 : 1065 : max_safe_fds -= NUM_RESERVED_FDS;
1064 : :
1065 : : /*
1066 : : * Make sure we still have enough to get by.
1067 : : */
1068 [ - + ]: 1065 : if (max_safe_fds < FD_MINFREE)
7968 tgl@sss.pgh.pa.us 1069 [ # # ]:UBC 0 : ereport(FATAL,
1070 : : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
1071 : : errmsg("insufficient file descriptors available to start server process"),
1072 : : errdetail("System allows %d, server needs at least %d, %d files are already open.",
1073 : : max_safe_fds + NUM_RESERVED_FDS,
1074 : : FD_MINFREE + NUM_RESERVED_FDS,
1075 : : already_open)));
1076 : :
7968 tgl@sss.pgh.pa.us 1077 [ + + ]:CBC 1065 : elog(DEBUG2, "max_safe_fds = %d, usable_fds = %d, already_open = %d",
1078 : : max_safe_fds, usable_fds, already_open);
1079 : 1065 : }
1080 : :
1081 : : /*
1082 : : * Open a file with BasicOpenFilePerm() and pass default file mode for the
1083 : : * fileMode parameter.
1084 : : */
1085 : : int
3007 peter_e@gmx.net 1086 : 39081 : BasicOpenFile(const char *fileName, int fileFlags)
1087 : : {
2811 sfrost@snowman.net 1088 : 39081 : return BasicOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
1089 : : }
1090 : :
1091 : : /*
1092 : : * BasicOpenFilePerm --- same as open(2) except can free other FDs if needed
1093 : : *
1094 : : * This is exported for use by places that really want a plain kernel FD,
1095 : : * but need to be proof against running out of FDs. Once an FD has been
1096 : : * successfully returned, it is the caller's responsibility to ensure that
1097 : : * it will not be leaked on ereport()! Most users should *not* call this
1098 : : * routine directly, but instead use the VFD abstraction level, which
1099 : : * provides protection against descriptor leaks as well as management of
1100 : : * files that need to be open for more than a short period of time.
1101 : : *
1102 : : * Ideally this should be the *only* direct call of open() in the backend.
1103 : : * In practice, the postmaster calls open() directly, and there are some
1104 : : * direct open() calls done early in backend startup. Those are OK since
1105 : : * this module wouldn't have any open files to close at that point anyway.
1106 : : */
1107 : : int
3007 peter_e@gmx.net 1108 : 1748439 : BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
1109 : : {
1110 : : int fd;
1111 : :
9329 tgl@sss.pgh.pa.us 1112 : 1748439 : tryAgain:
1113 : : #ifdef PG_O_DIRECT_USE_F_NOCACHE
1114 : : fd = open(fileName, fileFlags & ~PG_O_DIRECT, fileMode);
1115 : : #else
1116 : 1748439 : fd = open(fileName, fileFlags, fileMode);
1117 : : #endif
1118 : :
1119 [ + + ]: 1748439 : if (fd >= 0)
1120 : : {
1121 : : #ifdef PG_O_DIRECT_USE_F_NOCACHE
1122 : : if (fileFlags & PG_O_DIRECT)
1123 : : {
1124 : : if (fcntl(fd, F_NOCACHE, 1) < 0)
1125 : : {
1126 : : int save_errno = errno;
1127 : :
1128 : : close(fd);
1129 : : errno = save_errno;
1130 : : return -1;
1131 : : }
1132 : : }
1133 : : #endif
1134 : :
1135 : 1343967 : return fd; /* success! */
1136 : : }
1137 : :
9243 1138 [ + - - + ]: 404472 : if (errno == EMFILE || errno == ENFILE)
1139 : : {
9036 bruce@momjian.us 1140 :UBC 0 : int save_errno = errno;
1141 : :
8182 tgl@sss.pgh.pa.us 1142 [ # # ]: 0 : ereport(LOG,
1143 : : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
1144 : : errmsg("out of file descriptors: %m; release and retry")));
9329 1145 : 0 : errno = 0;
9243 1146 [ # # ]: 0 : if (ReleaseLruFile())
1147 : 0 : goto tryAgain;
1148 : 0 : errno = save_errno;
1149 : : }
1150 : :
9329 tgl@sss.pgh.pa.us 1151 :CBC 404472 : return -1; /* failure */
1152 : : }
1153 : :
1154 : : /*
1155 : : * AcquireExternalFD - attempt to reserve an external file descriptor
1156 : : *
1157 : : * This should be used by callers that need to hold a file descriptor open
1158 : : * over more than a short interval, but cannot use any of the other facilities
1159 : : * provided by this module.
1160 : : *
1161 : : * The difference between this and the underlying ReserveExternalFD function
1162 : : * is that this will report failure (by setting errno and returning false)
1163 : : * if "too many" external FDs are already reserved. This should be used in
1164 : : * any code where the total number of FDs to be reserved is not predictable
1165 : : * and small.
1166 : : */
1167 : : bool
2123 1168 : 124137 : AcquireExternalFD(void)
1169 : : {
1170 : : /*
1171 : : * We don't want more than max_safe_fds / 3 FDs to be consumed for
1172 : : * "external" FDs.
1173 : : */
1174 [ + - ]: 124137 : if (numExternalFDs < max_safe_fds / 3)
1175 : : {
1176 : 124137 : ReserveExternalFD();
1177 : 124137 : return true;
1178 : : }
2123 tgl@sss.pgh.pa.us 1179 :UBC 0 : errno = EMFILE;
1180 : 0 : return false;
1181 : : }
1182 : :
1183 : : /*
1184 : : * ReserveExternalFD - report external consumption of a file descriptor
1185 : : *
1186 : : * This should be used by callers that need to hold a file descriptor open
1187 : : * over more than a short interval, but cannot use any of the other facilities
1188 : : * provided by this module. This just tracks the use of the FD and closes
1189 : : * VFDs if needed to ensure we keep NUM_RESERVED_FDS FDs available.
1190 : : *
1191 : : * Call this directly only in code where failure to reserve the FD would be
1192 : : * fatal; for example, the WAL-writing code does so, since the alternative is
1193 : : * session failure. Also, it's very unwise to do so in code that could
1194 : : * consume more than one FD per process.
1195 : : *
1196 : : * Note: as long as everybody plays nice so that NUM_RESERVED_FDS FDs remain
1197 : : * available, it doesn't matter too much whether this is called before or
1198 : : * after actually opening the FD; but doing so beforehand reduces the risk of
1199 : : * an EMFILE failure if not everybody played nice. In any case, it's solely
1200 : : * caller's responsibility to keep the external-FD count in sync with reality.
1201 : : */
1202 : : void
2123 tgl@sss.pgh.pa.us 1203 :CBC 195955 : ReserveExternalFD(void)
1204 : : {
1205 : : /*
1206 : : * Release VFDs if needed to stay safe. Because we do this before
1207 : : * incrementing numExternalFDs, the final state will be as desired, i.e.,
1208 : : * nfile + numAllocatedDescs + numExternalFDs <= max_safe_fds.
1209 : : */
1210 : 195955 : ReleaseLruFiles();
1211 : :
1212 : 195955 : numExternalFDs++;
1213 : 195955 : }
1214 : :
1215 : : /*
1216 : : * ReleaseExternalFD - report release of an external file descriptor
1217 : : *
1218 : : * This is guaranteed not to change errno, so it can be used in failure paths.
1219 : : */
1220 : : void
1221 : 176009 : ReleaseExternalFD(void)
1222 : : {
1223 [ - + ]: 176009 : Assert(numExternalFDs > 0);
1224 : 176009 : numExternalFDs--;
1225 : 176009 : }
1226 : :
1227 : :
1228 : : #if defined(FDDEBUG)
1229 : :
1230 : : static void
1231 : : _dump_lru(void)
1232 : : {
1233 : : int mru = VfdCache[0].lruLessRecently;
1234 : : Vfd *vfdP = &VfdCache[mru];
1235 : : char buf[2048];
1236 : :
1237 : : snprintf(buf, sizeof(buf), "LRU: MOST %d ", mru);
1238 : : while (mru != 0)
1239 : : {
1240 : : mru = vfdP->lruLessRecently;
1241 : : vfdP = &VfdCache[mru];
1242 : : snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "%d ", mru);
1243 : : }
1244 : : snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "LEAST");
1245 : : elog(LOG, "%s", buf);
1246 : : }
1247 : : #endif /* FDDEBUG */
1248 : :
1249 : : static void
10753 scrappy@hub.org 1250 : 1399432 : Delete(File file)
1251 : : {
1252 : : Vfd *vfdP;
1253 : :
9719 tgl@sss.pgh.pa.us 1254 [ - + ]: 1399432 : Assert(file != 0);
1255 : :
1256 : : DO_DB(elog(LOG, "Delete %d (%s)",
1257 : : file, VfdCache[file].fileName));
1258 : : DO_DB(_dump_lru());
1259 : :
1260 : 1399432 : vfdP = &VfdCache[file];
1261 : :
1262 : 1399432 : VfdCache[vfdP->lruLessRecently].lruMoreRecently = vfdP->lruMoreRecently;
1263 : 1399432 : VfdCache[vfdP->lruMoreRecently].lruLessRecently = vfdP->lruLessRecently;
1264 : :
1265 : : DO_DB(_dump_lru());
10753 scrappy@hub.org 1266 : 1399432 : }
1267 : :
1268 : : static void
1269 : 2564 : LruDelete(File file)
1270 : : {
1271 : : Vfd *vfdP;
1272 : :
9719 tgl@sss.pgh.pa.us 1273 [ - + ]: 2564 : Assert(file != 0);
1274 : :
1275 : : DO_DB(elog(LOG, "LruDelete %d (%s)",
1276 : : file, VfdCache[file].fileName));
1277 : :
1278 : 2564 : vfdP = &VfdCache[file];
1279 : :
263 andres@anarazel.de 1280 : 2564 : pgaio_closing_fd(vfdP->fd);
1281 : :
1282 : : /*
1283 : : * Close the file. We aren't expecting this to fail; if it does, better
1284 : : * to leak the FD than to mess up our internal state.
1285 : : */
2356 peter@eisentraut.org 1286 [ - + ]: 2564 : if (close(vfdP->fd) != 0)
2585 tmunro@postgresql.or 1287 [ # # # # ]:UBC 0 : elog(vfdP->fdstate & FD_TEMP_FILE_LIMIT ? LOG : data_sync_elevel(LOG),
1288 : : "could not close file \"%s\": %m", vfdP->fileName);
9719 tgl@sss.pgh.pa.us 1289 :CBC 2564 : vfdP->fd = VFD_CLOSED;
3221 1290 : 2564 : --nfile;
1291 : :
1292 : : /* delete the vfd record from the LRU ring */
1293 : 2564 : Delete(file);
10753 scrappy@hub.org 1294 : 2564 : }
1295 : :
1296 : : static void
1297 : 1766681 : Insert(File file)
1298 : : {
1299 : : Vfd *vfdP;
1300 : :
9719 tgl@sss.pgh.pa.us 1301 [ - + ]: 1766681 : Assert(file != 0);
1302 : :
1303 : : DO_DB(elog(LOG, "Insert %d (%s)",
1304 : : file, VfdCache[file].fileName));
1305 : : DO_DB(_dump_lru());
1306 : :
10328 bruce@momjian.us 1307 : 1766681 : vfdP = &VfdCache[file];
1308 : :
1309 : 1766681 : vfdP->lruMoreRecently = 0;
1310 : 1766681 : vfdP->lruLessRecently = VfdCache[0].lruLessRecently;
1311 : 1766681 : VfdCache[0].lruLessRecently = file;
1312 : 1766681 : VfdCache[vfdP->lruLessRecently].lruMoreRecently = file;
1313 : :
1314 : : DO_DB(_dump_lru());
10753 scrappy@hub.org 1315 : 1766681 : }
1316 : :
1317 : : /* returns 0 on success, -1 on re-open failure (with errno set) */
1318 : : static int
10328 bruce@momjian.us 1319 : 27 : LruInsert(File file)
1320 : : {
1321 : : Vfd *vfdP;
1322 : :
9719 tgl@sss.pgh.pa.us 1323 [ - + ]: 27 : Assert(file != 0);
1324 : :
1325 : : DO_DB(elog(LOG, "LruInsert %d (%s)",
1326 : : file, VfdCache[file].fileName));
1327 : :
10328 bruce@momjian.us 1328 : 27 : vfdP = &VfdCache[file];
1329 : :
1330 [ + - ]: 27 : if (FileIsNotOpen(file))
1331 : : {
1332 : : /* Close excess kernel FDs. */
4574 tgl@sss.pgh.pa.us 1333 : 27 : ReleaseLruFiles();
1334 : :
1335 : : /*
1336 : : * The open could still fail for lack of file descriptors, eg due to
1337 : : * overall system file table being full. So, be prepared to release
1338 : : * another FD if necessary...
1339 : : */
3007 peter_e@gmx.net 1340 : 27 : vfdP->fd = BasicOpenFilePerm(vfdP->fileName, vfdP->fileFlags,
1341 : : vfdP->fileMode);
10328 bruce@momjian.us 1342 [ - + ]: 27 : if (vfdP->fd < 0)
1343 : : {
1344 : : DO_DB(elog(LOG, "re-open failed: %m"));
4598 tgl@sss.pgh.pa.us 1345 :UBC 0 : return -1;
1346 : : }
1347 : : else
1348 : : {
10328 bruce@momjian.us 1349 :CBC 27 : ++nfile;
1350 : : }
1351 : : }
1352 : :
1353 : : /*
1354 : : * put it at the head of the Lru ring
1355 : : */
1356 : :
1357 : 27 : Insert(file);
1358 : :
9969 1359 : 27 : return 0;
1360 : : }
1361 : :
1362 : : /*
1363 : : * Release one kernel FD by closing the least-recently-used VFD.
1364 : : */
1365 : : static bool
9243 tgl@sss.pgh.pa.us 1366 : 2428 : ReleaseLruFile(void)
1367 : : {
1368 : : DO_DB(elog(LOG, "ReleaseLruFile. Opened %d", nfile));
1369 : :
1370 [ + - ]: 2428 : if (nfile > 0)
1371 : : {
1372 : : /*
1373 : : * There are opened files and so there should be at least one used vfd
1374 : : * in the ring.
1375 : : */
1376 [ - + ]: 2428 : Assert(VfdCache[0].lruMoreRecently != 0);
1377 : 2428 : LruDelete(VfdCache[0].lruMoreRecently);
1378 : 2428 : return true; /* freed a file */
1379 : : }
9243 tgl@sss.pgh.pa.us 1380 :UBC 0 : return false; /* no files available to free */
1381 : : }
1382 : :
1383 : : /*
1384 : : * Release kernel FDs as needed to get under the max_safe_fds limit.
1385 : : * After calling this, it's OK to try to open another file.
1386 : : */
1387 : : static void
4574 tgl@sss.pgh.pa.us 1388 :CBC 2028367 : ReleaseLruFiles(void)
1389 : : {
2123 1390 [ + + ]: 2030795 : while (nfile + numAllocatedDescs + numExternalFDs >= max_safe_fds)
1391 : : {
4574 1392 [ - + ]: 2428 : if (!ReleaseLruFile())
4574 tgl@sss.pgh.pa.us 1393 :UBC 0 : break;
1394 : : }
4574 tgl@sss.pgh.pa.us 1395 :CBC 2028367 : }
1396 : :
1397 : : static File
9243 1398 : 1309618 : AllocateVfd(void)
1399 : : {
1400 : : Index i;
1401 : : File file;
1402 : :
1403 : : DO_DB(elog(LOG, "AllocateVfd. Size %zu", SizeVfdCache));
1404 : :
7368 bruce@momjian.us 1405 [ - + ]: 1309618 : Assert(SizeVfdCache > 0); /* InitFileAccess not called? */
1406 : :
10328 1407 [ + + ]: 1309618 : if (VfdCache[0].nextFree == 0)
1408 : : {
1409 : : /*
1410 : : * The free list is empty so it is time to increase the size of the
1411 : : * array. We choose to double it each time this happens. However,
1412 : : * there's not much point in starting *real* small.
1413 : : */
9703 1414 : 22903 : Size newCacheSize = SizeVfdCache * 2;
1415 : : Vfd *newVfdCache;
1416 : :
9719 tgl@sss.pgh.pa.us 1417 [ + + ]: 22903 : if (newCacheSize < 32)
1418 : 16770 : newCacheSize = 32;
1419 : :
1420 : : /*
1421 : : * Be careful not to clobber VfdCache ptr if realloc fails.
1422 : : */
9024 1423 : 22903 : newVfdCache = (Vfd *) realloc(VfdCache, sizeof(Vfd) * newCacheSize);
1424 [ - + ]: 22903 : if (newVfdCache == NULL)
8182 tgl@sss.pgh.pa.us 1425 [ # # ]:UBC 0 : ereport(ERROR,
1426 : : (errcode(ERRCODE_OUT_OF_MEMORY),
1427 : : errmsg("out of memory")));
9024 tgl@sss.pgh.pa.us 1428 :CBC 22903 : VfdCache = newVfdCache;
1429 : :
1430 : : /*
1431 : : * Initialize the new entries and link them into the free list.
1432 : : */
9719 1433 [ + + ]: 1100853 : for (i = SizeVfdCache; i < newCacheSize; i++)
1434 : : {
308 peter@eisentraut.org 1435 [ + - + - : 8623600 : MemSet(&(VfdCache[i]), 0, sizeof(Vfd));
+ - + - +
+ ]
10328 bruce@momjian.us 1436 : 1077950 : VfdCache[i].nextFree = i + 1;
1437 : 1077950 : VfdCache[i].fd = VFD_CLOSED;
1438 : : }
9719 tgl@sss.pgh.pa.us 1439 : 22903 : VfdCache[newCacheSize - 1].nextFree = 0;
10328 bruce@momjian.us 1440 : 22903 : VfdCache[0].nextFree = SizeVfdCache;
1441 : :
1442 : : /*
1443 : : * Record the new size
1444 : : */
9719 tgl@sss.pgh.pa.us 1445 : 22903 : SizeVfdCache = newCacheSize;
1446 : : }
1447 : :
10328 bruce@momjian.us 1448 : 1309618 : file = VfdCache[0].nextFree;
1449 : :
1450 : 1309618 : VfdCache[0].nextFree = VfdCache[file].nextFree;
1451 : :
1452 : 1309618 : return file;
1453 : : }
1454 : :
1455 : : static void
10753 scrappy@hub.org 1456 : 940029 : FreeVfd(File file)
1457 : : {
9719 tgl@sss.pgh.pa.us 1458 : 940029 : Vfd *vfdP = &VfdCache[file];
1459 : :
1460 : : DO_DB(elog(LOG, "FreeVfd: %d (%s)",
1461 : : file, vfdP->fileName ? vfdP->fileName : ""));
1462 : :
1463 [ + + ]: 940029 : if (vfdP->fileName != NULL)
1464 : : {
1465 : 541964 : free(vfdP->fileName);
1466 : 541964 : vfdP->fileName = NULL;
1467 : : }
9024 1468 : 940029 : vfdP->fdstate = 0x0;
1469 : :
9719 1470 : 940029 : vfdP->nextFree = VfdCache[0].nextFree;
10328 bruce@momjian.us 1471 : 940029 : VfdCache[0].nextFree = file;
10753 scrappy@hub.org 1472 : 940029 : }
1473 : :
1474 : : /* returns 0 on success, -1 on re-open failure (with errno set) */
1475 : : static int
1476 : 3209652 : FileAccess(File file)
1477 : : {
1478 : : int returnValue;
1479 : :
1480 : : DO_DB(elog(LOG, "FileAccess %d (%s)",
1481 : : file, VfdCache[file].fileName));
1482 : :
1483 : : /*
1484 : : * Is the file open? If not, open it and put it at the head of the LRU
1485 : : * ring (possibly closing the least recently used file to get an FD).
1486 : : */
1487 : :
10328 bruce@momjian.us 1488 [ + + ]: 3209652 : if (FileIsNotOpen(file))
1489 : : {
1490 : 27 : returnValue = LruInsert(file);
1491 [ - + ]: 27 : if (returnValue != 0)
10328 bruce@momjian.us 1492 :UBC 0 : return returnValue;
1493 : : }
9719 tgl@sss.pgh.pa.us 1494 [ + + ]:CBC 3209625 : else if (VfdCache[0].lruLessRecently != file)
1495 : : {
1496 : : /*
1497 : : * We now know that the file is open and that it is not the last one
1498 : : * accessed, so we need to move it to the head of the Lru ring.
1499 : : */
1500 : :
10328 bruce@momjian.us 1501 : 855101 : Delete(file);
1502 : 855101 : Insert(file);
1503 : : }
1504 : :
9969 1505 : 3209652 : return 0;
1506 : : }
1507 : :
1508 : : /*
1509 : : * Called whenever a temporary file is deleted to report its size.
1510 : : */
1511 : : static void
34 michael@paquier.xyz 1512 :GNC 2556 : ReportTemporaryFileUsage(const char *path, pgoff_t size)
1513 : : {
2938 andres@anarazel.de 1514 :CBC 2556 : pgstat_report_tempfile(size);
1515 : :
1516 [ + + ]: 2556 : if (log_temp_files >= 0)
1517 : : {
1518 [ + + ]: 806 : if ((size / 1024) >= log_temp_files)
1519 [ + - ]: 117 : ereport(LOG,
1520 : : (errmsg("temporary file: path \"%s\", size %lu",
1521 : : path, (unsigned long) size)));
1522 : : }
1523 : 2556 : }
1524 : :
1525 : : /*
1526 : : * Called to register a temporary file for automatic close.
1527 : : * ResourceOwnerEnlarge(CurrentResourceOwner) must have been called
1528 : : * before the file was opened.
1529 : : */
1530 : : static void
1531 : 4272 : RegisterTemporaryFile(File file)
1532 : : {
1533 : 4272 : ResourceOwnerRememberFile(CurrentResourceOwner, file);
1534 : 4272 : VfdCache[file].resowner = CurrentResourceOwner;
1535 : :
1536 : : /* Backup mechanism for closing at end of xact. */
1537 : 4272 : VfdCache[file].fdstate |= FD_CLOSE_AT_EOXACT;
1538 : 4272 : have_xact_temporary_files = true;
1539 : 4272 : }
1540 : :
1541 : : /*
1542 : : * Called when we get a shared invalidation message on some relation.
1543 : : */
1544 : : #ifdef NOT_USED
1545 : : void
1546 : : FileInvalidate(File file)
1547 : : {
1548 : : Assert(FileIsValid(file));
1549 : : if (!FileIsNotOpen(file))
1550 : : LruDelete(file);
1551 : : }
1552 : : #endif
1553 : :
1554 : : /*
1555 : : * Open a file with PathNameOpenFilePerm() and pass default file mode for the
1556 : : * fileMode parameter.
1557 : : */
1558 : : File
3007 peter_e@gmx.net 1559 : 1309618 : PathNameOpenFile(const char *fileName, int fileFlags)
1560 : : {
2811 sfrost@snowman.net 1561 : 1309618 : return PathNameOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
1562 : : }
1563 : :
1564 : : /*
1565 : : * open a file in an arbitrary directory
1566 : : *
1567 : : * NB: if the passed pathname is relative (which it usually is),
1568 : : * it will be interpreted relative to the process' working directory
1569 : : * (which should always be $PGDATA when this code is running).
1570 : : */
1571 : : File
3007 peter_e@gmx.net 1572 : 1309618 : PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
1573 : : {
1574 : : char *fnamecopy;
1575 : : File file;
1576 : : Vfd *vfdP;
1577 : :
1578 : : DO_DB(elog(LOG, "PathNameOpenFilePerm: %s %x %o",
1579 : : fileName, fileFlags, fileMode));
1580 : :
1581 : : /*
1582 : : * We need a malloc'd copy of the file name; fail cleanly if no room.
1583 : : */
8182 tgl@sss.pgh.pa.us 1584 : 1309618 : fnamecopy = strdup(fileName);
1585 [ - + ]: 1309618 : if (fnamecopy == NULL)
8182 tgl@sss.pgh.pa.us 1586 [ # # ]:UBC 0 : ereport(ERROR,
1587 : : (errcode(ERRCODE_OUT_OF_MEMORY),
1588 : : errmsg("out of memory")));
1589 : :
10328 bruce@momjian.us 1590 :CBC 1309618 : file = AllocateVfd();
1591 : 1309618 : vfdP = &VfdCache[file];
1592 : :
1593 : : /* Close excess kernel FDs. */
4574 tgl@sss.pgh.pa.us 1594 : 1309618 : ReleaseLruFiles();
1595 : :
1596 : : /*
1597 : : * Descriptors managed by VFDs are implicitly marked O_CLOEXEC. The
1598 : : * client shouldn't be expected to know which kernel descriptors are
1599 : : * currently open, so it wouldn't make sense for them to be inherited by
1600 : : * executed subprograms.
1601 : : */
1020 tmunro@postgresql.or 1602 : 1309618 : fileFlags |= O_CLOEXEC;
1603 : :
3007 peter_e@gmx.net 1604 : 1309618 : vfdP->fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
1605 : :
10328 bruce@momjian.us 1606 [ + + ]: 1309618 : if (vfdP->fd < 0)
1607 : : {
4598 tgl@sss.pgh.pa.us 1608 : 398065 : int save_errno = errno;
1609 : :
10328 bruce@momjian.us 1610 : 398065 : FreeVfd(file);
8182 tgl@sss.pgh.pa.us 1611 : 398065 : free(fnamecopy);
4598 1612 : 398065 : errno = save_errno;
10328 bruce@momjian.us 1613 : 398065 : return -1;
1614 : : }
1615 : 911553 : ++nfile;
1616 : : DO_DB(elog(LOG, "PathNameOpenFile: success %d",
1617 : : vfdP->fd));
1618 : :
8182 tgl@sss.pgh.pa.us 1619 : 911553 : vfdP->fileName = fnamecopy;
1620 : : /* Saved flags are adjusted to be OK for re-opening file */
9024 1621 : 911553 : vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
10328 bruce@momjian.us 1622 : 911553 : vfdP->fileMode = fileMode;
5267 tgl@sss.pgh.pa.us 1623 : 911553 : vfdP->fileSize = 0;
8534 1624 : 911553 : vfdP->fdstate = 0x0;
5858 heikki.linnakangas@i 1625 : 911553 : vfdP->resowner = NULL;
1626 : :
1857 tgl@sss.pgh.pa.us 1627 : 911553 : Insert(file);
1628 : :
10328 bruce@momjian.us 1629 : 911553 : return file;
1630 : : }
1631 : :
1632 : : /*
1633 : : * Create directory 'directory'. If necessary, create 'basedir', which must
1634 : : * be the directory above it. This is designed for creating the top-level
1635 : : * temporary directory on demand before creating a directory underneath it.
1636 : : * Do nothing if the directory already exists.
1637 : : *
1638 : : * Directories created within the top-level temporary directory should begin
1639 : : * with PG_TEMP_FILE_PREFIX, so that they can be identified as temporary and
1640 : : * deleted at startup by RemovePgTempFiles(). Further subdirectories below
1641 : : * that do not need any particular prefix.
1642 : : */
1643 : : void
2938 andres@anarazel.de 1644 : 185 : PathNameCreateTemporaryDir(const char *basedir, const char *directory)
1645 : : {
2811 sfrost@snowman.net 1646 [ + + ]: 185 : if (MakePGDirectory(directory) < 0)
1647 : : {
2938 andres@anarazel.de 1648 [ + + ]: 20 : if (errno == EEXIST)
1649 : 10 : return;
1650 : :
1651 : : /*
1652 : : * Failed. Try to create basedir first in case it's missing. Tolerate
1653 : : * EEXIST to close a race against another process following the same
1654 : : * algorithm.
1655 : : */
2811 sfrost@snowman.net 1656 [ - + - - ]: 10 : if (MakePGDirectory(basedir) < 0 && errno != EEXIST)
2938 andres@anarazel.de 1657 [ # # ]:UBC 0 : ereport(ERROR,
1658 : : (errcode_for_file_access(),
1659 : : errmsg("cannot create temporary directory \"%s\": %m",
1660 : : basedir)));
1661 : :
1662 : : /* Try again. */
2811 sfrost@snowman.net 1663 [ - + - - ]:CBC 10 : if (MakePGDirectory(directory) < 0 && errno != EEXIST)
2938 andres@anarazel.de 1664 [ # # ]:UBC 0 : ereport(ERROR,
1665 : : (errcode_for_file_access(),
1666 : : errmsg("cannot create temporary subdirectory \"%s\": %m",
1667 : : directory)));
1668 : : }
1669 : : }
1670 : :
1671 : : /*
1672 : : * Delete a directory and everything in it, if it exists.
1673 : : */
1674 : : void
2938 andres@anarazel.de 1675 :CBC 218 : PathNameDeleteTemporaryDir(const char *dirname)
1676 : : {
1677 : : struct stat statbuf;
1678 : :
1679 : : /* Silently ignore missing directory. */
1680 [ + + + - ]: 218 : if (stat(dirname, &statbuf) != 0 && errno == ENOENT)
1681 : 43 : return;
1682 : :
1683 : : /*
1684 : : * Currently, walkdir doesn't offer a way for our passed in function to
1685 : : * maintain state. Perhaps it should, so that we could tell the caller
1686 : : * whether this operation succeeded or failed. Since this operation is
1687 : : * used in a cleanup path, we wouldn't actually behave differently: we'll
1688 : : * just log failures.
1689 : : */
1690 : 175 : walkdir(dirname, unlink_if_exists_fname, false, LOG);
1691 : : }
1692 : :
1693 : : /*
1694 : : * Open a temporary file that will disappear when we close it.
1695 : : *
1696 : : * This routine takes care of generating an appropriate tempfile name.
1697 : : * There's no need to pass in fileFlags or fileMode either, since only
1698 : : * one setting makes any sense for a temp file.
1699 : : *
1700 : : * Unless interXact is true, the file is remembered by CurrentResourceOwner
1701 : : * to ensure it's closed and deleted when it's no longer needed, typically at
1702 : : * the end-of-transaction. In most cases, you don't want temporary files to
1703 : : * outlive the transaction that created them, so this should be false -- but
1704 : : * if you need "somewhat" temporary storage, this might be useful. In either
1705 : : * case, the file is removed when the File is explicitly closed.
1706 : : */
1707 : : File
6768 tgl@sss.pgh.pa.us 1708 : 1288 : OpenTemporaryFile(bool interXact)
1709 : : {
6772 1710 : 1288 : File file = 0;
1711 : :
1593 andres@anarazel.de 1712 [ - + ]: 1288 : Assert(temporary_files_allowed); /* check temp file access is up */
1713 : :
1714 : : /*
1715 : : * Make sure the current resource owner has space for this File before we
1716 : : * open it, if we'll be registering it below.
1717 : : */
2961 tgl@sss.pgh.pa.us 1718 [ + - ]: 1288 : if (!interXact)
770 heikki.linnakangas@i 1719 : 1288 : ResourceOwnerEnlarge(CurrentResourceOwner);
1720 : :
1721 : : /*
1722 : : * If some temp tablespace(s) have been given to us, try to use the next
1723 : : * one. If a given tablespace can't be found, we silently fall back to
1724 : : * the database's default tablespace.
1725 : : *
1726 : : * BUT: if the temp file is slated to outlive the current transaction,
1727 : : * force it into the database's default tablespace, so that it will not
1728 : : * pose a threat to possible tablespace drop attempts.
1729 : : */
6768 tgl@sss.pgh.pa.us 1730 [ + + + - ]: 1288 : if (numTempTableSpaces > 0 && !interXact)
1731 : : {
6607 bruce@momjian.us 1732 : 1 : Oid tblspcOid = GetNextTempTableSpace();
1733 : :
6768 tgl@sss.pgh.pa.us 1734 [ + - ]: 1 : if (OidIsValid(tblspcOid))
1735 : 1 : file = OpenTemporaryFileInTablespace(tblspcOid, false);
1736 : : }
1737 : :
1738 : : /*
1739 : : * If not, or if tablespace is bad, create in database's default
1740 : : * tablespace. MyDatabaseTableSpace should normally be set before we get
1741 : : * here, but just in case it isn't, fall back to pg_default tablespace.
1742 : : */
6772 1743 [ + + ]: 1288 : if (file <= 0)
1744 [ + + ]: 1287 : file = OpenTemporaryFileInTablespace(MyDatabaseTableSpace ?
1745 : : MyDatabaseTableSpace :
1746 : : DEFAULTTABLESPACE_OID,
1747 : : true);
1748 : :
1749 : : /* Mark it for deletion at close and temporary file size limit */
2938 andres@anarazel.de 1750 : 1288 : VfdCache[file].fdstate |= FD_DELETE_AT_CLOSE | FD_TEMP_FILE_LIMIT;
1751 : :
1752 : : /* Register it with the current resource owner */
6772 tgl@sss.pgh.pa.us 1753 [ + - ]: 1288 : if (!interXact)
2938 andres@anarazel.de 1754 : 1288 : RegisterTemporaryFile(file);
1755 : :
6772 tgl@sss.pgh.pa.us 1756 : 1288 : return file;
1757 : : }
1758 : :
1759 : : /*
1760 : : * Return the path of the temp directory in a given tablespace.
1761 : : */
1762 : : void
2938 andres@anarazel.de 1763 : 8506 : TempTablespacePath(char *path, Oid tablespace)
1764 : : {
1765 : : /*
1766 : : * Identify the tempfile directory for this tablespace.
1767 : : *
1768 : : * If someone tries to specify pg_global, use pg_default instead.
1769 : : */
1770 [ + - + + ]: 8506 : if (tablespace == InvalidOid ||
1771 [ - + ]: 1 : tablespace == DEFAULTTABLESPACE_OID ||
1772 : : tablespace == GLOBALTABLESPACE_OID)
1773 : 8505 : snprintf(path, MAXPGPATH, "base/%s", PG_TEMP_FILES_DIR);
1774 : : else
1775 : : {
1776 : : /* All other tablespaces are accessed via symlinks */
470 michael@paquier.xyz 1777 : 1 : snprintf(path, MAXPGPATH, "%s/%u/%s/%s",
1778 : : PG_TBLSPC_DIR, tablespace, TABLESPACE_VERSION_DIRECTORY,
1779 : : PG_TEMP_FILES_DIR);
1780 : : }
2938 andres@anarazel.de 1781 : 8506 : }
1782 : :
1783 : : /*
1784 : : * Open a temporary file in a specific tablespace.
1785 : : * Subroutine for OpenTemporaryFile, which see for details.
1786 : : */
1787 : : static File
1788 : 1288 : OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
1789 : : {
1790 : : char tempdirpath[MAXPGPATH];
1791 : : char tempfilepath[MAXPGPATH];
1792 : : File file;
1793 : :
1794 : 1288 : TempTablespacePath(tempdirpath, tblspcOid);
1795 : :
1796 : : /*
1797 : : * Generate a tempfile name that should be unique within the current
1798 : : * database instance.
1799 : : */
6772 tgl@sss.pgh.pa.us 1800 : 1288 : snprintf(tempfilepath, sizeof(tempfilepath), "%s/%s%d.%ld",
1801 : : tempdirpath, PG_TEMP_FILE_PREFIX, MyProcPid, tempFileCounter++);
1802 : :
1803 : : /*
1804 : : * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1805 : : * temp file that can be reused.
1806 : : */
1807 : 1288 : file = PathNameOpenFile(tempfilepath,
1808 : : O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
9719 1809 [ + + ]: 1288 : if (file <= 0)
1810 : : {
1811 : : /*
1812 : : * We might need to create the tablespace's tempfile directory, if no
1813 : : * one has yet done so.
1814 : : *
1815 : : * Don't check for an error from MakePGDirectory; it could fail if
1816 : : * someone else just did the same thing. If it doesn't work then
1817 : : * we'll bomb out on the second create attempt, instead.
1818 : : */
2811 sfrost@snowman.net 1819 : 94 : (void) MakePGDirectory(tempdirpath);
1820 : :
6772 tgl@sss.pgh.pa.us 1821 : 94 : file = PathNameOpenFile(tempfilepath,
1822 : : O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1823 [ - + - - ]: 94 : if (file <= 0 && rejectError)
8182 tgl@sss.pgh.pa.us 1824 [ # # ]:UBC 0 : elog(ERROR, "could not create temporary file \"%s\": %m",
1825 : : tempfilepath);
1826 : : }
1827 : :
9719 tgl@sss.pgh.pa.us 1828 :CBC 1288 : return file;
1829 : : }
1830 : :
1831 : :
1832 : : /*
1833 : : * Create a new file. The directory containing it must already exist. Files
1834 : : * created this way are subject to temp_file_limit and are automatically
1835 : : * closed at end of transaction, but are not automatically deleted on close
1836 : : * because they are intended to be shared between cooperating backends.
1837 : : *
1838 : : * If the file is inside the top-level temporary directory, its name should
1839 : : * begin with PG_TEMP_FILE_PREFIX so that it can be identified as temporary
1840 : : * and deleted at startup by RemovePgTempFiles(). Alternatively, it can be
1841 : : * inside a directory created with PathNameCreateTemporaryDir(), in which case
1842 : : * the prefix isn't needed.
1843 : : */
1844 : : File
2938 andres@anarazel.de 1845 : 1453 : PathNameCreateTemporaryFile(const char *path, bool error_on_failure)
1846 : : {
1847 : : File file;
1848 : :
1593 1849 [ - + ]: 1453 : Assert(temporary_files_allowed); /* check temp file access is up */
1850 : :
770 heikki.linnakangas@i 1851 : 1453 : ResourceOwnerEnlarge(CurrentResourceOwner);
1852 : :
1853 : : /*
1854 : : * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1855 : : * temp file that can be reused.
1856 : : */
2938 andres@anarazel.de 1857 : 1453 : file = PathNameOpenFile(path, O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1858 [ + + ]: 1453 : if (file <= 0)
1859 : : {
1860 [ - + ]: 185 : if (error_on_failure)
2938 andres@anarazel.de 1861 [ # # ]:UBC 0 : ereport(ERROR,
1862 : : (errcode_for_file_access(),
1863 : : errmsg("could not create temporary file \"%s\": %m",
1864 : : path)));
1865 : : else
2938 andres@anarazel.de 1866 :CBC 185 : return file;
1867 : : }
1868 : :
1869 : : /* Mark it for temp_file_limit accounting. */
1870 : 1268 : VfdCache[file].fdstate |= FD_TEMP_FILE_LIMIT;
1871 : :
1872 : : /* Register it for automatic close. */
1873 : 1268 : RegisterTemporaryFile(file);
1874 : :
1875 : 1268 : return file;
1876 : : }
1877 : :
1878 : : /*
1879 : : * Open a file that was created with PathNameCreateTemporaryFile, possibly in
1880 : : * another backend. Files opened this way don't count against the
1881 : : * temp_file_limit of the caller, are automatically closed at the end of the
1882 : : * transaction but are not deleted on close.
1883 : : */
1884 : : File
1939 akapila@postgresql.o 1885 : 3704 : PathNameOpenTemporaryFile(const char *path, int mode)
1886 : : {
1887 : : File file;
1888 : :
1593 andres@anarazel.de 1889 [ - + ]: 3704 : Assert(temporary_files_allowed); /* check temp file access is up */
1890 : :
770 heikki.linnakangas@i 1891 : 3704 : ResourceOwnerEnlarge(CurrentResourceOwner);
1892 : :
1939 akapila@postgresql.o 1893 : 3704 : file = PathNameOpenFile(path, mode | PG_BINARY);
1894 : :
1895 : : /* If no such file, then we don't raise an error. */
2938 andres@anarazel.de 1896 [ + + - + ]: 3704 : if (file <= 0 && errno != ENOENT)
2938 andres@anarazel.de 1897 [ # # ]:UBC 0 : ereport(ERROR,
1898 : : (errcode_for_file_access(),
1899 : : errmsg("could not open temporary file \"%s\": %m",
1900 : : path)));
1901 : :
2938 andres@anarazel.de 1902 [ + + ]:CBC 3704 : if (file > 0)
1903 : : {
1904 : : /* Register it for automatic close. */
1905 : 1716 : RegisterTemporaryFile(file);
1906 : : }
1907 : :
1908 : 3704 : return file;
1909 : : }
1910 : :
1911 : : /*
1912 : : * Delete a file by pathname. Return true if the file existed, false if
1913 : : * didn't.
1914 : : */
1915 : : bool
1916 : 2887 : PathNameDeleteTemporaryFile(const char *path, bool error_on_failure)
1917 : : {
1918 : : struct stat filestats;
1919 : : int stat_errno;
1920 : :
1921 : : /* Get the final size for pgstat reporting. */
1922 [ + + ]: 2887 : if (stat(path, &filestats) != 0)
1923 : 1619 : stat_errno = errno;
1924 : : else
1925 : 1268 : stat_errno = 0;
1926 : :
1927 : : /*
1928 : : * Unlike FileClose's automatic file deletion code, we tolerate
1929 : : * non-existence to support BufFileDeleteFileSet which doesn't know how
1930 : : * many segments it has to delete until it runs out.
1931 : : */
1932 [ + + ]: 2887 : if (stat_errno == ENOENT)
1933 : 1619 : return false;
1934 : :
1935 [ - + ]: 1268 : if (unlink(path) < 0)
1936 : : {
2938 andres@anarazel.de 1937 [ # # ]:UBC 0 : if (errno != ENOENT)
1938 [ # # # # ]: 0 : ereport(error_on_failure ? ERROR : LOG,
1939 : : (errcode_for_file_access(),
1940 : : errmsg("could not unlink temporary file \"%s\": %m",
1941 : : path)));
1942 : 0 : return false;
1943 : : }
1944 : :
2938 andres@anarazel.de 1945 [ + - ]:CBC 1268 : if (stat_errno == 0)
1946 : 1268 : ReportTemporaryFileUsage(path, filestats.st_size);
1947 : : else
1948 : : {
2938 andres@anarazel.de 1949 :UBC 0 : errno = stat_errno;
1950 [ # # ]: 0 : ereport(LOG,
1951 : : (errcode_for_file_access(),
1952 : : errmsg("could not stat file \"%s\": %m", path)));
1953 : : }
1954 : :
2938 andres@anarazel.de 1955 :CBC 1268 : return true;
1956 : : }
1957 : :
1958 : : /*
1959 : : * close a file when done with it
1960 : : */
1961 : : void
10753 scrappy@hub.org 1962 : 541964 : FileClose(File file)
1963 : : {
1964 : : Vfd *vfdP;
1965 : :
9719 tgl@sss.pgh.pa.us 1966 [ + - + - : 541964 : Assert(FileIsValid(file));
- + ]
1967 : :
1968 : : DO_DB(elog(LOG, "FileClose: %d (%s)",
1969 : : file, VfdCache[file].fileName));
1970 : :
8711 1971 : 541964 : vfdP = &VfdCache[file];
1972 : :
10328 bruce@momjian.us 1973 [ + + ]: 541964 : if (!FileIsNotOpen(file))
1974 : : {
263 andres@anarazel.de 1975 : 541767 : pgaio_closing_fd(vfdP->fd);
1976 : :
1977 : : /* close the file */
2356 peter@eisentraut.org 1978 [ - + ]: 541767 : if (close(vfdP->fd) != 0)
1979 : : {
1980 : : /*
1981 : : * We may need to panic on failure to close non-temporary files;
1982 : : * see LruDelete.
1983 : : */
2585 tmunro@postgresql.or 1984 [ # # # # ]:UBC 0 : elog(vfdP->fdstate & FD_TEMP_FILE_LIMIT ? LOG : data_sync_elevel(LOG),
1985 : : "could not close file \"%s\": %m", vfdP->fileName);
1986 : : }
1987 : :
10328 bruce@momjian.us 1988 :CBC 541767 : --nfile;
8711 tgl@sss.pgh.pa.us 1989 : 541767 : vfdP->fd = VFD_CLOSED;
1990 : :
1991 : : /* remove the file from the lru ring */
3221 1992 : 541767 : Delete(file);
1993 : : }
1994 : :
2938 andres@anarazel.de 1995 [ + + ]: 541964 : if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
1996 : : {
1997 : : /* Subtract its size from current usage (do first in case of error) */
1998 : 2556 : temporary_files_size -= vfdP->fileSize;
1999 : 2556 : vfdP->fileSize = 0;
2000 : : }
2001 : :
2002 : : /*
2003 : : * Delete the file if it was temporary, and make a log entry if wanted
2004 : : */
2005 [ + + ]: 541964 : if (vfdP->fdstate & FD_DELETE_AT_CLOSE)
2006 : : {
2007 : : struct stat filestats;
2008 : : int stat_errno;
2009 : :
2010 : : /*
2011 : : * If we get an error, as could happen within the ereport/elog calls,
2012 : : * we'll come right back here during transaction abort. Reset the
2013 : : * flag to ensure that we can't get into an infinite loop. This code
2014 : : * is arranged to ensure that the worst-case consequence is failing to
2015 : : * emit log message(s), not failing to attempt the unlink.
2016 : : */
2017 : 1288 : vfdP->fdstate &= ~FD_DELETE_AT_CLOSE;
2018 : :
2019 : :
2020 : : /* first try the stat() */
5074 magnus@hagander.net 2021 [ - + ]: 1288 : if (stat(vfdP->fileName, &filestats))
5074 magnus@hagander.net 2022 :UBC 0 : stat_errno = errno;
2023 : : else
5074 magnus@hagander.net 2024 :CBC 1288 : stat_errno = 0;
2025 : :
2026 : : /* in any case do the unlink */
2027 [ - + ]: 1288 : if (unlink(vfdP->fileName))
1839 peter@eisentraut.org 2028 [ # # ]:UBC 0 : ereport(LOG,
2029 : : (errcode_for_file_access(),
2030 : : errmsg("could not delete file \"%s\": %m", vfdP->fileName)));
2031 : :
2032 : : /* and last report the stat results */
5074 magnus@hagander.net 2033 [ + - ]:CBC 1288 : if (stat_errno == 0)
2938 andres@anarazel.de 2034 : 1288 : ReportTemporaryFileUsage(vfdP->fileName, filestats.st_size);
2035 : : else
2036 : : {
5072 magnus@hagander.net 2037 :UBC 0 : errno = stat_errno;
1839 peter@eisentraut.org 2038 [ # # ]: 0 : ereport(LOG,
2039 : : (errcode_for_file_access(),
2040 : : errmsg("could not stat file \"%s\": %m", vfdP->fileName)));
2041 : : }
2042 : : }
2043 : :
2044 : : /* Unregister it from the resource owner */
5858 heikki.linnakangas@i 2045 [ + + ]:CBC 541964 : if (vfdP->resowner)
2046 : 4268 : ResourceOwnerForgetFile(vfdP->resowner, file);
2047 : :
2048 : : /*
2049 : : * Return the Vfd slot to the free list
2050 : : */
9719 tgl@sss.pgh.pa.us 2051 : 541964 : FreeVfd(file);
10753 scrappy@hub.org 2052 : 541964 : }
2053 : :
2054 : : /*
2055 : : * FilePrefetch - initiate asynchronous read of a given range of the file.
2056 : : *
2057 : : * Returns 0 on success, otherwise an errno error code (like posix_fadvise()).
2058 : : *
2059 : : * posix_fadvise() is the simplest standardized interface that accomplishes
2060 : : * this.
2061 : : */
2062 : : int
34 michael@paquier.xyz 2063 :GNC 8183 : FilePrefetch(File file, pgoff_t offset, pgoff_t amount, uint32 wait_event_info)
2064 : : {
6183 tgl@sss.pgh.pa.us 2065 [ + - + - :CBC 8183 : Assert(FileIsValid(file));
- + ]
2066 : :
2067 : : DO_DB(elog(LOG, "FilePrefetch: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2068 : : file, VfdCache[file].fileName,
2069 : : (int64) offset, (int64) amount));
2070 : :
2071 : : #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_WILLNEED)
2072 : : {
2073 : : int returnCode;
2074 : :
476 peter@eisentraut.org 2075 : 8183 : returnCode = FileAccess(file);
2076 [ + - ]: 8183 : if (returnCode < 0)
476 peter@eisentraut.org 2077 :UBC 0 : return returnCode;
2078 : :
912 andres@anarazel.de 2079 :CBC 8183 : retry:
476 peter@eisentraut.org 2080 : 8183 : pgstat_report_wait_start(wait_event_info);
2081 : 8183 : returnCode = posix_fadvise(VfdCache[file].fd, offset, amount,
2082 : : POSIX_FADV_WILLNEED);
2083 : 8183 : pgstat_report_wait_end();
2084 : :
2085 [ - + ]: 8183 : if (returnCode == EINTR)
476 peter@eisentraut.org 2086 :UBC 0 : goto retry;
2087 : :
476 peter@eisentraut.org 2088 :CBC 8183 : return returnCode;
2089 : : }
2090 : : #elif defined(__darwin__)
2091 : : {
2092 : : struct radvisory
2093 : : {
2094 : : off_t ra_offset; /* offset into the file */
2095 : : int ra_count; /* size of the read */
2096 : : } ra;
2097 : : int returnCode;
2098 : :
2099 : : returnCode = FileAccess(file);
2100 : : if (returnCode < 0)
2101 : : return returnCode;
2102 : :
2103 : : ra.ra_offset = offset;
2104 : : ra.ra_count = amount;
2105 : : pgstat_report_wait_start(wait_event_info);
2106 : : returnCode = fcntl(VfdCache[file].fd, F_RDADVISE, &ra);
2107 : : pgstat_report_wait_end();
2108 : : if (returnCode != -1)
2109 : : return 0;
2110 : : else
2111 : : return errno;
2112 : : }
2113 : : #else
2114 : : return 0;
2115 : : #endif
2116 : : }
2117 : :
2118 : : void
34 michael@paquier.xyz 2119 :UNC 0 : FileWriteback(File file, pgoff_t offset, pgoff_t nbytes, uint32 wait_event_info)
2120 : : {
2121 : : int returnCode;
2122 : :
3589 andres@anarazel.de 2123 [ # # # # :UBC 0 : Assert(FileIsValid(file));
# # ]
2124 : :
2125 : : DO_DB(elog(LOG, "FileWriteback: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2126 : : file, VfdCache[file].fileName,
2127 : : (int64) offset, (int64) nbytes));
2128 : :
3535 tgl@sss.pgh.pa.us 2129 [ # # ]: 0 : if (nbytes <= 0)
2130 : 0 : return;
2131 : :
984 tmunro@postgresql.or 2132 [ # # ]: 0 : if (VfdCache[file].fileFlags & PG_O_DIRECT)
2133 : 0 : return;
2134 : :
3589 andres@anarazel.de 2135 : 0 : returnCode = FileAccess(file);
2136 [ # # ]: 0 : if (returnCode < 0)
2137 : 0 : return;
2138 : :
3196 rhaas@postgresql.org 2139 : 0 : pgstat_report_wait_start(wait_event_info);
3535 tgl@sss.pgh.pa.us 2140 : 0 : pg_flush_data(VfdCache[file].fd, offset, nbytes);
3196 rhaas@postgresql.org 2141 : 0 : pgstat_report_wait_end();
2142 : : }
2143 : :
2144 : : ssize_t
34 michael@paquier.xyz 2145 :GNC 409475 : FileReadV(File file, const struct iovec *iov, int iovcnt, pgoff_t offset,
2146 : : uint32 wait_event_info)
2147 : : {
2148 : : ssize_t returnCode;
2149 : : Vfd *vfdP;
2150 : :
9719 tgl@sss.pgh.pa.us 2151 [ + - + - :CBC 409475 : Assert(FileIsValid(file));
- + ]
2152 : :
2153 : : DO_DB(elog(LOG, "FileReadV: %d (%s) " INT64_FORMAT " %d",
2154 : : file, VfdCache[file].fileName,
2155 : : (int64) offset,
2156 : : iovcnt));
2157 : :
7870 2158 : 409475 : returnCode = FileAccess(file);
2159 [ - + ]: 409475 : if (returnCode < 0)
7870 tgl@sss.pgh.pa.us 2160 :UBC 0 : return returnCode;
2161 : :
3221 tgl@sss.pgh.pa.us 2162 :CBC 409475 : vfdP = &VfdCache[file];
2163 : :
7321 2164 : 409475 : retry:
3196 rhaas@postgresql.org 2165 : 409475 : pgstat_report_wait_start(wait_event_info);
736 tmunro@postgresql.or 2166 : 409475 : returnCode = pg_preadv(vfdP->fd, iov, iovcnt, offset);
3196 rhaas@postgresql.org 2167 : 409475 : pgstat_report_wait_end();
2168 : :
2597 tmunro@postgresql.or 2169 [ - + ]: 409475 : if (returnCode < 0)
2170 : : {
2171 : : /*
2172 : : * Windows may run out of kernel buffers and return "Insufficient
2173 : : * system resources" error. Wait a bit and retry to solve it.
2174 : : *
2175 : : * It is rumored that EINTR is also possible on some Unix filesystems,
2176 : : * in which case immediate retry is indicated.
2177 : : */
2178 : : #ifdef WIN32
2179 : : DWORD error = GetLastError();
2180 : :
2181 : : switch (error)
2182 : : {
2183 : : case ERROR_NO_SYSTEM_RESOURCES:
2184 : : pg_usleep(1000L);
2185 : : errno = EINTR;
2186 : : break;
2187 : : default:
2188 : : _dosmaperr(error);
2189 : : break;
2190 : : }
2191 : : #endif
2192 : : /* OK to retry if interrupted */
7321 tgl@sss.pgh.pa.us 2193 [ # # ]:UBC 0 : if (errno == EINTR)
2194 : 0 : goto retry;
2195 : : }
2196 : :
10328 bruce@momjian.us 2197 :CBC 409475 : return returnCode;
2198 : : }
2199 : :
2200 : : int
263 andres@anarazel.de 2201 : 1320622 : FileStartReadV(PgAioHandle *ioh, File file,
2202 : : int iovcnt, pgoff_t offset,
2203 : : uint32 wait_event_info)
2204 : : {
2205 : : int returnCode;
2206 : : Vfd *vfdP;
2207 : :
2208 [ + - + - : 1320622 : Assert(FileIsValid(file));
- + ]
2209 : :
2210 : : DO_DB(elog(LOG, "FileStartReadV: %d (%s) " INT64_FORMAT " %d",
2211 : : file, VfdCache[file].fileName,
2212 : : (int64) offset,
2213 : : iovcnt));
2214 : :
2215 : 1320622 : returnCode = FileAccess(file);
2216 [ - + ]: 1320622 : if (returnCode < 0)
263 andres@anarazel.de 2217 :UBC 0 : return returnCode;
2218 : :
263 andres@anarazel.de 2219 :CBC 1320622 : vfdP = &VfdCache[file];
2220 : :
2221 : 1320622 : pgaio_io_start_readv(ioh, vfdP->fd, iovcnt, offset);
2222 : :
2223 : 1320622 : return 0;
2224 : : }
2225 : :
2226 : : ssize_t
34 michael@paquier.xyz 2227 :GNC 762373 : FileWriteV(File file, const struct iovec *iov, int iovcnt, pgoff_t offset,
2228 : : uint32 wait_event_info)
2229 : : {
2230 : : ssize_t returnCode;
2231 : : Vfd *vfdP;
2232 : :
9719 tgl@sss.pgh.pa.us 2233 [ + - + - :CBC 762373 : Assert(FileIsValid(file));
- + ]
2234 : :
2235 : : DO_DB(elog(LOG, "FileWriteV: %d (%s) " INT64_FORMAT " %d",
2236 : : file, VfdCache[file].fileName,
2237 : : (int64) offset,
2238 : : iovcnt));
2239 : :
7870 2240 : 762373 : returnCode = FileAccess(file);
2241 [ - + ]: 762373 : if (returnCode < 0)
7870 tgl@sss.pgh.pa.us 2242 :UBC 0 : return returnCode;
2243 : :
3221 tgl@sss.pgh.pa.us 2244 :CBC 762373 : vfdP = &VfdCache[file];
2245 : :
2246 : : /*
2247 : : * If enforcing temp_file_limit and it's a temp file, check to see if the
2248 : : * write would overrun temp_file_limit, and throw error if so. Note: it's
2249 : : * really a modularity violation to throw error here; we should set errno
2250 : : * and return -1. However, there's no way to report a suitable error
2251 : : * message if we do that. All current callers would just throw error
2252 : : * immediately anyway, so this is safe at present.
2253 : : */
2938 andres@anarazel.de 2254 [ + - - - ]: 762373 : if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMP_FILE_LIMIT))
2255 : : {
34 michael@paquier.xyz 2256 :UNC 0 : pgoff_t past_write = offset;
2257 : :
736 tmunro@postgresql.or 2258 [ # # ]:UBC 0 : for (int i = 0; i < iovcnt; ++i)
2259 : 0 : past_write += iov[i].iov_len;
2260 : :
2597 2261 [ # # ]: 0 : if (past_write > vfdP->fileSize)
2262 : : {
4938 bruce@momjian.us 2263 : 0 : uint64 newTotal = temporary_files_size;
2264 : :
2597 tmunro@postgresql.or 2265 : 0 : newTotal += past_write - vfdP->fileSize;
5267 tgl@sss.pgh.pa.us 2266 [ # # ]: 0 : if (newTotal > (uint64) temp_file_limit * (uint64) 1024)
2267 [ # # ]: 0 : ereport(ERROR,
2268 : : (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED),
2269 : : errmsg("temporary file size exceeds \"temp_file_limit\" (%dkB)",
2270 : : temp_file_limit)));
2271 : : }
2272 : : }
2273 : :
7321 tgl@sss.pgh.pa.us 2274 :CBC 762373 : retry:
3196 rhaas@postgresql.org 2275 : 762373 : pgstat_report_wait_start(wait_event_info);
736 tmunro@postgresql.or 2276 : 762373 : returnCode = pg_pwritev(vfdP->fd, iov, iovcnt, offset);
3196 rhaas@postgresql.org 2277 : 762373 : pgstat_report_wait_end();
2278 : :
7321 tgl@sss.pgh.pa.us 2279 [ + - ]: 762373 : if (returnCode >= 0)
2280 : : {
2281 : : /*
2282 : : * Some callers expect short writes to set errno, and traditionally we
2283 : : * have assumed that they imply disk space shortage. We don't want to
2284 : : * waste CPU cycles adding up the total size here, so we'll just set
2285 : : * it for all successful writes in case such a caller determines that
2286 : : * the write was short and ereports "%m".
2287 : : */
736 tmunro@postgresql.or 2288 : 762373 : errno = ENOSPC;
2289 : :
2290 : : /*
2291 : : * Maintain fileSize and temporary_files_size if it's a temp file.
2292 : : */
2938 andres@anarazel.de 2293 [ + + ]: 762373 : if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
2294 : : {
34 michael@paquier.xyz 2295 :GNC 56398 : pgoff_t past_write = offset + returnCode;
2296 : :
2597 tmunro@postgresql.or 2297 [ + + ]:CBC 56398 : if (past_write > vfdP->fileSize)
2298 : : {
2299 : 39708 : temporary_files_size += past_write - vfdP->fileSize;
2300 : 39708 : vfdP->fileSize = past_write;
2301 : : }
2302 : : }
2303 : : }
2304 : : else
2305 : : {
2306 : : /*
2307 : : * See comments in FileReadV()
2308 : : */
2309 : : #ifdef WIN32
2310 : : DWORD error = GetLastError();
2311 : :
2312 : : switch (error)
2313 : : {
2314 : : case ERROR_NO_SYSTEM_RESOURCES:
2315 : : pg_usleep(1000L);
2316 : : errno = EINTR;
2317 : : break;
2318 : : default:
2319 : : _dosmaperr(error);
2320 : : break;
2321 : : }
2322 : : #endif
2323 : : /* OK to retry if interrupted */
7321 tgl@sss.pgh.pa.us 2324 [ # # ]:UBC 0 : if (errno == EINTR)
2325 : 0 : goto retry;
2326 : : }
2327 : :
10328 bruce@momjian.us 2328 :CBC 762373 : return returnCode;
2329 : : }
2330 : :
2331 : : int
3196 rhaas@postgresql.org 2332 : 296 : FileSync(File file, uint32 wait_event_info)
2333 : : {
2334 : : int returnCode;
2335 : :
7870 tgl@sss.pgh.pa.us 2336 [ + - + - : 296 : Assert(FileIsValid(file));
- + ]
2337 : :
2338 : : DO_DB(elog(LOG, "FileSync: %d (%s)",
2339 : : file, VfdCache[file].fileName));
2340 : :
2341 : 296 : returnCode = FileAccess(file);
2342 [ - + ]: 296 : if (returnCode < 0)
7870 tgl@sss.pgh.pa.us 2343 :UBC 0 : return returnCode;
2344 : :
3196 rhaas@postgresql.org 2345 :CBC 296 : pgstat_report_wait_start(wait_event_info);
2346 : 296 : returnCode = pg_fsync(VfdCache[file].fd);
2347 : 296 : pgstat_report_wait_end();
2348 : :
2349 : 296 : return returnCode;
2350 : : }
2351 : :
2352 : : /*
2353 : : * Zero a region of the file.
2354 : : *
2355 : : * Returns 0 on success, -1 otherwise. In the latter case errno is set to the
2356 : : * appropriate error.
2357 : : */
2358 : : int
34 michael@paquier.xyz 2359 :GNC 212400 : FileZero(File file, pgoff_t offset, pgoff_t amount, uint32 wait_event_info)
2360 : : {
2361 : : int returnCode;
2362 : : ssize_t written;
2363 : :
987 andres@anarazel.de 2364 [ + - + - :CBC 212400 : Assert(FileIsValid(file));
- + ]
2365 : :
2366 : : DO_DB(elog(LOG, "FileZero: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2367 : : file, VfdCache[file].fileName,
2368 : : (int64) offset, (int64) amount));
2369 : :
2370 : 212400 : returnCode = FileAccess(file);
2371 [ - + ]: 212400 : if (returnCode < 0)
987 andres@anarazel.de 2372 :UBC 0 : return returnCode;
2373 : :
987 andres@anarazel.de 2374 :CBC 212400 : pgstat_report_wait_start(wait_event_info);
2375 : 212400 : written = pg_pwrite_zeros(VfdCache[file].fd, amount, offset);
2376 : 212400 : pgstat_report_wait_end();
2377 : :
2378 [ - + ]: 212400 : if (written < 0)
987 andres@anarazel.de 2379 :UBC 0 : return -1;
987 andres@anarazel.de 2380 [ - + ]:CBC 212400 : else if (written != amount)
2381 : : {
2382 : : /* if errno is unset, assume problem is no disk space */
987 andres@anarazel.de 2383 [ # # ]:UBC 0 : if (errno == 0)
2384 : 0 : errno = ENOSPC;
2385 : 0 : return -1;
2386 : : }
2387 : :
987 andres@anarazel.de 2388 :CBC 212400 : return 0;
2389 : : }
2390 : :
2391 : : /*
2392 : : * Try to reserve file space with posix_fallocate(). If posix_fallocate() is
2393 : : * not implemented on the operating system or fails with EINVAL / EOPNOTSUPP,
2394 : : * use FileZero() instead.
2395 : : *
2396 : : * Note that at least glibc() implements posix_fallocate() in userspace if not
2397 : : * implemented by the filesystem. That's not the case for all environments
2398 : : * though.
2399 : : *
2400 : : * Returns 0 on success, -1 otherwise. In the latter case errno is set to the
2401 : : * appropriate error.
2402 : : */
2403 : : int
34 michael@paquier.xyz 2404 :GNC 525 : FileFallocate(File file, pgoff_t offset, pgoff_t amount, uint32 wait_event_info)
2405 : : {
2406 : : #ifdef HAVE_POSIX_FALLOCATE
2407 : : int returnCode;
2408 : :
987 andres@anarazel.de 2409 [ + - + - :CBC 525 : Assert(FileIsValid(file));
- + ]
2410 : :
2411 : : DO_DB(elog(LOG, "FileFallocate: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2412 : : file, VfdCache[file].fileName,
2413 : : (int64) offset, (int64) amount));
2414 : :
2415 : 525 : returnCode = FileAccess(file);
2416 [ + - ]: 525 : if (returnCode < 0)
987 andres@anarazel.de 2417 :UBC 0 : return -1;
2418 : :
912 andres@anarazel.de 2419 :CBC 525 : retry:
987 2420 : 525 : pgstat_report_wait_start(wait_event_info);
2421 : 525 : returnCode = posix_fallocate(VfdCache[file].fd, offset, amount);
2422 : 525 : pgstat_report_wait_end();
2423 : :
2424 [ + - ]: 525 : if (returnCode == 0)
2425 : 525 : return 0;
912 andres@anarazel.de 2426 [ # # ]:UBC 0 : else if (returnCode == EINTR)
2427 : 0 : goto retry;
2428 : :
2429 : : /* for compatibility with %m printing etc */
987 2430 : 0 : errno = returnCode;
2431 : :
2432 : : /*
2433 : : * Return in cases of a "real" failure, if fallocate is not supported,
2434 : : * fall through to the FileZero() backed implementation.
2435 : : */
2436 [ # # # # ]: 0 : if (returnCode != EINVAL && returnCode != EOPNOTSUPP)
2437 : 0 : return -1;
2438 : : #endif
2439 : :
2440 : 0 : return FileZero(file, offset, amount, wait_event_info);
2441 : : }
2442 : :
2443 : : pgoff_t
2597 tmunro@postgresql.or 2444 :CBC 3294511 : FileSize(File file)
2445 : : {
9719 tgl@sss.pgh.pa.us 2446 [ + - + - : 3294511 : Assert(FileIsValid(file));
- + ]
2447 : :
2448 : : DO_DB(elog(LOG, "FileSize %d (%s)",
2449 : : file, VfdCache[file].fileName));
2450 : :
10328 bruce@momjian.us 2451 [ + + ]: 3294511 : if (FileIsNotOpen(file))
2452 : : {
2597 tmunro@postgresql.or 2453 [ - + ]: 19 : if (FileAccess(file) < 0)
34 michael@paquier.xyz 2454 :UNC 0 : return (pgoff_t) -1;
2455 : : }
2456 : :
2597 tmunro@postgresql.or 2457 :CBC 3294511 : return lseek(VfdCache[file].fd, 0, SEEK_END);
2458 : : }
2459 : :
2460 : : int
34 michael@paquier.xyz 2461 :GNC 561 : FileTruncate(File file, pgoff_t offset, uint32 wait_event_info)
2462 : : {
2463 : : int returnCode;
2464 : :
9719 tgl@sss.pgh.pa.us 2465 [ + - + - :CBC 561 : Assert(FileIsValid(file));
- + ]
2466 : :
2467 : : DO_DB(elog(LOG, "FileTruncate %d (%s)",
2468 : : file, VfdCache[file].fileName));
2469 : :
7870 2470 : 561 : returnCode = FileAccess(file);
2471 [ - + ]: 561 : if (returnCode < 0)
7870 tgl@sss.pgh.pa.us 2472 :UBC 0 : return returnCode;
2473 : :
3196 rhaas@postgresql.org 2474 :CBC 561 : pgstat_report_wait_start(wait_event_info);
912 andres@anarazel.de 2475 : 561 : returnCode = pg_ftruncate(VfdCache[file].fd, offset);
3196 rhaas@postgresql.org 2476 : 561 : pgstat_report_wait_end();
2477 : :
5267 tgl@sss.pgh.pa.us 2478 [ + - - + ]: 561 : if (returnCode == 0 && VfdCache[file].fileSize > offset)
2479 : : {
2480 : : /* adjust our state for truncation of a temp file */
2938 andres@anarazel.de 2481 [ # # ]:UBC 0 : Assert(VfdCache[file].fdstate & FD_TEMP_FILE_LIMIT);
5267 tgl@sss.pgh.pa.us 2482 : 0 : temporary_files_size -= VfdCache[file].fileSize - offset;
2483 : 0 : VfdCache[file].fileSize = offset;
2484 : : }
2485 : :
9969 bruce@momjian.us 2486 :CBC 561 : return returnCode;
2487 : : }
2488 : :
2489 : : /*
2490 : : * Return the pathname associated with an open file.
2491 : : *
2492 : : * The returned string points to an internal buffer, which is valid until
2493 : : * the file is closed.
2494 : : */
2495 : : char *
5978 heikki.linnakangas@i 2496 : 22 : FilePathName(File file)
2497 : : {
2498 [ + - + - : 22 : Assert(FileIsValid(file));
- + ]
2499 : :
2500 : 22 : return VfdCache[file].fileName;
2501 : : }
2502 : :
2503 : : /*
2504 : : * Return the raw file descriptor of an opened file.
2505 : : *
2506 : : * The returned file descriptor will be valid until the file is closed, but
2507 : : * there are a lot of things that can make that happen. So the caller should
2508 : : * be careful not to do much of anything else before it finishes using the
2509 : : * returned file descriptor.
2510 : : */
2511 : : int
3571 rhaas@postgresql.org 2512 : 495198 : FileGetRawDesc(File file)
2513 : : {
2514 : : int returnCode;
2515 : :
263 andres@anarazel.de 2516 : 495198 : returnCode = FileAccess(file);
2517 [ - + ]: 495198 : if (returnCode < 0)
263 andres@anarazel.de 2518 :UBC 0 : return returnCode;
2519 : :
3571 rhaas@postgresql.org 2520 [ + - + - :CBC 495198 : Assert(FileIsValid(file));
- + ]
2521 : 495198 : return VfdCache[file].fd;
2522 : : }
2523 : :
2524 : : /*
2525 : : * FileGetRawFlags - returns the file flags on open(2)
2526 : : */
2527 : : int
3571 rhaas@postgresql.org 2528 :UBC 0 : FileGetRawFlags(File file)
2529 : : {
2530 [ # # # # : 0 : Assert(FileIsValid(file));
# # ]
2531 : 0 : return VfdCache[file].fileFlags;
2532 : : }
2533 : :
2534 : : /*
2535 : : * FileGetRawMode - returns the mode bitmask passed to open(2)
2536 : : */
2537 : : mode_t
2538 : 0 : FileGetRawMode(File file)
2539 : : {
2540 [ # # # # : 0 : Assert(FileIsValid(file));
# # ]
2541 : 0 : return VfdCache[file].fileMode;
2542 : : }
2543 : :
2544 : : /*
2545 : : * Make room for another allocatedDescs[] array entry if needed and possible.
2546 : : * Returns true if an array element is available.
2547 : : */
2548 : : static bool
4574 tgl@sss.pgh.pa.us 2549 :CBC 522767 : reserveAllocatedDesc(void)
2550 : : {
2551 : : AllocateDesc *newDescs;
2552 : : int newMax;
2553 : :
2554 : : /* Quick out if array already has a free slot. */
2555 [ + + ]: 522767 : if (numAllocatedDescs < maxAllocatedDescs)
2556 : 521684 : return true;
2557 : :
2558 : : /*
2559 : : * If the array hasn't yet been created in the current process, initialize
2560 : : * it with FD_MINFREE / 3 elements. In many scenarios this is as many as
2561 : : * we will ever need, anyway. We don't want to look at max_safe_fds
2562 : : * immediately because set_max_safe_fds() may not have run yet.
2563 : : */
2564 [ + - ]: 1083 : if (allocatedDescs == NULL)
2565 : : {
2123 2566 : 1083 : newMax = FD_MINFREE / 3;
4574 2567 : 1083 : newDescs = (AllocateDesc *) malloc(newMax * sizeof(AllocateDesc));
2568 : : /* Out of memory already? Treat as fatal error. */
2569 [ - + ]: 1083 : if (newDescs == NULL)
4574 tgl@sss.pgh.pa.us 2570 [ # # ]:UBC 0 : ereport(ERROR,
2571 : : (errcode(ERRCODE_OUT_OF_MEMORY),
2572 : : errmsg("out of memory")));
4574 tgl@sss.pgh.pa.us 2573 :CBC 1083 : allocatedDescs = newDescs;
2574 : 1083 : maxAllocatedDescs = newMax;
2575 : 1083 : return true;
2576 : : }
2577 : :
2578 : : /*
2579 : : * Consider enlarging the array beyond the initial allocation used above.
2580 : : * By the time this happens, max_safe_fds should be known accurately.
2581 : : *
2582 : : * We mustn't let allocated descriptors hog all the available FDs, and in
2583 : : * practice we'd better leave a reasonable number of FDs for VFD use. So
2584 : : * set the maximum to max_safe_fds / 3. (This should certainly be at
2585 : : * least as large as the initial size, FD_MINFREE / 3, so we aren't
2586 : : * tightening the restriction here.) Recall that "external" FDs are
2587 : : * allowed to consume another third of max_safe_fds.
2588 : : */
2123 tgl@sss.pgh.pa.us 2589 :UBC 0 : newMax = max_safe_fds / 3;
4574 2590 [ # # ]: 0 : if (newMax > maxAllocatedDescs)
2591 : : {
2592 : 0 : newDescs = (AllocateDesc *) realloc(allocatedDescs,
2593 : : newMax * sizeof(AllocateDesc));
2594 : : /* Treat out-of-memory as a non-fatal error. */
2595 [ # # ]: 0 : if (newDescs == NULL)
2596 : 0 : return false;
2597 : 0 : allocatedDescs = newDescs;
2598 : 0 : maxAllocatedDescs = newMax;
2599 : 0 : return true;
2600 : : }
2601 : :
2602 : : /* Can't enlarge allocatedDescs[] any more. */
2603 : 0 : return false;
2604 : : }
2605 : :
2606 : : /*
2607 : : * Routines that want to use stdio (ie, FILE*) should use AllocateFile
2608 : : * rather than plain fopen(). This lets fd.c deal with freeing FDs if
2609 : : * necessary to open the file. When done, call FreeFile rather than fclose.
2610 : : *
2611 : : * Note that files that will be open for any significant length of time
2612 : : * should NOT be handled this way, since they cannot share kernel file
2613 : : * descriptors with other files; there is grave risk of running out of FDs
2614 : : * if anyone locks down too many FDs. Most callers of this routine are
2615 : : * simply reading a config file that they will read and close immediately.
2616 : : *
2617 : : * fd.c will automatically close all files opened with AllocateFile at
2618 : : * transaction commit or abort; this prevents FD leakage if a routine
2619 : : * that calls AllocateFile is terminated prematurely by ereport(ERROR).
2620 : : *
2621 : : * Ideally this should be the *only* direct call of fopen() in the backend.
2622 : : */
2623 : : FILE *
7228 tgl@sss.pgh.pa.us 2624 :CBC 79756 : AllocateFile(const char *name, const char *mode)
2625 : : {
2626 : : FILE *file;
2627 : :
2628 : : DO_DB(elog(LOG, "AllocateFile: Allocated %d (%s)",
2629 : : numAllocatedDescs, name));
2630 : :
2631 : : /* Can we allocate another non-virtual FD? */
4574 2632 [ - + ]: 79756 : if (!reserveAllocatedDesc())
4574 tgl@sss.pgh.pa.us 2633 [ # # ]:UBC 0 : ereport(ERROR,
2634 : : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2635 : : errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2636 : : maxAllocatedDescs, name)));
2637 : :
2638 : : /* Close excess kernel FDs. */
4574 tgl@sss.pgh.pa.us 2639 :CBC 79756 : ReleaseLruFiles();
2640 : :
10348 bruce@momjian.us 2641 : 79756 : TryAgain:
9243 tgl@sss.pgh.pa.us 2642 [ + + ]: 79756 : if ((file = fopen(name, mode)) != NULL)
2643 : : {
7812 2644 : 73142 : AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2645 : :
2646 : 73142 : desc->kind = AllocateDescFile;
2647 : 73142 : desc->desc.file = file;
7762 2648 : 73142 : desc->create_subid = GetCurrentSubTransactionId();
7812 2649 : 73142 : numAllocatedDescs++;
2650 : 73142 : return desc->desc.file;
2651 : : }
2652 : :
9243 2653 [ + - - + ]: 6614 : if (errno == EMFILE || errno == ENFILE)
2654 : : {
9036 bruce@momjian.us 2655 :UBC 0 : int save_errno = errno;
2656 : :
8182 tgl@sss.pgh.pa.us 2657 [ # # ]: 0 : ereport(LOG,
2658 : : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2659 : : errmsg("out of file descriptors: %m; release and retry")));
9243 2660 : 0 : errno = 0;
2661 [ # # ]: 0 : if (ReleaseLruFile())
10328 bruce@momjian.us 2662 : 0 : goto TryAgain;
9243 tgl@sss.pgh.pa.us 2663 : 0 : errno = save_errno;
2664 : : }
2665 : :
9243 tgl@sss.pgh.pa.us 2666 :CBC 6614 : return NULL;
2667 : : }
2668 : :
2669 : : /*
2670 : : * Open a file with OpenTransientFilePerm() and pass default file mode for
2671 : : * the fileMode parameter.
2672 : : */
2673 : : int
3007 peter_e@gmx.net 2674 : 399603 : OpenTransientFile(const char *fileName, int fileFlags)
2675 : : {
2811 sfrost@snowman.net 2676 : 399603 : return OpenTransientFilePerm(fileName, fileFlags, pg_file_create_mode);
2677 : : }
2678 : :
2679 : : /*
2680 : : * Like AllocateFile, but returns an unbuffered fd like open(2)
2681 : : */
2682 : : int
3007 peter_e@gmx.net 2683 : 399609 : OpenTransientFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
2684 : : {
2685 : : int fd;
2686 : :
2687 : : DO_DB(elog(LOG, "OpenTransientFile: Allocated %d (%s)",
2688 : : numAllocatedDescs, fileName));
2689 : :
2690 : : /* Can we allocate another non-virtual FD? */
4574 tgl@sss.pgh.pa.us 2691 [ - + ]: 399609 : if (!reserveAllocatedDesc())
4574 tgl@sss.pgh.pa.us 2692 [ # # ]:UBC 0 : ereport(ERROR,
2693 : : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2694 : : errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2695 : : maxAllocatedDescs, fileName)));
2696 : :
2697 : : /* Close excess kernel FDs. */
4574 tgl@sss.pgh.pa.us 2698 :CBC 399609 : ReleaseLruFiles();
2699 : :
3007 peter_e@gmx.net 2700 : 399609 : fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
2701 : :
4768 heikki.linnakangas@i 2702 [ + + ]: 399609 : if (fd >= 0)
2703 : : {
2704 : 394751 : AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2705 : :
2706 : 394751 : desc->kind = AllocateDescRawFD;
2707 : 394751 : desc->desc.fd = fd;
2708 : 394751 : desc->create_subid = GetCurrentSubTransactionId();
2709 : 394751 : numAllocatedDescs++;
2710 : :
2711 : 394751 : return fd;
2712 : : }
2713 : :
2714 : 4858 : return -1; /* failure */
2715 : : }
2716 : :
2717 : : /*
2718 : : * Routines that want to initiate a pipe stream should use OpenPipeStream
2719 : : * rather than plain popen(). This lets fd.c deal with freeing FDs if
2720 : : * necessary. When done, call ClosePipeStream rather than pclose.
2721 : : *
2722 : : * This function also ensures that the popen'd program is run with default
2723 : : * SIGPIPE processing, rather than the SIG_IGN setting the backend normally
2724 : : * uses. This ensures desirable response to, eg, closing a read pipe early.
2725 : : */
2726 : : FILE *
4676 2727 : 61 : OpenPipeStream(const char *command, const char *mode)
2728 : : {
2729 : : FILE *file;
2730 : : int save_errno;
2731 : :
2732 : : DO_DB(elog(LOG, "OpenPipeStream: Allocated %d (%s)",
2733 : : numAllocatedDescs, command));
2734 : :
2735 : : /* Can we allocate another non-virtual FD? */
4574 tgl@sss.pgh.pa.us 2736 [ - + ]: 61 : if (!reserveAllocatedDesc())
4574 tgl@sss.pgh.pa.us 2737 [ # # ]:UBC 0 : ereport(ERROR,
2738 : : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2739 : : errmsg("exceeded maxAllocatedDescs (%d) while trying to execute command \"%s\"",
2740 : : maxAllocatedDescs, command)));
2741 : :
2742 : : /* Close excess kernel FDs. */
4574 tgl@sss.pgh.pa.us 2743 :CBC 61 : ReleaseLruFiles();
2744 : :
4676 heikki.linnakangas@i 2745 : 61 : TryAgain:
1206 tgl@sss.pgh.pa.us 2746 : 61 : fflush(NULL);
2585 2747 : 61 : pqsignal(SIGPIPE, SIG_DFL);
4676 heikki.linnakangas@i 2748 : 61 : errno = 0;
2585 tgl@sss.pgh.pa.us 2749 : 61 : file = popen(command, mode);
2750 : 61 : save_errno = errno;
2751 : 61 : pqsignal(SIGPIPE, SIG_IGN);
2752 : 61 : errno = save_errno;
2753 [ + - ]: 61 : if (file != NULL)
2754 : : {
4676 heikki.linnakangas@i 2755 : 61 : AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2756 : :
2757 : 61 : desc->kind = AllocateDescPipe;
2758 : 61 : desc->desc.file = file;
2759 : 61 : desc->create_subid = GetCurrentSubTransactionId();
2760 : 61 : numAllocatedDescs++;
2761 : 61 : return desc->desc.file;
2762 : : }
2763 : :
4676 heikki.linnakangas@i 2764 [ # # # # ]:UBC 0 : if (errno == EMFILE || errno == ENFILE)
2765 : : {
2766 [ # # ]: 0 : ereport(LOG,
2767 : : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2768 : : errmsg("out of file descriptors: %m; release and retry")));
2769 [ # # ]: 0 : if (ReleaseLruFile())
2770 : 0 : goto TryAgain;
2771 : 0 : errno = save_errno;
2772 : : }
2773 : :
2774 : 0 : return NULL;
2775 : : }
2776 : :
2777 : : /*
2778 : : * Free an AllocateDesc of any type.
2779 : : *
2780 : : * The argument *must* point into the allocatedDescs[] array.
2781 : : */
2782 : : static int
7812 tgl@sss.pgh.pa.us 2783 :CBC 510426 : FreeDesc(AllocateDesc *desc)
2784 : : {
2785 : : int result;
2786 : :
2787 : : /* Close the underlying object */
2788 [ + + + + : 510426 : switch (desc->kind)
- ]
2789 : : {
2790 : 73142 : case AllocateDescFile:
2791 : 73142 : result = fclose(desc->desc.file);
2792 : 73142 : break;
4676 heikki.linnakangas@i 2793 : 61 : case AllocateDescPipe:
2794 : 61 : result = pclose(desc->desc.file);
2795 : 61 : break;
7812 tgl@sss.pgh.pa.us 2796 : 42472 : case AllocateDescDir:
2797 : 42472 : result = closedir(desc->desc.dir);
2798 : 42472 : break;
4768 heikki.linnakangas@i 2799 : 394751 : case AllocateDescRawFD:
263 andres@anarazel.de 2800 : 394751 : pgaio_closing_fd(desc->desc.fd);
4768 heikki.linnakangas@i 2801 : 394751 : result = close(desc->desc.fd);
2802 : 394751 : break;
7812 tgl@sss.pgh.pa.us 2803 :UBC 0 : default:
2804 [ # # ]: 0 : elog(ERROR, "AllocateDesc kind not recognized");
2805 : : result = 0; /* keep compiler quiet */
2806 : : break;
2807 : : }
2808 : :
2809 : : /* Compact storage in the allocatedDescs array */
7812 tgl@sss.pgh.pa.us 2810 :CBC 510426 : numAllocatedDescs--;
2811 : 510426 : *desc = allocatedDescs[numAllocatedDescs];
2812 : :
2813 : 510426 : return result;
2814 : : }
2815 : :
2816 : : /*
2817 : : * Close a file returned by AllocateFile.
2818 : : *
2819 : : * Note we do not check fclose's return value --- it is up to the caller
2820 : : * to handle close errors.
2821 : : */
2822 : : int
10327 bruce@momjian.us 2823 : 73126 : FreeFile(FILE *file)
2824 : : {
2825 : : int i;
2826 : :
2827 : : DO_DB(elog(LOG, "FreeFile: Allocated %d", numAllocatedDescs));
2828 : :
2829 : : /* Remove file from list of allocated files, if it's present */
7812 tgl@sss.pgh.pa.us 2830 [ + - ]: 73129 : for (i = numAllocatedDescs; --i >= 0;)
2831 : : {
2832 : 73129 : AllocateDesc *desc = &allocatedDescs[i];
2833 : :
2834 [ + - + + ]: 73129 : if (desc->kind == AllocateDescFile && desc->desc.file == file)
2835 : 73126 : return FreeDesc(desc);
2836 : : }
2837 : :
2838 : : /* Only get here if someone passes us a file not in allocatedDescs */
7812 tgl@sss.pgh.pa.us 2839 [ # # ]:UBC 0 : elog(WARNING, "file passed to FreeFile was not obtained from AllocateFile");
2840 : :
7996 2841 : 0 : return fclose(file);
2842 : : }
2843 : :
2844 : : /*
2845 : : * Close a file returned by OpenTransientFile.
2846 : : *
2847 : : * Note we do not check close's return value --- it is up to the caller
2848 : : * to handle close errors.
2849 : : */
2850 : : int
4768 heikki.linnakangas@i 2851 :CBC 394749 : CloseTransientFile(int fd)
2852 : : {
2853 : : int i;
2854 : :
2855 : : DO_DB(elog(LOG, "CloseTransientFile: Allocated %d", numAllocatedDescs));
2856 : :
2857 : : /* Remove fd from list of allocated files, if it's present */
2858 [ + - ]: 394749 : for (i = numAllocatedDescs; --i >= 0;)
2859 : : {
2860 : 394749 : AllocateDesc *desc = &allocatedDescs[i];
2861 : :
2862 [ + - + - ]: 394749 : if (desc->kind == AllocateDescRawFD && desc->desc.fd == fd)
2863 : 394749 : return FreeDesc(desc);
2864 : : }
2865 : :
2866 : : /* Only get here if someone passes us a file not in allocatedDescs */
4768 heikki.linnakangas@i 2867 [ # # ]:UBC 0 : elog(WARNING, "fd passed to CloseTransientFile was not obtained from OpenTransientFile");
2868 : :
263 andres@anarazel.de 2869 : 0 : pgaio_closing_fd(fd);
2870 : :
4768 heikki.linnakangas@i 2871 : 0 : return close(fd);
2872 : : }
2873 : :
2874 : : /*
2875 : : * Routines that want to use <dirent.h> (ie, DIR*) should use AllocateDir
2876 : : * rather than plain opendir(). This lets fd.c deal with freeing FDs if
2877 : : * necessary to open the directory, and with closing it after an elog.
2878 : : * When done, call FreeDir rather than closedir.
2879 : : *
2880 : : * Returns NULL, with errno set, on failure. Note that failure detection
2881 : : * is commonly left to the following call of ReadDir or ReadDirExtended;
2882 : : * see the comments for ReadDir.
2883 : : *
2884 : : * Ideally this should be the *only* direct call of opendir() in the backend.
2885 : : */
2886 : : DIR *
7968 tgl@sss.pgh.pa.us 2887 :CBC 43341 : AllocateDir(const char *dirname)
2888 : : {
2889 : : DIR *dir;
2890 : :
2891 : : DO_DB(elog(LOG, "AllocateDir: Allocated %d (%s)",
2892 : : numAllocatedDescs, dirname));
2893 : :
2894 : : /* Can we allocate another non-virtual FD? */
4574 2895 [ - + ]: 43341 : if (!reserveAllocatedDesc())
4574 tgl@sss.pgh.pa.us 2896 [ # # ]:UBC 0 : ereport(ERROR,
2897 : : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2898 : : errmsg("exceeded maxAllocatedDescs (%d) while trying to open directory \"%s\"",
2899 : : maxAllocatedDescs, dirname)));
2900 : :
2901 : : /* Close excess kernel FDs. */
4574 tgl@sss.pgh.pa.us 2902 :CBC 43341 : ReleaseLruFiles();
2903 : :
7968 2904 : 43341 : TryAgain:
2905 [ + + ]: 43341 : if ((dir = opendir(dirname)) != NULL)
2906 : : {
7812 2907 : 42472 : AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2908 : :
2909 : 42472 : desc->kind = AllocateDescDir;
2910 : 42472 : desc->desc.dir = dir;
7762 2911 : 42472 : desc->create_subid = GetCurrentSubTransactionId();
7812 2912 : 42472 : numAllocatedDescs++;
2913 : 42472 : return desc->desc.dir;
2914 : : }
2915 : :
7968 2916 [ + - - + ]: 869 : if (errno == EMFILE || errno == ENFILE)
2917 : : {
7968 tgl@sss.pgh.pa.us 2918 :UBC 0 : int save_errno = errno;
2919 : :
2920 [ # # ]: 0 : ereport(LOG,
2921 : : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2922 : : errmsg("out of file descriptors: %m; release and retry")));
2923 : 0 : errno = 0;
2924 [ # # ]: 0 : if (ReleaseLruFile())
2925 : 0 : goto TryAgain;
2926 : 0 : errno = save_errno;
2927 : : }
2928 : :
7968 tgl@sss.pgh.pa.us 2929 :CBC 869 : return NULL;
2930 : : }
2931 : :
2932 : : /*
2933 : : * Read a directory opened with AllocateDir, ereport'ing any error.
2934 : : *
2935 : : * This is easier to use than raw readdir() since it takes care of some
2936 : : * otherwise rather tedious and error-prone manipulation of errno. Also,
2937 : : * if you are happy with a generic error message for AllocateDir failure,
2938 : : * you can just do
2939 : : *
2940 : : * dir = AllocateDir(path);
2941 : : * while ((dirent = ReadDir(dir, path)) != NULL)
2942 : : * process dirent;
2943 : : * FreeDir(dir);
2944 : : *
2945 : : * since a NULL dir parameter is taken as indicating AllocateDir failed.
2946 : : * (Make sure errno isn't changed between AllocateDir and ReadDir if you
2947 : : * use this shortcut.)
2948 : : *
2949 : : * The pathname passed to AllocateDir must be passed to this routine too,
2950 : : * but it is only used for error reporting.
2951 : : */
2952 : : struct dirent *
7486 2953 : 1238815 : ReadDir(DIR *dir, const char *dirname)
2954 : : {
3856 2955 : 1238815 : return ReadDirExtended(dir, dirname, ERROR);
2956 : : }
2957 : :
2958 : : /*
2959 : : * Alternate version of ReadDir that allows caller to specify the elevel
2960 : : * for any error report (whether it's reporting an initial failure of
2961 : : * AllocateDir or a subsequent directory read failure).
2962 : : *
2963 : : * If elevel < ERROR, returns NULL after any error. With the normal coding
2964 : : * pattern, this will result in falling out of the loop immediately as
2965 : : * though the directory contained no (more) entries.
2966 : : */
2967 : : struct dirent *
2968 : 2424924 : ReadDirExtended(DIR *dir, const char *dirname, int elevel)
2969 : : {
2970 : : struct dirent *dent;
2971 : :
2972 : : /* Give a generic message for AllocateDir failure, if caller didn't */
7486 2973 [ + + ]: 2424924 : if (dir == NULL)
2974 : : {
3856 2975 [ + - ]: 3 : ereport(elevel,
2976 : : (errcode_for_file_access(),
2977 : : errmsg("could not open directory \"%s\": %m",
2978 : : dirname)));
3856 tgl@sss.pgh.pa.us 2979 :UBC 0 : return NULL;
2980 : : }
2981 : :
7486 tgl@sss.pgh.pa.us 2982 :CBC 2424921 : errno = 0;
2983 [ + + ]: 2424921 : if ((dent = readdir(dir)) != NULL)
2984 : 2393480 : return dent;
2985 : :
2986 [ - + ]: 31441 : if (errno)
3856 tgl@sss.pgh.pa.us 2987 [ # # ]:UBC 0 : ereport(elevel,
2988 : : (errcode_for_file_access(),
2989 : : errmsg("could not read directory \"%s\": %m",
2990 : : dirname)));
7486 tgl@sss.pgh.pa.us 2991 :CBC 31441 : return NULL;
2992 : : }
2993 : :
2994 : : /*
2995 : : * Close a directory opened with AllocateDir.
2996 : : *
2997 : : * Returns closedir's return value (with errno set if it's not 0).
2998 : : * Note we do not check the return value --- it is up to the caller
2999 : : * to handle close errors if wanted.
3000 : : *
3001 : : * Does nothing if dir == NULL; we assume that directory open failure was
3002 : : * already reported if desired.
3003 : : */
3004 : : int
7968 3005 : 42353 : FreeDir(DIR *dir)
3006 : : {
3007 : : int i;
3008 : :
3009 : : /* Nothing to do if AllocateDir failed */
2935 3010 [ - + ]: 42353 : if (dir == NULL)
2935 tgl@sss.pgh.pa.us 3011 :UBC 0 : return 0;
3012 : :
3013 : : DO_DB(elog(LOG, "FreeDir: Allocated %d", numAllocatedDescs));
3014 : :
3015 : : /* Remove dir from list of allocated dirs, if it's present */
7812 tgl@sss.pgh.pa.us 3016 [ + - ]:CBC 42353 : for (i = numAllocatedDescs; --i >= 0;)
3017 : : {
3018 : 42353 : AllocateDesc *desc = &allocatedDescs[i];
3019 : :
3020 [ + - + - ]: 42353 : if (desc->kind == AllocateDescDir && desc->desc.dir == dir)
3021 : 42353 : return FreeDesc(desc);
3022 : : }
3023 : :
3024 : : /* Only get here if someone passes us a dir not in allocatedDescs */
7812 tgl@sss.pgh.pa.us 3025 [ # # ]:UBC 0 : elog(WARNING, "dir passed to FreeDir was not obtained from AllocateDir");
3026 : :
7968 3027 : 0 : return closedir(dir);
3028 : : }
3029 : :
3030 : :
3031 : : /*
3032 : : * Close a pipe stream returned by OpenPipeStream.
3033 : : */
3034 : : int
4676 heikki.linnakangas@i 3035 :CBC 61 : ClosePipeStream(FILE *file)
3036 : : {
3037 : : int i;
3038 : :
3039 : : DO_DB(elog(LOG, "ClosePipeStream: Allocated %d", numAllocatedDescs));
3040 : :
3041 : : /* Remove file from list of allocated files, if it's present */
3042 [ + - ]: 61 : for (i = numAllocatedDescs; --i >= 0;)
3043 : : {
3044 : 61 : AllocateDesc *desc = &allocatedDescs[i];
3045 : :
3046 [ + - + - ]: 61 : if (desc->kind == AllocateDescPipe && desc->desc.file == file)
3047 : 61 : return FreeDesc(desc);
3048 : : }
3049 : :
3050 : : /* Only get here if someone passes us a file not in allocatedDescs */
4676 heikki.linnakangas@i 3051 [ # # ]:UBC 0 : elog(WARNING, "file passed to ClosePipeStream was not obtained from OpenPipeStream");
3052 : :
3053 : 0 : return pclose(file);
3054 : : }
3055 : :
3056 : : /*
3057 : : * closeAllVfds
3058 : : *
3059 : : * Force all VFDs into the physically-closed state, so that the fewest
3060 : : * possible number of kernel file descriptors are in use. There is no
3061 : : * change in the logical state of the VFDs.
3062 : : */
3063 : : void
9243 tgl@sss.pgh.pa.us 3064 :CBC 32 : closeAllVfds(void)
3065 : : {
3066 : : Index i;
3067 : :
9719 3068 [ + - ]: 32 : if (SizeVfdCache > 0)
3069 : : {
3101 3070 [ - + ]: 32 : Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
9719 3071 [ + + ]: 1024 : for (i = 1; i < SizeVfdCache; i++)
3072 : : {
3073 [ + + ]: 992 : if (!FileIsNotOpen(i))
3074 : 136 : LruDelete(i);
3075 : : }
3076 : : }
3077 : 32 : }
3078 : :
3079 : :
3080 : : /*
3081 : : * SetTempTablespaces
3082 : : *
3083 : : * Define a list (actually an array) of OIDs of tablespaces to use for
3084 : : * temporary files. This list will be used until end of transaction,
3085 : : * unless this function is called again before then. It is caller's
3086 : : * responsibility that the passed-in array has adequate lifespan (typically
3087 : : * it'd be allocated in TopTransactionContext).
3088 : : *
3089 : : * Some entries of the array may be InvalidOid, indicating that the current
3090 : : * database's default tablespace should be used.
3091 : : */
3092 : : void
6768 3093 : 3197 : SetTempTablespaces(Oid *tableSpaces, int numSpaces)
3094 : : {
3095 [ - + ]: 3197 : Assert(numSpaces >= 0);
3096 : 3197 : tempTableSpaces = tableSpaces;
3097 : 3197 : numTempTableSpaces = numSpaces;
3098 : :
3099 : : /*
3100 : : * Select a random starting point in the list. This is to minimize
3101 : : * conflicts between backends that are most likely sharing the same list
3102 : : * of temp tablespaces. Note that if we create multiple temp files in the
3103 : : * same transaction, we'll advance circularly through the list --- this
3104 : : * ensures that large temporary sort files are nicely spread across all
3105 : : * available tablespaces.
3106 : : */
3107 [ - + ]: 3197 : if (numSpaces > 1)
1480 tgl@sss.pgh.pa.us 3108 :UBC 0 : nextTempTableSpace = pg_prng_uint64_range(&pg_global_prng_state,
3109 : 0 : 0, numSpaces - 1);
3110 : : else
6768 tgl@sss.pgh.pa.us 3111 :CBC 3197 : nextTempTableSpace = 0;
3112 : 3197 : }
3113 : :
3114 : : /*
3115 : : * TempTablespacesAreSet
3116 : : *
3117 : : * Returns true if SetTempTablespaces has been called in current transaction.
3118 : : * (This is just so that tablespaces.c doesn't need its own per-transaction
3119 : : * state.)
3120 : : */
3121 : : bool
3122 : 4477 : TempTablespacesAreSet(void)
3123 : : {
3124 : 4477 : return (numTempTableSpaces >= 0);
3125 : : }
3126 : :
3127 : : /*
3128 : : * GetTempTablespaces
3129 : : *
3130 : : * Populate an array with the OIDs of the tablespaces that should be used for
3131 : : * temporary files. (Some entries may be InvalidOid, indicating that the
3132 : : * current database's default tablespace should be used.) At most numSpaces
3133 : : * entries will be filled.
3134 : : * Returns the number of OIDs that were copied into the output array.
3135 : : */
3136 : : int
2938 andres@anarazel.de 3137 : 194 : GetTempTablespaces(Oid *tableSpaces, int numSpaces)
3138 : : {
3139 : : int i;
3140 : :
3141 [ - + ]: 194 : Assert(TempTablespacesAreSet());
3142 [ - + - - ]: 194 : for (i = 0; i < numTempTableSpaces && i < numSpaces; ++i)
2938 andres@anarazel.de 3143 :UBC 0 : tableSpaces[i] = tempTableSpaces[i];
3144 : :
2938 andres@anarazel.de 3145 :CBC 194 : return i;
3146 : : }
3147 : :
3148 : : /*
3149 : : * GetNextTempTableSpace
3150 : : *
3151 : : * Select the next temp tablespace to use. A result of InvalidOid means
3152 : : * to use the current database's default tablespace.
3153 : : */
3154 : : Oid
6768 tgl@sss.pgh.pa.us 3155 : 2241 : GetNextTempTableSpace(void)
3156 : : {
3157 [ + + ]: 2241 : if (numTempTableSpaces > 0)
3158 : : {
3159 : : /* Advance nextTempTableSpace counter with wraparound */
3160 [ + - ]: 1 : if (++nextTempTableSpace >= numTempTableSpaces)
3161 : 1 : nextTempTableSpace = 0;
3162 : 1 : return tempTableSpaces[nextTempTableSpace];
3163 : : }
3164 : 2240 : return InvalidOid;
3165 : : }
3166 : :
3167 : :
3168 : : /*
3169 : : * AtEOSubXact_Files
3170 : : *
3171 : : * Take care of subtransaction commit/abort. At abort, we close AllocateDescs
3172 : : * that the subtransaction may have opened. At commit, we reassign them to
3173 : : * the parent subtransaction. (Temporary files are tracked by ResourceOwners
3174 : : * instead.)
3175 : : */
3176 : : void
7762 3177 : 9140 : AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid,
3178 : : SubTransactionId parentSubid)
3179 : : {
3180 : : Index i;
3181 : :
7812 3182 [ - + ]: 9140 : for (i = 0; i < numAllocatedDescs; i++)
3183 : : {
7762 tgl@sss.pgh.pa.us 3184 [ # # ]:UBC 0 : if (allocatedDescs[i].create_subid == mySubid)
3185 : : {
7812 3186 [ # # ]: 0 : if (isCommit)
7762 3187 : 0 : allocatedDescs[i].create_subid = parentSubid;
3188 : : else
3189 : : {
3190 : : /* have to recheck the item after FreeDesc (ugly) */
7812 3191 : 0 : FreeDesc(&allocatedDescs[i--]);
3192 : : }
3193 : : }
3194 : : }
7812 tgl@sss.pgh.pa.us 3195 :CBC 9140 : }
3196 : :
3197 : : /*
3198 : : * AtEOXact_Files
3199 : : *
3200 : : * This routine is called during transaction commit or abort. All still-open
3201 : : * per-transaction temporary file VFDs are closed, which also causes the
3202 : : * underlying files to be deleted (although they should've been closed already
3203 : : * by the ResourceOwner cleanup). Furthermore, all "allocated" stdio files are
3204 : : * closed. We also forget any transaction-local temp tablespace list.
3205 : : *
3206 : : * The isCommit flag is used only to decide whether to emit warnings about
3207 : : * unclosed files.
3208 : : */
3209 : : void
2790 3210 : 331860 : AtEOXact_Files(bool isCommit)
3211 : : {
3212 : 331860 : CleanupTempFiles(isCommit, false);
6768 3213 : 331860 : tempTableSpaces = NULL;
3214 : 331860 : numTempTableSpaces = -1;
8268 3215 : 331860 : }
3216 : :
3217 : : /*
3218 : : * BeforeShmemExit_Files
3219 : : *
3220 : : * before_shmem_exit hook to clean up temp files during backend shutdown.
3221 : : * Here, we want to clean up *all* temp files including interXact ones.
3222 : : */
3223 : : static void
1593 andres@anarazel.de 3224 : 19580 : BeforeShmemExit_Files(int code, Datum arg)
3225 : : {
2790 tgl@sss.pgh.pa.us 3226 : 19580 : CleanupTempFiles(false, true);
3227 : :
3228 : : /* prevent further temp files from being created */
3229 : : #ifdef USE_ASSERT_CHECKING
1593 andres@anarazel.de 3230 : 19580 : temporary_files_allowed = false;
3231 : : #endif
8268 tgl@sss.pgh.pa.us 3232 : 19580 : }
3233 : :
3234 : : /*
3235 : : * Close temporary files and delete their underlying files.
3236 : : *
3237 : : * isCommit: if true, this is normal transaction commit, and we don't
3238 : : * expect any remaining files; warn if there are some.
3239 : : *
3240 : : * isProcExit: if true, this is being called as the backend process is
3241 : : * exiting. If that's the case, we should remove all temporary files; if
3242 : : * that's not the case, we are being called for transaction commit/abort
3243 : : * and should only remove transaction-local temp files. In either case,
3244 : : * also clean up "allocated" stdio files, dirs and fds.
3245 : : */
3246 : : static void
2790 3247 : 351440 : CleanupTempFiles(bool isCommit, bool isProcExit)
3248 : : {
3249 : : Index i;
3250 : :
3251 : : /*
3252 : : * Careful here: at proc_exit we need extra cleanup, not just
3253 : : * xact_temporary files.
3254 : : */
4809 3255 [ + + + + ]: 351440 : if (isProcExit || have_xact_temporary_files)
3256 : : {
3101 3257 [ - + ]: 20375 : Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
9719 3258 [ + + ]: 1136250 : for (i = 1; i < SizeVfdCache; i++)
3259 : : {
8268 3260 : 1115875 : unsigned short fdstate = VfdCache[i].fdstate;
3261 : :
2938 andres@anarazel.de 3262 [ + + - + ]: 1115875 : if (((fdstate & FD_DELETE_AT_CLOSE) || (fdstate & FD_CLOSE_AT_EOXACT)) &&
3263 [ + - ]: 4 : VfdCache[i].fileName != NULL)
3264 : : {
3265 : : /*
3266 : : * If we're in the process of exiting a backend process, close
3267 : : * all temporary files. Otherwise, only close temporary files
3268 : : * local to the current transaction. They should be closed by
3269 : : * the ResourceOwner mechanism already, so this is just a
3270 : : * debugging cross-check.
3271 : : */
4809 tgl@sss.pgh.pa.us 3272 [ + - ]: 4 : if (isProcExit)
3273 : 4 : FileClose(i);
2938 andres@anarazel.de 3274 [ # # ]:UBC 0 : else if (fdstate & FD_CLOSE_AT_EOXACT)
3275 : : {
4809 tgl@sss.pgh.pa.us 3276 [ # # ]: 0 : elog(WARNING,
3277 : : "temporary file %s not closed at end-of-transaction",
3278 : : VfdCache[i].fileName);
3279 : 0 : FileClose(i);
3280 : : }
3281 : : }
3282 : : }
3283 : :
4809 tgl@sss.pgh.pa.us 3284 :CBC 20375 : have_xact_temporary_files = false;
3285 : : }
3286 : :
3287 : : /* Complain if any allocated files remain open at commit. */
2790 3288 [ + + - + ]: 351440 : if (isCommit && numAllocatedDescs > 0)
2790 tgl@sss.pgh.pa.us 3289 [ # # ]:UBC 0 : elog(WARNING, "%d temporary files and directories not closed at end-of-transaction",
3290 : : numAllocatedDescs);
3291 : :
3292 : : /* Clean up "allocated" stdio files, dirs and fds. */
7812 tgl@sss.pgh.pa.us 3293 [ + + ]:CBC 351577 : while (numAllocatedDescs > 0)
3294 : 137 : FreeDesc(&allocatedDescs[0]);
9719 3295 : 351440 : }
3296 : :
3297 : :
3298 : : /*
3299 : : * Remove temporary and temporary relation files left over from a prior
3300 : : * postmaster session
3301 : : *
3302 : : * This should be called during postmaster startup. It will forcibly
3303 : : * remove any leftover files created by OpenTemporaryFile and any leftover
3304 : : * temporary relation files created by mdcreate.
3305 : : *
3306 : : * During post-backend-crash restart cycle, this routine is called when
3307 : : * remove_temp_files_after_crash GUC is enabled. Multiple crashes while
3308 : : * queries are using temp files could result in useless storage usage that can
3309 : : * only be reclaimed by a service restart. The argument against enabling it is
3310 : : * that someone might want to examine the temporary files for debugging
3311 : : * purposes. This does however mean that OpenTemporaryFile had better allow for
3312 : : * collision with an existing temp file name.
3313 : : *
3314 : : * NOTE: this function and its subroutines generally report syscall failures
3315 : : * with ereport(LOG) and keep going. Removing temp files is not so critical
3316 : : * that we should fail to start the database when we can't do it.
3317 : : */
3318 : : void
8955 3319 : 845 : RemovePgTempFiles(void)
3320 : : {
3321 : : char temp_path[MAXPGPATH + sizeof(PG_TBLSPC_DIR) + sizeof(TABLESPACE_VERSION_DIRECTORY) + sizeof(PG_TEMP_FILES_DIR)];
3322 : : DIR *spc_dir;
3323 : : struct dirent *spc_de;
3324 : :
3325 : : /*
3326 : : * First process temp files in pg_default ($PGDATA/base)
3327 : : */
6772 3328 : 845 : snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR);
2901 3329 : 845 : RemovePgTempFilesInDir(temp_path, true, false);
5605 rhaas@postgresql.org 3330 : 845 : RemovePgTempRelationFiles("base");
3331 : :
3332 : : /*
3333 : : * Cycle through temp directories for all non-default tablespaces.
3334 : : */
470 michael@paquier.xyz 3335 : 845 : spc_dir = AllocateDir(PG_TBLSPC_DIR);
3336 : :
3337 [ + + ]: 2608 : while ((spc_de = ReadDirExtended(spc_dir, PG_TBLSPC_DIR, LOG)) != NULL)
3338 : : {
6772 tgl@sss.pgh.pa.us 3339 [ + + ]: 1763 : if (strcmp(spc_de->d_name, ".") == 0 ||
3340 [ + + ]: 918 : strcmp(spc_de->d_name, "..") == 0)
7658 3341 : 1690 : continue;
3342 : :
470 michael@paquier.xyz 3343 : 73 : snprintf(temp_path, sizeof(temp_path), "%s/%s/%s/%s",
3344 : 73 : PG_TBLSPC_DIR, spc_de->d_name, TABLESPACE_VERSION_DIRECTORY,
3345 : : PG_TEMP_FILES_DIR);
2901 tgl@sss.pgh.pa.us 3346 : 73 : RemovePgTempFilesInDir(temp_path, true, false);
3347 : :
470 michael@paquier.xyz 3348 : 73 : snprintf(temp_path, sizeof(temp_path), "%s/%s/%s",
3349 : 73 : PG_TBLSPC_DIR, spc_de->d_name, TABLESPACE_VERSION_DIRECTORY);
5605 rhaas@postgresql.org 3350 : 73 : RemovePgTempRelationFiles(temp_path);
3351 : : }
3352 : :
6772 tgl@sss.pgh.pa.us 3353 : 845 : FreeDir(spc_dir);
3354 : :
3355 : : /*
3356 : : * In EXEC_BACKEND case there is a pgsql_tmp directory at the top level of
3357 : : * DataDir as well. However, that is *not* cleaned here because doing so
3358 : : * would create a race condition. It's done separately, earlier in
3359 : : * postmaster startup.
3360 : : */
7658 3361 : 845 : }
3362 : :
3363 : : /*
3364 : : * Process one pgsql_tmp directory for RemovePgTempFiles.
3365 : : *
3366 : : * If missing_ok is true, it's all right for the named directory to not exist.
3367 : : * Any other problem results in a LOG message. (missing_ok should be true at
3368 : : * the top level, since pgsql_tmp directories are not created until needed.)
3369 : : *
3370 : : * At the top level, this should be called with unlink_all = false, so that
3371 : : * only files matching the temporary name prefix will be unlinked. When
3372 : : * recursing it will be called with unlink_all = true to unlink everything
3373 : : * under a top-level temporary directory.
3374 : : *
3375 : : * (These two flags could be replaced by one, but it seems clearer to keep
3376 : : * them separate.)
3377 : : */
3378 : : void
2901 3379 : 919 : RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
3380 : : {
3381 : : DIR *temp_dir;
3382 : : struct dirent *temp_de;
3383 : : char rm_path[MAXPGPATH * 2];
3384 : :
7658 3385 : 919 : temp_dir = AllocateDir(tmpdirname);
3386 : :
2901 3387 [ + + + - : 919 : if (temp_dir == NULL && errno == ENOENT && missing_ok)
+ - ]
3388 : 850 : return;
3389 : :
2935 3390 [ + + ]: 210 : while ((temp_de = ReadDirExtended(temp_dir, tmpdirname, LOG)) != NULL)
3391 : : {
7658 3392 [ + + ]: 141 : if (strcmp(temp_de->d_name, ".") == 0 ||
3393 [ + + ]: 72 : strcmp(temp_de->d_name, "..") == 0)
3394 : 138 : continue;
3395 : :
3396 : 3 : snprintf(rm_path, sizeof(rm_path), "%s/%s",
3397 : 3 : tmpdirname, temp_de->d_name);
3398 : :
2938 andres@anarazel.de 3399 [ + - ]: 3 : if (unlink_all ||
3400 [ + - ]: 3 : strncmp(temp_de->d_name,
3401 : : PG_TEMP_FILE_PREFIX,
3402 : : strlen(PG_TEMP_FILE_PREFIX)) == 0)
3403 : 3 : {
1202 michael@paquier.xyz 3404 : 3 : PGFileType type = get_dirent_type(rm_path, temp_de, false, LOG);
3405 : :
3406 [ - + ]: 3 : if (type == PGFILETYPE_ERROR)
2938 andres@anarazel.de 3407 :UBC 0 : continue;
1202 michael@paquier.xyz 3408 [ + + ]:CBC 3 : else if (type == PGFILETYPE_DIR)
3409 : : {
3410 : : /* recursively remove contents, then directory itself */
2901 tgl@sss.pgh.pa.us 3411 : 1 : RemovePgTempFilesInDir(rm_path, false, true);
3412 : :
2935 3413 [ - + ]: 1 : if (rmdir(rm_path) < 0)
2935 tgl@sss.pgh.pa.us 3414 [ # # ]:UBC 0 : ereport(LOG,
3415 : : (errcode_for_file_access(),
3416 : : errmsg("could not remove directory \"%s\": %m",
3417 : : rm_path)));
3418 : : }
3419 : : else
3420 : : {
2935 tgl@sss.pgh.pa.us 3421 [ - + ]:CBC 2 : if (unlink(rm_path) < 0)
2935 tgl@sss.pgh.pa.us 3422 [ # # ]:UBC 0 : ereport(LOG,
3423 : : (errcode_for_file_access(),
3424 : : errmsg("could not remove file \"%s\": %m",
3425 : : rm_path)));
3426 : : }
3427 : : }
3428 : : else
3429 [ # # ]: 0 : ereport(LOG,
3430 : : (errmsg("unexpected file found in temporary-files directory: \"%s\"",
3431 : : rm_path)));
3432 : : }
3433 : :
7658 tgl@sss.pgh.pa.us 3434 :CBC 69 : FreeDir(temp_dir);
3435 : : }
3436 : :
3437 : : /* Process one tablespace directory, look for per-DB subdirectories */
3438 : : static void
5605 rhaas@postgresql.org 3439 : 918 : RemovePgTempRelationFiles(const char *tsdirname)
3440 : : {
3441 : : DIR *ts_dir;
3442 : : struct dirent *de;
3443 : : char dbspace_path[MAXPGPATH * 2];
3444 : :
3445 : 918 : ts_dir = AllocateDir(tsdirname);
3446 : :
2935 tgl@sss.pgh.pa.us 3447 [ + + ]: 5709 : while ((de = ReadDirExtended(ts_dir, tsdirname, LOG)) != NULL)
3448 : : {
3449 : : /*
3450 : : * We're only interested in the per-database directories, which have
3451 : : * numeric names. Note that this code will also (properly) ignore "."
3452 : : * and "..".
3453 : : */
3454 [ + + ]: 4791 : if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
5605 rhaas@postgresql.org 3455 : 1904 : continue;
3456 : :
3457 : 2887 : snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s",
3458 : 2887 : tsdirname, de->d_name);
3459 : 2887 : RemovePgTempRelationFilesInDbspace(dbspace_path);
3460 : : }
3461 : :
3462 : 918 : FreeDir(ts_dir);
3463 : 918 : }
3464 : :
3465 : : /* Process one per-dbspace directory for RemovePgTempRelationFiles */
3466 : : static void
3467 : 2887 : RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
3468 : : {
3469 : : DIR *dbspace_dir;
3470 : : struct dirent *de;
3471 : : char rm_path[MAXPGPATH * 2];
3472 : :
3473 : 2887 : dbspace_dir = AllocateDir(dbspacedirname);
3474 : :
2935 tgl@sss.pgh.pa.us 3475 [ + + ]: 875101 : while ((de = ReadDirExtended(dbspace_dir, dbspacedirname, LOG)) != NULL)
3476 : : {
5605 rhaas@postgresql.org 3477 [ + + ]: 872214 : if (!looks_like_temp_rel_name(de->d_name))
3478 : 872200 : continue;
3479 : :
3480 : 14 : snprintf(rm_path, sizeof(rm_path), "%s/%s",
3481 : 14 : dbspacedirname, de->d_name);
3482 : :
2935 tgl@sss.pgh.pa.us 3483 [ - + ]: 14 : if (unlink(rm_path) < 0)
2935 tgl@sss.pgh.pa.us 3484 [ # # ]:UBC 0 : ereport(LOG,
3485 : : (errcode_for_file_access(),
3486 : : errmsg("could not remove file \"%s\": %m",
3487 : : rm_path)));
3488 : : }
3489 : :
5605 rhaas@postgresql.org 3490 :CBC 2887 : FreeDir(dbspace_dir);
3491 : 2887 : }
3492 : :
3493 : : /* t<digits>_<digits>, or t<digits>_<digits>_<forkname> */
3494 : : bool
3495 : 1176728 : looks_like_temp_rel_name(const char *name)
3496 : : {
3497 : : int pos;
3498 : : int savepos;
3499 : :
3500 : : /* Must start with "t". */
3501 [ + + ]: 1176728 : if (name[0] != 't')
3502 : 1176678 : return false;
3503 : :
3504 : : /* Followed by a non-empty string of digits and then an underscore. */
3505 [ + + ]: 216 : for (pos = 1; isdigit((unsigned char) name[pos]); ++pos)
3506 : : ;
3507 [ + - - + ]: 50 : if (pos == 1 || name[pos] != '_')
5605 rhaas@postgresql.org 3508 :UBC 0 : return false;
3509 : :
3510 : : /* Followed by another nonempty string of digits. */
5605 rhaas@postgresql.org 3511 [ + + ]:CBC 266 : for (savepos = ++pos; isdigit((unsigned char) name[pos]); ++pos)
3512 : : ;
3513 [ - + ]: 50 : if (savepos == pos)
5605 rhaas@postgresql.org 3514 :UBC 0 : return false;
3515 : :
3516 : : /* We might have _forkname or .segment or both. */
5605 rhaas@postgresql.org 3517 [ + + ]:CBC 50 : if (name[pos] == '_')
3518 : : {
5365 bruce@momjian.us 3519 : 20 : int forkchar = forkname_chars(&name[pos + 1], NULL);
3520 : :
5605 rhaas@postgresql.org 3521 [ - + ]: 20 : if (forkchar <= 0)
5605 rhaas@postgresql.org 3522 :UBC 0 : return false;
5605 rhaas@postgresql.org 3523 :CBC 20 : pos += forkchar + 1;
3524 : : }
3525 [ + + ]: 50 : if (name[pos] == '.')
3526 : : {
3527 : : int segchar;
3528 : :
5365 bruce@momjian.us 3529 [ + + ]: 40 : for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar)
3530 : : ;
5605 rhaas@postgresql.org 3531 [ - + ]: 20 : if (segchar <= 1)
5605 rhaas@postgresql.org 3532 :UBC 0 : return false;
5605 rhaas@postgresql.org 3533 :CBC 20 : pos += segchar;
3534 : : }
3535 : :
3536 : : /* Now we should be at the end. */
3537 [ - + ]: 50 : if (name[pos] != '\0')
5605 rhaas@postgresql.org 3538 :UBC 0 : return false;
5605 rhaas@postgresql.org 3539 :CBC 50 : return true;
3540 : : }
3541 : :
3542 : : #ifdef HAVE_SYNCFS
3543 : : static void
1733 tmunro@postgresql.or 3544 :UBC 0 : do_syncfs(const char *path)
3545 : : {
3546 : : int fd;
3547 : :
1514 rhaas@postgresql.org 3548 [ # # # # ]: 0 : ereport_startup_progress("syncing data directory (syncfs), elapsed time: %ld.%02d s, current path: %s",
3549 : : path);
3550 : :
1733 tmunro@postgresql.or 3551 : 0 : fd = OpenTransientFile(path, O_RDONLY);
3552 [ # # ]: 0 : if (fd < 0)
3553 : : {
3554 [ # # ]: 0 : ereport(LOG,
3555 : : (errcode_for_file_access(),
3556 : : errmsg("could not open file \"%s\": %m", path)));
3557 : 0 : return;
3558 : : }
3559 [ # # ]: 0 : if (syncfs(fd) < 0)
3560 [ # # ]: 0 : ereport(LOG,
3561 : : (errcode_for_file_access(),
3562 : : errmsg("could not synchronize file system for file \"%s\": %m", path)));
3563 : 0 : CloseTransientFile(fd);
3564 : : }
3565 : : #endif
3566 : :
3567 : : /*
3568 : : * Issue fsync recursively on PGDATA and all its contents, or issue syncfs for
3569 : : * all potential filesystem, depending on recovery_init_sync_method setting.
3570 : : *
3571 : : * We fsync regular files and directories wherever they are, but we
3572 : : * follow symlinks only for pg_wal and immediately under pg_tblspc.
3573 : : * Other symlinks are presumed to point at files we're not responsible
3574 : : * for fsyncing, and might not have privileges to write at all.
3575 : : *
3576 : : * Errors are logged but not considered fatal; that's because this is used
3577 : : * only during database startup, to deal with the possibility that there are
3578 : : * issued-but-unsynced writes pending against the data directory. We want to
3579 : : * ensure that such writes reach disk before anything that's done in the new
3580 : : * run. However, aborting on error would result in failure to start for
3581 : : * harmless cases such as read-only files in the data directory, and that's
3582 : : * not good either.
3583 : : *
3584 : : * Note that if we previously crashed due to a PANIC on fsync(), we'll be
3585 : : * rewriting all changes again during recovery.
3586 : : *
3587 : : * Note we assume we're chdir'd into PGDATA to begin with.
3588 : : */
3589 : : void
3856 tgl@sss.pgh.pa.us 3590 :CBC 172 : SyncDataDirectory(void)
3591 : : {
3592 : : bool xlog_is_symlink;
3593 : :
3594 : : /* We can skip this whole thing if fsync is disabled. */
3595 [ + - ]: 172 : if (!enableFsync)
3596 : 172 : return;
3597 : :
3598 : : /*
3599 : : * If pg_wal is a symlink, we'll need to recurse into it separately,
3600 : : * because the first walkdir below will ignore it.
3601 : : */
3856 tgl@sss.pgh.pa.us 3602 :UBC 0 : xlog_is_symlink = false;
3603 : :
3604 : : {
3605 : : struct stat st;
3606 : :
3345 rhaas@postgresql.org 3607 [ # # ]: 0 : if (lstat("pg_wal", &st) < 0)
3856 tgl@sss.pgh.pa.us 3608 [ # # ]: 0 : ereport(LOG,
3609 : : (errcode_for_file_access(),
3610 : : errmsg("could not stat file \"%s\": %m",
3611 : : "pg_wal")));
3612 [ # # ]: 0 : else if (S_ISLNK(st.st_mode))
3613 : 0 : xlog_is_symlink = true;
3614 : : }
3615 : :
3616 : : #ifdef HAVE_SYNCFS
833 nathan@postgresql.or 3617 [ # # ]: 0 : if (recovery_init_sync_method == DATA_DIR_SYNC_METHOD_SYNCFS)
3618 : : {
3619 : : DIR *dir;
3620 : : struct dirent *de;
3621 : :
3622 : : /*
3623 : : * On Linux, we don't have to open every single file one by one. We
3624 : : * can use syncfs() to sync whole filesystems. We only expect
3625 : : * filesystem boundaries to exist where we tolerate symlinks, namely
3626 : : * pg_wal and the tablespaces, so we call syncfs() for each of those
3627 : : * directories.
3628 : : */
3629 : :
3630 : : /* Prepare to report progress syncing the data directory via syncfs. */
1514 rhaas@postgresql.org 3631 : 0 : begin_startup_progress_phase();
3632 : :
3633 : : /* Sync the top level pgdata directory. */
1733 tmunro@postgresql.or 3634 : 0 : do_syncfs(".");
3635 : : /* If any tablespaces are configured, sync each of those. */
470 michael@paquier.xyz 3636 : 0 : dir = AllocateDir(PG_TBLSPC_DIR);
3637 [ # # ]: 0 : while ((de = ReadDirExtended(dir, PG_TBLSPC_DIR, LOG)))
3638 : : {
3639 : : char path[MAXPGPATH];
3640 : :
1733 tmunro@postgresql.or 3641 [ # # # # ]: 0 : if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
3642 : 0 : continue;
3643 : :
470 michael@paquier.xyz 3644 : 0 : snprintf(path, MAXPGPATH, "%s/%s", PG_TBLSPC_DIR, de->d_name);
1733 tmunro@postgresql.or 3645 : 0 : do_syncfs(path);
3646 : : }
3647 : 0 : FreeDir(dir);
3648 : : /* If pg_wal is a symlink, process that too. */
3649 [ # # ]: 0 : if (xlog_is_symlink)
3650 : 0 : do_syncfs("pg_wal");
3651 : 0 : return;
3652 : : }
3653 : : #endif /* !HAVE_SYNCFS */
3654 : :
3655 : : #ifdef PG_FLUSH_DATA_WORKS
3656 : : /* Prepare to report progress of the pre-fsync phase. */
1514 rhaas@postgresql.org 3657 : 0 : begin_startup_progress_phase();
3658 : :
3659 : : /*
3660 : : * If possible, hint to the kernel that we're soon going to fsync the data
3661 : : * directory and its contents. Errors in this step are even less
3662 : : * interesting than normal, so log them only at DEBUG1.
3663 : : */
3856 tgl@sss.pgh.pa.us 3664 : 0 : walkdir(".", pre_sync_fname, false, DEBUG1);
3665 [ # # ]: 0 : if (xlog_is_symlink)
3345 rhaas@postgresql.org 3666 : 0 : walkdir("pg_wal", pre_sync_fname, false, DEBUG1);
470 michael@paquier.xyz 3667 : 0 : walkdir(PG_TBLSPC_DIR, pre_sync_fname, true, DEBUG1);
3668 : : #endif
3669 : :
3670 : : /* Prepare to report progress syncing the data directory via fsync. */
1514 rhaas@postgresql.org 3671 : 0 : begin_startup_progress_phase();
3672 : :
3673 : : /*
3674 : : * Now we do the fsync()s in the same order.
3675 : : *
3676 : : * The main call ignores symlinks, so in addition to specially processing
3677 : : * pg_wal if it's a symlink, pg_tblspc has to be visited separately with
3678 : : * process_symlinks = true. Note that if there are any plain directories
3679 : : * in pg_tblspc, they'll get fsync'd twice. That's not an expected case
3680 : : * so we don't worry about optimizing it.
3681 : : */
3570 andres@anarazel.de 3682 : 0 : walkdir(".", datadir_fsync_fname, false, LOG);
3856 tgl@sss.pgh.pa.us 3683 [ # # ]: 0 : if (xlog_is_symlink)
3345 rhaas@postgresql.org 3684 : 0 : walkdir("pg_wal", datadir_fsync_fname, false, LOG);
470 michael@paquier.xyz 3685 : 0 : walkdir(PG_TBLSPC_DIR, datadir_fsync_fname, true, LOG);
3686 : : }
3687 : :
3688 : : /*
3689 : : * walkdir: recursively walk a directory, applying the action to each
3690 : : * regular file and directory (including the named directory itself).
3691 : : *
3692 : : * If process_symlinks is true, the action and recursion are also applied
3693 : : * to regular files and directories that are pointed to by symlinks in the
3694 : : * given directory; otherwise symlinks are ignored. Symlinks are always
3695 : : * ignored in subdirectories, ie we intentionally don't pass down the
3696 : : * process_symlinks flag to recursive calls.
3697 : : *
3698 : : * Errors are reported at level elevel, which might be ERROR or less.
3699 : : *
3700 : : * See also walkdir in file_utils.c, which is a frontend version of this
3701 : : * logic.
3702 : : */
3703 : : static void
3856 tgl@sss.pgh.pa.us 3704 :CBC 175 : walkdir(const char *path,
3705 : : void (*action) (const char *fname, bool isdir, int elevel),
3706 : : bool process_symlinks,
3707 : : int elevel)
3708 : : {
3709 : : DIR *dir;
3710 : : struct dirent *de;
3711 : :
3880 rhaas@postgresql.org 3712 : 175 : dir = AllocateDir(path);
3713 : :
3856 tgl@sss.pgh.pa.us 3714 [ + + ]: 1754 : while ((de = ReadDirExtended(dir, path, elevel)) != NULL)
3715 : : {
3716 : : char subpath[MAXPGPATH * 2];
3717 : :
3880 rhaas@postgresql.org 3718 [ - + ]: 1579 : CHECK_FOR_INTERRUPTS();
3719 : :
3720 [ + + ]: 1579 : if (strcmp(de->d_name, ".") == 0 ||
3721 [ + + ]: 1404 : strcmp(de->d_name, "..") == 0)
3722 : 350 : continue;
3723 : :
3172 peter_e@gmx.net 3724 : 1229 : snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name);
3725 : :
1927 tmunro@postgresql.or 3726 [ + - - ]: 1229 : switch (get_dirent_type(subpath, de, process_symlinks, elevel))
3727 : : {
3728 : 1229 : case PGFILETYPE_REG:
3729 : 1229 : (*action) (subpath, false, elevel);
3730 : 1229 : break;
1927 tmunro@postgresql.or 3731 :UBC 0 : case PGFILETYPE_DIR:
3732 : 0 : walkdir(subpath, action, false, elevel);
3733 : 0 : break;
3734 : 0 : default:
3735 : :
3736 : : /*
3737 : : * Errors are already reported directly by get_dirent_type(),
3738 : : * and any remaining symlinks and unknown file types are
3739 : : * ignored.
3740 : : */
3741 : 0 : break;
3742 : : }
3743 : : }
3744 : :
3856 tgl@sss.pgh.pa.us 3745 :CBC 175 : FreeDir(dir); /* we ignore any error here */
3746 : :
3747 : : /*
3748 : : * It's important to fsync the destination directory itself as individual
3749 : : * file fsyncs don't guarantee that the directory entry for the file is
3750 : : * synced. However, skip this if AllocateDir failed; the action function
3751 : : * might not be robust against that.
3752 : : */
2935 3753 [ + - ]: 175 : if (dir)
3754 : 175 : (*action) (path, true, elevel);
3856 3755 : 175 : }
3756 : :
3757 : :
3758 : : /*
3759 : : * Hint to the OS that it should get ready to fsync() this file.
3760 : : *
3761 : : * Ignores errors trying to open unreadable files, and logs other errors at a
3762 : : * caller-specified level.
3763 : : */
3764 : : #ifdef PG_FLUSH_DATA_WORKS
3765 : :
3766 : : static void
3856 tgl@sss.pgh.pa.us 3767 :UBC 0 : pre_sync_fname(const char *fname, bool isdir, int elevel)
3768 : : {
3769 : : int fd;
3770 : :
3771 : : /* Don't try to flush directories, it'll likely just fail */
3535 3772 [ # # ]: 0 : if (isdir)
3773 : 0 : return;
3774 : :
1514 rhaas@postgresql.org 3775 [ # # # # ]: 0 : ereport_startup_progress("syncing data directory (pre-fsync), elapsed time: %ld.%02d s, current path: %s",
3776 : : fname);
3777 : :
3007 peter_e@gmx.net 3778 : 0 : fd = OpenTransientFile(fname, O_RDONLY | PG_BINARY);
3779 : :
3856 tgl@sss.pgh.pa.us 3780 [ # # ]: 0 : if (fd < 0)
3781 : : {
3535 3782 [ # # ]: 0 : if (errno == EACCES)
3856 3783 : 0 : return;
3784 [ # # ]: 0 : ereport(elevel,
3785 : : (errcode_for_file_access(),
3786 : : errmsg("could not open file \"%s\": %m", fname)));
3787 : 0 : return;
3788 : : }
3789 : :
3790 : : /*
3791 : : * pg_flush_data() ignores errors, which is ok because this is only a
3792 : : * hint.
3793 : : */
3589 andres@anarazel.de 3794 : 0 : pg_flush_data(fd, 0, 0);
3795 : :
2356 peter@eisentraut.org 3796 [ # # ]: 0 : if (CloseTransientFile(fd) != 0)
2475 michael@paquier.xyz 3797 [ # # ]: 0 : ereport(elevel,
3798 : : (errcode_for_file_access(),
3799 : : errmsg("could not close file \"%s\": %m", fname)));
3800 : : }
3801 : :
3802 : : #endif /* PG_FLUSH_DATA_WORKS */
3803 : :
3804 : : static void
3570 andres@anarazel.de 3805 : 0 : datadir_fsync_fname(const char *fname, bool isdir, int elevel)
3806 : : {
1514 rhaas@postgresql.org 3807 [ # # # # ]: 0 : ereport_startup_progress("syncing data directory (fsync), elapsed time: %ld.%02d s, current path: %s",
3808 : : fname);
3809 : :
3810 : : /*
3811 : : * We want to silently ignoring errors about unreadable files. Pass that
3812 : : * desire on to fsync_fname_ext().
3813 : : */
3570 andres@anarazel.de 3814 : 0 : fsync_fname_ext(fname, isdir, true, elevel);
3815 : 0 : }
3816 : :
3817 : : static void
2938 andres@anarazel.de 3818 :CBC 1404 : unlink_if_exists_fname(const char *fname, bool isdir, int elevel)
3819 : : {
3820 [ + + ]: 1404 : if (isdir)
3821 : : {
3822 [ - + - - ]: 175 : if (rmdir(fname) != 0 && errno != ENOENT)
2938 andres@anarazel.de 3823 [ # # ]:UBC 0 : ereport(elevel,
3824 : : (errcode_for_file_access(),
3825 : : errmsg("could not remove directory \"%s\": %m", fname)));
3826 : : }
3827 : : else
3828 : : {
3829 : : /* Use PathNameDeleteTemporaryFile to report filesize */
2938 andres@anarazel.de 3830 :CBC 1229 : PathNameDeleteTemporaryFile(fname, false);
3831 : : }
3832 : 1404 : }
3833 : :
3834 : : /*
3835 : : * fsync_fname_ext -- Try to fsync a file or directory
3836 : : *
3837 : : * If ignore_perm is true, ignore errors upon trying to open unreadable
3838 : : * files. Logs other errors at a caller-specified level.
3839 : : *
3840 : : * Returns 0 if the operation succeeded, -1 otherwise.
3841 : : */
3842 : : int
3570 3843 : 40948 : fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
3844 : : {
3845 : : int fd;
3846 : : int flags;
3847 : : int returncode;
3848 : :
3849 : : /*
3850 : : * Some OSs require directories to be opened read-only whereas other
3851 : : * systems don't allow us to fsync files opened read-only; so we need both
3852 : : * cases here. Using O_RDWR will cause us to fail to fsync files that are
3853 : : * not writable by our userid, but we assume that's OK.
3854 : : */
3856 tgl@sss.pgh.pa.us 3855 : 40948 : flags = PG_BINARY;
3856 [ + + ]: 40948 : if (!isdir)
3857 : 15280 : flags |= O_RDWR;
3858 : : else
3859 : 25668 : flags |= O_RDONLY;
3860 : :
3007 peter_e@gmx.net 3861 : 40948 : fd = OpenTransientFile(fname, flags);
3862 : :
3863 : : /*
3864 : : * Some OSs don't allow us to open directories at all (Windows returns
3865 : : * EACCES), just ignore the error in that case. If desired also silently
3866 : : * ignoring errors about unreadable files. Log others.
3867 : : */
3570 andres@anarazel.de 3868 [ - + - - : 40948 : if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
- - - - ]
3570 andres@anarazel.de 3869 :UBC 0 : return 0;
3570 andres@anarazel.de 3870 [ - + - - :CBC 40948 : else if (fd < 0 && ignore_perm && errno == EACCES)
- - ]
3570 andres@anarazel.de 3871 :UBC 0 : return 0;
3570 andres@anarazel.de 3872 [ - + ]:CBC 40948 : else if (fd < 0)
3873 : : {
3856 tgl@sss.pgh.pa.us 3874 [ # # ]:UBC 0 : ereport(elevel,
3875 : : (errcode_for_file_access(),
3876 : : errmsg("could not open file \"%s\": %m", fname)));
3570 andres@anarazel.de 3877 : 0 : return -1;
3878 : : }
3879 : :
3856 tgl@sss.pgh.pa.us 3880 :CBC 40948 : returncode = pg_fsync(fd);
3881 : :
3882 : : /*
3883 : : * Some OSes don't allow us to fsync directories at all, so we can ignore
3884 : : * those errors. Anything else needs to be logged.
3885 : : */
2488 tmunro@postgresql.or 3886 [ - + - - : 40948 : if (returncode != 0 && !(isdir && (errno == EBADF || errno == EINVAL)))
- - - - ]
3887 : : {
3888 : : int save_errno;
3889 : :
3890 : : /* close file upon error, might not be in transaction context */
3570 andres@anarazel.de 3891 :UBC 0 : save_errno = errno;
3892 : 0 : (void) CloseTransientFile(fd);
3893 : 0 : errno = save_errno;
3894 : :
3856 tgl@sss.pgh.pa.us 3895 [ # # ]: 0 : ereport(elevel,
3896 : : (errcode_for_file_access(),
3897 : : errmsg("could not fsync file \"%s\": %m", fname)));
3570 andres@anarazel.de 3898 : 0 : return -1;
3899 : : }
3900 : :
2356 peter@eisentraut.org 3901 [ - + ]:CBC 40948 : if (CloseTransientFile(fd) != 0)
3902 : : {
2475 michael@paquier.xyz 3903 [ # # ]:UBC 0 : ereport(elevel,
3904 : : (errcode_for_file_access(),
3905 : : errmsg("could not close file \"%s\": %m", fname)));
3906 : 0 : return -1;
3907 : : }
3908 : :
3570 andres@anarazel.de 3909 :CBC 40948 : return 0;
3910 : : }
3911 : :
3912 : : /*
3913 : : * fsync_parent_path -- fsync the parent path of a file or directory
3914 : : *
3915 : : * This is aimed at making file operations persistent on disk in case of
3916 : : * an OS crash or power failure.
3917 : : */
3918 : : static int
3919 : 7539 : fsync_parent_path(const char *fname, int elevel)
3920 : : {
3921 : : char parentpath[MAXPGPATH];
3922 : :
3923 : 7539 : strlcpy(parentpath, fname, MAXPGPATH);
3924 : 7539 : get_parent_directory(parentpath);
3925 : :
3926 : : /*
3927 : : * get_parent_directory() returns an empty string if the input argument is
3928 : : * just a file name (see comments in path.c), so handle that as being the
3929 : : * current directory.
3930 : : */
3931 [ + + ]: 7539 : if (strlen(parentpath) == 0)
3932 : 208 : strlcpy(parentpath, ".", MAXPGPATH);
3933 : :
3934 [ - + ]: 7539 : if (fsync_fname_ext(parentpath, true, false, elevel) != 0)
3570 andres@anarazel.de 3935 :UBC 0 : return -1;
3936 : :
3570 andres@anarazel.de 3937 :CBC 7539 : return 0;
3938 : : }
3939 : :
3940 : : /*
3941 : : * Create a PostgreSQL data sub-directory
3942 : : *
3943 : : * The data directory itself, and most of its sub-directories, are created at
3944 : : * initdb time, but we do have some occasions when we create directories in
3945 : : * the backend (CREATE TABLESPACE, for example). In those cases, we want to
3946 : : * make sure that those directories are created consistently. Today, that means
3947 : : * making sure that the created directory has the correct permissions, which is
3948 : : * what pg_dir_create_mode tracks for us.
3949 : : *
3950 : : * Note that we also set the umask() based on what we understand the correct
3951 : : * permissions to be (see file_perm.c).
3952 : : *
3953 : : * For permissions other than the default, mkdir() can be used directly, but
3954 : : * be sure to consider carefully such cases -- a sub-directory with incorrect
3955 : : * permissions in a PostgreSQL data directory could cause backups and other
3956 : : * processes to fail.
3957 : : */
3958 : : int
2811 sfrost@snowman.net 3959 : 1478 : MakePGDirectory(const char *directoryName)
3960 : : {
3961 : 1478 : return mkdir(directoryName, pg_dir_create_mode);
3962 : : }
3963 : :
3964 : : /*
3965 : : * Return the passed-in error level, or PANIC if data_sync_retry is off.
3966 : : *
3967 : : * Failure to fsync any data file is cause for immediate panic, unless
3968 : : * data_sync_retry is enabled. Data may have been written to the operating
3969 : : * system and removed from our buffer pool already, and if we are running on
3970 : : * an operating system that forgets dirty data on write-back failure, there
3971 : : * may be only one copy of the data remaining: in the WAL. A later attempt to
3972 : : * fsync again might falsely report success. Therefore we must not allow any
3973 : : * further checkpoints to be attempted. data_sync_retry can in theory be
3974 : : * enabled on systems known not to drop dirty buffered data on write-back
3975 : : * failure (with the likely outcome that checkpoints will continue to fail
3976 : : * until the underlying problem is fixed).
3977 : : *
3978 : : * Any code that reports a failure from fsync() or related functions should
3979 : : * filter the error level with this function.
3980 : : */
3981 : : int
2585 tmunro@postgresql.or 3982 : 20781 : data_sync_elevel(int elevel)
3983 : : {
3984 [ - + ]: 20781 : return data_sync_retry ? elevel : PANIC;
3985 : : }
3986 : :
3987 : : bool
848 peter@eisentraut.org 3988 : 1109 : check_debug_io_direct(char **newval, void **extra, GucSource source)
3989 : : {
984 tmunro@postgresql.or 3990 : 1109 : bool result = true;
3991 : : int flags;
3992 : :
3993 : : #if PG_O_DIRECT == 0
3994 : : if (strcmp(*newval, "") != 0)
3995 : : {
3996 : : GUC_check_errdetail("\"%s\" is not supported on this platform.",
3997 : : "debug_io_direct");
3998 : : result = false;
3999 : : }
4000 : : flags = 0;
4001 : : #else
4002 : : List *elemlist;
4003 : : ListCell *l;
4004 : : char *rawstring;
4005 : :
4006 : : /* Need a modifiable copy of string */
4007 : 1109 : rawstring = pstrdup(*newval);
4008 : :
4009 [ - + ]: 1109 : if (!SplitGUCList(rawstring, ',', &elemlist))
4010 : : {
385 alvherre@alvh.no-ip. 4011 :UBC 0 : GUC_check_errdetail("Invalid list syntax in parameter \"%s\".",
4012 : : "debug_io_direct");
984 tmunro@postgresql.or 4013 : 0 : pfree(rawstring);
4014 : 0 : list_free(elemlist);
4015 : 0 : return false;
4016 : : }
4017 : :
984 tmunro@postgresql.or 4018 :CBC 1109 : flags = 0;
4019 [ + + + + : 1115 : foreach(l, elemlist)
+ + ]
4020 : : {
4021 : 6 : char *item = (char *) lfirst(l);
4022 : :
4023 [ + + ]: 6 : if (pg_strcasecmp(item, "data") == 0)
4024 : 2 : flags |= IO_DIRECT_DATA;
4025 [ + + ]: 4 : else if (pg_strcasecmp(item, "wal") == 0)
4026 : 2 : flags |= IO_DIRECT_WAL;
4027 [ + - ]: 2 : else if (pg_strcasecmp(item, "wal_init") == 0)
4028 : 2 : flags |= IO_DIRECT_WAL_INIT;
4029 : : else
4030 : : {
385 alvherre@alvh.no-ip. 4031 :UBC 0 : GUC_check_errdetail("Invalid option \"%s\".", item);
984 tmunro@postgresql.or 4032 : 0 : result = false;
4033 : 0 : break;
4034 : : }
4035 : : }
4036 : :
4037 : : /*
4038 : : * It's possible to configure block sizes smaller than our assumed I/O
4039 : : * alignment size, which could result in invalid I/O requests.
4040 : : */
4041 : : #if XLOG_BLCKSZ < PG_IO_ALIGN_SIZE
4042 : : if (result && (flags & (IO_DIRECT_WAL | IO_DIRECT_WAL_INIT)))
4043 : : {
4044 : : GUC_check_errdetail("\"%s\" is not supported for WAL because %s is too small.",
4045 : : "debug_io_direct", "XLOG_BLCKSZ");
4046 : : result = false;
4047 : : }
4048 : : #endif
4049 : : #if BLCKSZ < PG_IO_ALIGN_SIZE
4050 : : if (result && (flags & IO_DIRECT_DATA))
4051 : : {
4052 : : GUC_check_errdetail("\"%s\" is not supported for data because %s is too small.",
4053 : : "debug_io_direct", "BLCKSZ");
4054 : : result = false;
4055 : : }
4056 : : #endif
4057 : :
984 tmunro@postgresql.or 4058 :CBC 1109 : pfree(rawstring);
4059 : 1109 : list_free(elemlist);
4060 : : #endif
4061 : :
4062 [ - + ]: 1109 : if (!result)
984 tmunro@postgresql.or 4063 :UBC 0 : return result;
4064 : :
4065 : : /* Save the flags in *extra, for use by assign_debug_io_direct */
265 dgustafsson@postgres 4066 :CBC 1109 : *extra = guc_malloc(LOG, sizeof(int));
4067 [ - + ]: 1109 : if (!*extra)
265 dgustafsson@postgres 4068 :UBC 0 : return false;
984 tmunro@postgresql.or 4069 :CBC 1109 : *((int *) *extra) = flags;
4070 : :
4071 : 1109 : return result;
4072 : : }
4073 : :
4074 : : void
848 peter@eisentraut.org 4075 : 1109 : assign_debug_io_direct(const char *newval, void *extra)
4076 : : {
984 tmunro@postgresql.or 4077 : 1109 : int *flags = (int *) extra;
4078 : :
4079 : 1109 : io_direct_flags = *flags;
4080 : 1109 : }
4081 : :
4082 : : /* ResourceOwner callbacks */
4083 : :
4084 : : static void
770 heikki.linnakangas@i 4085 : 4 : ResOwnerReleaseFile(Datum res)
4086 : : {
4087 : 4 : File file = (File) DatumGetInt32(res);
4088 : : Vfd *vfdP;
4089 : :
4090 [ + - + - : 4 : Assert(FileIsValid(file));
- + ]
4091 : :
4092 : 4 : vfdP = &VfdCache[file];
4093 : 4 : vfdP->resowner = NULL;
4094 : :
4095 : 4 : FileClose(file);
4096 : 4 : }
4097 : :
4098 : : static char *
770 heikki.linnakangas@i 4099 :UBC 0 : ResOwnerPrintFile(Datum res)
4100 : : {
4101 : 0 : return psprintf("File %d", DatumGetInt32(res));
4102 : : }
|