Age Owner Branch data TLA Line data Source code
1 : : /*-------------------------------------------------------------------------
2 : : *
3 : : * fd.c
4 : : * Virtual file descriptor code.
5 : : *
6 : : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
7 : : * Portions Copyright (c) 1994, Regents of the University of California
8 : : *
9 : : * IDENTIFICATION
10 : : * src/backend/storage/file/fd.c
11 : : *
12 : : * NOTES:
13 : : *
14 : : * This code manages a cache of 'virtual' file descriptors (VFDs).
15 : : * The server opens many file descriptors for a variety of reasons,
16 : : * including base tables, scratch files (e.g., sort and hash spool
17 : : * files), and random calls to C library routines like system(3); it
18 : : * is quite easy to exceed system limits on the number of open files a
19 : : * single process can have. (This is around 1024 on many modern
20 : : * operating systems, but may be lower on others.)
21 : : *
22 : : * VFDs are managed as an LRU pool, with actual OS file descriptors
23 : : * being opened and closed as needed. Obviously, if a routine is
24 : : * opened using these interfaces, all subsequent operations must also
25 : : * be through these interfaces (the File type is not a real file
26 : : * descriptor).
27 : : *
28 : : * For this scheme to work, most (if not all) routines throughout the
29 : : * server should use these interfaces instead of calling the C library
30 : : * routines (e.g., open(2) and fopen(3)) themselves. Otherwise, we
31 : : * may find ourselves short of real file descriptors anyway.
32 : : *
33 : : * INTERFACE ROUTINES
34 : : *
35 : : * PathNameOpenFile and OpenTemporaryFile are used to open virtual files.
36 : : * A File opened with OpenTemporaryFile is automatically deleted when the
37 : : * File is closed, either explicitly or implicitly at end of transaction or
38 : : * process exit. PathNameOpenFile is intended for files that are held open
39 : : * for a long time, like relation files. It is the caller's responsibility
40 : : * to close them, there is no automatic mechanism in fd.c for that.
41 : : *
42 : : * PathName(Create|Open|Delete)Temporary(File|Dir) are used to manage
43 : : * temporary files that have names so that they can be shared between
44 : : * backends. Such files are automatically closed and count against the
45 : : * temporary file limit of the backend that creates them, but unlike anonymous
46 : : * files they are not automatically deleted. See sharedfileset.c for a shared
47 : : * ownership mechanism that provides automatic cleanup for shared files when
48 : : * the last of a group of backends detaches.
49 : : *
50 : : * AllocateFile, AllocateDir, OpenPipeStream and OpenTransientFile are
51 : : * wrappers around fopen(3), opendir(3), popen(3) and open(2), respectively.
52 : : * They behave like the corresponding native functions, except that the handle
53 : : * is registered with the current subtransaction, and will be automatically
54 : : * closed at abort. These are intended mainly for short operations like
55 : : * reading a configuration file; there is a limit on the number of files that
56 : : * can be opened using these functions at any one time.
57 : : *
58 : : * Finally, BasicOpenFile is just a thin wrapper around open() that can
59 : : * release file descriptors in use by the virtual file descriptors if
60 : : * necessary. There is no automatic cleanup of file descriptors returned by
61 : : * BasicOpenFile, it is solely the caller's responsibility to close the file
62 : : * descriptor by calling close(2).
63 : : *
64 : : * If a non-virtual file descriptor needs to be held open for any length of
65 : : * time, report it to fd.c by calling AcquireExternalFD or ReserveExternalFD
66 : : * (and eventually ReleaseExternalFD), so that we can take it into account
67 : : * while deciding how many VFDs can be open. This applies to FDs obtained
68 : : * with BasicOpenFile as well as those obtained without use of any fd.c API.
69 : : *
70 : : *-------------------------------------------------------------------------
71 : : */
72 : :
73 : : #include "postgres.h"
74 : :
75 : : #include <dirent.h>
76 : : #include <sys/file.h>
77 : : #include <sys/param.h>
78 : : #include <sys/resource.h> /* for getrlimit */
79 : : #include <sys/stat.h>
80 : : #include <sys/types.h>
81 : : #ifndef WIN32
82 : : #include <sys/mman.h>
83 : : #endif
84 : : #include <limits.h>
85 : : #include <unistd.h>
86 : : #include <fcntl.h>
87 : :
88 : : #include "access/xact.h"
89 : : #include "access/xlog.h"
90 : : #include "catalog/pg_tablespace.h"
91 : : #include "common/file_perm.h"
92 : : #include "common/file_utils.h"
93 : : #include "common/pg_prng.h"
94 : : #include "miscadmin.h"
95 : : #include "pgstat.h"
96 : : #include "postmaster/startup.h"
97 : : #include "storage/aio.h"
98 : : #include "storage/fd.h"
99 : : #include "storage/ipc.h"
100 : : #include "utils/guc.h"
101 : : #include "utils/guc_hooks.h"
102 : : #include "utils/resowner.h"
103 : : #include "utils/varlena.h"
104 : : #include "utils/wait_event.h"
105 : :
106 : : /* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
107 : : #if defined(HAVE_SYNC_FILE_RANGE)
108 : : #define PG_FLUSH_DATA_WORKS 1
109 : : #elif !defined(WIN32) && defined(MS_ASYNC)
110 : : #define PG_FLUSH_DATA_WORKS 1
111 : : #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
112 : : #define PG_FLUSH_DATA_WORKS 1
113 : : #endif
114 : :
115 : : /*
116 : : * We must leave some file descriptors free for system(), the dynamic loader,
117 : : * and other code that tries to open files without consulting fd.c. This
118 : : * is the number left free. (While we try fairly hard to prevent EMFILE
119 : : * errors, there's never any guarantee that we won't get ENFILE due to
120 : : * other processes chewing up FDs. So it's a bad idea to try to open files
121 : : * without consulting fd.c. Nonetheless we cannot control all code.)
122 : : *
123 : : * Because this is just a fixed setting, we are effectively assuming that
124 : : * no such code will leave FDs open over the long term; otherwise the slop
125 : : * is likely to be insufficient. Note in particular that we expect that
126 : : * loading a shared library does not result in any permanent increase in
127 : : * the number of open files. (This appears to be true on most if not
128 : : * all platforms as of Feb 2004.)
129 : : */
130 : : #define NUM_RESERVED_FDS 10
131 : :
132 : : /*
133 : : * If we have fewer than this many usable FDs after allowing for the reserved
134 : : * ones, choke. (This value is chosen to work with "ulimit -n 64", but not
135 : : * much less than that. Note that this value ensures numExternalFDs can be
136 : : * at least 16; as of this writing, the contrib/postgres_fdw regression tests
137 : : * will not pass unless that can grow to at least 14.)
138 : : */
139 : : #define FD_MINFREE 48
140 : :
141 : : /*
142 : : * A number of platforms allow individual processes to open many more files
143 : : * than they can really support when *many* processes do the same thing.
144 : : * This GUC parameter lets the DBA limit max_safe_fds to something less than
145 : : * what the postmaster's initial probe suggests will work.
146 : : */
147 : : int max_files_per_process = 1000;
148 : :
149 : : /*
150 : : * Maximum number of file descriptors to open for operations that fd.c knows
151 : : * about (VFDs, AllocateFile etc, or "external" FDs). This is initialized
152 : : * to a conservative value, and remains that way indefinitely in bootstrap or
153 : : * standalone-backend cases. In normal postmaster operation, the postmaster
154 : : * calls set_max_safe_fds() late in initialization to update the value, and
155 : : * that value is then inherited by forked subprocesses.
156 : : *
157 : : * Note: the value of max_files_per_process is taken into account while
158 : : * setting this variable, and so need not be tested separately.
159 : : */
160 : : int max_safe_fds = FD_MINFREE; /* default if not changed */
161 : :
162 : : /* Whether it is safe to continue running after fsync() fails. */
163 : : bool data_sync_retry = false;
164 : :
165 : : /* How SyncDataDirectory() should do its job. */
166 : : int recovery_init_sync_method = DATA_DIR_SYNC_METHOD_FSYNC;
167 : :
168 : : /* How data files should be bulk-extended with zeros. */
169 : : int file_extend_method = DEFAULT_FILE_EXTEND_METHOD;
170 : :
171 : : /* Which kinds of files should be opened with PG_O_DIRECT. */
172 : : int io_direct_flags;
173 : :
174 : : /* Debugging.... */
175 : :
176 : : #ifdef FDDEBUG
177 : : #define DO_DB(A) \
178 : : do { \
179 : : int _do_db_save_errno = errno; \
180 : : A; \
181 : : errno = _do_db_save_errno; \
182 : : } while (0)
183 : : #else
184 : : #define DO_DB(A) \
185 : : ((void) 0)
186 : : #endif
187 : :
188 : : #define VFD_CLOSED (-1)
189 : :
190 : : #define FileIsValid(file) \
191 : : ((file) > 0 && (file) < (int) SizeVfdCache && VfdCache[file].fileName != NULL)
192 : :
193 : : #define FileIsNotOpen(file) (VfdCache[file].fd == VFD_CLOSED)
194 : :
195 : : /* these are the assigned bits in fdstate below: */
196 : : #define FD_DELETE_AT_CLOSE (1 << 0) /* T = delete when closed */
197 : : #define FD_CLOSE_AT_EOXACT (1 << 1) /* T = close at eoXact */
198 : : #define FD_TEMP_FILE_LIMIT (1 << 2) /* T = respect temp_file_limit */
199 : :
200 : : typedef struct vfd
201 : : {
202 : : int fd; /* current FD, or VFD_CLOSED if none */
203 : : unsigned short fdstate; /* bitflags for VFD's state */
204 : : ResourceOwner resowner; /* owner, for automatic cleanup */
205 : : File nextFree; /* link to next free VFD, if in freelist */
206 : : File lruMoreRecently; /* doubly linked recency-of-use list */
207 : : File lruLessRecently;
208 : : pgoff_t fileSize; /* current size of file (0 if not temporary) */
209 : : char *fileName; /* name of file, or NULL for unused VFD */
210 : : /* NB: fileName is malloc'd, and must be free'd when closing the VFD */
211 : : int fileFlags; /* open(2) flags for (re)opening the file */
212 : : mode_t fileMode; /* mode to pass to open(2) */
213 : : } Vfd;
214 : :
215 : : /*
216 : : * Virtual File Descriptor array pointer and size. This grows as
217 : : * needed. 'File' values are indexes into this array.
218 : : * Note that VfdCache[0] is not a usable VFD, just a list header.
219 : : */
220 : : static Vfd *VfdCache;
221 : : static Size SizeVfdCache = 0;
222 : :
223 : : /*
224 : : * Number of file descriptors known to be in use by VFD entries.
225 : : */
226 : : static int nfile = 0;
227 : :
228 : : /*
229 : : * Flag to tell whether it's worth scanning VfdCache looking for temp files
230 : : * to close
231 : : */
232 : : static bool have_xact_temporary_files = false;
233 : :
234 : : /*
235 : : * Tracks the total size of all temporary files. Note: when temp_file_limit
236 : : * is being enforced, this cannot overflow since the limit cannot be more
237 : : * than INT_MAX kilobytes. When not enforcing, it could theoretically
238 : : * overflow, but we don't care.
239 : : */
240 : : static uint64 temporary_files_size = 0;
241 : :
242 : : /* Temporary file access initialized and not yet shut down? */
243 : : #ifdef USE_ASSERT_CHECKING
244 : : static bool temporary_files_allowed = false;
245 : : #endif
246 : :
247 : : /*
248 : : * List of OS handles opened with AllocateFile, AllocateDir and
249 : : * OpenTransientFile.
250 : : */
251 : : typedef enum
252 : : {
253 : : AllocateDescFile,
254 : : AllocateDescPipe,
255 : : AllocateDescDir,
256 : : AllocateDescRawFD,
257 : : } AllocateDescKind;
258 : :
259 : : typedef struct
260 : : {
261 : : AllocateDescKind kind;
262 : : SubTransactionId create_subid;
263 : : union
264 : : {
265 : : FILE *file;
266 : : DIR *dir;
267 : : int fd;
268 : : } desc;
269 : : } AllocateDesc;
270 : :
271 : : static int numAllocatedDescs = 0;
272 : : static int maxAllocatedDescs = 0;
273 : : static AllocateDesc *allocatedDescs = NULL;
274 : :
275 : : /*
276 : : * Number of open "external" FDs reported to Reserve/ReleaseExternalFD.
277 : : */
278 : : static int numExternalFDs = 0;
279 : :
280 : : /*
281 : : * Number of temporary files opened during the current session;
282 : : * this is used in generation of tempfile names.
283 : : */
284 : : static long tempFileCounter = 0;
285 : :
286 : : /*
287 : : * Array of OIDs of temp tablespaces. (Some entries may be InvalidOid,
288 : : * indicating that the current database's default tablespace should be used.)
289 : : * When numTempTableSpaces is -1, this has not been set in the current
290 : : * transaction.
291 : : */
292 : : static Oid *tempTableSpaces = NULL;
293 : : static int numTempTableSpaces = -1;
294 : : static int nextTempTableSpace = 0;
295 : :
296 : :
297 : : /*--------------------
298 : : *
299 : : * Private Routines
300 : : *
301 : : * Delete - delete a file from the Lru ring
302 : : * LruDelete - remove a file from the Lru ring and close its FD
303 : : * Insert - put a file at the front of the Lru ring
304 : : * LruInsert - put a file at the front of the Lru ring and open it
305 : : * ReleaseLruFile - Release an fd by closing the last entry in the Lru ring
306 : : * ReleaseLruFiles - Release fd(s) until we're under the max_safe_fds limit
307 : : * AllocateVfd - grab a free (or new) file record (from VfdCache)
308 : : * FreeVfd - free a file record
309 : : *
310 : : * The Least Recently Used ring is a doubly linked list that begins and
311 : : * ends on element zero. Element zero is special -- it doesn't represent
312 : : * a file and its "fd" field always == VFD_CLOSED. Element zero is just an
313 : : * anchor that shows us the beginning/end of the ring.
314 : : * Only VFD elements that are currently really open (have an FD assigned) are
315 : : * in the Lru ring. Elements that are "virtually" open can be recognized
316 : : * by having a non-null fileName field.
317 : : *
318 : : * example:
319 : : *
320 : : * /--less----\ /---------\
321 : : * v \ v \
322 : : * #0 --more---> LeastRecentlyUsed --more-\ \
323 : : * ^\ | |
324 : : * \\less--> MostRecentlyUsedFile <---/ |
325 : : * \more---/ \--less--/
326 : : *
327 : : *--------------------
328 : : */
329 : : static void Delete(File file);
330 : : static void LruDelete(File file);
331 : : static void Insert(File file);
332 : : static int LruInsert(File file);
333 : : static bool ReleaseLruFile(void);
334 : : static void ReleaseLruFiles(void);
335 : : static File AllocateVfd(void);
336 : : static void FreeVfd(File file);
337 : :
338 : : static int FileAccess(File file);
339 : : static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError);
340 : : static bool reserveAllocatedDesc(void);
341 : : static int FreeDesc(AllocateDesc *desc);
342 : :
343 : : static void BeforeShmemExit_Files(int code, Datum arg);
344 : : static void CleanupTempFiles(bool isCommit, bool isProcExit);
345 : : static void RemovePgTempRelationFiles(const char *tsdirname);
346 : : static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname);
347 : :
348 : : static void walkdir(const char *path,
349 : : void (*action) (const char *fname, bool isdir, int elevel),
350 : : bool process_symlinks,
351 : : int elevel);
352 : : #ifdef PG_FLUSH_DATA_WORKS
353 : : static void pre_sync_fname(const char *fname, bool isdir, int elevel);
354 : : #endif
355 : : static void datadir_fsync_fname(const char *fname, bool isdir, int elevel);
356 : : static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel);
357 : :
358 : : static int fsync_parent_path(const char *fname, int elevel);
359 : :
360 : :
361 : : /* ResourceOwner callbacks to hold virtual file descriptors */
362 : : static void ResOwnerReleaseFile(Datum res);
363 : : static char *ResOwnerPrintFile(Datum res);
364 : :
365 : : static const ResourceOwnerDesc file_resowner_desc =
366 : : {
367 : : .name = "File",
368 : : .release_phase = RESOURCE_RELEASE_AFTER_LOCKS,
369 : : .release_priority = RELEASE_PRIO_FILES,
370 : : .ReleaseResource = ResOwnerReleaseFile,
371 : : .DebugPrint = ResOwnerPrintFile
372 : : };
373 : :
374 : : /* Convenience wrappers over ResourceOwnerRemember/Forget */
375 : : static inline void
858 heikki.linnakangas@i 376 :CBC 5196 : ResourceOwnerRememberFile(ResourceOwner owner, File file)
377 : : {
378 : 5196 : ResourceOwnerRemember(owner, Int32GetDatum(file), &file_resowner_desc);
379 : 5196 : }
380 : : static inline void
381 : 5192 : ResourceOwnerForgetFile(ResourceOwner owner, File file)
382 : : {
383 : 5192 : ResourceOwnerForget(owner, Int32GetDatum(file), &file_resowner_desc);
384 : 5192 : }
385 : :
386 : : /*
387 : : * pg_fsync --- do fsync with or without writethrough
388 : : */
389 : : int
9228 tgl@sss.pgh.pa.us 390 : 68657 : pg_fsync(int fd)
391 : : {
392 : : #if !defined(WIN32) && defined(USE_ASSERT_CHECKING)
393 : : struct stat st;
394 : :
395 : : /*
396 : : * Some operating system implementations of fsync() have requirements
397 : : * about the file access modes that were used when their file descriptor
398 : : * argument was opened, and these requirements differ depending on whether
399 : : * the file descriptor is for a directory.
400 : : *
401 : : * For any file descriptor that may eventually be handed to fsync(), we
402 : : * should have opened it with access modes that are compatible with
403 : : * fsync() on all supported systems, otherwise the code may not be
404 : : * portable, even if it runs ok on the current system.
405 : : *
406 : : * We assert here that a descriptor for a file was opened with write
407 : : * permissions (i.e., not O_RDONLY) and for a directory without write
408 : : * permissions (O_RDONLY). Notice that the assertion check is made even
409 : : * if fsync() is disabled.
410 : : *
411 : : * If fstat() fails, ignore it and let the follow-up fsync() complain.
412 : : */
2301 michael@paquier.xyz 413 [ + - ]: 68657 : if (fstat(fd, &st) == 0)
414 : : {
415 : 68657 : int desc_flags = fcntl(fd, F_GETFL);
416 : :
257 tgl@sss.pgh.pa.us 417 : 68657 : desc_flags &= O_ACCMODE;
418 : :
2301 michael@paquier.xyz 419 [ + + ]: 68657 : if (S_ISDIR(st.st_mode))
257 tgl@sss.pgh.pa.us 420 [ - + ]: 27395 : Assert(desc_flags == O_RDONLY);
421 : : else
422 [ - + ]: 41262 : Assert(desc_flags != O_RDONLY);
423 : : }
2301 michael@paquier.xyz 424 : 68657 : errno = 0;
425 : : #endif
426 : :
427 : : /* #if is to skip the wal_sync_method test if there's no need for it */
428 : : #if defined(HAVE_FSYNC_WRITETHROUGH)
429 : : if (wal_sync_method == WAL_SYNC_METHOD_FSYNC_WRITETHROUGH)
430 : : return pg_fsync_writethrough(fd);
431 : : else
432 : : #endif
5576 tgl@sss.pgh.pa.us 433 : 68657 : return pg_fsync_no_writethrough(fd);
434 : : }
435 : :
436 : :
437 : : /*
438 : : * pg_fsync_no_writethrough --- same as fsync except does nothing if
439 : : * enableFsync is off
440 : : */
441 : : int
7604 bruce@momjian.us 442 : 68657 : pg_fsync_no_writethrough(int fd)
443 : : {
444 : : int rc;
445 : :
1000 andres@anarazel.de 446 [ - + ]: 68657 : if (!enableFsync)
9228 tgl@sss.pgh.pa.us 447 : 68657 : return 0;
448 : :
1000 andres@anarazel.de 449 :UBC 0 : retry:
450 : 0 : rc = fsync(fd);
451 : :
452 [ # # # # ]: 0 : if (rc == -1 && errno == EINTR)
453 : 0 : goto retry;
454 : :
455 : 0 : return rc;
456 : : }
457 : :
458 : : /*
459 : : * pg_fsync_writethrough
460 : : */
461 : : int
7604 bruce@momjian.us 462 : 0 : pg_fsync_writethrough(int fd)
463 : : {
464 [ # # ]: 0 : if (enableFsync)
465 : : {
466 : : #if defined(F_FULLFSYNC)
467 : : return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0;
468 : : #else
5865 tgl@sss.pgh.pa.us 469 : 0 : errno = ENOSYS;
7604 bruce@momjian.us 470 : 0 : return -1;
471 : : #endif
472 : : }
473 : : else
474 : 0 : return 0;
475 : : }
476 : :
477 : : /*
478 : : * pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off
479 : : */
480 : : int
9156 tgl@sss.pgh.pa.us 481 : 0 : pg_fdatasync(int fd)
482 : : {
483 : : int rc;
484 : :
1000 andres@anarazel.de 485 [ # # ]: 0 : if (!enableFsync)
9156 tgl@sss.pgh.pa.us 486 : 0 : return 0;
487 : :
1000 andres@anarazel.de 488 : 0 : retry:
489 : 0 : rc = fdatasync(fd);
490 : :
491 [ # # # # ]: 0 : if (rc == -1 && errno == EINTR)
492 : 0 : goto retry;
493 : :
494 : 0 : return rc;
495 : : }
496 : :
497 : : /*
498 : : * pg_file_exists -- check that a file exists.
499 : : *
500 : : * This requires an absolute path to the file. Returns true if the file is
501 : : * not a directory, false otherwise.
502 : : */
503 : : bool
793 michael@paquier.xyz 504 :CBC 21751 : pg_file_exists(const char *name)
505 : : {
506 : : struct stat st;
507 : :
508 [ - + ]: 21751 : Assert(name != NULL);
509 : :
510 [ + + ]: 21751 : if (stat(name, &st) == 0)
511 : 11244 : return !S_ISDIR(st.st_mode);
512 [ - + - - : 10507 : else if (!(errno == ENOENT || errno == ENOTDIR || errno == EACCES))
- - ]
793 michael@paquier.xyz 513 [ # # ]:UBC 0 : ereport(ERROR,
514 : : (errcode_for_file_access(),
515 : : errmsg("could not access file \"%s\": %m", name)));
516 : :
793 michael@paquier.xyz 517 :CBC 10507 : return false;
518 : : }
519 : :
520 : : /*
521 : : * pg_flush_data --- advise OS that the described dirty data should be flushed
522 : : *
523 : : * offset of 0 with nbytes 0 means that the entire file should be flushed
524 : : */
525 : : void
122 michael@paquier.xyz 526 :GNC 38536 : pg_flush_data(int fd, pgoff_t offset, pgoff_t nbytes)
527 : : {
528 : : /*
529 : : * Right now file flushing is primarily used to avoid making later
530 : : * fsync()/fdatasync() calls have less impact. Thus don't trigger flushes
531 : : * if fsyncs are disabled - that's a decision we might want to make
532 : : * configurable at some point.
533 : : */
3677 andres@anarazel.de 534 [ + - ]:CBC 38536 : if (!enableFsync)
535 : 38536 : return;
536 : :
537 : : /*
538 : : * We compile all alternatives that are supported on the current platform,
539 : : * to find portability problems more easily.
540 : : */
541 : : #if defined(HAVE_SYNC_FILE_RANGE)
542 : : {
543 : : int rc;
544 : : static bool not_implemented_by_kernel = false;
545 : :
2576 tmunro@postgresql.or 546 [ # # ]:UBC 0 : if (not_implemented_by_kernel)
547 : 0 : return;
548 : :
1000 andres@anarazel.de 549 : 0 : retry:
550 : :
551 : : /*
552 : : * sync_file_range(SYNC_FILE_RANGE_WRITE), currently linux specific,
553 : : * tells the OS that writeback for the specified blocks should be
554 : : * started, but that we don't want to wait for completion. Note that
555 : : * this call might block if too much dirty data exists in the range.
556 : : * This is the preferable method on OSs supporting it, as it works
557 : : * reliably when available (contrast to msync()) and doesn't flush out
558 : : * clean data (like FADV_DONTNEED).
559 : : */
3677 560 : 0 : rc = sync_file_range(fd, offset, nbytes,
561 : : SYNC_FILE_RANGE_WRITE);
562 [ # # ]: 0 : if (rc != 0)
563 : : {
564 : : int elevel;
565 : :
1000 566 [ # # ]: 0 : if (rc == EINTR)
567 : 0 : goto retry;
568 : :
569 : : /*
570 : : * For systems that don't have an implementation of
571 : : * sync_file_range() such as Windows WSL, generate only one
572 : : * warning and then suppress all further attempts by this process.
573 : : */
2576 tmunro@postgresql.or 574 [ # # ]: 0 : if (errno == ENOSYS)
575 : : {
576 : 0 : elevel = WARNING;
577 : 0 : not_implemented_by_kernel = true;
578 : : }
579 : : else
580 : 0 : elevel = data_sync_elevel(WARNING);
581 : :
582 [ # # ]: 0 : ereport(elevel,
583 : : (errcode_for_file_access(),
584 : : errmsg("could not flush dirty data: %m")));
585 : : }
586 : :
3677 andres@anarazel.de 587 : 0 : return;
588 : : }
589 : : #endif
590 : : #if !defined(WIN32) && defined(MS_ASYNC)
591 : : {
592 : : void *p;
593 : : static int pagesize = 0;
594 : :
595 : : /*
596 : : * On several OSs msync(MS_ASYNC) on a mmap'ed file triggers
597 : : * writeback. On linux it only does so if MS_SYNC is specified, but
598 : : * then it does the writeback synchronously. Luckily all common linux
599 : : * systems have sync_file_range(). This is preferable over
600 : : * FADV_DONTNEED because it doesn't flush out clean data.
601 : : *
602 : : * We map the file (mmap()), tell the kernel to sync back the contents
603 : : * (msync()), and then remove the mapping again (munmap()).
604 : : */
605 : :
606 : : /* mmap() needs actual length if we want to map whole file */
607 : : if (offset == 0 && nbytes == 0)
608 : : {
609 : : nbytes = lseek(fd, 0, SEEK_END);
610 : : if (nbytes < 0)
611 : : {
612 : : ereport(WARNING,
613 : : (errcode_for_file_access(),
614 : : errmsg("could not determine dirty data size: %m")));
615 : : return;
616 : : }
617 : : }
618 : :
619 : : /*
620 : : * Some platforms reject partial-page mmap() attempts. To deal with
621 : : * that, just truncate the request to a page boundary. If any extra
622 : : * bytes don't get flushed, well, it's only a hint anyway.
623 : : */
624 : :
625 : : /* fetch pagesize only once */
626 : : if (pagesize == 0)
627 : : pagesize = sysconf(_SC_PAGESIZE);
628 : :
629 : : /* align length to pagesize, dropping any fractional page */
630 : : if (pagesize > 0)
631 : : nbytes = (nbytes / pagesize) * pagesize;
632 : :
633 : : /* fractional-page request is a no-op */
634 : : if (nbytes <= 0)
635 : : return;
636 : :
637 : : /*
638 : : * mmap could well fail, particularly on 32-bit platforms where there
639 : : * may simply not be enough address space. If so, silently fall
640 : : * through to the next implementation.
641 : : */
642 : : if (nbytes <= (pgoff_t) SSIZE_MAX)
643 : : p = mmap(NULL, nbytes, PROT_READ, MAP_SHARED, fd, offset);
644 : : else
645 : : p = MAP_FAILED;
646 : :
647 : : if (p != MAP_FAILED)
648 : : {
649 : : int rc;
650 : :
651 : : rc = msync(p, (size_t) nbytes, MS_ASYNC);
652 : : if (rc != 0)
653 : : {
654 : : ereport(data_sync_elevel(WARNING),
655 : : (errcode_for_file_access(),
656 : : errmsg("could not flush dirty data: %m")));
657 : : /* NB: need to fall through to munmap()! */
658 : : }
659 : :
660 : : rc = munmap(p, (size_t) nbytes);
661 : : if (rc != 0)
662 : : {
663 : : /* FATAL error because mapping would remain */
664 : : ereport(FATAL,
665 : : (errcode_for_file_access(),
666 : : errmsg("could not munmap() while flushing data: %m")));
667 : : }
668 : :
669 : : return;
670 : : }
671 : : }
672 : : #endif
673 : : #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
674 : : {
675 : : int rc;
676 : :
677 : : /*
678 : : * Signal the kernel that the passed in range should not be cached
679 : : * anymore. This has the, desired, side effect of writing out dirty
680 : : * data, and the, undesired, side effect of likely discarding useful
681 : : * clean cached blocks. For the latter reason this is the least
682 : : * preferable method.
683 : : */
684 : :
685 : : rc = posix_fadvise(fd, offset, nbytes, POSIX_FADV_DONTNEED);
686 : :
687 : : if (rc != 0)
688 : : {
689 : : /* don't error out, this is just a performance optimization */
690 : : ereport(WARNING,
691 : : (errcode_for_file_access(),
692 : : errmsg("could not flush dirty data: %m")));
693 : : }
694 : :
695 : : return;
696 : : }
697 : : #endif
698 : : }
699 : :
700 : : /*
701 : : * Truncate an open file to a given length.
702 : : */
703 : : static int
122 michael@paquier.xyz 704 :GNC 546 : pg_ftruncate(int fd, pgoff_t length)
705 : : {
706 : : int ret;
707 : :
1000 andres@anarazel.de 708 :CBC 546 : retry:
709 : 546 : ret = ftruncate(fd, length);
710 : :
711 [ - + - - ]: 546 : if (ret == -1 && errno == EINTR)
1000 andres@anarazel.de 712 :UBC 0 : goto retry;
713 : :
1000 andres@anarazel.de 714 :CBC 546 : return ret;
715 : : }
716 : :
717 : : /*
718 : : * Truncate a file to a given length by name.
719 : : */
720 : : int
122 michael@paquier.xyz 721 :GNC 228497 : pg_truncate(const char *path, pgoff_t length)
722 : : {
723 : : int ret;
724 : : #ifdef WIN32
725 : : int save_errno;
726 : : int fd;
727 : :
728 : : fd = OpenTransientFile(path, O_RDWR | PG_BINARY);
729 : : if (fd >= 0)
730 : : {
731 : : ret = pg_ftruncate(fd, length);
732 : : save_errno = errno;
733 : : CloseTransientFile(fd);
734 : : errno = save_errno;
735 : : }
736 : : else
737 : : ret = -1;
738 : : #else
739 : :
1000 andres@anarazel.de 740 :CBC 228497 : retry:
741 : 228497 : ret = truncate(path, length);
742 : :
743 [ + + - + ]: 228497 : if (ret == -1 && errno == EINTR)
1000 andres@anarazel.de 744 :UBC 0 : goto retry;
745 : : #endif
746 : :
1000 andres@anarazel.de 747 :CBC 228497 : return ret;
748 : : }
749 : :
750 : : /*
751 : : * fsync_fname -- fsync a file or directory, handling errors properly
752 : : *
753 : : * Try to fsync a file or directory. When doing the latter, ignore errors that
754 : : * indicate the OS just doesn't allow/require fsyncing directories.
755 : : */
756 : : void
3658 757 : 22488 : fsync_fname(const char *fname, bool isdir)
758 : : {
2673 tmunro@postgresql.or 759 : 22488 : fsync_fname_ext(fname, isdir, false, data_sync_elevel(ERROR));
3658 andres@anarazel.de 760 : 22488 : }
761 : :
762 : : /*
763 : : * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
764 : : *
765 : : * This routine ensures that, after returning, the effect of renaming file
766 : : * persists in case of a crash. A crash while this routine is running will
767 : : * leave you with either the pre-existing or the moved file in place of the
768 : : * new file; no mixed state or truncated files are possible.
769 : : *
770 : : * It does so by using fsync on the old filename and the possibly existing
771 : : * target filename before the rename, and the target file and directory after.
772 : : *
773 : : * Note that rename() cannot be used across arbitrary directories, as they
774 : : * might not be on the same filesystem. Therefore this routine does not
775 : : * support renaming across directories.
776 : : *
777 : : * Log errors with the caller specified severity.
778 : : *
779 : : * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
780 : : * valid upon return.
781 : : */
782 : : int
783 : 6634 : durable_rename(const char *oldfile, const char *newfile, int elevel)
784 : : {
785 : : int fd;
786 : :
787 : : /*
788 : : * First fsync the old and target path (if it exists), to ensure that they
789 : : * are properly persistent on disk. Syncing the target file is not
790 : : * strictly necessary, but it makes it easier to reason about crashes;
791 : : * because it's then guaranteed that either source or target file exists
792 : : * after a crash.
793 : : */
794 [ - + ]: 6634 : if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
3658 andres@anarazel.de 795 :UBC 0 : return -1;
796 : :
3095 peter_e@gmx.net 797 :CBC 6634 : fd = OpenTransientFile(newfile, PG_BINARY | O_RDWR);
3658 andres@anarazel.de 798 [ + + ]: 6634 : if (fd < 0)
799 : : {
800 [ - + ]: 4607 : if (errno != ENOENT)
801 : : {
3658 andres@anarazel.de 802 [ # # ]:UBC 0 : ereport(elevel,
803 : : (errcode_for_file_access(),
804 : : errmsg("could not open file \"%s\": %m", newfile)));
805 : 0 : return -1;
806 : : }
807 : : }
808 : : else
809 : : {
3658 andres@anarazel.de 810 [ - + ]:CBC 2027 : if (pg_fsync(fd) != 0)
811 : : {
812 : : int save_errno;
813 : :
814 : : /* close file upon error, might not be in transaction context */
3658 andres@anarazel.de 815 :UBC 0 : save_errno = errno;
816 : 0 : CloseTransientFile(fd);
817 : 0 : errno = save_errno;
818 : :
819 [ # # ]: 0 : ereport(elevel,
820 : : (errcode_for_file_access(),
821 : : errmsg("could not fsync file \"%s\": %m", newfile)));
822 : 0 : return -1;
823 : : }
824 : :
2444 peter@eisentraut.org 825 [ - + ]:CBC 2027 : if (CloseTransientFile(fd) != 0)
826 : : {
2563 michael@paquier.xyz 827 [ # # ]:UBC 0 : ereport(elevel,
828 : : (errcode_for_file_access(),
829 : : errmsg("could not close file \"%s\": %m", newfile)));
830 : 0 : return -1;
831 : : }
832 : : }
833 : :
834 : : /* Time to do the real deal... */
3658 andres@anarazel.de 835 [ - + ]:CBC 6634 : if (rename(oldfile, newfile) < 0)
836 : : {
3658 andres@anarazel.de 837 [ # # ]:UBC 0 : ereport(elevel,
838 : : (errcode_for_file_access(),
839 : : errmsg("could not rename file \"%s\" to \"%s\": %m",
840 : : oldfile, newfile)));
841 : 0 : return -1;
842 : : }
843 : :
844 : : /*
845 : : * To guarantee renaming the file is persistent, fsync the file with its
846 : : * new name, and its containing directory.
847 : : */
3658 andres@anarazel.de 848 [ - + ]:CBC 6634 : if (fsync_fname_ext(newfile, false, false, elevel) != 0)
3658 andres@anarazel.de 849 :UBC 0 : return -1;
850 : :
3658 andres@anarazel.de 851 [ - + ]:CBC 6634 : if (fsync_parent_path(newfile, elevel) != 0)
3658 andres@anarazel.de 852 :UBC 0 : return -1;
853 : :
3658 andres@anarazel.de 854 :CBC 6634 : return 0;
855 : : }
856 : :
857 : : /*
858 : : * durable_unlink -- remove a file in a durable manner
859 : : *
860 : : * This routine ensures that, after returning, the effect of removing file
861 : : * persists in case of a crash. A crash while this routine is running will
862 : : * leave the system in no mixed state.
863 : : *
864 : : * It does so by using fsync on the parent directory of the file after the
865 : : * actual removal is done.
866 : : *
867 : : * Log errors with the severity specified by caller.
868 : : *
869 : : * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
870 : : * valid upon return.
871 : : */
872 : : int
3275 teodor@sigaev.ru 873 : 1289 : durable_unlink(const char *fname, int elevel)
874 : : {
875 [ + + ]: 1289 : if (unlink(fname) < 0)
876 : : {
877 [ + + ]: 43 : ereport(elevel,
878 : : (errcode_for_file_access(),
879 : : errmsg("could not remove file \"%s\": %m",
880 : : fname)));
881 : 43 : return -1;
882 : : }
883 : :
884 : : /*
885 : : * To guarantee that the removal of the file is persistent, fsync its
886 : : * parent directory.
887 : : */
888 [ - + ]: 1246 : if (fsync_parent_path(fname, elevel) != 0)
3275 teodor@sigaev.ru 889 :UBC 0 : return -1;
890 : :
3275 teodor@sigaev.ru 891 :CBC 1246 : return 0;
892 : : }
893 : :
894 : : /*
895 : : * InitFileAccess --- initialize this module during backend startup
896 : : *
897 : : * This is called during either normal or standalone backend start.
898 : : * It is *not* called in the postmaster.
899 : : *
900 : : * Note that this does not initialize temporary file access, that is
901 : : * separately initialized via InitTemporaryFileAccess().
902 : : */
903 : : void
7524 tgl@sss.pgh.pa.us 904 : 21553 : InitFileAccess(void)
905 : : {
7456 bruce@momjian.us 906 [ - + ]: 21553 : Assert(SizeVfdCache == 0); /* call me only once */
907 : :
908 : : /* initialize cache header entry */
7524 tgl@sss.pgh.pa.us 909 : 21553 : VfdCache = (Vfd *) malloc(sizeof(Vfd));
910 [ - + ]: 21553 : if (VfdCache == NULL)
7524 tgl@sss.pgh.pa.us 911 [ # # ]:UBC 0 : ereport(FATAL,
912 : : (errcode(ERRCODE_OUT_OF_MEMORY),
913 : : errmsg("out of memory")));
914 : :
396 peter@eisentraut.org 915 [ + - + - :CBC 172424 : MemSet(&(VfdCache[0]), 0, sizeof(Vfd));
+ - + - +
+ ]
7524 tgl@sss.pgh.pa.us 916 : 21553 : VfdCache->fd = VFD_CLOSED;
917 : :
918 : 21553 : SizeVfdCache = 1;
1681 andres@anarazel.de 919 : 21553 : }
920 : :
921 : : /*
922 : : * InitTemporaryFileAccess --- initialize temporary file access during startup
923 : : *
924 : : * This is called during either normal or standalone backend start.
925 : : * It is *not* called in the postmaster.
926 : : *
927 : : * This is separate from InitFileAccess() because temporary file cleanup can
928 : : * cause pgstat reporting. As pgstat is shut down during before_shmem_exit(),
929 : : * our reporting has to happen before that. Low level file access should be
930 : : * available for longer, hence the separate initialization / shutdown of
931 : : * temporary file handling.
932 : : */
933 : : void
934 : 21553 : InitTemporaryFileAccess(void)
935 : : {
1431 drowley@postgresql.o 936 [ - + ]: 21553 : Assert(SizeVfdCache != 0); /* InitFileAccess() needs to have run */
1681 andres@anarazel.de 937 [ - + ]: 21553 : Assert(!temporary_files_allowed); /* call me only once */
938 : :
939 : : /*
940 : : * Register before-shmem-exit hook to ensure temp files are dropped while
941 : : * we can still report stats.
942 : : */
943 : 21553 : before_shmem_exit(BeforeShmemExit_Files, 0);
944 : :
945 : : #ifdef USE_ASSERT_CHECKING
946 : 21553 : temporary_files_allowed = true;
947 : : #endif
7524 tgl@sss.pgh.pa.us 948 : 21553 : }
949 : :
950 : : /*
951 : : * count_usable_fds --- count how many FDs the system will let us open,
952 : : * and estimate how many are already open.
953 : : *
954 : : * We stop counting if usable_fds reaches max_to_probe. Note: a small
955 : : * value of max_to_probe might result in an underestimate of already_open;
956 : : * we must fill in any "gaps" in the set of used FDs before the calculation
957 : : * of already_open will give the right answer. In practice, max_to_probe
958 : : * of a couple of dozen should be enough to ensure good results.
959 : : *
960 : : * We assume stderr (FD 2) is available for dup'ing. While the calling
961 : : * script could theoretically close that, it would be a really bad idea,
962 : : * since then one risks loss of error messages from, e.g., libc.
963 : : */
964 : : static void
7525 965 : 1145 : count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
966 : : {
967 : : int *fd;
968 : : int size;
8056 969 : 1145 : int used = 0;
970 : 1145 : int highestfd = 0;
971 : : int j;
972 : :
973 : : #ifdef HAVE_GETRLIMIT
974 : : struct rlimit rlim;
975 : : int getrlimit_status;
976 : : #endif
977 : :
978 : 1145 : size = 1024;
979 : 1145 : fd = (int *) palloc(size * sizeof(int));
980 : :
981 : : #ifdef HAVE_GETRLIMIT
6220 peter_e@gmx.net 982 : 1145 : getrlimit_status = getrlimit(RLIMIT_NOFILE, &rlim);
983 [ - + ]: 1145 : if (getrlimit_status != 0)
6220 peter_e@gmx.net 984 [ # # ]:UBC 0 : ereport(WARNING, (errmsg("getrlimit failed: %m")));
985 : : #endif /* HAVE_GETRLIMIT */
986 : :
987 : : /* dup until failure or probe limit reached */
988 : : for (;;)
8056 tgl@sss.pgh.pa.us 989 :CBC 1143855 : {
990 : : int thisfd;
991 : :
992 : : #ifdef HAVE_GETRLIMIT
993 : :
994 : : /*
995 : : * don't go beyond RLIMIT_NOFILE; causes irritating kernel logs on
996 : : * some platforms
997 : : */
6220 peter_e@gmx.net 998 [ + - - + ]: 1145000 : if (getrlimit_status == 0 && highestfd >= rlim.rlim_cur - 1)
6220 peter_e@gmx.net 999 :UBC 0 : break;
1000 : : #endif
1001 : :
1655 tgl@sss.pgh.pa.us 1002 :CBC 1145000 : thisfd = dup(2);
8056 1003 [ - + ]: 1145000 : if (thisfd < 0)
1004 : : {
1005 : : /* Expect EMFILE or ENFILE, else it's fishy */
8056 tgl@sss.pgh.pa.us 1006 [ # # # # ]:UBC 0 : if (errno != EMFILE && errno != ENFILE)
1655 1007 [ # # ]: 0 : elog(WARNING, "duplicating stderr file descriptor failed after %d successes: %m", used);
8056 1008 : 0 : break;
1009 : : }
1010 : :
8056 tgl@sss.pgh.pa.us 1011 [ - + ]:CBC 1145000 : if (used >= size)
1012 : : {
8056 tgl@sss.pgh.pa.us 1013 :UBC 0 : size *= 2;
1014 : 0 : fd = (int *) repalloc(fd, size * sizeof(int));
1015 : : }
8056 tgl@sss.pgh.pa.us 1016 :CBC 1145000 : fd[used++] = thisfd;
1017 : :
1018 [ + - ]: 1145000 : if (highestfd < thisfd)
1019 : 1145000 : highestfd = thisfd;
1020 : :
7525 1021 [ + + ]: 1145000 : if (used >= max_to_probe)
1022 : 1145 : break;
1023 : : }
1024 : :
1025 : : /* release the files we opened */
8056 1026 [ + + ]: 1146145 : for (j = 0; j < used; j++)
1027 : 1145000 : close(fd[j]);
1028 : :
1029 : 1145 : pfree(fd);
1030 : :
1031 : : /*
1032 : : * Return results. usable_fds is just the number of successful dups. We
1033 : : * assume that the system limit is highestfd+1 (remember 0 is a legal FD
1034 : : * number) and so already_open is highestfd+1 - usable_fds.
1035 : : */
1036 : 1145 : *usable_fds = used;
7868 bruce@momjian.us 1037 : 1145 : *already_open = highestfd + 1 - used;
8056 tgl@sss.pgh.pa.us 1038 : 1145 : }
1039 : :
1040 : : /*
1041 : : * set_max_safe_fds
1042 : : * Determine number of file descriptors that fd.c is allowed to use
1043 : : */
1044 : : void
1045 : 1145 : set_max_safe_fds(void)
1046 : : {
1047 : : int usable_fds;
1048 : : int already_open;
1049 : :
1050 : : /*----------
1051 : : * We want to set max_safe_fds to
1052 : : * MIN(usable_fds, max_files_per_process)
1053 : : * less the slop factor for files that are opened without consulting
1054 : : * fd.c. This ensures that we won't allow to open more than
1055 : : * max_files_per_process, or the experimentally-determined EMFILE limit,
1056 : : * additional files.
1057 : : *----------
1058 : : */
7525 1059 : 1145 : count_usable_fds(max_files_per_process,
1060 : : &usable_fds, &already_open);
1061 : :
356 andres@anarazel.de 1062 : 1145 : max_safe_fds = Min(usable_fds, max_files_per_process);
1063 : :
1064 : : /*
1065 : : * Take off the FDs reserved for system() etc.
1066 : : */
8056 tgl@sss.pgh.pa.us 1067 : 1145 : max_safe_fds -= NUM_RESERVED_FDS;
1068 : :
1069 : : /*
1070 : : * Make sure we still have enough to get by.
1071 : : */
1072 [ - + ]: 1145 : if (max_safe_fds < FD_MINFREE)
8056 tgl@sss.pgh.pa.us 1073 [ # # ]:UBC 0 : ereport(FATAL,
1074 : : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
1075 : : errmsg("insufficient file descriptors available to start server process"),
1076 : : errdetail("System allows %d, server needs at least %d, %d files are already open.",
1077 : : max_safe_fds + NUM_RESERVED_FDS,
1078 : : FD_MINFREE + NUM_RESERVED_FDS,
1079 : : already_open)));
1080 : :
8056 tgl@sss.pgh.pa.us 1081 [ + + ]:CBC 1145 : elog(DEBUG2, "max_safe_fds = %d, usable_fds = %d, already_open = %d",
1082 : : max_safe_fds, usable_fds, already_open);
1083 : 1145 : }
1084 : :
1085 : : /*
1086 : : * Open a file with BasicOpenFilePerm() and pass default file mode for the
1087 : : * fileMode parameter.
1088 : : */
1089 : : int
3095 peter_e@gmx.net 1090 : 40607 : BasicOpenFile(const char *fileName, int fileFlags)
1091 : : {
2899 sfrost@snowman.net 1092 : 40607 : return BasicOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
1093 : : }
1094 : :
1095 : : /*
1096 : : * BasicOpenFilePerm --- same as open(2) except can free other FDs if needed
1097 : : *
1098 : : * This is exported for use by places that really want a plain kernel FD,
1099 : : * but need to be proof against running out of FDs. Once an FD has been
1100 : : * successfully returned, it is the caller's responsibility to ensure that
1101 : : * it will not be leaked on ereport()! Most users should *not* call this
1102 : : * routine directly, but instead use the VFD abstraction level, which
1103 : : * provides protection against descriptor leaks as well as management of
1104 : : * files that need to be open for more than a short period of time.
1105 : : *
1106 : : * Ideally this should be the *only* direct call of open() in the backend.
1107 : : * In practice, the postmaster calls open() directly, and there are some
1108 : : * direct open() calls done early in backend startup. Those are OK since
1109 : : * this module wouldn't have any open files to close at that point anyway.
1110 : : */
1111 : : int
3095 peter_e@gmx.net 1112 : 1800207 : BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
1113 : : {
1114 : : int fd;
1115 : :
9417 tgl@sss.pgh.pa.us 1116 : 1800207 : tryAgain:
1117 : : #ifdef PG_O_DIRECT_USE_F_NOCACHE
1118 : : fd = open(fileName, fileFlags & ~PG_O_DIRECT, fileMode);
1119 : : #else
1120 : 1800207 : fd = open(fileName, fileFlags, fileMode);
1121 : : #endif
1122 : :
1123 [ + + ]: 1800207 : if (fd >= 0)
1124 : : {
1125 : : #ifdef PG_O_DIRECT_USE_F_NOCACHE
1126 : : if (fileFlags & PG_O_DIRECT)
1127 : : {
1128 : : if (fcntl(fd, F_NOCACHE, 1) < 0)
1129 : : {
1130 : : int save_errno = errno;
1131 : :
1132 : : close(fd);
1133 : : errno = save_errno;
1134 : : return -1;
1135 : : }
1136 : : }
1137 : : #endif
1138 : :
1139 : 1407406 : return fd; /* success! */
1140 : : }
1141 : :
9331 1142 [ + - - + ]: 392801 : if (errno == EMFILE || errno == ENFILE)
1143 : : {
9124 bruce@momjian.us 1144 :UBC 0 : int save_errno = errno;
1145 : :
8270 tgl@sss.pgh.pa.us 1146 [ # # ]: 0 : ereport(LOG,
1147 : : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
1148 : : errmsg("out of file descriptors: %m; release and retry")));
9417 1149 : 0 : errno = 0;
9331 1150 [ # # ]: 0 : if (ReleaseLruFile())
1151 : 0 : goto tryAgain;
1152 : 0 : errno = save_errno;
1153 : : }
1154 : :
9417 tgl@sss.pgh.pa.us 1155 :CBC 392801 : return -1; /* failure */
1156 : : }
1157 : :
1158 : : /*
1159 : : * AcquireExternalFD - attempt to reserve an external file descriptor
1160 : : *
1161 : : * This should be used by callers that need to hold a file descriptor open
1162 : : * over more than a short interval, but cannot use any of the other facilities
1163 : : * provided by this module.
1164 : : *
1165 : : * The difference between this and the underlying ReserveExternalFD function
1166 : : * is that this will report failure (by setting errno and returning false)
1167 : : * if "too many" external FDs are already reserved. This should be used in
1168 : : * any code where the total number of FDs to be reserved is not predictable
1169 : : * and small.
1170 : : */
1171 : : bool
2211 1172 : 126032 : AcquireExternalFD(void)
1173 : : {
1174 : : /*
1175 : : * We don't want more than max_safe_fds / 3 FDs to be consumed for
1176 : : * "external" FDs.
1177 : : */
1178 [ + - ]: 126032 : if (numExternalFDs < max_safe_fds / 3)
1179 : : {
1180 : 126032 : ReserveExternalFD();
1181 : 126032 : return true;
1182 : : }
2211 tgl@sss.pgh.pa.us 1183 :UBC 0 : errno = EMFILE;
1184 : 0 : return false;
1185 : : }
1186 : :
1187 : : /*
1188 : : * ReserveExternalFD - report external consumption of a file descriptor
1189 : : *
1190 : : * This should be used by callers that need to hold a file descriptor open
1191 : : * over more than a short interval, but cannot use any of the other facilities
1192 : : * provided by this module. This just tracks the use of the FD and closes
1193 : : * VFDs if needed to ensure we keep NUM_RESERVED_FDS FDs available.
1194 : : *
1195 : : * Call this directly only in code where failure to reserve the FD would be
1196 : : * fatal; for example, the WAL-writing code does so, since the alternative is
1197 : : * session failure. Also, it's very unwise to do so in code that could
1198 : : * consume more than one FD per process.
1199 : : *
1200 : : * Note: as long as everybody plays nice so that NUM_RESERVED_FDS FDs remain
1201 : : * available, it doesn't matter too much whether this is called before or
1202 : : * after actually opening the FD; but doing so beforehand reduces the risk of
1203 : : * an EMFILE failure if not everybody played nice. In any case, it's solely
1204 : : * caller's responsibility to keep the external-FD count in sync with reality.
1205 : : */
1206 : : void
2211 tgl@sss.pgh.pa.us 1207 :CBC 204156 : ReserveExternalFD(void)
1208 : : {
1209 : : /*
1210 : : * Release VFDs if needed to stay safe. Because we do this before
1211 : : * incrementing numExternalFDs, the final state will be as desired, i.e.,
1212 : : * nfile + numAllocatedDescs + numExternalFDs <= max_safe_fds.
1213 : : */
1214 : 204156 : ReleaseLruFiles();
1215 : :
1216 : 204156 : numExternalFDs++;
1217 : 204156 : }
1218 : :
1219 : : /*
1220 : : * ReleaseExternalFD - report release of an external file descriptor
1221 : : *
1222 : : * This is guaranteed not to change errno, so it can be used in failure paths.
1223 : : */
1224 : : void
1225 : 182349 : ReleaseExternalFD(void)
1226 : : {
1227 [ - + ]: 182349 : Assert(numExternalFDs > 0);
1228 : 182349 : numExternalFDs--;
1229 : 182349 : }
1230 : :
1231 : :
1232 : : #if defined(FDDEBUG)
1233 : :
1234 : : static void
1235 : : _dump_lru(void)
1236 : : {
1237 : : int mru = VfdCache[0].lruLessRecently;
1238 : : Vfd *vfdP = &VfdCache[mru];
1239 : : char buf[2048];
1240 : :
1241 : : snprintf(buf, sizeof(buf), "LRU: MOST %d ", mru);
1242 : : while (mru != 0)
1243 : : {
1244 : : mru = vfdP->lruLessRecently;
1245 : : vfdP = &VfdCache[mru];
1246 : : snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "%d ", mru);
1247 : : }
1248 : : snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "LEAST");
1249 : : elog(LOG, "%s", buf);
1250 : : }
1251 : : #endif /* FDDEBUG */
1252 : :
1253 : : static void
10841 scrappy@hub.org 1254 : 1440606 : Delete(File file)
1255 : : {
1256 : : Vfd *vfdP;
1257 : :
9807 tgl@sss.pgh.pa.us 1258 [ - + ]: 1440606 : Assert(file != 0);
1259 : :
1260 : : DO_DB(elog(LOG, "Delete %d (%s)",
1261 : : file, VfdCache[file].fileName));
1262 : : DO_DB(_dump_lru());
1263 : :
1264 : 1440606 : vfdP = &VfdCache[file];
1265 : :
1266 : 1440606 : VfdCache[vfdP->lruLessRecently].lruMoreRecently = vfdP->lruMoreRecently;
1267 : 1440606 : VfdCache[vfdP->lruMoreRecently].lruLessRecently = vfdP->lruLessRecently;
1268 : :
1269 : : DO_DB(_dump_lru());
10841 scrappy@hub.org 1270 : 1440606 : }
1271 : :
1272 : : static void
1273 : 3492 : LruDelete(File file)
1274 : : {
1275 : : Vfd *vfdP;
1276 : :
9807 tgl@sss.pgh.pa.us 1277 [ - + ]: 3492 : Assert(file != 0);
1278 : :
1279 : : DO_DB(elog(LOG, "LruDelete %d (%s)",
1280 : : file, VfdCache[file].fileName));
1281 : :
1282 : 3492 : vfdP = &VfdCache[file];
1283 : :
351 andres@anarazel.de 1284 : 3492 : pgaio_closing_fd(vfdP->fd);
1285 : :
1286 : : /*
1287 : : * Close the file. We aren't expecting this to fail; if it does, better
1288 : : * to leak the FD than to mess up our internal state.
1289 : : */
2444 peter@eisentraut.org 1290 [ - + ]: 3492 : if (close(vfdP->fd) != 0)
2673 tmunro@postgresql.or 1291 [ # # # # ]:UBC 0 : elog(vfdP->fdstate & FD_TEMP_FILE_LIMIT ? LOG : data_sync_elevel(LOG),
1292 : : "could not close file \"%s\": %m", vfdP->fileName);
9807 tgl@sss.pgh.pa.us 1293 :CBC 3492 : vfdP->fd = VFD_CLOSED;
3309 1294 : 3492 : --nfile;
1295 : :
1296 : : /* delete the vfd record from the LRU ring */
1297 : 3492 : Delete(file);
10841 scrappy@hub.org 1298 : 3492 : }
1299 : :
1300 : : static void
1301 : 1841826 : Insert(File file)
1302 : : {
1303 : : Vfd *vfdP;
1304 : :
9807 tgl@sss.pgh.pa.us 1305 [ - + ]: 1841826 : Assert(file != 0);
1306 : :
1307 : : DO_DB(elog(LOG, "Insert %d (%s)",
1308 : : file, VfdCache[file].fileName));
1309 : : DO_DB(_dump_lru());
1310 : :
10416 bruce@momjian.us 1311 : 1841826 : vfdP = &VfdCache[file];
1312 : :
1313 : 1841826 : vfdP->lruMoreRecently = 0;
1314 : 1841826 : vfdP->lruLessRecently = VfdCache[0].lruLessRecently;
1315 : 1841826 : VfdCache[0].lruLessRecently = file;
1316 : 1841826 : VfdCache[vfdP->lruLessRecently].lruMoreRecently = file;
1317 : :
1318 : : DO_DB(_dump_lru());
10841 scrappy@hub.org 1319 : 1841826 : }
1320 : :
1321 : : /* returns 0 on success, -1 on re-open failure (with errno set) */
1322 : : static int
10416 bruce@momjian.us 1323 : 34 : LruInsert(File file)
1324 : : {
1325 : : Vfd *vfdP;
1326 : :
9807 tgl@sss.pgh.pa.us 1327 [ - + ]: 34 : Assert(file != 0);
1328 : :
1329 : : DO_DB(elog(LOG, "LruInsert %d (%s)",
1330 : : file, VfdCache[file].fileName));
1331 : :
10416 bruce@momjian.us 1332 : 34 : vfdP = &VfdCache[file];
1333 : :
1334 [ + - ]: 34 : if (FileIsNotOpen(file))
1335 : : {
1336 : : /* Close excess kernel FDs. */
4662 tgl@sss.pgh.pa.us 1337 : 34 : ReleaseLruFiles();
1338 : :
1339 : : /*
1340 : : * The open could still fail for lack of file descriptors, eg due to
1341 : : * overall system file table being full. So, be prepared to release
1342 : : * another FD if necessary...
1343 : : */
3095 peter_e@gmx.net 1344 : 34 : vfdP->fd = BasicOpenFilePerm(vfdP->fileName, vfdP->fileFlags,
1345 : : vfdP->fileMode);
10416 bruce@momjian.us 1346 [ - + ]: 34 : if (vfdP->fd < 0)
1347 : : {
1348 : : DO_DB(elog(LOG, "re-open failed: %m"));
4686 tgl@sss.pgh.pa.us 1349 :UBC 0 : return -1;
1350 : : }
1351 : : else
1352 : : {
10416 bruce@momjian.us 1353 :CBC 34 : ++nfile;
1354 : : }
1355 : : }
1356 : :
1357 : : /*
1358 : : * put it at the head of the Lru ring
1359 : : */
1360 : :
1361 : 34 : Insert(file);
1362 : :
10057 1363 : 34 : return 0;
1364 : : }
1365 : :
1366 : : /*
1367 : : * Release one kernel FD by closing the least-recently-used VFD.
1368 : : */
1369 : : static bool
9331 tgl@sss.pgh.pa.us 1370 : 3342 : ReleaseLruFile(void)
1371 : : {
1372 : : DO_DB(elog(LOG, "ReleaseLruFile. Opened %d", nfile));
1373 : :
1374 [ + - ]: 3342 : if (nfile > 0)
1375 : : {
1376 : : /*
1377 : : * There are opened files and so there should be at least one used vfd
1378 : : * in the ring.
1379 : : */
1380 [ - + ]: 3342 : Assert(VfdCache[0].lruMoreRecently != 0);
1381 : 3342 : LruDelete(VfdCache[0].lruMoreRecently);
1382 : 3342 : return true; /* freed a file */
1383 : : }
9331 tgl@sss.pgh.pa.us 1384 :UBC 0 : return false; /* no files available to free */
1385 : : }
1386 : :
1387 : : /*
1388 : : * Release kernel FDs as needed to get under the max_safe_fds limit.
1389 : : * After calling this, it's OK to try to open another file.
1390 : : */
1391 : : static void
4662 tgl@sss.pgh.pa.us 1392 :CBC 2098104 : ReleaseLruFiles(void)
1393 : : {
2211 1394 [ + + ]: 2101446 : while (nfile + numAllocatedDescs + numExternalFDs >= max_safe_fds)
1395 : : {
4662 1396 [ - + ]: 3342 : if (!ReleaseLruFile())
4662 tgl@sss.pgh.pa.us 1397 :UBC 0 : break;
1398 : : }
4662 tgl@sss.pgh.pa.us 1399 :CBC 2098104 : }
1400 : :
1401 : : static File
9331 1402 : 1340209 : AllocateVfd(void)
1403 : : {
1404 : : Index i;
1405 : : File file;
1406 : :
1407 : : DO_DB(elog(LOG, "AllocateVfd. Size %zu", SizeVfdCache));
1408 : :
7456 bruce@momjian.us 1409 [ - + ]: 1340209 : Assert(SizeVfdCache > 0); /* InitFileAccess not called? */
1410 : :
10416 1411 [ + + ]: 1340209 : if (VfdCache[0].nextFree == 0)
1412 : : {
1413 : : /*
1414 : : * The free list is empty so it is time to increase the size of the
1415 : : * array. We choose to double it each time this happens. However,
1416 : : * there's not much point in starting *real* small.
1417 : : */
9791 1418 : 25069 : Size newCacheSize = SizeVfdCache * 2;
1419 : : Vfd *newVfdCache;
1420 : :
9807 tgl@sss.pgh.pa.us 1421 [ + + ]: 25069 : if (newCacheSize < 32)
1422 : 18422 : newCacheSize = 32;
1423 : :
1424 : : /*
1425 : : * Be careful not to clobber VfdCache ptr if realloc fails.
1426 : : */
9112 1427 : 25069 : newVfdCache = (Vfd *) realloc(VfdCache, sizeof(Vfd) * newCacheSize);
1428 [ - + ]: 25069 : if (newVfdCache == NULL)
8270 tgl@sss.pgh.pa.us 1429 [ # # ]:UBC 0 : ereport(ERROR,
1430 : : (errcode(ERRCODE_OUT_OF_MEMORY),
1431 : : errmsg("out of memory")));
9112 tgl@sss.pgh.pa.us 1432 :CBC 25069 : VfdCache = newVfdCache;
1433 : :
1434 : : /*
1435 : : * Initialize the new entries and link them into the free list.
1436 : : */
9807 1437 [ + + ]: 1201655 : for (i = SizeVfdCache; i < newCacheSize; i++)
1438 : : {
396 peter@eisentraut.org 1439 [ + - + - : 9412688 : MemSet(&(VfdCache[i]), 0, sizeof(Vfd));
+ - + - +
+ ]
10416 bruce@momjian.us 1440 : 1176586 : VfdCache[i].nextFree = i + 1;
1441 : 1176586 : VfdCache[i].fd = VFD_CLOSED;
1442 : : }
9807 tgl@sss.pgh.pa.us 1443 : 25069 : VfdCache[newCacheSize - 1].nextFree = 0;
10416 bruce@momjian.us 1444 : 25069 : VfdCache[0].nextFree = SizeVfdCache;
1445 : :
1446 : : /*
1447 : : * Record the new size
1448 : : */
9807 tgl@sss.pgh.pa.us 1449 : 25069 : SizeVfdCache = newCacheSize;
1450 : : }
1451 : :
10416 bruce@momjian.us 1452 : 1340209 : file = VfdCache[0].nextFree;
1453 : :
1454 : 1340209 : VfdCache[0].nextFree = VfdCache[file].nextFree;
1455 : :
1456 : 1340209 : return file;
1457 : : }
1458 : :
1459 : : static void
10841 scrappy@hub.org 1460 : 935808 : FreeVfd(File file)
1461 : : {
9807 tgl@sss.pgh.pa.us 1462 : 935808 : Vfd *vfdP = &VfdCache[file];
1463 : :
1464 : : DO_DB(elog(LOG, "FreeVfd: %d (%s)",
1465 : : file, vfdP->fileName ? vfdP->fileName : ""));
1466 : :
1467 [ + + ]: 935808 : if (vfdP->fileName != NULL)
1468 : : {
1469 : 549786 : free(vfdP->fileName);
1470 : 549786 : vfdP->fileName = NULL;
1471 : : }
9112 1472 : 935808 : vfdP->fdstate = 0x0;
1473 : :
9807 1474 : 935808 : vfdP->nextFree = VfdCache[0].nextFree;
10416 bruce@momjian.us 1475 : 935808 : VfdCache[0].nextFree = file;
10841 scrappy@hub.org 1476 : 935808 : }
1477 : :
1478 : : /* returns 0 on success, -1 on re-open failure (with errno set) */
1479 : : static int
1480 : 3262853 : FileAccess(File file)
1481 : : {
1482 : : int returnValue;
1483 : :
1484 : : DO_DB(elog(LOG, "FileAccess %d (%s)",
1485 : : file, VfdCache[file].fileName));
1486 : :
1487 : : /*
1488 : : * Is the file open? If not, open it and put it at the head of the LRU
1489 : : * ring (possibly closing the least recently used file to get an FD).
1490 : : */
1491 : :
10416 bruce@momjian.us 1492 [ + + ]: 3262853 : if (FileIsNotOpen(file))
1493 : : {
1494 : 34 : returnValue = LruInsert(file);
1495 [ - + ]: 34 : if (returnValue != 0)
10416 bruce@momjian.us 1496 :UBC 0 : return returnValue;
1497 : : }
9807 tgl@sss.pgh.pa.us 1498 [ + + ]:CBC 3262819 : else if (VfdCache[0].lruLessRecently != file)
1499 : : {
1500 : : /*
1501 : : * We now know that the file is open and that it is not the last one
1502 : : * accessed, so we need to move it to the head of the Lru ring.
1503 : : */
1504 : :
10416 bruce@momjian.us 1505 : 887621 : Delete(file);
1506 : 887621 : Insert(file);
1507 : : }
1508 : :
10057 1509 : 3262853 : return 0;
1510 : : }
1511 : :
1512 : : /*
1513 : : * Called whenever a temporary file is deleted to report its size.
1514 : : */
1515 : : static void
122 michael@paquier.xyz 1516 :GNC 3172 : ReportTemporaryFileUsage(const char *path, pgoff_t size)
1517 : : {
3026 andres@anarazel.de 1518 :CBC 3172 : pgstat_report_tempfile(size);
1519 : :
1520 [ + + ]: 3172 : if (log_temp_files >= 0)
1521 : : {
1522 [ + + ]: 1014 : if ((size / 1024) >= log_temp_files)
1523 [ + - ]: 122 : ereport(LOG,
1524 : : (errmsg("temporary file: path \"%s\", size %lu",
1525 : : path, (unsigned long) size)));
1526 : : }
1527 : 3172 : }
1528 : :
1529 : : /*
1530 : : * Called to register a temporary file for automatic close.
1531 : : * ResourceOwnerEnlarge(CurrentResourceOwner) must have been called
1532 : : * before the file was opened.
1533 : : */
1534 : : static void
1535 : 5196 : RegisterTemporaryFile(File file)
1536 : : {
1537 : 5196 : ResourceOwnerRememberFile(CurrentResourceOwner, file);
1538 : 5196 : VfdCache[file].resowner = CurrentResourceOwner;
1539 : :
1540 : : /* Backup mechanism for closing at end of xact. */
1541 : 5196 : VfdCache[file].fdstate |= FD_CLOSE_AT_EOXACT;
1542 : 5196 : have_xact_temporary_files = true;
1543 : 5196 : }
1544 : :
1545 : : /*
1546 : : * Called when we get a shared invalidation message on some relation.
1547 : : */
1548 : : #ifdef NOT_USED
1549 : : void
1550 : : FileInvalidate(File file)
1551 : : {
1552 : : Assert(FileIsValid(file));
1553 : : if (!FileIsNotOpen(file))
1554 : : LruDelete(file);
1555 : : }
1556 : : #endif
1557 : :
1558 : : /*
1559 : : * Open a file with PathNameOpenFilePerm() and pass default file mode for the
1560 : : * fileMode parameter.
1561 : : */
1562 : : File
3095 peter_e@gmx.net 1563 : 1340209 : PathNameOpenFile(const char *fileName, int fileFlags)
1564 : : {
2899 sfrost@snowman.net 1565 : 1340209 : return PathNameOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
1566 : : }
1567 : :
1568 : : /*
1569 : : * open a file in an arbitrary directory
1570 : : *
1571 : : * NB: if the passed pathname is relative (which it usually is),
1572 : : * it will be interpreted relative to the process' working directory
1573 : : * (which should always be $PGDATA when this code is running).
1574 : : */
1575 : : File
3095 peter_e@gmx.net 1576 : 1340209 : PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
1577 : : {
1578 : : char *fnamecopy;
1579 : : File file;
1580 : : Vfd *vfdP;
1581 : :
1582 : : DO_DB(elog(LOG, "PathNameOpenFilePerm: %s %x %o",
1583 : : fileName, fileFlags, fileMode));
1584 : :
1585 : : /*
1586 : : * We need a malloc'd copy of the file name; fail cleanly if no room.
1587 : : */
8270 tgl@sss.pgh.pa.us 1588 : 1340209 : fnamecopy = strdup(fileName);
1589 [ - + ]: 1340209 : if (fnamecopy == NULL)
8270 tgl@sss.pgh.pa.us 1590 [ # # ]:UBC 0 : ereport(ERROR,
1591 : : (errcode(ERRCODE_OUT_OF_MEMORY),
1592 : : errmsg("out of memory")));
1593 : :
10416 bruce@momjian.us 1594 :CBC 1340209 : file = AllocateVfd();
1595 : 1340209 : vfdP = &VfdCache[file];
1596 : :
1597 : : /* Close excess kernel FDs. */
4662 tgl@sss.pgh.pa.us 1598 : 1340209 : ReleaseLruFiles();
1599 : :
1600 : : /*
1601 : : * Descriptors managed by VFDs are implicitly marked O_CLOEXEC. The
1602 : : * client shouldn't be expected to know which kernel descriptors are
1603 : : * currently open, so it wouldn't make sense for them to be inherited by
1604 : : * executed subprograms.
1605 : : */
1108 tmunro@postgresql.or 1606 : 1340209 : fileFlags |= O_CLOEXEC;
1607 : :
3095 peter_e@gmx.net 1608 : 1340209 : vfdP->fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
1609 : :
10416 bruce@momjian.us 1610 [ + + ]: 1340209 : if (vfdP->fd < 0)
1611 : : {
4686 tgl@sss.pgh.pa.us 1612 : 386022 : int save_errno = errno;
1613 : :
10416 bruce@momjian.us 1614 : 386022 : FreeVfd(file);
8270 tgl@sss.pgh.pa.us 1615 : 386022 : free(fnamecopy);
4686 1616 : 386022 : errno = save_errno;
10416 bruce@momjian.us 1617 : 386022 : return -1;
1618 : : }
1619 : 954187 : ++nfile;
1620 : : DO_DB(elog(LOG, "PathNameOpenFile: success %d",
1621 : : vfdP->fd));
1622 : :
8270 tgl@sss.pgh.pa.us 1623 : 954187 : vfdP->fileName = fnamecopy;
1624 : : /* Saved flags are adjusted to be OK for re-opening file */
9112 1625 : 954187 : vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
10416 bruce@momjian.us 1626 : 954187 : vfdP->fileMode = fileMode;
5355 tgl@sss.pgh.pa.us 1627 : 954187 : vfdP->fileSize = 0;
8622 1628 : 954187 : vfdP->fdstate = 0x0;
5946 heikki.linnakangas@i 1629 : 954187 : vfdP->resowner = NULL;
1630 : :
1945 tgl@sss.pgh.pa.us 1631 : 954187 : Insert(file);
1632 : :
10416 bruce@momjian.us 1633 : 954187 : return file;
1634 : : }
1635 : :
1636 : : /*
1637 : : * Create directory 'directory'. If necessary, create 'basedir', which must
1638 : : * be the directory above it. This is designed for creating the top-level
1639 : : * temporary directory on demand before creating a directory underneath it.
1640 : : * Do nothing if the directory already exists.
1641 : : *
1642 : : * Directories created within the top-level temporary directory should begin
1643 : : * with PG_TEMP_FILE_PREFIX, so that they can be identified as temporary and
1644 : : * deleted at startup by RemovePgTempFiles(). Further subdirectories below
1645 : : * that do not need any particular prefix.
1646 : : */
1647 : : void
3026 andres@anarazel.de 1648 : 200 : PathNameCreateTemporaryDir(const char *basedir, const char *directory)
1649 : : {
2899 sfrost@snowman.net 1650 [ + + ]: 200 : if (MakePGDirectory(directory) < 0)
1651 : : {
3026 andres@anarazel.de 1652 [ + + ]: 17 : if (errno == EEXIST)
1653 : 4 : return;
1654 : :
1655 : : /*
1656 : : * Failed. Try to create basedir first in case it's missing. Tolerate
1657 : : * EEXIST to close a race against another process following the same
1658 : : * algorithm.
1659 : : */
2899 sfrost@snowman.net 1660 [ + + - + ]: 13 : if (MakePGDirectory(basedir) < 0 && errno != EEXIST)
3026 andres@anarazel.de 1661 [ # # ]:UBC 0 : ereport(ERROR,
1662 : : (errcode_for_file_access(),
1663 : : errmsg("cannot create temporary directory \"%s\": %m",
1664 : : basedir)));
1665 : :
1666 : : /* Try again. */
2899 sfrost@snowman.net 1667 [ + + - + ]:CBC 13 : if (MakePGDirectory(directory) < 0 && errno != EEXIST)
3026 andres@anarazel.de 1668 [ # # ]:UBC 0 : ereport(ERROR,
1669 : : (errcode_for_file_access(),
1670 : : errmsg("cannot create temporary subdirectory \"%s\": %m",
1671 : : directory)));
1672 : : }
1673 : : }
1674 : :
1675 : : /*
1676 : : * Delete a directory and everything in it, if it exists.
1677 : : */
1678 : : void
3026 andres@anarazel.de 1679 :CBC 238 : PathNameDeleteTemporaryDir(const char *dirname)
1680 : : {
1681 : : struct stat statbuf;
1682 : :
1683 : : /* Silently ignore missing directory. */
1684 [ + + + - ]: 238 : if (stat(dirname, &statbuf) != 0 && errno == ENOENT)
1685 : 43 : return;
1686 : :
1687 : : /*
1688 : : * Currently, walkdir doesn't offer a way for our passed in function to
1689 : : * maintain state. Perhaps it should, so that we could tell the caller
1690 : : * whether this operation succeeded or failed. Since this operation is
1691 : : * used in a cleanup path, we wouldn't actually behave differently: we'll
1692 : : * just log failures.
1693 : : */
1694 : 195 : walkdir(dirname, unlink_if_exists_fname, false, LOG);
1695 : : }
1696 : :
1697 : : /*
1698 : : * Open a temporary file that will disappear when we close it.
1699 : : *
1700 : : * This routine takes care of generating an appropriate tempfile name.
1701 : : * There's no need to pass in fileFlags or fileMode either, since only
1702 : : * one setting makes any sense for a temp file.
1703 : : *
1704 : : * Unless interXact is true, the file is remembered by CurrentResourceOwner
1705 : : * to ensure it's closed and deleted when it's no longer needed, typically at
1706 : : * the end-of-transaction. In most cases, you don't want temporary files to
1707 : : * outlive the transaction that created them, so this should be false -- but
1708 : : * if you need "somewhat" temporary storage, this might be useful. In either
1709 : : * case, the file is removed when the File is explicitly closed.
1710 : : */
1711 : : File
6856 tgl@sss.pgh.pa.us 1712 : 1591 : OpenTemporaryFile(bool interXact)
1713 : : {
6860 1714 : 1591 : File file = 0;
1715 : :
1681 andres@anarazel.de 1716 [ - + ]: 1591 : Assert(temporary_files_allowed); /* check temp file access is up */
1717 : :
1718 : : /*
1719 : : * Make sure the current resource owner has space for this File before we
1720 : : * open it, if we'll be registering it below.
1721 : : */
3049 tgl@sss.pgh.pa.us 1722 [ + - ]: 1591 : if (!interXact)
858 heikki.linnakangas@i 1723 : 1591 : ResourceOwnerEnlarge(CurrentResourceOwner);
1724 : :
1725 : : /*
1726 : : * If some temp tablespace(s) have been given to us, try to use the next
1727 : : * one. If a given tablespace can't be found, we silently fall back to
1728 : : * the database's default tablespace.
1729 : : *
1730 : : * BUT: if the temp file is slated to outlive the current transaction,
1731 : : * force it into the database's default tablespace, so that it will not
1732 : : * pose a threat to possible tablespace drop attempts.
1733 : : */
6856 tgl@sss.pgh.pa.us 1734 [ + + + - ]: 1591 : if (numTempTableSpaces > 0 && !interXact)
1735 : : {
6695 bruce@momjian.us 1736 : 1 : Oid tblspcOid = GetNextTempTableSpace();
1737 : :
6856 tgl@sss.pgh.pa.us 1738 [ + - ]: 1 : if (OidIsValid(tblspcOid))
1739 : 1 : file = OpenTemporaryFileInTablespace(tblspcOid, false);
1740 : : }
1741 : :
1742 : : /*
1743 : : * If not, or if tablespace is bad, create in database's default
1744 : : * tablespace. MyDatabaseTableSpace should normally be set before we get
1745 : : * here, but just in case it isn't, fall back to pg_default tablespace.
1746 : : */
6860 1747 [ + + ]: 1591 : if (file <= 0)
1748 [ + + ]: 1590 : file = OpenTemporaryFileInTablespace(MyDatabaseTableSpace ?
1749 : : MyDatabaseTableSpace :
1750 : : DEFAULTTABLESPACE_OID,
1751 : : true);
1752 : :
1753 : : /* Mark it for deletion at close and temporary file size limit */
3026 andres@anarazel.de 1754 : 1591 : VfdCache[file].fdstate |= FD_DELETE_AT_CLOSE | FD_TEMP_FILE_LIMIT;
1755 : :
1756 : : /* Register it with the current resource owner */
6860 tgl@sss.pgh.pa.us 1757 [ + - ]: 1591 : if (!interXact)
3026 andres@anarazel.de 1758 : 1591 : RegisterTemporaryFile(file);
1759 : :
6860 tgl@sss.pgh.pa.us 1760 : 1591 : return file;
1761 : : }
1762 : :
1763 : : /*
1764 : : * Return the path of the temp directory in a given tablespace.
1765 : : */
1766 : : void
3026 andres@anarazel.de 1767 : 10103 : TempTablespacePath(char *path, Oid tablespace)
1768 : : {
1769 : : /*
1770 : : * Identify the tempfile directory for this tablespace.
1771 : : *
1772 : : * If someone tries to specify pg_global, use pg_default instead.
1773 : : */
1774 [ + - + + ]: 10103 : if (tablespace == InvalidOid ||
1775 [ - + ]: 1 : tablespace == DEFAULTTABLESPACE_OID ||
1776 : : tablespace == GLOBALTABLESPACE_OID)
1777 : 10102 : snprintf(path, MAXPGPATH, "base/%s", PG_TEMP_FILES_DIR);
1778 : : else
1779 : : {
1780 : : /* All other tablespaces are accessed via symlinks */
558 michael@paquier.xyz 1781 : 1 : snprintf(path, MAXPGPATH, "%s/%u/%s/%s",
1782 : : PG_TBLSPC_DIR, tablespace, TABLESPACE_VERSION_DIRECTORY,
1783 : : PG_TEMP_FILES_DIR);
1784 : : }
3026 andres@anarazel.de 1785 : 10103 : }
1786 : :
1787 : : /*
1788 : : * Open a temporary file in a specific tablespace.
1789 : : * Subroutine for OpenTemporaryFile, which see for details.
1790 : : */
1791 : : static File
1792 : 1591 : OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
1793 : : {
1794 : : char tempdirpath[MAXPGPATH];
1795 : : char tempfilepath[MAXPGPATH];
1796 : : File file;
1797 : :
1798 : 1591 : TempTablespacePath(tempdirpath, tblspcOid);
1799 : :
1800 : : /*
1801 : : * Generate a tempfile name that should be unique within the current
1802 : : * database instance.
1803 : : */
6860 tgl@sss.pgh.pa.us 1804 : 1591 : snprintf(tempfilepath, sizeof(tempfilepath), "%s/%s%d.%ld",
1805 : : tempdirpath, PG_TEMP_FILE_PREFIX, MyProcPid, tempFileCounter++);
1806 : :
1807 : : /*
1808 : : * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1809 : : * temp file that can be reused.
1810 : : */
1811 : 1591 : file = PathNameOpenFile(tempfilepath,
1812 : : O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
9807 1813 [ + + ]: 1591 : if (file <= 0)
1814 : : {
1815 : : /*
1816 : : * We might need to create the tablespace's tempfile directory, if no
1817 : : * one has yet done so.
1818 : : *
1819 : : * Don't check for an error from MakePGDirectory; it could fail if
1820 : : * someone else just did the same thing. If it doesn't work then
1821 : : * we'll bomb out on the second create attempt, instead.
1822 : : */
2899 sfrost@snowman.net 1823 : 103 : (void) MakePGDirectory(tempdirpath);
1824 : :
6860 tgl@sss.pgh.pa.us 1825 : 103 : file = PathNameOpenFile(tempfilepath,
1826 : : O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1827 [ - + - - ]: 103 : if (file <= 0 && rejectError)
8270 tgl@sss.pgh.pa.us 1828 [ # # ]:UBC 0 : elog(ERROR, "could not create temporary file \"%s\": %m",
1829 : : tempfilepath);
1830 : : }
1831 : :
9807 tgl@sss.pgh.pa.us 1832 :CBC 1591 : return file;
1833 : : }
1834 : :
1835 : :
1836 : : /*
1837 : : * Create a new file. The directory containing it must already exist. Files
1838 : : * created this way are subject to temp_file_limit and are automatically
1839 : : * closed at end of transaction, but are not automatically deleted on close
1840 : : * because they are intended to be shared between cooperating backends.
1841 : : *
1842 : : * If the file is inside the top-level temporary directory, its name should
1843 : : * begin with PG_TEMP_FILE_PREFIX so that it can be identified as temporary
1844 : : * and deleted at startup by RemovePgTempFiles(). Alternatively, it can be
1845 : : * inside a directory created with PathNameCreateTemporaryDir(), in which case
1846 : : * the prefix isn't needed.
1847 : : */
1848 : : File
3026 andres@anarazel.de 1849 : 1781 : PathNameCreateTemporaryFile(const char *path, bool error_on_failure)
1850 : : {
1851 : : File file;
1852 : :
1681 1853 [ - + ]: 1781 : Assert(temporary_files_allowed); /* check temp file access is up */
1854 : :
858 heikki.linnakangas@i 1855 : 1781 : ResourceOwnerEnlarge(CurrentResourceOwner);
1856 : :
1857 : : /*
1858 : : * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1859 : : * temp file that can be reused.
1860 : : */
3026 andres@anarazel.de 1861 : 1781 : file = PathNameOpenFile(path, O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1862 [ + + ]: 1781 : if (file <= 0)
1863 : : {
1864 [ - + ]: 200 : if (error_on_failure)
3026 andres@anarazel.de 1865 [ # # ]:UBC 0 : ereport(ERROR,
1866 : : (errcode_for_file_access(),
1867 : : errmsg("could not create temporary file \"%s\": %m",
1868 : : path)));
1869 : : else
3026 andres@anarazel.de 1870 :CBC 200 : return file;
1871 : : }
1872 : :
1873 : : /* Mark it for temp_file_limit accounting. */
1874 : 1581 : VfdCache[file].fdstate |= FD_TEMP_FILE_LIMIT;
1875 : :
1876 : : /* Register it for automatic close. */
1877 : 1581 : RegisterTemporaryFile(file);
1878 : :
1879 : 1581 : return file;
1880 : : }
1881 : :
1882 : : /*
1883 : : * Open a file that was created with PathNameCreateTemporaryFile, possibly in
1884 : : * another backend. Files opened this way don't count against the
1885 : : * temp_file_limit of the caller, are automatically closed at the end of the
1886 : : * transaction but are not deleted on close.
1887 : : */
1888 : : File
2027 akapila@postgresql.o 1889 : 4321 : PathNameOpenTemporaryFile(const char *path, int mode)
1890 : : {
1891 : : File file;
1892 : :
1681 andres@anarazel.de 1893 [ - + ]: 4321 : Assert(temporary_files_allowed); /* check temp file access is up */
1894 : :
858 heikki.linnakangas@i 1895 : 4321 : ResourceOwnerEnlarge(CurrentResourceOwner);
1896 : :
2027 akapila@postgresql.o 1897 : 4321 : file = PathNameOpenFile(path, mode | PG_BINARY);
1898 : :
1899 : : /* If no such file, then we don't raise an error. */
3026 andres@anarazel.de 1900 [ + + - + ]: 4321 : if (file <= 0 && errno != ENOENT)
3026 andres@anarazel.de 1901 [ # # ]:UBC 0 : ereport(ERROR,
1902 : : (errcode_for_file_access(),
1903 : : errmsg("could not open temporary file \"%s\": %m",
1904 : : path)));
1905 : :
3026 andres@anarazel.de 1906 [ + + ]:CBC 4321 : if (file > 0)
1907 : : {
1908 : : /* Register it for automatic close. */
1909 : 2024 : RegisterTemporaryFile(file);
1910 : : }
1911 : :
1912 : 4321 : return file;
1913 : : }
1914 : :
1915 : : /*
1916 : : * Delete a file by pathname. Return true if the file existed, false if
1917 : : * didn't.
1918 : : */
1919 : : bool
1920 : 3514 : PathNameDeleteTemporaryFile(const char *path, bool error_on_failure)
1921 : : {
1922 : : struct stat filestats;
1923 : : int stat_errno;
1924 : :
1925 : : /* Get the final size for pgstat reporting. */
1926 [ + + ]: 3514 : if (stat(path, &filestats) != 0)
1927 : 1933 : stat_errno = errno;
1928 : : else
1929 : 1581 : stat_errno = 0;
1930 : :
1931 : : /*
1932 : : * Unlike FileClose's automatic file deletion code, we tolerate
1933 : : * non-existence to support BufFileDeleteFileSet which doesn't know how
1934 : : * many segments it has to delete until it runs out.
1935 : : */
1936 [ + + ]: 3514 : if (stat_errno == ENOENT)
1937 : 1933 : return false;
1938 : :
1939 [ - + ]: 1581 : if (unlink(path) < 0)
1940 : : {
3026 andres@anarazel.de 1941 [ # # ]:UBC 0 : if (errno != ENOENT)
1942 [ # # # # ]: 0 : ereport(error_on_failure ? ERROR : LOG,
1943 : : (errcode_for_file_access(),
1944 : : errmsg("could not unlink temporary file \"%s\": %m",
1945 : : path)));
1946 : 0 : return false;
1947 : : }
1948 : :
3026 andres@anarazel.de 1949 [ + - ]:CBC 1581 : if (stat_errno == 0)
1950 : 1581 : ReportTemporaryFileUsage(path, filestats.st_size);
1951 : : else
1952 : : {
3026 andres@anarazel.de 1953 :UBC 0 : errno = stat_errno;
1954 [ # # ]: 0 : ereport(LOG,
1955 : : (errcode_for_file_access(),
1956 : : errmsg("could not stat file \"%s\": %m", path)));
1957 : : }
1958 : :
3026 andres@anarazel.de 1959 :CBC 1581 : return true;
1960 : : }
1961 : :
1962 : : /*
1963 : : * close a file when done with it
1964 : : */
1965 : : void
10841 scrappy@hub.org 1966 : 549786 : FileClose(File file)
1967 : : {
1968 : : Vfd *vfdP;
1969 : :
9807 tgl@sss.pgh.pa.us 1970 [ + - + - : 549786 : Assert(FileIsValid(file));
- + ]
1971 : :
1972 : : DO_DB(elog(LOG, "FileClose: %d (%s)",
1973 : : file, VfdCache[file].fileName));
1974 : :
8799 1975 : 549786 : vfdP = &VfdCache[file];
1976 : :
10416 bruce@momjian.us 1977 [ + + ]: 549786 : if (!FileIsNotOpen(file))
1978 : : {
351 andres@anarazel.de 1979 : 549495 : pgaio_closing_fd(vfdP->fd);
1980 : :
1981 : : /* close the file */
2444 peter@eisentraut.org 1982 [ - + ]: 549495 : if (close(vfdP->fd) != 0)
1983 : : {
1984 : : /*
1985 : : * We may need to panic on failure to close non-temporary files;
1986 : : * see LruDelete.
1987 : : */
2673 tmunro@postgresql.or 1988 [ # # # # ]:UBC 0 : elog(vfdP->fdstate & FD_TEMP_FILE_LIMIT ? LOG : data_sync_elevel(LOG),
1989 : : "could not close file \"%s\": %m", vfdP->fileName);
1990 : : }
1991 : :
10416 bruce@momjian.us 1992 :CBC 549495 : --nfile;
8799 tgl@sss.pgh.pa.us 1993 : 549495 : vfdP->fd = VFD_CLOSED;
1994 : :
1995 : : /* remove the file from the lru ring */
3309 1996 : 549495 : Delete(file);
1997 : : }
1998 : :
3026 andres@anarazel.de 1999 [ + + ]: 549786 : if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
2000 : : {
2001 : : /* Subtract its size from current usage (do first in case of error) */
2002 : 3172 : temporary_files_size -= vfdP->fileSize;
2003 : 3172 : vfdP->fileSize = 0;
2004 : : }
2005 : :
2006 : : /*
2007 : : * Delete the file if it was temporary, and make a log entry if wanted
2008 : : */
2009 [ + + ]: 549786 : if (vfdP->fdstate & FD_DELETE_AT_CLOSE)
2010 : : {
2011 : : struct stat filestats;
2012 : : int stat_errno;
2013 : :
2014 : : /*
2015 : : * If we get an error, as could happen within the ereport/elog calls,
2016 : : * we'll come right back here during transaction abort. Reset the
2017 : : * flag to ensure that we can't get into an infinite loop. This code
2018 : : * is arranged to ensure that the worst-case consequence is failing to
2019 : : * emit log message(s), not failing to attempt the unlink.
2020 : : */
2021 : 1591 : vfdP->fdstate &= ~FD_DELETE_AT_CLOSE;
2022 : :
2023 : :
2024 : : /* first try the stat() */
5162 magnus@hagander.net 2025 [ - + ]: 1591 : if (stat(vfdP->fileName, &filestats))
5162 magnus@hagander.net 2026 :UBC 0 : stat_errno = errno;
2027 : : else
5162 magnus@hagander.net 2028 :CBC 1591 : stat_errno = 0;
2029 : :
2030 : : /* in any case do the unlink */
2031 [ - + ]: 1591 : if (unlink(vfdP->fileName))
1927 peter@eisentraut.org 2032 [ # # ]:UBC 0 : ereport(LOG,
2033 : : (errcode_for_file_access(),
2034 : : errmsg("could not delete file \"%s\": %m", vfdP->fileName)));
2035 : :
2036 : : /* and last report the stat results */
5162 magnus@hagander.net 2037 [ + - ]:CBC 1591 : if (stat_errno == 0)
3026 andres@anarazel.de 2038 : 1591 : ReportTemporaryFileUsage(vfdP->fileName, filestats.st_size);
2039 : : else
2040 : : {
5160 magnus@hagander.net 2041 :UBC 0 : errno = stat_errno;
1927 peter@eisentraut.org 2042 [ # # ]: 0 : ereport(LOG,
2043 : : (errcode_for_file_access(),
2044 : : errmsg("could not stat file \"%s\": %m", vfdP->fileName)));
2045 : : }
2046 : : }
2047 : :
2048 : : /* Unregister it from the resource owner */
5946 heikki.linnakangas@i 2049 [ + + ]:CBC 549786 : if (vfdP->resowner)
2050 : 5192 : ResourceOwnerForgetFile(vfdP->resowner, file);
2051 : :
2052 : : /*
2053 : : * Return the Vfd slot to the free list
2054 : : */
9807 tgl@sss.pgh.pa.us 2055 : 549786 : FreeVfd(file);
10841 scrappy@hub.org 2056 : 549786 : }
2057 : :
2058 : : /*
2059 : : * FilePrefetch - initiate asynchronous read of a given range of the file.
2060 : : *
2061 : : * Returns 0 on success, otherwise an errno error code (like posix_fadvise()).
2062 : : *
2063 : : * posix_fadvise() is the simplest standardized interface that accomplishes
2064 : : * this.
2065 : : */
2066 : : int
122 michael@paquier.xyz 2067 :GNC 8678 : FilePrefetch(File file, pgoff_t offset, pgoff_t amount, uint32 wait_event_info)
2068 : : {
6271 tgl@sss.pgh.pa.us 2069 [ + - + - :CBC 8678 : Assert(FileIsValid(file));
- + ]
2070 : :
2071 : : DO_DB(elog(LOG, "FilePrefetch: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2072 : : file, VfdCache[file].fileName,
2073 : : (int64) offset, (int64) amount));
2074 : :
2075 : : #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_WILLNEED)
2076 : : {
2077 : : int returnCode;
2078 : :
564 peter@eisentraut.org 2079 : 8678 : returnCode = FileAccess(file);
2080 [ + - ]: 8678 : if (returnCode < 0)
564 peter@eisentraut.org 2081 :UBC 0 : return returnCode;
2082 : :
1000 andres@anarazel.de 2083 :CBC 8678 : retry:
564 peter@eisentraut.org 2084 : 8678 : pgstat_report_wait_start(wait_event_info);
2085 : 8678 : returnCode = posix_fadvise(VfdCache[file].fd, offset, amount,
2086 : : POSIX_FADV_WILLNEED);
2087 : 8678 : pgstat_report_wait_end();
2088 : :
2089 [ - + ]: 8678 : if (returnCode == EINTR)
564 peter@eisentraut.org 2090 :UBC 0 : goto retry;
2091 : :
564 peter@eisentraut.org 2092 :CBC 8678 : return returnCode;
2093 : : }
2094 : : #elif defined(__darwin__)
2095 : : {
2096 : : struct radvisory
2097 : : {
2098 : : off_t ra_offset; /* offset into the file */
2099 : : int ra_count; /* size of the read */
2100 : : } ra;
2101 : : int returnCode;
2102 : :
2103 : : returnCode = FileAccess(file);
2104 : : if (returnCode < 0)
2105 : : return returnCode;
2106 : :
2107 : : ra.ra_offset = offset;
2108 : : ra.ra_count = amount;
2109 : : pgstat_report_wait_start(wait_event_info);
2110 : : returnCode = fcntl(VfdCache[file].fd, F_RDADVISE, &ra);
2111 : : pgstat_report_wait_end();
2112 : : if (returnCode != -1)
2113 : : return 0;
2114 : : else
2115 : : return errno;
2116 : : }
2117 : : #else
2118 : : return 0;
2119 : : #endif
2120 : : }
2121 : :
2122 : : void
122 michael@paquier.xyz 2123 :UNC 0 : FileWriteback(File file, pgoff_t offset, pgoff_t nbytes, uint32 wait_event_info)
2124 : : {
2125 : : int returnCode;
2126 : :
3677 andres@anarazel.de 2127 [ # # # # :UBC 0 : Assert(FileIsValid(file));
# # ]
2128 : :
2129 : : DO_DB(elog(LOG, "FileWriteback: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2130 : : file, VfdCache[file].fileName,
2131 : : (int64) offset, (int64) nbytes));
2132 : :
3623 tgl@sss.pgh.pa.us 2133 [ # # ]: 0 : if (nbytes <= 0)
2134 : 0 : return;
2135 : :
1072 tmunro@postgresql.or 2136 [ # # ]: 0 : if (VfdCache[file].fileFlags & PG_O_DIRECT)
2137 : 0 : return;
2138 : :
3677 andres@anarazel.de 2139 : 0 : returnCode = FileAccess(file);
2140 [ # # ]: 0 : if (returnCode < 0)
2141 : 0 : return;
2142 : :
3284 rhaas@postgresql.org 2143 : 0 : pgstat_report_wait_start(wait_event_info);
3623 tgl@sss.pgh.pa.us 2144 : 0 : pg_flush_data(VfdCache[file].fd, offset, nbytes);
3284 rhaas@postgresql.org 2145 : 0 : pgstat_report_wait_end();
2146 : : }
2147 : :
2148 : : ssize_t
122 michael@paquier.xyz 2149 :GNC 411221 : FileReadV(File file, const struct iovec *iov, int iovcnt, pgoff_t offset,
2150 : : uint32 wait_event_info)
2151 : : {
2152 : : ssize_t returnCode;
2153 : : Vfd *vfdP;
2154 : :
9807 tgl@sss.pgh.pa.us 2155 [ + - + - :CBC 411221 : Assert(FileIsValid(file));
- + ]
2156 : :
2157 : : DO_DB(elog(LOG, "FileReadV: %d (%s) " INT64_FORMAT " %d",
2158 : : file, VfdCache[file].fileName,
2159 : : (int64) offset,
2160 : : iovcnt));
2161 : :
7958 2162 : 411221 : returnCode = FileAccess(file);
2163 [ - + ]: 411221 : if (returnCode < 0)
7958 tgl@sss.pgh.pa.us 2164 :UBC 0 : return returnCode;
2165 : :
3309 tgl@sss.pgh.pa.us 2166 :CBC 411221 : vfdP = &VfdCache[file];
2167 : :
7409 2168 : 411221 : retry:
3284 rhaas@postgresql.org 2169 : 411221 : pgstat_report_wait_start(wait_event_info);
824 tmunro@postgresql.or 2170 : 411221 : returnCode = pg_preadv(vfdP->fd, iov, iovcnt, offset);
3284 rhaas@postgresql.org 2171 : 411221 : pgstat_report_wait_end();
2172 : :
2685 tmunro@postgresql.or 2173 [ - + ]: 411221 : if (returnCode < 0)
2174 : : {
2175 : : /*
2176 : : * Windows may run out of kernel buffers and return "Insufficient
2177 : : * system resources" error. Wait a bit and retry to solve it.
2178 : : *
2179 : : * It is rumored that EINTR is also possible on some Unix filesystems,
2180 : : * in which case immediate retry is indicated.
2181 : : */
2182 : : #ifdef WIN32
2183 : : DWORD error = GetLastError();
2184 : :
2185 : : switch (error)
2186 : : {
2187 : : case ERROR_NO_SYSTEM_RESOURCES:
2188 : : pg_usleep(1000L);
2189 : : errno = EINTR;
2190 : : break;
2191 : : default:
2192 : : _dosmaperr(error);
2193 : : break;
2194 : : }
2195 : : #endif
2196 : : /* OK to retry if interrupted */
7409 tgl@sss.pgh.pa.us 2197 [ # # ]:UBC 0 : if (errno == EINTR)
2198 : 0 : goto retry;
2199 : : }
2200 : :
10416 bruce@momjian.us 2201 :CBC 411221 : return returnCode;
2202 : : }
2203 : :
2204 : : int
351 andres@anarazel.de 2205 : 1336685 : FileStartReadV(PgAioHandle *ioh, File file,
2206 : : int iovcnt, pgoff_t offset,
2207 : : uint32 wait_event_info)
2208 : : {
2209 : : int returnCode;
2210 : : Vfd *vfdP;
2211 : :
2212 [ + - + - : 1336685 : Assert(FileIsValid(file));
- + ]
2213 : :
2214 : : DO_DB(elog(LOG, "FileStartReadV: %d (%s) " INT64_FORMAT " %d",
2215 : : file, VfdCache[file].fileName,
2216 : : (int64) offset,
2217 : : iovcnt));
2218 : :
2219 : 1336685 : returnCode = FileAccess(file);
2220 [ - + ]: 1336685 : if (returnCode < 0)
351 andres@anarazel.de 2221 :UBC 0 : return returnCode;
2222 : :
351 andres@anarazel.de 2223 :CBC 1336685 : vfdP = &VfdCache[file];
2224 : :
2225 : 1336685 : pgaio_io_start_readv(ioh, vfdP->fd, iovcnt, offset);
2226 : :
2227 : 1336685 : return 0;
2228 : : }
2229 : :
2230 : : ssize_t
122 michael@paquier.xyz 2231 :GNC 797215 : FileWriteV(File file, const struct iovec *iov, int iovcnt, pgoff_t offset,
2232 : : uint32 wait_event_info)
2233 : : {
2234 : : ssize_t returnCode;
2235 : : Vfd *vfdP;
2236 : :
9807 tgl@sss.pgh.pa.us 2237 [ + - + - :CBC 797215 : Assert(FileIsValid(file));
- + ]
2238 : :
2239 : : DO_DB(elog(LOG, "FileWriteV: %d (%s) " INT64_FORMAT " %d",
2240 : : file, VfdCache[file].fileName,
2241 : : (int64) offset,
2242 : : iovcnt));
2243 : :
7958 2244 : 797215 : returnCode = FileAccess(file);
2245 [ - + ]: 797215 : if (returnCode < 0)
7958 tgl@sss.pgh.pa.us 2246 :UBC 0 : return returnCode;
2247 : :
3309 tgl@sss.pgh.pa.us 2248 :CBC 797215 : vfdP = &VfdCache[file];
2249 : :
2250 : : /*
2251 : : * If enforcing temp_file_limit and it's a temp file, check to see if the
2252 : : * write would overrun temp_file_limit, and throw error if so. Note: it's
2253 : : * really a modularity violation to throw error here; we should set errno
2254 : : * and return -1. However, there's no way to report a suitable error
2255 : : * message if we do that. All current callers would just throw error
2256 : : * immediately anyway, so this is safe at present.
2257 : : */
3026 andres@anarazel.de 2258 [ + - - - ]: 797215 : if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMP_FILE_LIMIT))
2259 : : {
122 michael@paquier.xyz 2260 :UNC 0 : pgoff_t past_write = offset;
2261 : :
824 tmunro@postgresql.or 2262 [ # # ]:UBC 0 : for (int i = 0; i < iovcnt; ++i)
2263 : 0 : past_write += iov[i].iov_len;
2264 : :
2685 2265 [ # # ]: 0 : if (past_write > vfdP->fileSize)
2266 : : {
5026 bruce@momjian.us 2267 : 0 : uint64 newTotal = temporary_files_size;
2268 : :
2685 tmunro@postgresql.or 2269 : 0 : newTotal += past_write - vfdP->fileSize;
5355 tgl@sss.pgh.pa.us 2270 [ # # ]: 0 : if (newTotal > (uint64) temp_file_limit * (uint64) 1024)
2271 [ # # ]: 0 : ereport(ERROR,
2272 : : (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED),
2273 : : errmsg("temporary file size exceeds \"temp_file_limit\" (%dkB)",
2274 : : temp_file_limit)));
2275 : : }
2276 : : }
2277 : :
7409 tgl@sss.pgh.pa.us 2278 :CBC 797215 : retry:
3284 rhaas@postgresql.org 2279 : 797215 : pgstat_report_wait_start(wait_event_info);
824 tmunro@postgresql.or 2280 : 797215 : returnCode = pg_pwritev(vfdP->fd, iov, iovcnt, offset);
3284 rhaas@postgresql.org 2281 : 797215 : pgstat_report_wait_end();
2282 : :
7409 tgl@sss.pgh.pa.us 2283 [ + - ]: 797215 : if (returnCode >= 0)
2284 : : {
2285 : : /*
2286 : : * Some callers expect short writes to set errno, and traditionally we
2287 : : * have assumed that they imply disk space shortage. We don't want to
2288 : : * waste CPU cycles adding up the total size here, so we'll just set
2289 : : * it for all successful writes in case such a caller determines that
2290 : : * the write was short and ereports "%m".
2291 : : */
824 tmunro@postgresql.or 2292 : 797215 : errno = ENOSPC;
2293 : :
2294 : : /*
2295 : : * Maintain fileSize and temporary_files_size if it's a temp file.
2296 : : */
3026 andres@anarazel.de 2297 [ + + ]: 797215 : if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
2298 : : {
122 michael@paquier.xyz 2299 :GNC 59955 : pgoff_t past_write = offset + returnCode;
2300 : :
2685 tmunro@postgresql.or 2301 [ + + ]:CBC 59955 : if (past_write > vfdP->fileSize)
2302 : : {
2303 : 43248 : temporary_files_size += past_write - vfdP->fileSize;
2304 : 43248 : vfdP->fileSize = past_write;
2305 : : }
2306 : : }
2307 : : }
2308 : : else
2309 : : {
2310 : : /*
2311 : : * See comments in FileReadV()
2312 : : */
2313 : : #ifdef WIN32
2314 : : DWORD error = GetLastError();
2315 : :
2316 : : switch (error)
2317 : : {
2318 : : case ERROR_NO_SYSTEM_RESOURCES:
2319 : : pg_usleep(1000L);
2320 : : errno = EINTR;
2321 : : break;
2322 : : default:
2323 : : _dosmaperr(error);
2324 : : break;
2325 : : }
2326 : : #endif
2327 : : /* OK to retry if interrupted */
7409 tgl@sss.pgh.pa.us 2328 [ # # ]:UBC 0 : if (errno == EINTR)
2329 : 0 : goto retry;
2330 : : }
2331 : :
10416 bruce@momjian.us 2332 :CBC 797215 : return returnCode;
2333 : : }
2334 : :
2335 : : int
3284 rhaas@postgresql.org 2336 : 284 : FileSync(File file, uint32 wait_event_info)
2337 : : {
2338 : : int returnCode;
2339 : :
7958 tgl@sss.pgh.pa.us 2340 [ + - + - : 284 : Assert(FileIsValid(file));
- + ]
2341 : :
2342 : : DO_DB(elog(LOG, "FileSync: %d (%s)",
2343 : : file, VfdCache[file].fileName));
2344 : :
2345 : 284 : returnCode = FileAccess(file);
2346 [ - + ]: 284 : if (returnCode < 0)
7958 tgl@sss.pgh.pa.us 2347 :UBC 0 : return returnCode;
2348 : :
3284 rhaas@postgresql.org 2349 :CBC 284 : pgstat_report_wait_start(wait_event_info);
2350 : 284 : returnCode = pg_fsync(VfdCache[file].fd);
2351 : 284 : pgstat_report_wait_end();
2352 : :
2353 : 284 : return returnCode;
2354 : : }
2355 : :
2356 : : /*
2357 : : * Zero a region of the file.
2358 : : *
2359 : : * Returns 0 on success, -1 otherwise. In the latter case errno is set to the
2360 : : * appropriate error.
2361 : : */
2362 : : int
122 michael@paquier.xyz 2363 :GNC 220651 : FileZero(File file, pgoff_t offset, pgoff_t amount, uint32 wait_event_info)
2364 : : {
2365 : : int returnCode;
2366 : : ssize_t written;
2367 : :
1075 andres@anarazel.de 2368 [ + - + - :CBC 220651 : Assert(FileIsValid(file));
- + ]
2369 : :
2370 : : DO_DB(elog(LOG, "FileZero: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2371 : : file, VfdCache[file].fileName,
2372 : : (int64) offset, (int64) amount));
2373 : :
2374 : 220651 : returnCode = FileAccess(file);
2375 [ - + ]: 220651 : if (returnCode < 0)
1075 andres@anarazel.de 2376 :UBC 0 : return returnCode;
2377 : :
1075 andres@anarazel.de 2378 :CBC 220651 : pgstat_report_wait_start(wait_event_info);
2379 : 220651 : written = pg_pwrite_zeros(VfdCache[file].fd, amount, offset);
2380 : 220651 : pgstat_report_wait_end();
2381 : :
2382 [ - + ]: 220651 : if (written < 0)
1075 andres@anarazel.de 2383 :UBC 0 : return -1;
1075 andres@anarazel.de 2384 [ - + ]:CBC 220651 : else if (written != amount)
2385 : : {
2386 : : /* if errno is unset, assume problem is no disk space */
1075 andres@anarazel.de 2387 [ # # ]:UBC 0 : if (errno == 0)
2388 : 0 : errno = ENOSPC;
2389 : 0 : return -1;
2390 : : }
2391 : :
1075 andres@anarazel.de 2392 :CBC 220651 : return 0;
2393 : : }
2394 : :
2395 : : /*
2396 : : * Try to reserve file space with posix_fallocate(). If posix_fallocate() is
2397 : : * not implemented on the operating system or fails with EINVAL / EOPNOTSUPP,
2398 : : * use FileZero() instead.
2399 : : *
2400 : : * Note that at least glibc() implements posix_fallocate() in userspace if not
2401 : : * implemented by the filesystem. That's not the case for all environments
2402 : : * though.
2403 : : *
2404 : : * Returns 0 on success, -1 otherwise. In the latter case errno is set to the
2405 : : * appropriate error.
2406 : : */
2407 : : int
122 michael@paquier.xyz 2408 :GNC 524 : FileFallocate(File file, pgoff_t offset, pgoff_t amount, uint32 wait_event_info)
2409 : : {
2410 : : #ifdef HAVE_POSIX_FALLOCATE
2411 : : int returnCode;
2412 : :
1075 andres@anarazel.de 2413 [ + - + - :CBC 524 : Assert(FileIsValid(file));
- + ]
2414 : :
2415 : : DO_DB(elog(LOG, "FileFallocate: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2416 : : file, VfdCache[file].fileName,
2417 : : (int64) offset, (int64) amount));
2418 : :
2419 : 524 : returnCode = FileAccess(file);
2420 [ + - ]: 524 : if (returnCode < 0)
1075 andres@anarazel.de 2421 :UBC 0 : return -1;
2422 : :
1000 andres@anarazel.de 2423 :CBC 524 : retry:
1075 2424 : 524 : pgstat_report_wait_start(wait_event_info);
2425 : 524 : returnCode = posix_fallocate(VfdCache[file].fd, offset, amount);
2426 : 524 : pgstat_report_wait_end();
2427 : :
2428 [ + - ]: 524 : if (returnCode == 0)
2429 : 524 : return 0;
1000 andres@anarazel.de 2430 [ # # ]:UBC 0 : else if (returnCode == EINTR)
2431 : 0 : goto retry;
2432 : :
2433 : : /* for compatibility with %m printing etc */
1075 2434 : 0 : errno = returnCode;
2435 : :
2436 : : /*
2437 : : * Return in cases of a "real" failure, if fallocate is not supported,
2438 : : * fall through to the FileZero() backed implementation.
2439 : : */
2440 [ # # # # ]: 0 : if (returnCode != EINVAL && returnCode != EOPNOTSUPP)
2441 : 0 : return -1;
2442 : : #endif
2443 : :
2444 : 0 : return FileZero(file, offset, amount, wait_event_info);
2445 : : }
2446 : :
2447 : : pgoff_t
2685 tmunro@postgresql.or 2448 :CBC 3964488 : FileSize(File file)
2449 : : {
9807 tgl@sss.pgh.pa.us 2450 [ + - + - : 3964488 : Assert(FileIsValid(file));
- + ]
2451 : :
2452 : : DO_DB(elog(LOG, "FileSize %d (%s)",
2453 : : file, VfdCache[file].fileName));
2454 : :
10416 bruce@momjian.us 2455 [ + + ]: 3964488 : if (FileIsNotOpen(file))
2456 : : {
2685 tmunro@postgresql.or 2457 [ - + ]: 26 : if (FileAccess(file) < 0)
122 michael@paquier.xyz 2458 :UNC 0 : return (pgoff_t) -1;
2459 : : }
2460 : :
2685 tmunro@postgresql.or 2461 :CBC 3964488 : return lseek(VfdCache[file].fd, 0, SEEK_END);
2462 : : }
2463 : :
2464 : : int
122 michael@paquier.xyz 2465 :GNC 546 : FileTruncate(File file, pgoff_t offset, uint32 wait_event_info)
2466 : : {
2467 : : int returnCode;
2468 : :
9807 tgl@sss.pgh.pa.us 2469 [ + - + - :CBC 546 : Assert(FileIsValid(file));
- + ]
2470 : :
2471 : : DO_DB(elog(LOG, "FileTruncate %d (%s)",
2472 : : file, VfdCache[file].fileName));
2473 : :
7958 2474 : 546 : returnCode = FileAccess(file);
2475 [ - + ]: 546 : if (returnCode < 0)
7958 tgl@sss.pgh.pa.us 2476 :UBC 0 : return returnCode;
2477 : :
3284 rhaas@postgresql.org 2478 :CBC 546 : pgstat_report_wait_start(wait_event_info);
1000 andres@anarazel.de 2479 : 546 : returnCode = pg_ftruncate(VfdCache[file].fd, offset);
3284 rhaas@postgresql.org 2480 : 546 : pgstat_report_wait_end();
2481 : :
5355 tgl@sss.pgh.pa.us 2482 [ + - - + ]: 546 : if (returnCode == 0 && VfdCache[file].fileSize > offset)
2483 : : {
2484 : : /* adjust our state for truncation of a temp file */
3026 andres@anarazel.de 2485 [ # # ]:UBC 0 : Assert(VfdCache[file].fdstate & FD_TEMP_FILE_LIMIT);
5355 tgl@sss.pgh.pa.us 2486 : 0 : temporary_files_size -= VfdCache[file].fileSize - offset;
2487 : 0 : VfdCache[file].fileSize = offset;
2488 : : }
2489 : :
10057 bruce@momjian.us 2490 :CBC 546 : return returnCode;
2491 : : }
2492 : :
2493 : : /*
2494 : : * Return the pathname associated with an open file.
2495 : : *
2496 : : * The returned string points to an internal buffer, which is valid until
2497 : : * the file is closed.
2498 : : */
2499 : : char *
6066 heikki.linnakangas@i 2500 : 32 : FilePathName(File file)
2501 : : {
2502 [ + - + - : 32 : Assert(FileIsValid(file));
- + ]
2503 : :
2504 : 32 : return VfdCache[file].fileName;
2505 : : }
2506 : :
2507 : : /*
2508 : : * Return the raw file descriptor of an opened file.
2509 : : *
2510 : : * The returned file descriptor will be valid until the file is closed, but
2511 : : * there are a lot of things that can make that happen. So the caller should
2512 : : * be careful not to do much of anything else before it finishes using the
2513 : : * returned file descriptor.
2514 : : */
2515 : : int
3659 rhaas@postgresql.org 2516 : 487023 : FileGetRawDesc(File file)
2517 : : {
2518 : : int returnCode;
2519 : :
351 andres@anarazel.de 2520 : 487023 : returnCode = FileAccess(file);
2521 [ - + ]: 487023 : if (returnCode < 0)
351 andres@anarazel.de 2522 :UBC 0 : return returnCode;
2523 : :
3659 rhaas@postgresql.org 2524 [ + - + - :CBC 487023 : Assert(FileIsValid(file));
- + ]
2525 : 487023 : return VfdCache[file].fd;
2526 : : }
2527 : :
2528 : : /*
2529 : : * FileGetRawFlags - returns the file flags on open(2)
2530 : : */
2531 : : int
3659 rhaas@postgresql.org 2532 :UBC 0 : FileGetRawFlags(File file)
2533 : : {
2534 [ # # # # : 0 : Assert(FileIsValid(file));
# # ]
2535 : 0 : return VfdCache[file].fileFlags;
2536 : : }
2537 : :
2538 : : /*
2539 : : * FileGetRawMode - returns the mode bitmask passed to open(2)
2540 : : */
2541 : : mode_t
2542 : 0 : FileGetRawMode(File file)
2543 : : {
2544 [ # # # # : 0 : Assert(FileIsValid(file));
# # ]
2545 : 0 : return VfdCache[file].fileMode;
2546 : : }
2547 : :
2548 : : /*
2549 : : * Make room for another allocatedDescs[] array entry if needed and possible.
2550 : : * Returns true if an array element is available.
2551 : : */
2552 : : static bool
4662 tgl@sss.pgh.pa.us 2553 :CBC 553701 : reserveAllocatedDesc(void)
2554 : : {
2555 : : AllocateDesc *newDescs;
2556 : : int newMax;
2557 : :
2558 : : /* Quick out if array already has a free slot. */
2559 [ + + ]: 553701 : if (numAllocatedDescs < maxAllocatedDescs)
2560 : 552541 : return true;
2561 : :
2562 : : /*
2563 : : * If the array hasn't yet been created in the current process, initialize
2564 : : * it with FD_MINFREE / 3 elements. In many scenarios this is as many as
2565 : : * we will ever need, anyway. We don't want to look at max_safe_fds
2566 : : * immediately because set_max_safe_fds() may not have run yet.
2567 : : */
2568 [ + - ]: 1160 : if (allocatedDescs == NULL)
2569 : : {
2211 2570 : 1160 : newMax = FD_MINFREE / 3;
4662 2571 : 1160 : newDescs = (AllocateDesc *) malloc(newMax * sizeof(AllocateDesc));
2572 : : /* Out of memory already? Treat as fatal error. */
2573 [ - + ]: 1160 : if (newDescs == NULL)
4662 tgl@sss.pgh.pa.us 2574 [ # # ]:UBC 0 : ereport(ERROR,
2575 : : (errcode(ERRCODE_OUT_OF_MEMORY),
2576 : : errmsg("out of memory")));
4662 tgl@sss.pgh.pa.us 2577 :CBC 1160 : allocatedDescs = newDescs;
2578 : 1160 : maxAllocatedDescs = newMax;
2579 : 1160 : return true;
2580 : : }
2581 : :
2582 : : /*
2583 : : * Consider enlarging the array beyond the initial allocation used above.
2584 : : * By the time this happens, max_safe_fds should be known accurately.
2585 : : *
2586 : : * We mustn't let allocated descriptors hog all the available FDs, and in
2587 : : * practice we'd better leave a reasonable number of FDs for VFD use. So
2588 : : * set the maximum to max_safe_fds / 3. (This should certainly be at
2589 : : * least as large as the initial size, FD_MINFREE / 3, so we aren't
2590 : : * tightening the restriction here.) Recall that "external" FDs are
2591 : : * allowed to consume another third of max_safe_fds.
2592 : : */
2211 tgl@sss.pgh.pa.us 2593 :UBC 0 : newMax = max_safe_fds / 3;
4662 2594 [ # # ]: 0 : if (newMax > maxAllocatedDescs)
2595 : : {
2596 : 0 : newDescs = (AllocateDesc *) realloc(allocatedDescs,
2597 : : newMax * sizeof(AllocateDesc));
2598 : : /* Treat out-of-memory as a non-fatal error. */
2599 [ # # ]: 0 : if (newDescs == NULL)
2600 : 0 : return false;
2601 : 0 : allocatedDescs = newDescs;
2602 : 0 : maxAllocatedDescs = newMax;
2603 : 0 : return true;
2604 : : }
2605 : :
2606 : : /* Can't enlarge allocatedDescs[] any more. */
2607 : 0 : return false;
2608 : : }
2609 : :
2610 : : /*
2611 : : * Routines that want to use stdio (ie, FILE*) should use AllocateFile
2612 : : * rather than plain fopen(). This lets fd.c deal with freeing FDs if
2613 : : * necessary to open the file. When done, call FreeFile rather than fclose.
2614 : : *
2615 : : * Note that files that will be open for any significant length of time
2616 : : * should NOT be handled this way, since they cannot share kernel file
2617 : : * descriptors with other files; there is grave risk of running out of FDs
2618 : : * if anyone locks down too many FDs. Most callers of this routine are
2619 : : * simply reading a config file that they will read and close immediately.
2620 : : *
2621 : : * fd.c will automatically close all files opened with AllocateFile at
2622 : : * transaction commit or abort; this prevents FD leakage if a routine
2623 : : * that calls AllocateFile is terminated prematurely by ereport(ERROR).
2624 : : *
2625 : : * Ideally this should be the *only* direct call of fopen() in the backend.
2626 : : */
2627 : : FILE *
7316 tgl@sss.pgh.pa.us 2628 :CBC 88155 : AllocateFile(const char *name, const char *mode)
2629 : : {
2630 : : FILE *file;
2631 : :
2632 : : DO_DB(elog(LOG, "AllocateFile: Allocated %d (%s)",
2633 : : numAllocatedDescs, name));
2634 : :
2635 : : /* Can we allocate another non-virtual FD? */
4662 2636 [ - + ]: 88155 : if (!reserveAllocatedDesc())
4662 tgl@sss.pgh.pa.us 2637 [ # # ]:UBC 0 : ereport(ERROR,
2638 : : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2639 : : errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2640 : : maxAllocatedDescs, name)));
2641 : :
2642 : : /* Close excess kernel FDs. */
4662 tgl@sss.pgh.pa.us 2643 :CBC 88155 : ReleaseLruFiles();
2644 : :
10436 bruce@momjian.us 2645 : 88155 : TryAgain:
9331 tgl@sss.pgh.pa.us 2646 [ + + ]: 88155 : if ((file = fopen(name, mode)) != NULL)
2647 : : {
7900 2648 : 81001 : AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2649 : :
2650 : 81001 : desc->kind = AllocateDescFile;
2651 : 81001 : desc->desc.file = file;
7850 2652 : 81001 : desc->create_subid = GetCurrentSubTransactionId();
7900 2653 : 81001 : numAllocatedDescs++;
2654 : 81001 : return desc->desc.file;
2655 : : }
2656 : :
9331 2657 [ + - - + ]: 7154 : if (errno == EMFILE || errno == ENFILE)
2658 : : {
9124 bruce@momjian.us 2659 :UBC 0 : int save_errno = errno;
2660 : :
8270 tgl@sss.pgh.pa.us 2661 [ # # ]: 0 : ereport(LOG,
2662 : : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2663 : : errmsg("out of file descriptors: %m; release and retry")));
9331 2664 : 0 : errno = 0;
2665 [ # # ]: 0 : if (ReleaseLruFile())
10416 bruce@momjian.us 2666 : 0 : goto TryAgain;
9331 tgl@sss.pgh.pa.us 2667 : 0 : errno = save_errno;
2668 : : }
2669 : :
9331 tgl@sss.pgh.pa.us 2670 :CBC 7154 : return NULL;
2671 : : }
2672 : :
2673 : : /*
2674 : : * Open a file with OpenTransientFilePerm() and pass default file mode for
2675 : : * the fileMode parameter.
2676 : : */
2677 : : int
3095 peter_e@gmx.net 2678 : 419249 : OpenTransientFile(const char *fileName, int fileFlags)
2679 : : {
2899 sfrost@snowman.net 2680 : 419249 : return OpenTransientFilePerm(fileName, fileFlags, pg_file_create_mode);
2681 : : }
2682 : :
2683 : : /*
2684 : : * Like AllocateFile, but returns an unbuffered fd like open(2)
2685 : : */
2686 : : int
3095 peter_e@gmx.net 2687 : 419255 : OpenTransientFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
2688 : : {
2689 : : int fd;
2690 : :
2691 : : DO_DB(elog(LOG, "OpenTransientFile: Allocated %d (%s)",
2692 : : numAllocatedDescs, fileName));
2693 : :
2694 : : /* Can we allocate another non-virtual FD? */
4662 tgl@sss.pgh.pa.us 2695 [ - + ]: 419255 : if (!reserveAllocatedDesc())
4662 tgl@sss.pgh.pa.us 2696 [ # # ]:UBC 0 : ereport(ERROR,
2697 : : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2698 : : errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2699 : : maxAllocatedDescs, fileName)));
2700 : :
2701 : : /* Close excess kernel FDs. */
4662 tgl@sss.pgh.pa.us 2702 :CBC 419255 : ReleaseLruFiles();
2703 : :
3095 peter_e@gmx.net 2704 : 419255 : fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
2705 : :
4856 heikki.linnakangas@i 2706 [ + + ]: 419255 : if (fd >= 0)
2707 : : {
2708 : 414144 : AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2709 : :
2710 : 414144 : desc->kind = AllocateDescRawFD;
2711 : 414144 : desc->desc.fd = fd;
2712 : 414144 : desc->create_subid = GetCurrentSubTransactionId();
2713 : 414144 : numAllocatedDescs++;
2714 : :
2715 : 414144 : return fd;
2716 : : }
2717 : :
2718 : 5111 : return -1; /* failure */
2719 : : }
2720 : :
2721 : : /*
2722 : : * Routines that want to initiate a pipe stream should use OpenPipeStream
2723 : : * rather than plain popen(). This lets fd.c deal with freeing FDs if
2724 : : * necessary. When done, call ClosePipeStream rather than pclose.
2725 : : *
2726 : : * This function also ensures that the popen'd program is run with default
2727 : : * SIGPIPE processing, rather than the SIG_IGN setting the backend normally
2728 : : * uses. This ensures desirable response to, eg, closing a read pipe early.
2729 : : */
2730 : : FILE *
4764 2731 : 61 : OpenPipeStream(const char *command, const char *mode)
2732 : : {
2733 : : FILE *file;
2734 : : int save_errno;
2735 : :
2736 : : DO_DB(elog(LOG, "OpenPipeStream: Allocated %d (%s)",
2737 : : numAllocatedDescs, command));
2738 : :
2739 : : /* Can we allocate another non-virtual FD? */
4662 tgl@sss.pgh.pa.us 2740 [ - + ]: 61 : if (!reserveAllocatedDesc())
4662 tgl@sss.pgh.pa.us 2741 [ # # ]:UBC 0 : ereport(ERROR,
2742 : : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2743 : : errmsg("exceeded maxAllocatedDescs (%d) while trying to execute command \"%s\"",
2744 : : maxAllocatedDescs, command)));
2745 : :
2746 : : /* Close excess kernel FDs. */
4662 tgl@sss.pgh.pa.us 2747 :CBC 61 : ReleaseLruFiles();
2748 : :
4764 heikki.linnakangas@i 2749 : 61 : TryAgain:
1294 tgl@sss.pgh.pa.us 2750 : 61 : fflush(NULL);
2673 2751 : 61 : pqsignal(SIGPIPE, SIG_DFL);
4764 heikki.linnakangas@i 2752 : 61 : errno = 0;
2673 tgl@sss.pgh.pa.us 2753 : 61 : file = popen(command, mode);
2754 : 61 : save_errno = errno;
2755 : 61 : pqsignal(SIGPIPE, SIG_IGN);
2756 : 61 : errno = save_errno;
2757 [ + - ]: 61 : if (file != NULL)
2758 : : {
4764 heikki.linnakangas@i 2759 : 61 : AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2760 : :
2761 : 61 : desc->kind = AllocateDescPipe;
2762 : 61 : desc->desc.file = file;
2763 : 61 : desc->create_subid = GetCurrentSubTransactionId();
2764 : 61 : numAllocatedDescs++;
2765 : 61 : return desc->desc.file;
2766 : : }
2767 : :
4764 heikki.linnakangas@i 2768 [ # # # # ]:UBC 0 : if (errno == EMFILE || errno == ENFILE)
2769 : : {
2770 [ # # ]: 0 : ereport(LOG,
2771 : : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2772 : : errmsg("out of file descriptors: %m; release and retry")));
2773 [ # # ]: 0 : if (ReleaseLruFile())
2774 : 0 : goto TryAgain;
2775 : 0 : errno = save_errno;
2776 : : }
2777 : :
2778 : 0 : return NULL;
2779 : : }
2780 : :
2781 : : /*
2782 : : * Free an AllocateDesc of any type.
2783 : : *
2784 : : * The argument *must* point into the allocatedDescs[] array.
2785 : : */
2786 : : static int
7900 tgl@sss.pgh.pa.us 2787 :CBC 540493 : FreeDesc(AllocateDesc *desc)
2788 : : {
2789 : : int result;
2790 : :
2791 : : /* Close the underlying object */
2792 [ + + + + : 540493 : switch (desc->kind)
- ]
2793 : : {
2794 : 81001 : case AllocateDescFile:
2795 : 81001 : result = fclose(desc->desc.file);
2796 : 81001 : break;
4764 heikki.linnakangas@i 2797 : 61 : case AllocateDescPipe:
2798 : 61 : result = pclose(desc->desc.file);
2799 : 61 : break;
7900 tgl@sss.pgh.pa.us 2800 : 45287 : case AllocateDescDir:
2801 : 45287 : result = closedir(desc->desc.dir);
2802 : 45287 : break;
4856 heikki.linnakangas@i 2803 : 414144 : case AllocateDescRawFD:
351 andres@anarazel.de 2804 : 414144 : pgaio_closing_fd(desc->desc.fd);
4856 heikki.linnakangas@i 2805 : 414144 : result = close(desc->desc.fd);
2806 : 414144 : break;
7900 tgl@sss.pgh.pa.us 2807 :UBC 0 : default:
2808 [ # # ]: 0 : elog(ERROR, "AllocateDesc kind not recognized");
2809 : : result = 0; /* keep compiler quiet */
2810 : : break;
2811 : : }
2812 : :
2813 : : /* Compact storage in the allocatedDescs array */
7900 tgl@sss.pgh.pa.us 2814 :CBC 540493 : numAllocatedDescs--;
2815 : 540493 : *desc = allocatedDescs[numAllocatedDescs];
2816 : :
2817 : 540493 : return result;
2818 : : }
2819 : :
2820 : : /*
2821 : : * Close a file returned by AllocateFile.
2822 : : *
2823 : : * Note we do not check fclose's return value --- it is up to the caller
2824 : : * to handle close errors.
2825 : : */
2826 : : int
10415 bruce@momjian.us 2827 : 80985 : FreeFile(FILE *file)
2828 : : {
2829 : : int i;
2830 : :
2831 : : DO_DB(elog(LOG, "FreeFile: Allocated %d", numAllocatedDescs));
2832 : :
2833 : : /* Remove file from list of allocated files, if it's present */
7900 tgl@sss.pgh.pa.us 2834 [ + - ]: 80988 : for (i = numAllocatedDescs; --i >= 0;)
2835 : : {
2836 : 80988 : AllocateDesc *desc = &allocatedDescs[i];
2837 : :
2838 [ + - + + ]: 80988 : if (desc->kind == AllocateDescFile && desc->desc.file == file)
2839 : 80985 : return FreeDesc(desc);
2840 : : }
2841 : :
2842 : : /* Only get here if someone passes us a file not in allocatedDescs */
7900 tgl@sss.pgh.pa.us 2843 [ # # ]:UBC 0 : elog(WARNING, "file passed to FreeFile was not obtained from AllocateFile");
2844 : :
8084 2845 : 0 : return fclose(file);
2846 : : }
2847 : :
2848 : : /*
2849 : : * Close a file returned by OpenTransientFile.
2850 : : *
2851 : : * Note we do not check close's return value --- it is up to the caller
2852 : : * to handle close errors.
2853 : : */
2854 : : int
4856 heikki.linnakangas@i 2855 :CBC 414143 : CloseTransientFile(int fd)
2856 : : {
2857 : : int i;
2858 : :
2859 : : DO_DB(elog(LOG, "CloseTransientFile: Allocated %d", numAllocatedDescs));
2860 : :
2861 : : /* Remove fd from list of allocated files, if it's present */
2862 [ + - ]: 414143 : for (i = numAllocatedDescs; --i >= 0;)
2863 : : {
2864 : 414143 : AllocateDesc *desc = &allocatedDescs[i];
2865 : :
2866 [ + - + - ]: 414143 : if (desc->kind == AllocateDescRawFD && desc->desc.fd == fd)
2867 : 414143 : return FreeDesc(desc);
2868 : : }
2869 : :
2870 : : /* Only get here if someone passes us a file not in allocatedDescs */
4856 heikki.linnakangas@i 2871 [ # # ]:UBC 0 : elog(WARNING, "fd passed to CloseTransientFile was not obtained from OpenTransientFile");
2872 : :
351 andres@anarazel.de 2873 : 0 : pgaio_closing_fd(fd);
2874 : :
4856 heikki.linnakangas@i 2875 : 0 : return close(fd);
2876 : : }
2877 : :
2878 : : /*
2879 : : * Routines that want to use <dirent.h> (ie, DIR*) should use AllocateDir
2880 : : * rather than plain opendir(). This lets fd.c deal with freeing FDs if
2881 : : * necessary to open the directory, and with closing it after an elog.
2882 : : * When done, call FreeDir rather than closedir.
2883 : : *
2884 : : * Returns NULL, with errno set, on failure. Note that failure detection
2885 : : * is commonly left to the following call of ReadDir or ReadDirExtended;
2886 : : * see the comments for ReadDir.
2887 : : *
2888 : : * Ideally this should be the *only* direct call of opendir() in the backend.
2889 : : */
2890 : : DIR *
8056 tgl@sss.pgh.pa.us 2891 :CBC 46230 : AllocateDir(const char *dirname)
2892 : : {
2893 : : DIR *dir;
2894 : :
2895 : : DO_DB(elog(LOG, "AllocateDir: Allocated %d (%s)",
2896 : : numAllocatedDescs, dirname));
2897 : :
2898 : : /* Can we allocate another non-virtual FD? */
4662 2899 [ - + ]: 46230 : if (!reserveAllocatedDesc())
4662 tgl@sss.pgh.pa.us 2900 [ # # ]:UBC 0 : ereport(ERROR,
2901 : : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2902 : : errmsg("exceeded maxAllocatedDescs (%d) while trying to open directory \"%s\"",
2903 : : maxAllocatedDescs, dirname)));
2904 : :
2905 : : /* Close excess kernel FDs. */
4662 tgl@sss.pgh.pa.us 2906 :CBC 46230 : ReleaseLruFiles();
2907 : :
8056 2908 : 46230 : TryAgain:
2909 [ + + ]: 46230 : if ((dir = opendir(dirname)) != NULL)
2910 : : {
7900 2911 : 45287 : AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2912 : :
2913 : 45287 : desc->kind = AllocateDescDir;
2914 : 45287 : desc->desc.dir = dir;
7850 2915 : 45287 : desc->create_subid = GetCurrentSubTransactionId();
7900 2916 : 45287 : numAllocatedDescs++;
2917 : 45287 : return desc->desc.dir;
2918 : : }
2919 : :
8056 2920 [ + - - + ]: 943 : if (errno == EMFILE || errno == ENFILE)
2921 : : {
8056 tgl@sss.pgh.pa.us 2922 :UBC 0 : int save_errno = errno;
2923 : :
2924 [ # # ]: 0 : ereport(LOG,
2925 : : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2926 : : errmsg("out of file descriptors: %m; release and retry")));
2927 : 0 : errno = 0;
2928 [ # # ]: 0 : if (ReleaseLruFile())
2929 : 0 : goto TryAgain;
2930 : 0 : errno = save_errno;
2931 : : }
2932 : :
8056 tgl@sss.pgh.pa.us 2933 :CBC 943 : return NULL;
2934 : : }
2935 : :
2936 : : /*
2937 : : * Read a directory opened with AllocateDir, ereport'ing any error.
2938 : : *
2939 : : * This is easier to use than raw readdir() since it takes care of some
2940 : : * otherwise rather tedious and error-prone manipulation of errno. Also,
2941 : : * if you are happy with a generic error message for AllocateDir failure,
2942 : : * you can just do
2943 : : *
2944 : : * dir = AllocateDir(path);
2945 : : * while ((dirent = ReadDir(dir, path)) != NULL)
2946 : : * process dirent;
2947 : : * FreeDir(dir);
2948 : : *
2949 : : * since a NULL dir parameter is taken as indicating AllocateDir failed.
2950 : : * (Make sure errno isn't changed between AllocateDir and ReadDir if you
2951 : : * use this shortcut.)
2952 : : *
2953 : : * The pathname passed to AllocateDir must be passed to this routine too,
2954 : : * but it is only used for error reporting.
2955 : : */
2956 : : struct dirent *
7574 2957 : 1362627 : ReadDir(DIR *dir, const char *dirname)
2958 : : {
3944 2959 : 1362627 : return ReadDirExtended(dir, dirname, ERROR);
2960 : : }
2961 : :
2962 : : /*
2963 : : * Alternate version of ReadDir that allows caller to specify the elevel
2964 : : * for any error report (whether it's reporting an initial failure of
2965 : : * AllocateDir or a subsequent directory read failure).
2966 : : *
2967 : : * If elevel < ERROR, returns NULL after any error. With the normal coding
2968 : : * pattern, this will result in falling out of the loop immediately as
2969 : : * though the directory contained no (more) entries.
2970 : : */
2971 : : struct dirent *
2972 : 2862636 : ReadDirExtended(DIR *dir, const char *dirname, int elevel)
2973 : : {
2974 : : struct dirent *dent;
2975 : :
2976 : : /* Give a generic message for AllocateDir failure, if caller didn't */
7574 2977 [ + + ]: 2862636 : if (dir == NULL)
2978 : : {
3944 2979 [ + - ]: 3 : ereport(elevel,
2980 : : (errcode_for_file_access(),
2981 : : errmsg("could not open directory \"%s\": %m",
2982 : : dirname)));
3944 tgl@sss.pgh.pa.us 2983 :UBC 0 : return NULL;
2984 : : }
2985 : :
7574 tgl@sss.pgh.pa.us 2986 :CBC 2862633 : errno = 0;
2987 [ + + ]: 2862633 : if ((dent = readdir(dir)) != NULL)
2988 : 2828927 : return dent;
2989 : :
2990 [ - + ]: 33706 : if (errno)
3944 tgl@sss.pgh.pa.us 2991 [ # # ]:UBC 0 : ereport(elevel,
2992 : : (errcode_for_file_access(),
2993 : : errmsg("could not read directory \"%s\": %m",
2994 : : dirname)));
7574 tgl@sss.pgh.pa.us 2995 :CBC 33706 : return NULL;
2996 : : }
2997 : :
2998 : : /*
2999 : : * Close a directory opened with AllocateDir.
3000 : : *
3001 : : * Returns closedir's return value (with errno set if it's not 0).
3002 : : * Note we do not check the return value --- it is up to the caller
3003 : : * to handle close errors if wanted.
3004 : : *
3005 : : * Does nothing if dir == NULL; we assume that directory open failure was
3006 : : * already reported if desired.
3007 : : */
3008 : : int
8056 3009 : 45154 : FreeDir(DIR *dir)
3010 : : {
3011 : : int i;
3012 : :
3013 : : /* Nothing to do if AllocateDir failed */
3023 3014 [ - + ]: 45154 : if (dir == NULL)
3023 tgl@sss.pgh.pa.us 3015 :UBC 0 : return 0;
3016 : :
3017 : : DO_DB(elog(LOG, "FreeDir: Allocated %d", numAllocatedDescs));
3018 : :
3019 : : /* Remove dir from list of allocated dirs, if it's present */
7900 tgl@sss.pgh.pa.us 3020 [ + - ]:CBC 45154 : for (i = numAllocatedDescs; --i >= 0;)
3021 : : {
3022 : 45154 : AllocateDesc *desc = &allocatedDescs[i];
3023 : :
3024 [ + - + - ]: 45154 : if (desc->kind == AllocateDescDir && desc->desc.dir == dir)
3025 : 45154 : return FreeDesc(desc);
3026 : : }
3027 : :
3028 : : /* Only get here if someone passes us a dir not in allocatedDescs */
7900 tgl@sss.pgh.pa.us 3029 [ # # ]:UBC 0 : elog(WARNING, "dir passed to FreeDir was not obtained from AllocateDir");
3030 : :
8056 3031 : 0 : return closedir(dir);
3032 : : }
3033 : :
3034 : :
3035 : : /*
3036 : : * Close a pipe stream returned by OpenPipeStream.
3037 : : */
3038 : : int
4764 heikki.linnakangas@i 3039 :CBC 61 : ClosePipeStream(FILE *file)
3040 : : {
3041 : : int i;
3042 : :
3043 : : DO_DB(elog(LOG, "ClosePipeStream: Allocated %d", numAllocatedDescs));
3044 : :
3045 : : /* Remove file from list of allocated files, if it's present */
3046 [ + - ]: 61 : for (i = numAllocatedDescs; --i >= 0;)
3047 : : {
3048 : 61 : AllocateDesc *desc = &allocatedDescs[i];
3049 : :
3050 [ + - + - ]: 61 : if (desc->kind == AllocateDescPipe && desc->desc.file == file)
3051 : 61 : return FreeDesc(desc);
3052 : : }
3053 : :
3054 : : /* Only get here if someone passes us a file not in allocatedDescs */
4764 heikki.linnakangas@i 3055 [ # # ]:UBC 0 : elog(WARNING, "file passed to ClosePipeStream was not obtained from OpenPipeStream");
3056 : :
3057 : 0 : return pclose(file);
3058 : : }
3059 : :
3060 : : /*
3061 : : * closeAllVfds
3062 : : *
3063 : : * Force all VFDs into the physically-closed state, so that the fewest
3064 : : * possible number of kernel file descriptors are in use. There is no
3065 : : * change in the logical state of the VFDs.
3066 : : */
3067 : : void
9331 tgl@sss.pgh.pa.us 3068 :CBC 40 : closeAllVfds(void)
3069 : : {
3070 : : Index i;
3071 : :
9807 3072 [ + - ]: 40 : if (SizeVfdCache > 0)
3073 : : {
3189 3074 [ - + ]: 40 : Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
9807 3075 [ + + ]: 1280 : for (i = 1; i < SizeVfdCache; i++)
3076 : : {
3077 [ + + ]: 1240 : if (!FileIsNotOpen(i))
3078 : 150 : LruDelete(i);
3079 : : }
3080 : : }
3081 : 40 : }
3082 : :
3083 : :
3084 : : /*
3085 : : * SetTempTablespaces
3086 : : *
3087 : : * Define a list (actually an array) of OIDs of tablespaces to use for
3088 : : * temporary files. This list will be used until end of transaction,
3089 : : * unless this function is called again before then. It is caller's
3090 : : * responsibility that the passed-in array has adequate lifespan (typically
3091 : : * it'd be allocated in TopTransactionContext).
3092 : : *
3093 : : * Some entries of the array may be InvalidOid, indicating that the current
3094 : : * database's default tablespace should be used.
3095 : : */
3096 : : void
6856 3097 : 3370 : SetTempTablespaces(Oid *tableSpaces, int numSpaces)
3098 : : {
3099 [ - + ]: 3370 : Assert(numSpaces >= 0);
3100 : 3370 : tempTableSpaces = tableSpaces;
3101 : 3370 : numTempTableSpaces = numSpaces;
3102 : :
3103 : : /*
3104 : : * Select a random starting point in the list. This is to minimize
3105 : : * conflicts between backends that are most likely sharing the same list
3106 : : * of temp tablespaces. Note that if we create multiple temp files in the
3107 : : * same transaction, we'll advance circularly through the list --- this
3108 : : * ensures that large temporary sort files are nicely spread across all
3109 : : * available tablespaces.
3110 : : */
3111 [ - + ]: 3370 : if (numSpaces > 1)
1568 tgl@sss.pgh.pa.us 3112 :UBC 0 : nextTempTableSpace = pg_prng_uint64_range(&pg_global_prng_state,
3113 : 0 : 0, numSpaces - 1);
3114 : : else
6856 tgl@sss.pgh.pa.us 3115 :CBC 3370 : nextTempTableSpace = 0;
3116 : 3370 : }
3117 : :
3118 : : /*
3119 : : * TempTablespacesAreSet
3120 : : *
3121 : : * Returns true if SetTempTablespaces has been called in current transaction.
3122 : : * (This is just so that tablespaces.c doesn't need its own per-transaction
3123 : : * state.)
3124 : : */
3125 : : bool
3126 : 4941 : TempTablespacesAreSet(void)
3127 : : {
3128 : 4941 : return (numTempTableSpaces >= 0);
3129 : : }
3130 : :
3131 : : /*
3132 : : * GetTempTablespaces
3133 : : *
3134 : : * Populate an array with the OIDs of the tablespaces that should be used for
3135 : : * temporary files. (Some entries may be InvalidOid, indicating that the
3136 : : * current database's default tablespace should be used.) At most numSpaces
3137 : : * entries will be filled.
3138 : : * Returns the number of OIDs that were copied into the output array.
3139 : : */
3140 : : int
3026 andres@anarazel.de 3141 : 214 : GetTempTablespaces(Oid *tableSpaces, int numSpaces)
3142 : : {
3143 : : int i;
3144 : :
3145 [ - + ]: 214 : Assert(TempTablespacesAreSet());
3146 [ - + - - ]: 214 : for (i = 0; i < numTempTableSpaces && i < numSpaces; ++i)
3026 andres@anarazel.de 3147 :UBC 0 : tableSpaces[i] = tempTableSpaces[i];
3148 : :
3026 andres@anarazel.de 3149 :CBC 214 : return i;
3150 : : }
3151 : :
3152 : : /*
3153 : : * GetNextTempTableSpace
3154 : : *
3155 : : * Select the next temp tablespace to use. A result of InvalidOid means
3156 : : * to use the current database's default tablespace.
3157 : : */
3158 : : Oid
6856 tgl@sss.pgh.pa.us 3159 : 2283 : GetNextTempTableSpace(void)
3160 : : {
3161 [ + + ]: 2283 : if (numTempTableSpaces > 0)
3162 : : {
3163 : : /* Advance nextTempTableSpace counter with wraparound */
3164 [ + - ]: 1 : if (++nextTempTableSpace >= numTempTableSpaces)
3165 : 1 : nextTempTableSpace = 0;
3166 : 1 : return tempTableSpaces[nextTempTableSpace];
3167 : : }
3168 : 2282 : return InvalidOid;
3169 : : }
3170 : :
3171 : :
3172 : : /*
3173 : : * AtEOSubXact_Files
3174 : : *
3175 : : * Take care of subtransaction commit/abort. At abort, we close AllocateDescs
3176 : : * that the subtransaction may have opened. At commit, we reassign them to
3177 : : * the parent subtransaction. (Temporary files are tracked by ResourceOwners
3178 : : * instead.)
3179 : : */
3180 : : void
7850 3181 : 11685 : AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid,
3182 : : SubTransactionId parentSubid)
3183 : : {
3184 : : Index i;
3185 : :
7900 3186 [ - + ]: 11685 : for (i = 0; i < numAllocatedDescs; i++)
3187 : : {
7850 tgl@sss.pgh.pa.us 3188 [ # # ]:UBC 0 : if (allocatedDescs[i].create_subid == mySubid)
3189 : : {
7900 3190 [ # # ]: 0 : if (isCommit)
7850 3191 : 0 : allocatedDescs[i].create_subid = parentSubid;
3192 : : else
3193 : : {
3194 : : /* have to recheck the item after FreeDesc (ugly) */
7900 3195 : 0 : FreeDesc(&allocatedDescs[i--]);
3196 : : }
3197 : : }
3198 : : }
7900 tgl@sss.pgh.pa.us 3199 :CBC 11685 : }
3200 : :
3201 : : /*
3202 : : * AtEOXact_Files
3203 : : *
3204 : : * This routine is called during transaction commit or abort. All still-open
3205 : : * per-transaction temporary file VFDs are closed, which also causes the
3206 : : * underlying files to be deleted (although they should've been closed already
3207 : : * by the ResourceOwner cleanup). Furthermore, all "allocated" stdio files are
3208 : : * closed. We also forget any transaction-local temp tablespace list.
3209 : : *
3210 : : * The isCommit flag is used only to decide whether to emit warnings about
3211 : : * unclosed files.
3212 : : */
3213 : : void
2878 3214 : 337798 : AtEOXact_Files(bool isCommit)
3215 : : {
3216 : 337798 : CleanupTempFiles(isCommit, false);
6856 3217 : 337798 : tempTableSpaces = NULL;
3218 : 337798 : numTempTableSpaces = -1;
8356 3219 : 337798 : }
3220 : :
3221 : : /*
3222 : : * BeforeShmemExit_Files
3223 : : *
3224 : : * before_shmem_exit hook to clean up temp files during backend shutdown.
3225 : : * Here, we want to clean up *all* temp files including interXact ones.
3226 : : */
3227 : : static void
1681 andres@anarazel.de 3228 : 21554 : BeforeShmemExit_Files(int code, Datum arg)
3229 : : {
2878 tgl@sss.pgh.pa.us 3230 : 21554 : CleanupTempFiles(false, true);
3231 : :
3232 : : /* prevent further temp files from being created */
3233 : : #ifdef USE_ASSERT_CHECKING
1681 andres@anarazel.de 3234 : 21554 : temporary_files_allowed = false;
3235 : : #endif
8356 tgl@sss.pgh.pa.us 3236 : 21554 : }
3237 : :
3238 : : /*
3239 : : * Close temporary files and delete their underlying files.
3240 : : *
3241 : : * isCommit: if true, this is normal transaction commit, and we don't
3242 : : * expect any remaining files; warn if there are some.
3243 : : *
3244 : : * isProcExit: if true, this is being called as the backend process is
3245 : : * exiting. If that's the case, we should remove all temporary files; if
3246 : : * that's not the case, we are being called for transaction commit/abort
3247 : : * and should only remove transaction-local temp files. In either case,
3248 : : * also clean up "allocated" stdio files, dirs and fds.
3249 : : */
3250 : : static void
2878 3251 : 359352 : CleanupTempFiles(bool isCommit, bool isProcExit)
3252 : : {
3253 : : Index i;
3254 : :
3255 : : /*
3256 : : * Careful here: at proc_exit we need extra cleanup, not just
3257 : : * xact_temporary files.
3258 : : */
4897 3259 [ + + + + ]: 359352 : if (isProcExit || have_xact_temporary_files)
3260 : : {
3189 3261 [ - + ]: 22407 : Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
9807 3262 [ + + ]: 1240252 : for (i = 1; i < SizeVfdCache; i++)
3263 : : {
8356 3264 : 1217845 : unsigned short fdstate = VfdCache[i].fdstate;
3265 : :
3026 andres@anarazel.de 3266 [ + + - + ]: 1217845 : if (((fdstate & FD_DELETE_AT_CLOSE) || (fdstate & FD_CLOSE_AT_EOXACT)) &&
3267 [ + - ]: 4 : VfdCache[i].fileName != NULL)
3268 : : {
3269 : : /*
3270 : : * If we're in the process of exiting a backend process, close
3271 : : * all temporary files. Otherwise, only close temporary files
3272 : : * local to the current transaction. They should be closed by
3273 : : * the ResourceOwner mechanism already, so this is just a
3274 : : * debugging cross-check.
3275 : : */
4897 tgl@sss.pgh.pa.us 3276 [ + - ]: 4 : if (isProcExit)
3277 : 4 : FileClose(i);
3026 andres@anarazel.de 3278 [ # # ]:UBC 0 : else if (fdstate & FD_CLOSE_AT_EOXACT)
3279 : : {
4897 tgl@sss.pgh.pa.us 3280 [ # # ]: 0 : elog(WARNING,
3281 : : "temporary file %s not closed at end-of-transaction",
3282 : : VfdCache[i].fileName);
3283 : 0 : FileClose(i);
3284 : : }
3285 : : }
3286 : : }
3287 : :
4897 tgl@sss.pgh.pa.us 3288 :CBC 22407 : have_xact_temporary_files = false;
3289 : : }
3290 : :
3291 : : /* Complain if any allocated files remain open at commit. */
2878 3292 [ + + - + ]: 359352 : if (isCommit && numAllocatedDescs > 0)
2878 tgl@sss.pgh.pa.us 3293 [ # # ]:UBC 0 : elog(WARNING, "%d temporary files and directories not closed at end-of-transaction",
3294 : : numAllocatedDescs);
3295 : :
3296 : : /* Clean up "allocated" stdio files, dirs and fds. */
7900 tgl@sss.pgh.pa.us 3297 [ + + ]:CBC 359502 : while (numAllocatedDescs > 0)
3298 : 150 : FreeDesc(&allocatedDescs[0]);
9807 3299 : 359352 : }
3300 : :
3301 : :
3302 : : /*
3303 : : * Remove temporary and temporary relation files left over from a prior
3304 : : * postmaster session
3305 : : *
3306 : : * This should be called during postmaster startup. It will forcibly
3307 : : * remove any leftover files created by OpenTemporaryFile and any leftover
3308 : : * temporary relation files created by mdcreate.
3309 : : *
3310 : : * During post-backend-crash restart cycle, this routine is called when
3311 : : * remove_temp_files_after_crash GUC is enabled. Multiple crashes while
3312 : : * queries are using temp files could result in useless storage usage that can
3313 : : * only be reclaimed by a service restart. The argument against enabling it is
3314 : : * that someone might want to examine the temporary files for debugging
3315 : : * purposes. This does however mean that OpenTemporaryFile had better allow for
3316 : : * collision with an existing temp file name.
3317 : : *
3318 : : * NOTE: this function and its subroutines generally report syscall failures
3319 : : * with ereport(LOG) and keep going. Removing temp files is not so critical
3320 : : * that we should fail to start the database when we can't do it.
3321 : : */
3322 : : void
9043 3323 : 926 : RemovePgTempFiles(void)
3324 : : {
3325 : : char temp_path[MAXPGPATH + sizeof(PG_TBLSPC_DIR) + sizeof(TABLESPACE_VERSION_DIRECTORY) + sizeof(PG_TEMP_FILES_DIR)];
3326 : : DIR *spc_dir;
3327 : : struct dirent *spc_de;
3328 : :
3329 : : /*
3330 : : * First process temp files in pg_default ($PGDATA/base)
3331 : : */
6860 3332 : 926 : snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR);
2989 3333 : 926 : RemovePgTempFilesInDir(temp_path, true, false);
5693 rhaas@postgresql.org 3334 : 926 : RemovePgTempRelationFiles("base");
3335 : :
3336 : : /*
3337 : : * Cycle through temp directories for all non-default tablespaces.
3338 : : */
558 michael@paquier.xyz 3339 : 926 : spc_dir = AllocateDir(PG_TBLSPC_DIR);
3340 : :
3341 [ + + ]: 2851 : while ((spc_de = ReadDirExtended(spc_dir, PG_TBLSPC_DIR, LOG)) != NULL)
3342 : : {
6860 tgl@sss.pgh.pa.us 3343 [ + + ]: 1925 : if (strcmp(spc_de->d_name, ".") == 0 ||
3344 [ + + ]: 999 : strcmp(spc_de->d_name, "..") == 0)
7746 3345 : 1852 : continue;
3346 : :
558 michael@paquier.xyz 3347 : 73 : snprintf(temp_path, sizeof(temp_path), "%s/%s/%s/%s",
3348 : 73 : PG_TBLSPC_DIR, spc_de->d_name, TABLESPACE_VERSION_DIRECTORY,
3349 : : PG_TEMP_FILES_DIR);
2989 tgl@sss.pgh.pa.us 3350 : 73 : RemovePgTempFilesInDir(temp_path, true, false);
3351 : :
558 michael@paquier.xyz 3352 : 73 : snprintf(temp_path, sizeof(temp_path), "%s/%s/%s",
3353 : 73 : PG_TBLSPC_DIR, spc_de->d_name, TABLESPACE_VERSION_DIRECTORY);
5693 rhaas@postgresql.org 3354 : 73 : RemovePgTempRelationFiles(temp_path);
3355 : : }
3356 : :
6860 tgl@sss.pgh.pa.us 3357 : 926 : FreeDir(spc_dir);
3358 : :
3359 : : /*
3360 : : * In EXEC_BACKEND case there is a pgsql_tmp directory at the top level of
3361 : : * DataDir as well. However, that is *not* cleaned here because doing so
3362 : : * would create a race condition. It's done separately, earlier in
3363 : : * postmaster startup.
3364 : : */
7746 3365 : 926 : }
3366 : :
3367 : : /*
3368 : : * Process one pgsql_tmp directory for RemovePgTempFiles.
3369 : : *
3370 : : * If missing_ok is true, it's all right for the named directory to not exist.
3371 : : * Any other problem results in a LOG message. (missing_ok should be true at
3372 : : * the top level, since pgsql_tmp directories are not created until needed.)
3373 : : *
3374 : : * At the top level, this should be called with unlink_all = false, so that
3375 : : * only files matching the temporary name prefix will be unlinked. When
3376 : : * recursing it will be called with unlink_all = true to unlink everything
3377 : : * under a top-level temporary directory.
3378 : : *
3379 : : * (These two flags could be replaced by one, but it seems clearer to keep
3380 : : * them separate.)
3381 : : */
3382 : : void
2989 3383 : 1000 : RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
3384 : : {
3385 : : DIR *temp_dir;
3386 : : struct dirent *temp_de;
3387 : : char rm_path[MAXPGPATH * 2];
3388 : :
7746 3389 : 1000 : temp_dir = AllocateDir(tmpdirname);
3390 : :
2989 3391 [ + + + - : 1000 : if (temp_dir == NULL && errno == ENOENT && missing_ok)
+ - ]
3392 : 923 : return;
3393 : :
3023 3394 [ + + ]: 234 : while ((temp_de = ReadDirExtended(temp_dir, tmpdirname, LOG)) != NULL)
3395 : : {
7746 3396 [ + + ]: 157 : if (strcmp(temp_de->d_name, ".") == 0 ||
3397 [ + + ]: 80 : strcmp(temp_de->d_name, "..") == 0)
3398 : 154 : continue;
3399 : :
3400 : 3 : snprintf(rm_path, sizeof(rm_path), "%s/%s",
3401 : 3 : tmpdirname, temp_de->d_name);
3402 : :
3026 andres@anarazel.de 3403 [ + - ]: 3 : if (unlink_all ||
3404 [ + - ]: 3 : strncmp(temp_de->d_name,
3405 : : PG_TEMP_FILE_PREFIX,
3406 : : strlen(PG_TEMP_FILE_PREFIX)) == 0)
3407 : 3 : {
1290 michael@paquier.xyz 3408 : 3 : PGFileType type = get_dirent_type(rm_path, temp_de, false, LOG);
3409 : :
3410 [ - + ]: 3 : if (type == PGFILETYPE_ERROR)
3026 andres@anarazel.de 3411 :UBC 0 : continue;
1290 michael@paquier.xyz 3412 [ + + ]:CBC 3 : else if (type == PGFILETYPE_DIR)
3413 : : {
3414 : : /* recursively remove contents, then directory itself */
2989 tgl@sss.pgh.pa.us 3415 : 1 : RemovePgTempFilesInDir(rm_path, false, true);
3416 : :
3023 3417 [ - + ]: 1 : if (rmdir(rm_path) < 0)
3023 tgl@sss.pgh.pa.us 3418 [ # # ]:UBC 0 : ereport(LOG,
3419 : : (errcode_for_file_access(),
3420 : : errmsg("could not remove directory \"%s\": %m",
3421 : : rm_path)));
3422 : : }
3423 : : else
3424 : : {
3023 tgl@sss.pgh.pa.us 3425 [ - + ]:CBC 2 : if (unlink(rm_path) < 0)
3023 tgl@sss.pgh.pa.us 3426 [ # # ]:UBC 0 : ereport(LOG,
3427 : : (errcode_for_file_access(),
3428 : : errmsg("could not remove file \"%s\": %m",
3429 : : rm_path)));
3430 : : }
3431 : : }
3432 : : else
3433 [ # # ]: 0 : ereport(LOG,
3434 : : (errmsg("unexpected file found in temporary-files directory: \"%s\"",
3435 : : rm_path)));
3436 : : }
3437 : :
7746 tgl@sss.pgh.pa.us 3438 :CBC 77 : FreeDir(temp_dir);
3439 : : }
3440 : :
3441 : : /* Process one tablespace directory, look for per-DB subdirectories */
3442 : : static void
5693 rhaas@postgresql.org 3443 : 999 : RemovePgTempRelationFiles(const char *tsdirname)
3444 : : {
3445 : : DIR *ts_dir;
3446 : : struct dirent *de;
3447 : : char dbspace_path[MAXPGPATH * 2];
3448 : :
3449 : 999 : ts_dir = AllocateDir(tsdirname);
3450 : :
3023 tgl@sss.pgh.pa.us 3451 [ + + ]: 6218 : while ((de = ReadDirExtended(ts_dir, tsdirname, LOG)) != NULL)
3452 : : {
3453 : : /*
3454 : : * We're only interested in the per-database directories, which have
3455 : : * numeric names. Note that this code will also (properly) ignore "."
3456 : : * and "..".
3457 : : */
3458 [ + + ]: 5219 : if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
5693 rhaas@postgresql.org 3459 : 2074 : continue;
3460 : :
3461 : 3145 : snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s",
3462 : 3145 : tsdirname, de->d_name);
3463 : 3145 : RemovePgTempRelationFilesInDbspace(dbspace_path);
3464 : : }
3465 : :
3466 : 999 : FreeDir(ts_dir);
3467 : 999 : }
3468 : :
3469 : : /* Process one per-dbspace directory for RemovePgTempRelationFiles */
3470 : : static void
3471 : 3145 : RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
3472 : : {
3473 : : DIR *dbspace_dir;
3474 : : struct dirent *de;
3475 : : char rm_path[MAXPGPATH * 2];
3476 : :
3477 : 3145 : dbspace_dir = AllocateDir(dbspacedirname);
3478 : :
3023 tgl@sss.pgh.pa.us 3479 [ + + ]: 953456 : while ((de = ReadDirExtended(dbspace_dir, dbspacedirname, LOG)) != NULL)
3480 : : {
5693 rhaas@postgresql.org 3481 [ + + ]: 950311 : if (!looks_like_temp_rel_name(de->d_name))
3482 : 950307 : continue;
3483 : :
3484 : 4 : snprintf(rm_path, sizeof(rm_path), "%s/%s",
3485 : 4 : dbspacedirname, de->d_name);
3486 : :
3023 tgl@sss.pgh.pa.us 3487 [ - + ]: 4 : if (unlink(rm_path) < 0)
3023 tgl@sss.pgh.pa.us 3488 [ # # ]:UBC 0 : ereport(LOG,
3489 : : (errcode_for_file_access(),
3490 : : errmsg("could not remove file \"%s\": %m",
3491 : : rm_path)));
3492 : : }
3493 : :
5693 rhaas@postgresql.org 3494 :CBC 3145 : FreeDir(dbspace_dir);
3495 : 3145 : }
3496 : :
3497 : : /* t<digits>_<digits>, or t<digits>_<digits>_<forkname> */
3498 : : bool
3499 : 1274027 : looks_like_temp_rel_name(const char *name)
3500 : : {
3501 : : int pos;
3502 : : int savepos;
3503 : :
3504 : : /* Must start with "t". */
3505 [ + + ]: 1274027 : if (name[0] != 't')
3506 : 1273987 : return false;
3507 : :
3508 : : /* Followed by a non-empty string of digits and then an underscore. */
3509 [ + + ]: 196 : for (pos = 1; isdigit((unsigned char) name[pos]); ++pos)
3510 : : ;
3511 [ + - - + ]: 40 : if (pos == 1 || name[pos] != '_')
5693 rhaas@postgresql.org 3512 :UBC 0 : return false;
3513 : :
3514 : : /* Followed by another nonempty string of digits. */
5693 rhaas@postgresql.org 3515 [ + + ]:CBC 196 : for (savepos = ++pos; isdigit((unsigned char) name[pos]); ++pos)
3516 : : ;
3517 [ - + ]: 40 : if (savepos == pos)
5693 rhaas@postgresql.org 3518 :UBC 0 : return false;
3519 : :
3520 : : /* We might have _forkname or .segment or both. */
5693 rhaas@postgresql.org 3521 [ + + ]:CBC 40 : if (name[pos] == '_')
3522 : : {
5453 bruce@momjian.us 3523 : 20 : int forkchar = forkname_chars(&name[pos + 1], NULL);
3524 : :
5693 rhaas@postgresql.org 3525 [ - + ]: 20 : if (forkchar <= 0)
5693 rhaas@postgresql.org 3526 :UBC 0 : return false;
5693 rhaas@postgresql.org 3527 :CBC 20 : pos += forkchar + 1;
3528 : : }
3529 [ + + ]: 40 : if (name[pos] == '.')
3530 : : {
3531 : : int segchar;
3532 : :
5453 bruce@momjian.us 3533 [ + + ]: 40 : for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar)
3534 : : ;
5693 rhaas@postgresql.org 3535 [ - + ]: 20 : if (segchar <= 1)
5693 rhaas@postgresql.org 3536 :UBC 0 : return false;
5693 rhaas@postgresql.org 3537 :CBC 20 : pos += segchar;
3538 : : }
3539 : :
3540 : : /* Now we should be at the end. */
3541 [ - + ]: 40 : if (name[pos] != '\0')
5693 rhaas@postgresql.org 3542 :UBC 0 : return false;
5693 rhaas@postgresql.org 3543 :CBC 40 : return true;
3544 : : }
3545 : :
3546 : : #ifdef HAVE_SYNCFS
3547 : : static void
1821 tmunro@postgresql.or 3548 :UBC 0 : do_syncfs(const char *path)
3549 : : {
3550 : : int fd;
3551 : :
1602 rhaas@postgresql.org 3552 [ # # # # ]: 0 : ereport_startup_progress("syncing data directory (syncfs), elapsed time: %ld.%02d s, current path: %s",
3553 : : path);
3554 : :
1821 tmunro@postgresql.or 3555 : 0 : fd = OpenTransientFile(path, O_RDONLY);
3556 [ # # ]: 0 : if (fd < 0)
3557 : : {
3558 [ # # ]: 0 : ereport(LOG,
3559 : : (errcode_for_file_access(),
3560 : : errmsg("could not open file \"%s\": %m", path)));
3561 : 0 : return;
3562 : : }
3563 [ # # ]: 0 : if (syncfs(fd) < 0)
3564 [ # # ]: 0 : ereport(LOG,
3565 : : (errcode_for_file_access(),
3566 : : errmsg("could not synchronize file system for file \"%s\": %m", path)));
3567 : 0 : CloseTransientFile(fd);
3568 : : }
3569 : : #endif
3570 : :
3571 : : /*
3572 : : * Issue fsync recursively on PGDATA and all its contents, or issue syncfs for
3573 : : * all potential filesystem, depending on recovery_init_sync_method setting.
3574 : : *
3575 : : * We fsync regular files and directories wherever they are, but we
3576 : : * follow symlinks only for pg_wal and immediately under pg_tblspc.
3577 : : * Other symlinks are presumed to point at files we're not responsible
3578 : : * for fsyncing, and might not have privileges to write at all.
3579 : : *
3580 : : * Errors are logged but not considered fatal; that's because this is used
3581 : : * only during database startup, to deal with the possibility that there are
3582 : : * issued-but-unsynced writes pending against the data directory. We want to
3583 : : * ensure that such writes reach disk before anything that's done in the new
3584 : : * run. However, aborting on error would result in failure to start for
3585 : : * harmless cases such as read-only files in the data directory, and that's
3586 : : * not good either.
3587 : : *
3588 : : * Note that if we previously crashed due to a PANIC on fsync(), we'll be
3589 : : * rewriting all changes again during recovery.
3590 : : *
3591 : : * Note we assume we're chdir'd into PGDATA to begin with.
3592 : : */
3593 : : void
3944 tgl@sss.pgh.pa.us 3594 :CBC 186 : SyncDataDirectory(void)
3595 : : {
3596 : : bool xlog_is_symlink;
3597 : :
3598 : : /* We can skip this whole thing if fsync is disabled. */
3599 [ + - ]: 186 : if (!enableFsync)
3600 : 186 : return;
3601 : :
3602 : : /*
3603 : : * If pg_wal is a symlink, we'll need to recurse into it separately,
3604 : : * because the first walkdir below will ignore it.
3605 : : */
3944 tgl@sss.pgh.pa.us 3606 :UBC 0 : xlog_is_symlink = false;
3607 : :
3608 : : {
3609 : : struct stat st;
3610 : :
3433 rhaas@postgresql.org 3611 [ # # ]: 0 : if (lstat("pg_wal", &st) < 0)
3944 tgl@sss.pgh.pa.us 3612 [ # # ]: 0 : ereport(LOG,
3613 : : (errcode_for_file_access(),
3614 : : errmsg("could not stat file \"%s\": %m",
3615 : : "pg_wal")));
3616 [ # # ]: 0 : else if (S_ISLNK(st.st_mode))
3617 : 0 : xlog_is_symlink = true;
3618 : : }
3619 : :
3620 : : #ifdef HAVE_SYNCFS
921 nathan@postgresql.or 3621 [ # # ]: 0 : if (recovery_init_sync_method == DATA_DIR_SYNC_METHOD_SYNCFS)
3622 : : {
3623 : : DIR *dir;
3624 : : struct dirent *de;
3625 : :
3626 : : /*
3627 : : * On Linux, we don't have to open every single file one by one. We
3628 : : * can use syncfs() to sync whole filesystems. We only expect
3629 : : * filesystem boundaries to exist where we tolerate symlinks, namely
3630 : : * pg_wal and the tablespaces, so we call syncfs() for each of those
3631 : : * directories.
3632 : : */
3633 : :
3634 : : /* Prepare to report progress syncing the data directory via syncfs. */
1602 rhaas@postgresql.org 3635 : 0 : begin_startup_progress_phase();
3636 : :
3637 : : /* Sync the top level pgdata directory. */
1821 tmunro@postgresql.or 3638 : 0 : do_syncfs(".");
3639 : : /* If any tablespaces are configured, sync each of those. */
558 michael@paquier.xyz 3640 : 0 : dir = AllocateDir(PG_TBLSPC_DIR);
3641 [ # # ]: 0 : while ((de = ReadDirExtended(dir, PG_TBLSPC_DIR, LOG)))
3642 : : {
3643 : : char path[MAXPGPATH];
3644 : :
1821 tmunro@postgresql.or 3645 [ # # # # ]: 0 : if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
3646 : 0 : continue;
3647 : :
558 michael@paquier.xyz 3648 : 0 : snprintf(path, MAXPGPATH, "%s/%s", PG_TBLSPC_DIR, de->d_name);
1821 tmunro@postgresql.or 3649 : 0 : do_syncfs(path);
3650 : : }
3651 : 0 : FreeDir(dir);
3652 : : /* If pg_wal is a symlink, process that too. */
3653 [ # # ]: 0 : if (xlog_is_symlink)
3654 : 0 : do_syncfs("pg_wal");
3655 : 0 : return;
3656 : : }
3657 : : #endif /* !HAVE_SYNCFS */
3658 : :
3659 : : #ifdef PG_FLUSH_DATA_WORKS
3660 : : /* Prepare to report progress of the pre-fsync phase. */
1602 rhaas@postgresql.org 3661 : 0 : begin_startup_progress_phase();
3662 : :
3663 : : /*
3664 : : * If possible, hint to the kernel that we're soon going to fsync the data
3665 : : * directory and its contents. Errors in this step are even less
3666 : : * interesting than normal, so log them only at DEBUG1.
3667 : : */
3944 tgl@sss.pgh.pa.us 3668 : 0 : walkdir(".", pre_sync_fname, false, DEBUG1);
3669 [ # # ]: 0 : if (xlog_is_symlink)
3433 rhaas@postgresql.org 3670 : 0 : walkdir("pg_wal", pre_sync_fname, false, DEBUG1);
558 michael@paquier.xyz 3671 : 0 : walkdir(PG_TBLSPC_DIR, pre_sync_fname, true, DEBUG1);
3672 : : #endif
3673 : :
3674 : : /* Prepare to report progress syncing the data directory via fsync. */
1602 rhaas@postgresql.org 3675 : 0 : begin_startup_progress_phase();
3676 : :
3677 : : /*
3678 : : * Now we do the fsync()s in the same order.
3679 : : *
3680 : : * The main call ignores symlinks, so in addition to specially processing
3681 : : * pg_wal if it's a symlink, pg_tblspc has to be visited separately with
3682 : : * process_symlinks = true. Note that if there are any plain directories
3683 : : * in pg_tblspc, they'll get fsync'd twice. That's not an expected case
3684 : : * so we don't worry about optimizing it.
3685 : : */
3658 andres@anarazel.de 3686 : 0 : walkdir(".", datadir_fsync_fname, false, LOG);
3944 tgl@sss.pgh.pa.us 3687 [ # # ]: 0 : if (xlog_is_symlink)
3433 rhaas@postgresql.org 3688 : 0 : walkdir("pg_wal", datadir_fsync_fname, false, LOG);
558 michael@paquier.xyz 3689 : 0 : walkdir(PG_TBLSPC_DIR, datadir_fsync_fname, true, LOG);
3690 : : }
3691 : :
3692 : : /*
3693 : : * walkdir: recursively walk a directory, applying the action to each
3694 : : * regular file and directory (including the named directory itself).
3695 : : *
3696 : : * If process_symlinks is true, the action and recursion are also applied
3697 : : * to regular files and directories that are pointed to by symlinks in the
3698 : : * given directory; otherwise symlinks are ignored. Symlinks are always
3699 : : * ignored in subdirectories, ie we intentionally don't pass down the
3700 : : * process_symlinks flag to recursive calls.
3701 : : *
3702 : : * Errors are reported at level elevel, which might be ERROR or less.
3703 : : *
3704 : : * See also walkdir in file_utils.c, which is a frontend version of this
3705 : : * logic.
3706 : : */
3707 : : static void
3944 tgl@sss.pgh.pa.us 3708 :CBC 195 : walkdir(const char *path,
3709 : : void (*action) (const char *fname, bool isdir, int elevel),
3710 : : bool process_symlinks,
3711 : : int elevel)
3712 : : {
3713 : : DIR *dir;
3714 : : struct dirent *de;
3715 : :
3968 rhaas@postgresql.org 3716 : 195 : dir = AllocateDir(path);
3717 : :
3944 tgl@sss.pgh.pa.us 3718 [ + + ]: 2127 : while ((de = ReadDirExtended(dir, path, elevel)) != NULL)
3719 : : {
3720 : : char subpath[MAXPGPATH * 2];
3721 : :
3968 rhaas@postgresql.org 3722 [ - + ]: 1932 : CHECK_FOR_INTERRUPTS();
3723 : :
3724 [ + + ]: 1932 : if (strcmp(de->d_name, ".") == 0 ||
3725 [ + + ]: 1737 : strcmp(de->d_name, "..") == 0)
3726 : 390 : continue;
3727 : :
3260 peter_e@gmx.net 3728 : 1542 : snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name);
3729 : :
2015 tmunro@postgresql.or 3730 [ + - - ]: 1542 : switch (get_dirent_type(subpath, de, process_symlinks, elevel))
3731 : : {
3732 : 1542 : case PGFILETYPE_REG:
3733 : 1542 : (*action) (subpath, false, elevel);
3734 : 1542 : break;
2015 tmunro@postgresql.or 3735 :UBC 0 : case PGFILETYPE_DIR:
3736 : 0 : walkdir(subpath, action, false, elevel);
3737 : 0 : break;
3738 : 0 : default:
3739 : :
3740 : : /*
3741 : : * Errors are already reported directly by get_dirent_type(),
3742 : : * and any remaining symlinks and unknown file types are
3743 : : * ignored.
3744 : : */
3745 : 0 : break;
3746 : : }
3747 : : }
3748 : :
3944 tgl@sss.pgh.pa.us 3749 :CBC 195 : FreeDir(dir); /* we ignore any error here */
3750 : :
3751 : : /*
3752 : : * It's important to fsync the destination directory itself as individual
3753 : : * file fsyncs don't guarantee that the directory entry for the file is
3754 : : * synced. However, skip this if AllocateDir failed; the action function
3755 : : * might not be robust against that.
3756 : : */
3023 3757 [ + - ]: 195 : if (dir)
3758 : 195 : (*action) (path, true, elevel);
3944 3759 : 195 : }
3760 : :
3761 : :
3762 : : /*
3763 : : * Hint to the OS that it should get ready to fsync() this file.
3764 : : *
3765 : : * Ignores errors trying to open unreadable files, and logs other errors at a
3766 : : * caller-specified level.
3767 : : */
3768 : : #ifdef PG_FLUSH_DATA_WORKS
3769 : :
3770 : : static void
3944 tgl@sss.pgh.pa.us 3771 :UBC 0 : pre_sync_fname(const char *fname, bool isdir, int elevel)
3772 : : {
3773 : : int fd;
3774 : :
3775 : : /* Don't try to flush directories, it'll likely just fail */
3623 3776 [ # # ]: 0 : if (isdir)
3777 : 0 : return;
3778 : :
1602 rhaas@postgresql.org 3779 [ # # # # ]: 0 : ereport_startup_progress("syncing data directory (pre-fsync), elapsed time: %ld.%02d s, current path: %s",
3780 : : fname);
3781 : :
3095 peter_e@gmx.net 3782 : 0 : fd = OpenTransientFile(fname, O_RDONLY | PG_BINARY);
3783 : :
3944 tgl@sss.pgh.pa.us 3784 [ # # ]: 0 : if (fd < 0)
3785 : : {
3623 3786 [ # # ]: 0 : if (errno == EACCES)
3944 3787 : 0 : return;
3788 [ # # ]: 0 : ereport(elevel,
3789 : : (errcode_for_file_access(),
3790 : : errmsg("could not open file \"%s\": %m", fname)));
3791 : 0 : return;
3792 : : }
3793 : :
3794 : : /*
3795 : : * pg_flush_data() ignores errors, which is ok because this is only a
3796 : : * hint.
3797 : : */
3677 andres@anarazel.de 3798 : 0 : pg_flush_data(fd, 0, 0);
3799 : :
2444 peter@eisentraut.org 3800 [ # # ]: 0 : if (CloseTransientFile(fd) != 0)
2563 michael@paquier.xyz 3801 [ # # ]: 0 : ereport(elevel,
3802 : : (errcode_for_file_access(),
3803 : : errmsg("could not close file \"%s\": %m", fname)));
3804 : : }
3805 : :
3806 : : #endif /* PG_FLUSH_DATA_WORKS */
3807 : :
3808 : : static void
3658 andres@anarazel.de 3809 : 0 : datadir_fsync_fname(const char *fname, bool isdir, int elevel)
3810 : : {
1602 rhaas@postgresql.org 3811 [ # # # # ]: 0 : ereport_startup_progress("syncing data directory (fsync), elapsed time: %ld.%02d s, current path: %s",
3812 : : fname);
3813 : :
3814 : : /*
3815 : : * We want to silently ignoring errors about unreadable files. Pass that
3816 : : * desire on to fsync_fname_ext().
3817 : : */
3658 andres@anarazel.de 3818 : 0 : fsync_fname_ext(fname, isdir, true, elevel);
3819 : 0 : }
3820 : :
3821 : : static void
3026 andres@anarazel.de 3822 :CBC 1737 : unlink_if_exists_fname(const char *fname, bool isdir, int elevel)
3823 : : {
3824 [ + + ]: 1737 : if (isdir)
3825 : : {
3826 [ - + - - ]: 195 : if (rmdir(fname) != 0 && errno != ENOENT)
3026 andres@anarazel.de 3827 [ # # ]:UBC 0 : ereport(elevel,
3828 : : (errcode_for_file_access(),
3829 : : errmsg("could not remove directory \"%s\": %m", fname)));
3830 : : }
3831 : : else
3832 : : {
3833 : : /* Use PathNameDeleteTemporaryFile to report filesize */
3026 andres@anarazel.de 3834 :CBC 1542 : PathNameDeleteTemporaryFile(fname, false);
3835 : : }
3836 : 1737 : }
3837 : :
3838 : : /*
3839 : : * fsync_fname_ext -- Try to fsync a file or directory
3840 : : *
3841 : : * If ignore_perm is true, ignore errors upon trying to open unreadable
3842 : : * files. Logs other errors at a caller-specified level.
3843 : : *
3844 : : * Returns 0 if the operation succeeded, -1 otherwise.
3845 : : */
3846 : : int
3658 3847 : 43636 : fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
3848 : : {
3849 : : int fd;
3850 : : int flags;
3851 : : int returncode;
3852 : :
3853 : : /*
3854 : : * Some OSs require directories to be opened read-only whereas other
3855 : : * systems don't allow us to fsync files opened read-only; so we need both
3856 : : * cases here. Using O_RDWR will cause us to fail to fsync files that are
3857 : : * not writable by our userid, but we assume that's OK.
3858 : : */
3944 tgl@sss.pgh.pa.us 3859 : 43636 : flags = PG_BINARY;
3860 [ + + ]: 43636 : if (!isdir)
3861 : 16241 : flags |= O_RDWR;
3862 : : else
3863 : 27395 : flags |= O_RDONLY;
3864 : :
3095 peter_e@gmx.net 3865 : 43636 : fd = OpenTransientFile(fname, flags);
3866 : :
3867 : : /*
3868 : : * Some OSs don't allow us to open directories at all (Windows returns
3869 : : * EACCES), just ignore the error in that case. If desired also silently
3870 : : * ignoring errors about unreadable files. Log others.
3871 : : */
3658 andres@anarazel.de 3872 [ - + - - : 43636 : if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
- - - - ]
3658 andres@anarazel.de 3873 :UBC 0 : return 0;
3658 andres@anarazel.de 3874 [ - + - - :CBC 43636 : else if (fd < 0 && ignore_perm && errno == EACCES)
- - ]
3658 andres@anarazel.de 3875 :UBC 0 : return 0;
3658 andres@anarazel.de 3876 [ - + ]:CBC 43636 : else if (fd < 0)
3877 : : {
3944 tgl@sss.pgh.pa.us 3878 [ # # ]:UBC 0 : ereport(elevel,
3879 : : (errcode_for_file_access(),
3880 : : errmsg("could not open file \"%s\": %m", fname)));
3658 andres@anarazel.de 3881 : 0 : return -1;
3882 : : }
3883 : :
3944 tgl@sss.pgh.pa.us 3884 :CBC 43636 : returncode = pg_fsync(fd);
3885 : :
3886 : : /*
3887 : : * Some OSes don't allow us to fsync directories at all, so we can ignore
3888 : : * those errors. Anything else needs to be logged.
3889 : : */
2576 tmunro@postgresql.or 3890 [ - + - - : 43636 : if (returncode != 0 && !(isdir && (errno == EBADF || errno == EINVAL)))
- - - - ]
3891 : : {
3892 : : int save_errno;
3893 : :
3894 : : /* close file upon error, might not be in transaction context */
3658 andres@anarazel.de 3895 :UBC 0 : save_errno = errno;
3896 : 0 : (void) CloseTransientFile(fd);
3897 : 0 : errno = save_errno;
3898 : :
3944 tgl@sss.pgh.pa.us 3899 [ # # ]: 0 : ereport(elevel,
3900 : : (errcode_for_file_access(),
3901 : : errmsg("could not fsync file \"%s\": %m", fname)));
3658 andres@anarazel.de 3902 : 0 : return -1;
3903 : : }
3904 : :
2444 peter@eisentraut.org 3905 [ - + ]:CBC 43636 : if (CloseTransientFile(fd) != 0)
3906 : : {
2563 michael@paquier.xyz 3907 [ # # ]:UBC 0 : ereport(elevel,
3908 : : (errcode_for_file_access(),
3909 : : errmsg("could not close file \"%s\": %m", fname)));
3910 : 0 : return -1;
3911 : : }
3912 : :
3658 andres@anarazel.de 3913 :CBC 43636 : return 0;
3914 : : }
3915 : :
3916 : : /*
3917 : : * fsync_parent_path -- fsync the parent path of a file or directory
3918 : : *
3919 : : * This is aimed at making file operations persistent on disk in case of
3920 : : * an OS crash or power failure.
3921 : : */
3922 : : static int
3923 : 7880 : fsync_parent_path(const char *fname, int elevel)
3924 : : {
3925 : : char parentpath[MAXPGPATH];
3926 : :
3927 : 7880 : strlcpy(parentpath, fname, MAXPGPATH);
3928 : 7880 : get_parent_directory(parentpath);
3929 : :
3930 : : /*
3931 : : * get_parent_directory() returns an empty string if the input argument is
3932 : : * just a file name (see comments in path.c), so handle that as being the
3933 : : * current directory.
3934 : : */
3935 [ + + ]: 7880 : if (strlen(parentpath) == 0)
3936 : 230 : strlcpy(parentpath, ".", MAXPGPATH);
3937 : :
3938 [ - + ]: 7880 : if (fsync_fname_ext(parentpath, true, false, elevel) != 0)
3658 andres@anarazel.de 3939 :UBC 0 : return -1;
3940 : :
3658 andres@anarazel.de 3941 :CBC 7880 : return 0;
3942 : : }
3943 : :
3944 : : /*
3945 : : * Create a PostgreSQL data sub-directory
3946 : : *
3947 : : * The data directory itself, and most of its sub-directories, are created at
3948 : : * initdb time, but we do have some occasions when we create directories in
3949 : : * the backend (CREATE TABLESPACE, for example). In those cases, we want to
3950 : : * make sure that those directories are created consistently. Today, that means
3951 : : * making sure that the created directory has the correct permissions, which is
3952 : : * what pg_dir_create_mode tracks for us.
3953 : : *
3954 : : * Note that we also set the umask() based on what we understand the correct
3955 : : * permissions to be (see file_perm.c).
3956 : : *
3957 : : * For permissions other than the default, mkdir() can be used directly, but
3958 : : * be sure to consider carefully such cases -- a sub-directory with incorrect
3959 : : * permissions in a PostgreSQL data directory could cause backups and other
3960 : : * processes to fail.
3961 : : */
3962 : : int
2899 sfrost@snowman.net 3963 : 1606 : MakePGDirectory(const char *directoryName)
3964 : : {
3965 : 1606 : return mkdir(directoryName, pg_dir_create_mode);
3966 : : }
3967 : :
3968 : : /*
3969 : : * Return the passed-in error level, or PANIC if data_sync_retry is off.
3970 : : *
3971 : : * Failure to fsync any data file is cause for immediate panic, unless
3972 : : * data_sync_retry is enabled. Data may have been written to the operating
3973 : : * system and removed from our buffer pool already, and if we are running on
3974 : : * an operating system that forgets dirty data on write-back failure, there
3975 : : * may be only one copy of the data remaining: in the WAL. A later attempt to
3976 : : * fsync again might falsely report success. Therefore we must not allow any
3977 : : * further checkpoints to be attempted. data_sync_retry can in theory be
3978 : : * enabled on systems known not to drop dirty buffered data on write-back
3979 : : * failure (with the likely outcome that checkpoints will continue to fail
3980 : : * until the underlying problem is fixed).
3981 : : *
3982 : : * Any code that reports a failure from fsync() or related functions should
3983 : : * filter the error level with this function.
3984 : : */
3985 : : int
2673 tmunro@postgresql.or 3986 : 22488 : data_sync_elevel(int elevel)
3987 : : {
3988 [ - + ]: 22488 : return data_sync_retry ? elevel : PANIC;
3989 : : }
3990 : :
3991 : : bool
936 peter@eisentraut.org 3992 : 1186 : check_debug_io_direct(char **newval, void **extra, GucSource source)
3993 : : {
1072 tmunro@postgresql.or 3994 : 1186 : bool result = true;
3995 : : int flags;
3996 : :
3997 : : #if PG_O_DIRECT == 0
3998 : : if (strcmp(*newval, "") != 0)
3999 : : {
4000 : : GUC_check_errdetail("\"%s\" is not supported on this platform.",
4001 : : "debug_io_direct");
4002 : : result = false;
4003 : : }
4004 : : flags = 0;
4005 : : #else
4006 : : List *elemlist;
4007 : : ListCell *l;
4008 : : char *rawstring;
4009 : :
4010 : : /* Need a modifiable copy of string */
4011 : 1186 : rawstring = pstrdup(*newval);
4012 : :
4013 [ - + ]: 1186 : if (!SplitGUCList(rawstring, ',', &elemlist))
4014 : : {
473 alvherre@alvh.no-ip. 4015 :UBC 0 : GUC_check_errdetail("Invalid list syntax in parameter \"%s\".",
4016 : : "debug_io_direct");
1072 tmunro@postgresql.or 4017 : 0 : pfree(rawstring);
4018 : 0 : list_free(elemlist);
4019 : 0 : return false;
4020 : : }
4021 : :
1072 tmunro@postgresql.or 4022 :CBC 1186 : flags = 0;
4023 [ + + + + : 1192 : foreach(l, elemlist)
+ + ]
4024 : : {
4025 : 6 : char *item = (char *) lfirst(l);
4026 : :
4027 [ + + ]: 6 : if (pg_strcasecmp(item, "data") == 0)
4028 : 2 : flags |= IO_DIRECT_DATA;
4029 [ + + ]: 4 : else if (pg_strcasecmp(item, "wal") == 0)
4030 : 2 : flags |= IO_DIRECT_WAL;
4031 [ + - ]: 2 : else if (pg_strcasecmp(item, "wal_init") == 0)
4032 : 2 : flags |= IO_DIRECT_WAL_INIT;
4033 : : else
4034 : : {
473 alvherre@alvh.no-ip. 4035 :UBC 0 : GUC_check_errdetail("Invalid option \"%s\".", item);
1072 tmunro@postgresql.or 4036 : 0 : result = false;
4037 : 0 : break;
4038 : : }
4039 : : }
4040 : :
4041 : : /*
4042 : : * It's possible to configure block sizes smaller than our assumed I/O
4043 : : * alignment size, which could result in invalid I/O requests.
4044 : : */
4045 : : #if XLOG_BLCKSZ < PG_IO_ALIGN_SIZE
4046 : : if (result && (flags & (IO_DIRECT_WAL | IO_DIRECT_WAL_INIT)))
4047 : : {
4048 : : GUC_check_errdetail("\"%s\" is not supported for WAL because %s is too small.",
4049 : : "debug_io_direct", "XLOG_BLCKSZ");
4050 : : result = false;
4051 : : }
4052 : : #endif
4053 : : #if BLCKSZ < PG_IO_ALIGN_SIZE
4054 : : if (result && (flags & IO_DIRECT_DATA))
4055 : : {
4056 : : GUC_check_errdetail("\"%s\" is not supported for data because %s is too small.",
4057 : : "debug_io_direct", "BLCKSZ");
4058 : : result = false;
4059 : : }
4060 : : #endif
4061 : :
1072 tmunro@postgresql.or 4062 :CBC 1186 : pfree(rawstring);
4063 : 1186 : list_free(elemlist);
4064 : : #endif
4065 : :
4066 [ - + ]: 1186 : if (!result)
1072 tmunro@postgresql.or 4067 :UBC 0 : return result;
4068 : :
4069 : : /* Save the flags in *extra, for use by assign_debug_io_direct */
353 dgustafsson@postgres 4070 :CBC 1186 : *extra = guc_malloc(LOG, sizeof(int));
4071 [ - + ]: 1186 : if (!*extra)
353 dgustafsson@postgres 4072 :UBC 0 : return false;
1072 tmunro@postgresql.or 4073 :CBC 1186 : *((int *) *extra) = flags;
4074 : :
4075 : 1186 : return result;
4076 : : }
4077 : :
4078 : : void
936 peter@eisentraut.org 4079 : 1186 : assign_debug_io_direct(const char *newval, void *extra)
4080 : : {
1072 tmunro@postgresql.or 4081 : 1186 : int *flags = (int *) extra;
4082 : :
4083 : 1186 : io_direct_flags = *flags;
4084 : 1186 : }
4085 : :
4086 : : /* ResourceOwner callbacks */
4087 : :
4088 : : static void
858 heikki.linnakangas@i 4089 : 4 : ResOwnerReleaseFile(Datum res)
4090 : : {
4091 : 4 : File file = (File) DatumGetInt32(res);
4092 : : Vfd *vfdP;
4093 : :
4094 [ + - + - : 4 : Assert(FileIsValid(file));
- + ]
4095 : :
4096 : 4 : vfdP = &VfdCache[file];
4097 : 4 : vfdP->resowner = NULL;
4098 : :
4099 : 4 : FileClose(file);
4100 : 4 : }
4101 : :
4102 : : static char *
858 heikki.linnakangas@i 4103 :UBC 0 : ResOwnerPrintFile(Datum res)
4104 : : {
4105 : 0 : return psprintf("File %d", DatumGetInt32(res));
4106 : : }
|