Thanks to visit codestin.com
Credit goes to doxygen.postgresql.org

PostgreSQL Source Code git master
fd.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * fd.c
4 * Virtual file descriptor code.
5 *
6 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 * IDENTIFICATION
10 * src/backend/storage/file/fd.c
11 *
12 * NOTES:
13 *
14 * This code manages a cache of 'virtual' file descriptors (VFDs).
15 * The server opens many file descriptors for a variety of reasons,
16 * including base tables, scratch files (e.g., sort and hash spool
17 * files), and random calls to C library routines like system(3); it
18 * is quite easy to exceed system limits on the number of open files a
19 * single process can have. (This is around 1024 on many modern
20 * operating systems, but may be lower on others.)
21 *
22 * VFDs are managed as an LRU pool, with actual OS file descriptors
23 * being opened and closed as needed. Obviously, if a routine is
24 * opened using these interfaces, all subsequent operations must also
25 * be through these interfaces (the File type is not a real file
26 * descriptor).
27 *
28 * For this scheme to work, most (if not all) routines throughout the
29 * server should use these interfaces instead of calling the C library
30 * routines (e.g., open(2) and fopen(3)) themselves. Otherwise, we
31 * may find ourselves short of real file descriptors anyway.
32 *
33 * INTERFACE ROUTINES
34 *
35 * PathNameOpenFile and OpenTemporaryFile are used to open virtual files.
36 * A File opened with OpenTemporaryFile is automatically deleted when the
37 * File is closed, either explicitly or implicitly at end of transaction or
38 * process exit. PathNameOpenFile is intended for files that are held open
39 * for a long time, like relation files. It is the caller's responsibility
40 * to close them, there is no automatic mechanism in fd.c for that.
41 *
42 * PathName(Create|Open|Delete)Temporary(File|Dir) are used to manage
43 * temporary files that have names so that they can be shared between
44 * backends. Such files are automatically closed and count against the
45 * temporary file limit of the backend that creates them, but unlike anonymous
46 * files they are not automatically deleted. See sharedfileset.c for a shared
47 * ownership mechanism that provides automatic cleanup for shared files when
48 * the last of a group of backends detaches.
49 *
50 * AllocateFile, AllocateDir, OpenPipeStream and OpenTransientFile are
51 * wrappers around fopen(3), opendir(3), popen(3) and open(2), respectively.
52 * They behave like the corresponding native functions, except that the handle
53 * is registered with the current subtransaction, and will be automatically
54 * closed at abort. These are intended mainly for short operations like
55 * reading a configuration file; there is a limit on the number of files that
56 * can be opened using these functions at any one time.
57 *
58 * Finally, BasicOpenFile is just a thin wrapper around open() that can
59 * release file descriptors in use by the virtual file descriptors if
60 * necessary. There is no automatic cleanup of file descriptors returned by
61 * BasicOpenFile, it is solely the caller's responsibility to close the file
62 * descriptor by calling close(2).
63 *
64 * If a non-virtual file descriptor needs to be held open for any length of
65 * time, report it to fd.c by calling AcquireExternalFD or ReserveExternalFD
66 * (and eventually ReleaseExternalFD), so that we can take it into account
67 * while deciding how many VFDs can be open. This applies to FDs obtained
68 * with BasicOpenFile as well as those obtained without use of any fd.c API.
69 *
70 *-------------------------------------------------------------------------
71 */
72
73#include "postgres.h"
74
75#include <dirent.h>
76#include <sys/file.h>
77#include <sys/param.h>
78#include <sys/resource.h> /* for getrlimit */
79#include <sys/stat.h>
80#include <sys/types.h>
81#ifndef WIN32
82#include <sys/mman.h>
83#endif
84#include <limits.h>
85#include <unistd.h>
86#include <fcntl.h>
87
88#include "access/xact.h"
89#include "access/xlog.h"
91#include "common/file_perm.h"
92#include "common/file_utils.h"
93#include "common/pg_prng.h"
94#include "miscadmin.h"
95#include "pgstat.h"
96#include "postmaster/startup.h"
97#include "storage/aio.h"
98#include "storage/fd.h"
99#include "storage/ipc.h"
100#include "utils/guc.h"
101#include "utils/guc_hooks.h"
102#include "utils/resowner.h"
103#include "utils/varlena.h"
104
105/* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
106#if defined(HAVE_SYNC_FILE_RANGE)
107#define PG_FLUSH_DATA_WORKS 1
108#elif !defined(WIN32) && defined(MS_ASYNC)
109#define PG_FLUSH_DATA_WORKS 1
110#elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
111#define PG_FLUSH_DATA_WORKS 1
112#endif
113
114/*
115 * We must leave some file descriptors free for system(), the dynamic loader,
116 * and other code that tries to open files without consulting fd.c. This
117 * is the number left free. (While we try fairly hard to prevent EMFILE
118 * errors, there's never any guarantee that we won't get ENFILE due to
119 * other processes chewing up FDs. So it's a bad idea to try to open files
120 * without consulting fd.c. Nonetheless we cannot control all code.)
121 *
122 * Because this is just a fixed setting, we are effectively assuming that
123 * no such code will leave FDs open over the long term; otherwise the slop
124 * is likely to be insufficient. Note in particular that we expect that
125 * loading a shared library does not result in any permanent increase in
126 * the number of open files. (This appears to be true on most if not
127 * all platforms as of Feb 2004.)
128 */
129#define NUM_RESERVED_FDS 10
130
131/*
132 * If we have fewer than this many usable FDs after allowing for the reserved
133 * ones, choke. (This value is chosen to work with "ulimit -n 64", but not
134 * much less than that. Note that this value ensures numExternalFDs can be
135 * at least 16; as of this writing, the contrib/postgres_fdw regression tests
136 * will not pass unless that can grow to at least 14.)
137 */
138#define FD_MINFREE 48
139
140/*
141 * A number of platforms allow individual processes to open many more files
142 * than they can really support when *many* processes do the same thing.
143 * This GUC parameter lets the DBA limit max_safe_fds to something less than
144 * what the postmaster's initial probe suggests will work.
145 */
147
148/*
149 * Maximum number of file descriptors to open for operations that fd.c knows
150 * about (VFDs, AllocateFile etc, or "external" FDs). This is initialized
151 * to a conservative value, and remains that way indefinitely in bootstrap or
152 * standalone-backend cases. In normal postmaster operation, the postmaster
153 * calls set_max_safe_fds() late in initialization to update the value, and
154 * that value is then inherited by forked subprocesses.
155 *
156 * Note: the value of max_files_per_process is taken into account while
157 * setting this variable, and so need not be tested separately.
158 */
159int max_safe_fds = FD_MINFREE; /* default if not changed */
160
161/* Whether it is safe to continue running after fsync() fails. */
162bool data_sync_retry = false;
163
164/* How SyncDataDirectory() should do its job. */
166
167/* Which kinds of files should be opened with PG_O_DIRECT. */
169
170/* Debugging.... */
171
172#ifdef FDDEBUG
173#define DO_DB(A) \
174 do { \
175 int _do_db_save_errno = errno; \
176 A; \
177 errno = _do_db_save_errno; \
178 } while (0)
179#else
180#define DO_DB(A) \
181 ((void) 0)
182#endif
183
184#define VFD_CLOSED (-1)
185
186#define FileIsValid(file) \
187 ((file) > 0 && (file) < (int) SizeVfdCache && VfdCache[file].fileName != NULL)
188
189#define FileIsNotOpen(file) (VfdCache[file].fd == VFD_CLOSED)
190
191/* these are the assigned bits in fdstate below: */
192#define FD_DELETE_AT_CLOSE (1 << 0) /* T = delete when closed */
193#define FD_CLOSE_AT_EOXACT (1 << 1) /* T = close at eoXact */
194#define FD_TEMP_FILE_LIMIT (1 << 2) /* T = respect temp_file_limit */
195
196typedef struct vfd
197{
198 int fd; /* current FD, or VFD_CLOSED if none */
199 unsigned short fdstate; /* bitflags for VFD's state */
200 ResourceOwner resowner; /* owner, for automatic cleanup */
201 File nextFree; /* link to next free VFD, if in freelist */
202 File lruMoreRecently; /* doubly linked recency-of-use list */
204 off_t fileSize; /* current size of file (0 if not temporary) */
205 char *fileName; /* name of file, or NULL for unused VFD */
206 /* NB: fileName is malloc'd, and must be free'd when closing the VFD */
207 int fileFlags; /* open(2) flags for (re)opening the file */
208 mode_t fileMode; /* mode to pass to open(2) */
210
211/*
212 * Virtual File Descriptor array pointer and size. This grows as
213 * needed. 'File' values are indexes into this array.
214 * Note that VfdCache[0] is not a usable VFD, just a list header.
215 */
216static Vfd *VfdCache;
218
219/*
220 * Number of file descriptors known to be in use by VFD entries.
221 */
222static int nfile = 0;
223
224/*
225 * Flag to tell whether it's worth scanning VfdCache looking for temp files
226 * to close
227 */
228static bool have_xact_temporary_files = false;
229
230/*
231 * Tracks the total size of all temporary files. Note: when temp_file_limit
232 * is being enforced, this cannot overflow since the limit cannot be more
233 * than INT_MAX kilobytes. When not enforcing, it could theoretically
234 * overflow, but we don't care.
235 */
237
238/* Temporary file access initialized and not yet shut down? */
239#ifdef USE_ASSERT_CHECKING
240static bool temporary_files_allowed = false;
241#endif
242
243/*
244 * List of OS handles opened with AllocateFile, AllocateDir and
245 * OpenTransientFile.
246 */
247typedef enum
248{
254
255typedef struct
256{
259 union
260 {
261 FILE *file;
263 int fd;
264 } desc;
266
267static int numAllocatedDescs = 0;
268static int maxAllocatedDescs = 0;
270
271/*
272 * Number of open "external" FDs reported to Reserve/ReleaseExternalFD.
273 */
274static int numExternalFDs = 0;
275
276/*
277 * Number of temporary files opened during the current session;
278 * this is used in generation of tempfile names.
279 */
280static long tempFileCounter = 0;
281
282/*
283 * Array of OIDs of temp tablespaces. (Some entries may be InvalidOid,
284 * indicating that the current database's default tablespace should be used.)
285 * When numTempTableSpaces is -1, this has not been set in the current
286 * transaction.
287 */
288static Oid *tempTableSpaces = NULL;
289static int numTempTableSpaces = -1;
290static int nextTempTableSpace = 0;
291
292
293/*--------------------
294 *
295 * Private Routines
296 *
297 * Delete - delete a file from the Lru ring
298 * LruDelete - remove a file from the Lru ring and close its FD
299 * Insert - put a file at the front of the Lru ring
300 * LruInsert - put a file at the front of the Lru ring and open it
301 * ReleaseLruFile - Release an fd by closing the last entry in the Lru ring
302 * ReleaseLruFiles - Release fd(s) until we're under the max_safe_fds limit
303 * AllocateVfd - grab a free (or new) file record (from VfdCache)
304 * FreeVfd - free a file record
305 *
306 * The Least Recently Used ring is a doubly linked list that begins and
307 * ends on element zero. Element zero is special -- it doesn't represent
308 * a file and its "fd" field always == VFD_CLOSED. Element zero is just an
309 * anchor that shows us the beginning/end of the ring.
310 * Only VFD elements that are currently really open (have an FD assigned) are
311 * in the Lru ring. Elements that are "virtually" open can be recognized
312 * by having a non-null fileName field.
313 *
314 * example:
315 *
316 * /--less----\ /---------\
317 * v \ v \
318 * #0 --more---> LeastRecentlyUsed --more-\ \
319 * ^\ | |
320 * \\less--> MostRecentlyUsedFile <---/ |
321 * \more---/ \--less--/
322 *
323 *--------------------
324 */
325static void Delete(File file);
326static void LruDelete(File file);
327static void Insert(File file);
328static int LruInsert(File file);
329static bool ReleaseLruFile(void);
330static void ReleaseLruFiles(void);
331static File AllocateVfd(void);
332static void FreeVfd(File file);
333
334static int FileAccess(File file);
335static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError);
336static bool reserveAllocatedDesc(void);
337static int FreeDesc(AllocateDesc *desc);
338
339static void BeforeShmemExit_Files(int code, Datum arg);
340static void CleanupTempFiles(bool isCommit, bool isProcExit);
341static void RemovePgTempRelationFiles(const char *tsdirname);
342static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname);
343
344static void walkdir(const char *path,
345 void (*action) (const char *fname, bool isdir, int elevel),
346 bool process_symlinks,
347 int elevel);
348#ifdef PG_FLUSH_DATA_WORKS
349static void pre_sync_fname(const char *fname, bool isdir, int elevel);
350#endif
351static void datadir_fsync_fname(const char *fname, bool isdir, int elevel);
352static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel);
353
354static int fsync_parent_path(const char *fname, int elevel);
355
356
357/* ResourceOwner callbacks to hold virtual file descriptors */
358static void ResOwnerReleaseFile(Datum res);
359static char *ResOwnerPrintFile(Datum res);
360
362{
363 .name = "File",
364 .release_phase = RESOURCE_RELEASE_AFTER_LOCKS,
365 .release_priority = RELEASE_PRIO_FILES,
366 .ReleaseResource = ResOwnerReleaseFile,
367 .DebugPrint = ResOwnerPrintFile
368};
369
370/* Convenience wrappers over ResourceOwnerRemember/Forget */
371static inline void
373{
375}
376static inline void
378{
380}
381
382/*
383 * pg_fsync --- do fsync with or without writethrough
384 */
385int
387{
388#if !defined(WIN32) && defined(USE_ASSERT_CHECKING)
389 struct stat st;
390
391 /*
392 * Some operating system implementations of fsync() have requirements
393 * about the file access modes that were used when their file descriptor
394 * argument was opened, and these requirements differ depending on whether
395 * the file descriptor is for a directory.
396 *
397 * For any file descriptor that may eventually be handed to fsync(), we
398 * should have opened it with access modes that are compatible with
399 * fsync() on all supported systems, otherwise the code may not be
400 * portable, even if it runs ok on the current system.
401 *
402 * We assert here that a descriptor for a file was opened with write
403 * permissions (i.e., not O_RDONLY) and for a directory without write
404 * permissions (O_RDONLY). Notice that the assertion check is made even
405 * if fsync() is disabled.
406 *
407 * If fstat() fails, ignore it and let the follow-up fsync() complain.
408 */
409 if (fstat(fd, &st) == 0)
410 {
411 int desc_flags = fcntl(fd, F_GETFL);
412
413 desc_flags &= O_ACCMODE;
414
415 if (S_ISDIR(st.st_mode))
416 Assert(desc_flags == O_RDONLY);
417 else
418 Assert(desc_flags != O_RDONLY);
419 }
420 errno = 0;
421#endif
422
423 /* #if is to skip the wal_sync_method test if there's no need for it */
424#if defined(HAVE_FSYNC_WRITETHROUGH)
427 else
428#endif
430}
431
432
433/*
434 * pg_fsync_no_writethrough --- same as fsync except does nothing if
435 * enableFsync is off
436 */
437int
439{
440 int rc;
441
442 if (!enableFsync)
443 return 0;
444
445retry:
446 rc = fsync(fd);
447
448 if (rc == -1 && errno == EINTR)
449 goto retry;
450
451 return rc;
452}
453
454/*
455 * pg_fsync_writethrough
456 */
457int
459{
460 if (enableFsync)
461 {
462#if defined(F_FULLFSYNC)
463 return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0;
464#else
465 errno = ENOSYS;
466 return -1;
467#endif
468 }
469 else
470 return 0;
471}
472
473/*
474 * pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off
475 */
476int
478{
479 int rc;
480
481 if (!enableFsync)
482 return 0;
483
484retry:
485 rc = fdatasync(fd);
486
487 if (rc == -1 && errno == EINTR)
488 goto retry;
489
490 return rc;
491}
492
493/*
494 * pg_file_exists -- check that a file exists.
495 *
496 * This requires an absolute path to the file. Returns true if the file is
497 * not a directory, false otherwise.
498 */
499bool
501{
502 struct stat st;
503
504 Assert(name != NULL);
505
506 if (stat(name, &st) == 0)
507 return !S_ISDIR(st.st_mode);
508 else if (!(errno == ENOENT || errno == ENOTDIR || errno == EACCES))
511 errmsg("could not access file \"%s\": %m", name)));
512
513 return false;
514}
515
516/*
517 * pg_flush_data --- advise OS that the described dirty data should be flushed
518 *
519 * offset of 0 with nbytes 0 means that the entire file should be flushed
520 */
521void
522pg_flush_data(int fd, off_t offset, off_t nbytes)
523{
524 /*
525 * Right now file flushing is primarily used to avoid making later
526 * fsync()/fdatasync() calls have less impact. Thus don't trigger flushes
527 * if fsyncs are disabled - that's a decision we might want to make
528 * configurable at some point.
529 */
530 if (!enableFsync)
531 return;
532
533 /*
534 * We compile all alternatives that are supported on the current platform,
535 * to find portability problems more easily.
536 */
537#if defined(HAVE_SYNC_FILE_RANGE)
538 {
539 int rc;
540 static bool not_implemented_by_kernel = false;
541
542 if (not_implemented_by_kernel)
543 return;
544
545retry:
546
547 /*
548 * sync_file_range(SYNC_FILE_RANGE_WRITE), currently linux specific,
549 * tells the OS that writeback for the specified blocks should be
550 * started, but that we don't want to wait for completion. Note that
551 * this call might block if too much dirty data exists in the range.
552 * This is the preferable method on OSs supporting it, as it works
553 * reliably when available (contrast to msync()) and doesn't flush out
554 * clean data (like FADV_DONTNEED).
555 */
556 rc = sync_file_range(fd, offset, nbytes,
557 SYNC_FILE_RANGE_WRITE);
558 if (rc != 0)
559 {
560 int elevel;
561
562 if (rc == EINTR)
563 goto retry;
564
565 /*
566 * For systems that don't have an implementation of
567 * sync_file_range() such as Windows WSL, generate only one
568 * warning and then suppress all further attempts by this process.
569 */
570 if (errno == ENOSYS)
571 {
572 elevel = WARNING;
573 not_implemented_by_kernel = true;
574 }
575 else
576 elevel = data_sync_elevel(WARNING);
577
578 ereport(elevel,
580 errmsg("could not flush dirty data: %m")));
581 }
582
583 return;
584 }
585#endif
586#if !defined(WIN32) && defined(MS_ASYNC)
587 {
588 void *p;
589 static int pagesize = 0;
590
591 /*
592 * On several OSs msync(MS_ASYNC) on a mmap'ed file triggers
593 * writeback. On linux it only does so if MS_SYNC is specified, but
594 * then it does the writeback synchronously. Luckily all common linux
595 * systems have sync_file_range(). This is preferable over
596 * FADV_DONTNEED because it doesn't flush out clean data.
597 *
598 * We map the file (mmap()), tell the kernel to sync back the contents
599 * (msync()), and then remove the mapping again (munmap()).
600 */
601
602 /* mmap() needs actual length if we want to map whole file */
603 if (offset == 0 && nbytes == 0)
604 {
605 nbytes = lseek(fd, 0, SEEK_END);
606 if (nbytes < 0)
607 {
610 errmsg("could not determine dirty data size: %m")));
611 return;
612 }
613 }
614
615 /*
616 * Some platforms reject partial-page mmap() attempts. To deal with
617 * that, just truncate the request to a page boundary. If any extra
618 * bytes don't get flushed, well, it's only a hint anyway.
619 */
620
621 /* fetch pagesize only once */
622 if (pagesize == 0)
623 pagesize = sysconf(_SC_PAGESIZE);
624
625 /* align length to pagesize, dropping any fractional page */
626 if (pagesize > 0)
627 nbytes = (nbytes / pagesize) * pagesize;
628
629 /* fractional-page request is a no-op */
630 if (nbytes <= 0)
631 return;
632
633 /*
634 * mmap could well fail, particularly on 32-bit platforms where there
635 * may simply not be enough address space. If so, silently fall
636 * through to the next implementation.
637 */
638 if (nbytes <= (off_t) SSIZE_MAX)
639 p = mmap(NULL, nbytes, PROT_READ, MAP_SHARED, fd, offset);
640 else
641 p = MAP_FAILED;
642
643 if (p != MAP_FAILED)
644 {
645 int rc;
646
647 rc = msync(p, (size_t) nbytes, MS_ASYNC);
648 if (rc != 0)
649 {
652 errmsg("could not flush dirty data: %m")));
653 /* NB: need to fall through to munmap()! */
654 }
655
656 rc = munmap(p, (size_t) nbytes);
657 if (rc != 0)
658 {
659 /* FATAL error because mapping would remain */
662 errmsg("could not munmap() while flushing data: %m")));
663 }
664
665 return;
666 }
667 }
668#endif
669#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
670 {
671 int rc;
672
673 /*
674 * Signal the kernel that the passed in range should not be cached
675 * anymore. This has the, desired, side effect of writing out dirty
676 * data, and the, undesired, side effect of likely discarding useful
677 * clean cached blocks. For the latter reason this is the least
678 * preferable method.
679 */
680
681 rc = posix_fadvise(fd, offset, nbytes, POSIX_FADV_DONTNEED);
682
683 if (rc != 0)
684 {
685 /* don't error out, this is just a performance optimization */
688 errmsg("could not flush dirty data: %m")));
689 }
690
691 return;
692 }
693#endif
694}
695
696/*
697 * Truncate an open file to a given length.
698 */
699static int
700pg_ftruncate(int fd, off_t length)
701{
702 int ret;
703
704retry:
705 ret = ftruncate(fd, length);
706
707 if (ret == -1 && errno == EINTR)
708 goto retry;
709
710 return ret;
711}
712
713/*
714 * Truncate a file to a given length by name.
715 */
716int
717pg_truncate(const char *path, off_t length)
718{
719 int ret;
720#ifdef WIN32
721 int save_errno;
722 int fd;
723
724 fd = OpenTransientFile(path, O_RDWR | PG_BINARY);
725 if (fd >= 0)
726 {
727 ret = pg_ftruncate(fd, length);
728 save_errno = errno;
730 errno = save_errno;
731 }
732 else
733 ret = -1;
734#else
735
736retry:
737 ret = truncate(path, length);
738
739 if (ret == -1 && errno == EINTR)
740 goto retry;
741#endif
742
743 return ret;
744}
745
746/*
747 * fsync_fname -- fsync a file or directory, handling errors properly
748 *
749 * Try to fsync a file or directory. When doing the latter, ignore errors that
750 * indicate the OS just doesn't allow/require fsyncing directories.
751 */
752void
753fsync_fname(const char *fname, bool isdir)
754{
755 fsync_fname_ext(fname, isdir, false, data_sync_elevel(ERROR));
756}
757
758/*
759 * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
760 *
761 * This routine ensures that, after returning, the effect of renaming file
762 * persists in case of a crash. A crash while this routine is running will
763 * leave you with either the pre-existing or the moved file in place of the
764 * new file; no mixed state or truncated files are possible.
765 *
766 * It does so by using fsync on the old filename and the possibly existing
767 * target filename before the rename, and the target file and directory after.
768 *
769 * Note that rename() cannot be used across arbitrary directories, as they
770 * might not be on the same filesystem. Therefore this routine does not
771 * support renaming across directories.
772 *
773 * Log errors with the caller specified severity.
774 *
775 * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
776 * valid upon return.
777 */
778int
779durable_rename(const char *oldfile, const char *newfile, int elevel)
780{
781 int fd;
782
783 /*
784 * First fsync the old and target path (if it exists), to ensure that they
785 * are properly persistent on disk. Syncing the target file is not
786 * strictly necessary, but it makes it easier to reason about crashes;
787 * because it's then guaranteed that either source or target file exists
788 * after a crash.
789 */
790 if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
791 return -1;
792
793 fd = OpenTransientFile(newfile, PG_BINARY | O_RDWR);
794 if (fd < 0)
795 {
796 if (errno != ENOENT)
797 {
798 ereport(elevel,
800 errmsg("could not open file \"%s\": %m", newfile)));
801 return -1;
802 }
803 }
804 else
805 {
806 if (pg_fsync(fd) != 0)
807 {
808 int save_errno;
809
810 /* close file upon error, might not be in transaction context */
811 save_errno = errno;
813 errno = save_errno;
814
815 ereport(elevel,
817 errmsg("could not fsync file \"%s\": %m", newfile)));
818 return -1;
819 }
820
821 if (CloseTransientFile(fd) != 0)
822 {
823 ereport(elevel,
825 errmsg("could not close file \"%s\": %m", newfile)));
826 return -1;
827 }
828 }
829
830 /* Time to do the real deal... */
831 if (rename(oldfile, newfile) < 0)
832 {
833 ereport(elevel,
835 errmsg("could not rename file \"%s\" to \"%s\": %m",
836 oldfile, newfile)));
837 return -1;
838 }
839
840 /*
841 * To guarantee renaming the file is persistent, fsync the file with its
842 * new name, and its containing directory.
843 */
844 if (fsync_fname_ext(newfile, false, false, elevel) != 0)
845 return -1;
846
847 if (fsync_parent_path(newfile, elevel) != 0)
848 return -1;
849
850 return 0;
851}
852
853/*
854 * durable_unlink -- remove a file in a durable manner
855 *
856 * This routine ensures that, after returning, the effect of removing file
857 * persists in case of a crash. A crash while this routine is running will
858 * leave the system in no mixed state.
859 *
860 * It does so by using fsync on the parent directory of the file after the
861 * actual removal is done.
862 *
863 * Log errors with the severity specified by caller.
864 *
865 * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
866 * valid upon return.
867 */
868int
869durable_unlink(const char *fname, int elevel)
870{
871 if (unlink(fname) < 0)
872 {
873 ereport(elevel,
875 errmsg("could not remove file \"%s\": %m",
876 fname)));
877 return -1;
878 }
879
880 /*
881 * To guarantee that the removal of the file is persistent, fsync its
882 * parent directory.
883 */
884 if (fsync_parent_path(fname, elevel) != 0)
885 return -1;
886
887 return 0;
888}
889
890/*
891 * InitFileAccess --- initialize this module during backend startup
892 *
893 * This is called during either normal or standalone backend start.
894 * It is *not* called in the postmaster.
895 *
896 * Note that this does not initialize temporary file access, that is
897 * separately initialized via InitTemporaryFileAccess().
898 */
899void
901{
902 Assert(SizeVfdCache == 0); /* call me only once */
903
904 /* initialize cache header entry */
905 VfdCache = (Vfd *) malloc(sizeof(Vfd));
906 if (VfdCache == NULL)
908 (errcode(ERRCODE_OUT_OF_MEMORY),
909 errmsg("out of memory")));
910
911 MemSet(&(VfdCache[0]), 0, sizeof(Vfd));
913
914 SizeVfdCache = 1;
915}
916
917/*
918 * InitTemporaryFileAccess --- initialize temporary file access during startup
919 *
920 * This is called during either normal or standalone backend start.
921 * It is *not* called in the postmaster.
922 *
923 * This is separate from InitFileAccess() because temporary file cleanup can
924 * cause pgstat reporting. As pgstat is shut down during before_shmem_exit(),
925 * our reporting has to happen before that. Low level file access should be
926 * available for longer, hence the separate initialization / shutdown of
927 * temporary file handling.
928 */
929void
931{
932 Assert(SizeVfdCache != 0); /* InitFileAccess() needs to have run */
933 Assert(!temporary_files_allowed); /* call me only once */
934
935 /*
936 * Register before-shmem-exit hook to ensure temp files are dropped while
937 * we can still report stats.
938 */
940
941#ifdef USE_ASSERT_CHECKING
942 temporary_files_allowed = true;
943#endif
944}
945
946/*
947 * count_usable_fds --- count how many FDs the system will let us open,
948 * and estimate how many are already open.
949 *
950 * We stop counting if usable_fds reaches max_to_probe. Note: a small
951 * value of max_to_probe might result in an underestimate of already_open;
952 * we must fill in any "gaps" in the set of used FDs before the calculation
953 * of already_open will give the right answer. In practice, max_to_probe
954 * of a couple of dozen should be enough to ensure good results.
955 *
956 * We assume stderr (FD 2) is available for dup'ing. While the calling
957 * script could theoretically close that, it would be a really bad idea,
958 * since then one risks loss of error messages from, e.g., libc.
959 */
960static void
961count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
962{
963 int *fd;
964 int size;
965 int used = 0;
966 int highestfd = 0;
967 int j;
968
969#ifdef HAVE_GETRLIMIT
970 struct rlimit rlim;
971 int getrlimit_status;
972#endif
973
974 size = 1024;
975 fd = (int *) palloc(size * sizeof(int));
976
977#ifdef HAVE_GETRLIMIT
978 getrlimit_status = getrlimit(RLIMIT_NOFILE, &rlim);
979 if (getrlimit_status != 0)
980 ereport(WARNING, (errmsg("getrlimit failed: %m")));
981#endif /* HAVE_GETRLIMIT */
982
983 /* dup until failure or probe limit reached */
984 for (;;)
985 {
986 int thisfd;
987
988#ifdef HAVE_GETRLIMIT
989
990 /*
991 * don't go beyond RLIMIT_NOFILE; causes irritating kernel logs on
992 * some platforms
993 */
994 if (getrlimit_status == 0 && highestfd >= rlim.rlim_cur - 1)
995 break;
996#endif
997
998 thisfd = dup(2);
999 if (thisfd < 0)
1000 {
1001 /* Expect EMFILE or ENFILE, else it's fishy */
1002 if (errno != EMFILE && errno != ENFILE)
1003 elog(WARNING, "duplicating stderr file descriptor failed after %d successes: %m", used);
1004 break;
1005 }
1006
1007 if (used >= size)
1008 {
1009 size *= 2;
1010 fd = (int *) repalloc(fd, size * sizeof(int));
1011 }
1012 fd[used++] = thisfd;
1013
1014 if (highestfd < thisfd)
1015 highestfd = thisfd;
1016
1017 if (used >= max_to_probe)
1018 break;
1019 }
1020
1021 /* release the files we opened */
1022 for (j = 0; j < used; j++)
1023 close(fd[j]);
1024
1025 pfree(fd);
1026
1027 /*
1028 * Return results. usable_fds is just the number of successful dups. We
1029 * assume that the system limit is highestfd+1 (remember 0 is a legal FD
1030 * number) and so already_open is highestfd+1 - usable_fds.
1031 */
1032 *usable_fds = used;
1033 *already_open = highestfd + 1 - used;
1034}
1035
1036/*
1037 * set_max_safe_fds
1038 * Determine number of file descriptors that fd.c is allowed to use
1039 */
1040void
1042{
1043 int usable_fds;
1044 int already_open;
1045
1046 /*----------
1047 * We want to set max_safe_fds to
1048 * MIN(usable_fds, max_files_per_process)
1049 * less the slop factor for files that are opened without consulting
1050 * fd.c. This ensures that we won't allow to open more than
1051 * max_files_per_process, or the experimentally-determined EMFILE limit,
1052 * additional files.
1053 *----------
1054 */
1056 &usable_fds, &already_open);
1057
1058 max_safe_fds = Min(usable_fds, max_files_per_process);
1059
1060 /*
1061 * Take off the FDs reserved for system() etc.
1062 */
1064
1065 /*
1066 * Make sure we still have enough to get by.
1067 */
1069 ereport(FATAL,
1070 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
1071 errmsg("insufficient file descriptors available to start server process"),
1072 errdetail("System allows %d, server needs at least %d, %d files are already open.",
1075 already_open)));
1076
1077 elog(DEBUG2, "max_safe_fds = %d, usable_fds = %d, already_open = %d",
1078 max_safe_fds, usable_fds, already_open);
1079}
1080
1081/*
1082 * Open a file with BasicOpenFilePerm() and pass default file mode for the
1083 * fileMode parameter.
1084 */
1085int
1086BasicOpenFile(const char *fileName, int fileFlags)
1087{
1088 return BasicOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
1089}
1090
1091/*
1092 * BasicOpenFilePerm --- same as open(2) except can free other FDs if needed
1093 *
1094 * This is exported for use by places that really want a plain kernel FD,
1095 * but need to be proof against running out of FDs. Once an FD has been
1096 * successfully returned, it is the caller's responsibility to ensure that
1097 * it will not be leaked on ereport()! Most users should *not* call this
1098 * routine directly, but instead use the VFD abstraction level, which
1099 * provides protection against descriptor leaks as well as management of
1100 * files that need to be open for more than a short period of time.
1101 *
1102 * Ideally this should be the *only* direct call of open() in the backend.
1103 * In practice, the postmaster calls open() directly, and there are some
1104 * direct open() calls done early in backend startup. Those are OK since
1105 * this module wouldn't have any open files to close at that point anyway.
1106 */
1107int
1108BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
1109{
1110 int fd;
1111
1112tryAgain:
1113#ifdef PG_O_DIRECT_USE_F_NOCACHE
1114
1115 /*
1116 * The value we defined to stand in for O_DIRECT when simulating it with
1117 * F_NOCACHE had better not collide with any of the standard flags.
1118 */
1120 (O_APPEND |
1121 O_CLOEXEC |
1122 O_CREAT |
1123 O_DSYNC |
1124 O_EXCL |
1125 O_RDWR |
1126 O_RDONLY |
1127 O_SYNC |
1128 O_TRUNC |
1129 O_WRONLY)) == 0,
1130 "PG_O_DIRECT value collides with standard flag");
1131 fd = open(fileName, fileFlags & ~PG_O_DIRECT, fileMode);
1132#else
1133 fd = open(fileName, fileFlags, fileMode);
1134#endif
1135
1136 if (fd >= 0)
1137 {
1138#ifdef PG_O_DIRECT_USE_F_NOCACHE
1139 if (fileFlags & PG_O_DIRECT)
1140 {
1141 if (fcntl(fd, F_NOCACHE, 1) < 0)
1142 {
1143 int save_errno = errno;
1144
1145 close(fd);
1146 errno = save_errno;
1147 return -1;
1148 }
1149 }
1150#endif
1151
1152 return fd; /* success! */
1153 }
1154
1155 if (errno == EMFILE || errno == ENFILE)
1156 {
1157 int save_errno = errno;
1158
1159 ereport(LOG,
1160 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
1161 errmsg("out of file descriptors: %m; release and retry")));
1162 errno = 0;
1163 if (ReleaseLruFile())
1164 goto tryAgain;
1165 errno = save_errno;
1166 }
1167
1168 return -1; /* failure */
1169}
1170
1171/*
1172 * AcquireExternalFD - attempt to reserve an external file descriptor
1173 *
1174 * This should be used by callers that need to hold a file descriptor open
1175 * over more than a short interval, but cannot use any of the other facilities
1176 * provided by this module.
1177 *
1178 * The difference between this and the underlying ReserveExternalFD function
1179 * is that this will report failure (by setting errno and returning false)
1180 * if "too many" external FDs are already reserved. This should be used in
1181 * any code where the total number of FDs to be reserved is not predictable
1182 * and small.
1183 */
1184bool
1186{
1187 /*
1188 * We don't want more than max_safe_fds / 3 FDs to be consumed for
1189 * "external" FDs.
1190 */
1191 if (numExternalFDs < max_safe_fds / 3)
1192 {
1194 return true;
1195 }
1196 errno = EMFILE;
1197 return false;
1198}
1199
1200/*
1201 * ReserveExternalFD - report external consumption of a file descriptor
1202 *
1203 * This should be used by callers that need to hold a file descriptor open
1204 * over more than a short interval, but cannot use any of the other facilities
1205 * provided by this module. This just tracks the use of the FD and closes
1206 * VFDs if needed to ensure we keep NUM_RESERVED_FDS FDs available.
1207 *
1208 * Call this directly only in code where failure to reserve the FD would be
1209 * fatal; for example, the WAL-writing code does so, since the alternative is
1210 * session failure. Also, it's very unwise to do so in code that could
1211 * consume more than one FD per process.
1212 *
1213 * Note: as long as everybody plays nice so that NUM_RESERVED_FDS FDs remain
1214 * available, it doesn't matter too much whether this is called before or
1215 * after actually opening the FD; but doing so beforehand reduces the risk of
1216 * an EMFILE failure if not everybody played nice. In any case, it's solely
1217 * caller's responsibility to keep the external-FD count in sync with reality.
1218 */
1219void
1221{
1222 /*
1223 * Release VFDs if needed to stay safe. Because we do this before
1224 * incrementing numExternalFDs, the final state will be as desired, i.e.,
1225 * nfile + numAllocatedDescs + numExternalFDs <= max_safe_fds.
1226 */
1228
1230}
1231
1232/*
1233 * ReleaseExternalFD - report release of an external file descriptor
1234 *
1235 * This is guaranteed not to change errno, so it can be used in failure paths.
1236 */
1237void
1239{
1242}
1243
1244
1245#if defined(FDDEBUG)
1246
1247static void
1248_dump_lru(void)
1249{
1250 int mru = VfdCache[0].lruLessRecently;
1251 Vfd *vfdP = &VfdCache[mru];
1252 char buf[2048];
1253
1254 snprintf(buf, sizeof(buf), "LRU: MOST %d ", mru);
1255 while (mru != 0)
1256 {
1257 mru = vfdP->lruLessRecently;
1258 vfdP = &VfdCache[mru];
1259 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "%d ", mru);
1260 }
1261 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "LEAST");
1262 elog(LOG, "%s", buf);
1263}
1264#endif /* FDDEBUG */
1265
1266static void
1268{
1269 Vfd *vfdP;
1270
1271 Assert(file != 0);
1272
1273 DO_DB(elog(LOG, "Delete %d (%s)",
1274 file, VfdCache[file].fileName));
1275 DO_DB(_dump_lru());
1276
1277 vfdP = &VfdCache[file];
1278
1281
1282 DO_DB(_dump_lru());
1283}
1284
1285static void
1287{
1288 Vfd *vfdP;
1289
1290 Assert(file != 0);
1291
1292 DO_DB(elog(LOG, "LruDelete %d (%s)",
1293 file, VfdCache[file].fileName));
1294
1295 vfdP = &VfdCache[file];
1296
1297 pgaio_closing_fd(vfdP->fd);
1298
1299 /*
1300 * Close the file. We aren't expecting this to fail; if it does, better
1301 * to leak the FD than to mess up our internal state.
1302 */
1303 if (close(vfdP->fd) != 0)
1305 "could not close file \"%s\": %m", vfdP->fileName);
1306 vfdP->fd = VFD_CLOSED;
1307 --nfile;
1308
1309 /* delete the vfd record from the LRU ring */
1310 Delete(file);
1311}
1312
1313static void
1315{
1316 Vfd *vfdP;
1317
1318 Assert(file != 0);
1319
1320 DO_DB(elog(LOG, "Insert %d (%s)",
1321 file, VfdCache[file].fileName));
1322 DO_DB(_dump_lru());
1323
1324 vfdP = &VfdCache[file];
1325
1326 vfdP->lruMoreRecently = 0;
1328 VfdCache[0].lruLessRecently = file;
1330
1331 DO_DB(_dump_lru());
1332}
1333
1334/* returns 0 on success, -1 on re-open failure (with errno set) */
1335static int
1337{
1338 Vfd *vfdP;
1339
1340 Assert(file != 0);
1341
1342 DO_DB(elog(LOG, "LruInsert %d (%s)",
1343 file, VfdCache[file].fileName));
1344
1345 vfdP = &VfdCache[file];
1346
1347 if (FileIsNotOpen(file))
1348 {
1349 /* Close excess kernel FDs. */
1351
1352 /*
1353 * The open could still fail for lack of file descriptors, eg due to
1354 * overall system file table being full. So, be prepared to release
1355 * another FD if necessary...
1356 */
1357 vfdP->fd = BasicOpenFilePerm(vfdP->fileName, vfdP->fileFlags,
1358 vfdP->fileMode);
1359 if (vfdP->fd < 0)
1360 {
1361 DO_DB(elog(LOG, "re-open failed: %m"));
1362 return -1;
1363 }
1364 else
1365 {
1366 ++nfile;
1367 }
1368 }
1369
1370 /*
1371 * put it at the head of the Lru ring
1372 */
1373
1374 Insert(file);
1375
1376 return 0;
1377}
1378
1379/*
1380 * Release one kernel FD by closing the least-recently-used VFD.
1381 */
1382static bool
1384{
1385 DO_DB(elog(LOG, "ReleaseLruFile. Opened %d", nfile));
1386
1387 if (nfile > 0)
1388 {
1389 /*
1390 * There are opened files and so there should be at least one used vfd
1391 * in the ring.
1392 */
1393 Assert(VfdCache[0].lruMoreRecently != 0);
1394 LruDelete(VfdCache[0].lruMoreRecently);
1395 return true; /* freed a file */
1396 }
1397 return false; /* no files available to free */
1398}
1399
1400/*
1401 * Release kernel FDs as needed to get under the max_safe_fds limit.
1402 * After calling this, it's OK to try to open another file.
1403 */
1404static void
1406{
1408 {
1409 if (!ReleaseLruFile())
1410 break;
1411 }
1412}
1413
1414static File
1416{
1417 Index i;
1418 File file;
1419
1420 DO_DB(elog(LOG, "AllocateVfd. Size %zu", SizeVfdCache));
1421
1422 Assert(SizeVfdCache > 0); /* InitFileAccess not called? */
1423
1424 if (VfdCache[0].nextFree == 0)
1425 {
1426 /*
1427 * The free list is empty so it is time to increase the size of the
1428 * array. We choose to double it each time this happens. However,
1429 * there's not much point in starting *real* small.
1430 */
1431 Size newCacheSize = SizeVfdCache * 2;
1432 Vfd *newVfdCache;
1433
1434 if (newCacheSize < 32)
1435 newCacheSize = 32;
1436
1437 /*
1438 * Be careful not to clobber VfdCache ptr if realloc fails.
1439 */
1440 newVfdCache = (Vfd *) realloc(VfdCache, sizeof(Vfd) * newCacheSize);
1441 if (newVfdCache == NULL)
1442 ereport(ERROR,
1443 (errcode(ERRCODE_OUT_OF_MEMORY),
1444 errmsg("out of memory")));
1445 VfdCache = newVfdCache;
1446
1447 /*
1448 * Initialize the new entries and link them into the free list.
1449 */
1450 for (i = SizeVfdCache; i < newCacheSize; i++)
1451 {
1452 MemSet(&(VfdCache[i]), 0, sizeof(Vfd));
1453 VfdCache[i].nextFree = i + 1;
1455 }
1456 VfdCache[newCacheSize - 1].nextFree = 0;
1458
1459 /*
1460 * Record the new size
1461 */
1462 SizeVfdCache = newCacheSize;
1463 }
1464
1465 file = VfdCache[0].nextFree;
1466
1468
1469 return file;
1470}
1471
1472static void
1474{
1475 Vfd *vfdP = &VfdCache[file];
1476
1477 DO_DB(elog(LOG, "FreeVfd: %d (%s)",
1478 file, vfdP->fileName ? vfdP->fileName : ""));
1479
1480 if (vfdP->fileName != NULL)
1481 {
1482 free(vfdP->fileName);
1483 vfdP->fileName = NULL;
1484 }
1485 vfdP->fdstate = 0x0;
1486
1487 vfdP->nextFree = VfdCache[0].nextFree;
1488 VfdCache[0].nextFree = file;
1489}
1490
1491/* returns 0 on success, -1 on re-open failure (with errno set) */
1492static int
1494{
1495 int returnValue;
1496
1497 DO_DB(elog(LOG, "FileAccess %d (%s)",
1498 file, VfdCache[file].fileName));
1499
1500 /*
1501 * Is the file open? If not, open it and put it at the head of the LRU
1502 * ring (possibly closing the least recently used file to get an FD).
1503 */
1504
1505 if (FileIsNotOpen(file))
1506 {
1507 returnValue = LruInsert(file);
1508 if (returnValue != 0)
1509 return returnValue;
1510 }
1511 else if (VfdCache[0].lruLessRecently != file)
1512 {
1513 /*
1514 * We now know that the file is open and that it is not the last one
1515 * accessed, so we need to move it to the head of the Lru ring.
1516 */
1517
1518 Delete(file);
1519 Insert(file);
1520 }
1521
1522 return 0;
1523}
1524
1525/*
1526 * Called whenever a temporary file is deleted to report its size.
1527 */
1528static void
1529ReportTemporaryFileUsage(const char *path, off_t size)
1530{
1532
1533 if (log_temp_files >= 0)
1534 {
1535 if ((size / 1024) >= log_temp_files)
1536 ereport(LOG,
1537 (errmsg("temporary file: path \"%s\", size %lu",
1538 path, (unsigned long) size)));
1539 }
1540}
1541
1542/*
1543 * Called to register a temporary file for automatic close.
1544 * ResourceOwnerEnlarge(CurrentResourceOwner) must have been called
1545 * before the file was opened.
1546 */
1547static void
1549{
1552
1553 /* Backup mechanism for closing at end of xact. */
1556}
1557
1558/*
1559 * Called when we get a shared invalidation message on some relation.
1560 */
1561#ifdef NOT_USED
1562void
1563FileInvalidate(File file)
1564{
1565 Assert(FileIsValid(file));
1566 if (!FileIsNotOpen(file))
1567 LruDelete(file);
1568}
1569#endif
1570
1571/*
1572 * Open a file with PathNameOpenFilePerm() and pass default file mode for the
1573 * fileMode parameter.
1574 */
1575File
1576PathNameOpenFile(const char *fileName, int fileFlags)
1577{
1578 return PathNameOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
1579}
1580
1581/*
1582 * open a file in an arbitrary directory
1583 *
1584 * NB: if the passed pathname is relative (which it usually is),
1585 * it will be interpreted relative to the process' working directory
1586 * (which should always be $PGDATA when this code is running).
1587 */
1588File
1589PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
1590{
1591 char *fnamecopy;
1592 File file;
1593 Vfd *vfdP;
1594
1595 DO_DB(elog(LOG, "PathNameOpenFilePerm: %s %x %o",
1596 fileName, fileFlags, fileMode));
1597
1598 /*
1599 * We need a malloc'd copy of the file name; fail cleanly if no room.
1600 */
1601 fnamecopy = strdup(fileName);
1602 if (fnamecopy == NULL)
1603 ereport(ERROR,
1604 (errcode(ERRCODE_OUT_OF_MEMORY),
1605 errmsg("out of memory")));
1606
1607 file = AllocateVfd();
1608 vfdP = &VfdCache[file];
1609
1610 /* Close excess kernel FDs. */
1612
1613 /*
1614 * Descriptors managed by VFDs are implicitly marked O_CLOEXEC. The
1615 * client shouldn't be expected to know which kernel descriptors are
1616 * currently open, so it wouldn't make sense for them to be inherited by
1617 * executed subprograms.
1618 */
1619 fileFlags |= O_CLOEXEC;
1620
1621 vfdP->fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
1622
1623 if (vfdP->fd < 0)
1624 {
1625 int save_errno = errno;
1626
1627 FreeVfd(file);
1628 free(fnamecopy);
1629 errno = save_errno;
1630 return -1;
1631 }
1632 ++nfile;
1633 DO_DB(elog(LOG, "PathNameOpenFile: success %d",
1634 vfdP->fd));
1635
1636 vfdP->fileName = fnamecopy;
1637 /* Saved flags are adjusted to be OK for re-opening file */
1638 vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
1639 vfdP->fileMode = fileMode;
1640 vfdP->fileSize = 0;
1641 vfdP->fdstate = 0x0;
1642 vfdP->resowner = NULL;
1643
1644 Insert(file);
1645
1646 return file;
1647}
1648
1649/*
1650 * Create directory 'directory'. If necessary, create 'basedir', which must
1651 * be the directory above it. This is designed for creating the top-level
1652 * temporary directory on demand before creating a directory underneath it.
1653 * Do nothing if the directory already exists.
1654 *
1655 * Directories created within the top-level temporary directory should begin
1656 * with PG_TEMP_FILE_PREFIX, so that they can be identified as temporary and
1657 * deleted at startup by RemovePgTempFiles(). Further subdirectories below
1658 * that do not need any particular prefix.
1659*/
1660void
1662{
1663 if (MakePGDirectory(directory) < 0)
1664 {
1665 if (errno == EEXIST)
1666 return;
1667
1668 /*
1669 * Failed. Try to create basedir first in case it's missing. Tolerate
1670 * EEXIST to close a race against another process following the same
1671 * algorithm.
1672 */
1673 if (MakePGDirectory(basedir) < 0 && errno != EEXIST)
1674 ereport(ERROR,
1676 errmsg("cannot create temporary directory \"%s\": %m",
1677 basedir)));
1678
1679 /* Try again. */
1680 if (MakePGDirectory(directory) < 0 && errno != EEXIST)
1681 ereport(ERROR,
1683 errmsg("cannot create temporary subdirectory \"%s\": %m",
1684 directory)));
1685 }
1686}
1687
1688/*
1689 * Delete a directory and everything in it, if it exists.
1690 */
1691void
1692PathNameDeleteTemporaryDir(const char *dirname)
1693{
1694 struct stat statbuf;
1695
1696 /* Silently ignore missing directory. */
1697 if (stat(dirname, &statbuf) != 0 && errno == ENOENT)
1698 return;
1699
1700 /*
1701 * Currently, walkdir doesn't offer a way for our passed in function to
1702 * maintain state. Perhaps it should, so that we could tell the caller
1703 * whether this operation succeeded or failed. Since this operation is
1704 * used in a cleanup path, we wouldn't actually behave differently: we'll
1705 * just log failures.
1706 */
1707 walkdir(dirname, unlink_if_exists_fname, false, LOG);
1708}
1709
1710/*
1711 * Open a temporary file that will disappear when we close it.
1712 *
1713 * This routine takes care of generating an appropriate tempfile name.
1714 * There's no need to pass in fileFlags or fileMode either, since only
1715 * one setting makes any sense for a temp file.
1716 *
1717 * Unless interXact is true, the file is remembered by CurrentResourceOwner
1718 * to ensure it's closed and deleted when it's no longer needed, typically at
1719 * the end-of-transaction. In most cases, you don't want temporary files to
1720 * outlive the transaction that created them, so this should be false -- but
1721 * if you need "somewhat" temporary storage, this might be useful. In either
1722 * case, the file is removed when the File is explicitly closed.
1723 */
1724File
1725OpenTemporaryFile(bool interXact)
1726{
1727 File file = 0;
1728
1729 Assert(temporary_files_allowed); /* check temp file access is up */
1730
1731 /*
1732 * Make sure the current resource owner has space for this File before we
1733 * open it, if we'll be registering it below.
1734 */
1735 if (!interXact)
1737
1738 /*
1739 * If some temp tablespace(s) have been given to us, try to use the next
1740 * one. If a given tablespace can't be found, we silently fall back to
1741 * the database's default tablespace.
1742 *
1743 * BUT: if the temp file is slated to outlive the current transaction,
1744 * force it into the database's default tablespace, so that it will not
1745 * pose a threat to possible tablespace drop attempts.
1746 */
1747 if (numTempTableSpaces > 0 && !interXact)
1748 {
1749 Oid tblspcOid = GetNextTempTableSpace();
1750
1751 if (OidIsValid(tblspcOid))
1752 file = OpenTemporaryFileInTablespace(tblspcOid, false);
1753 }
1754
1755 /*
1756 * If not, or if tablespace is bad, create in database's default
1757 * tablespace. MyDatabaseTableSpace should normally be set before we get
1758 * here, but just in case it isn't, fall back to pg_default tablespace.
1759 */
1760 if (file <= 0)
1763 DEFAULTTABLESPACE_OID,
1764 true);
1765
1766 /* Mark it for deletion at close and temporary file size limit */
1768
1769 /* Register it with the current resource owner */
1770 if (!interXact)
1772
1773 return file;
1774}
1775
1776/*
1777 * Return the path of the temp directory in a given tablespace.
1778 */
1779void
1781{
1782 /*
1783 * Identify the tempfile directory for this tablespace.
1784 *
1785 * If someone tries to specify pg_global, use pg_default instead.
1786 */
1787 if (tablespace == InvalidOid ||
1788 tablespace == DEFAULTTABLESPACE_OID ||
1789 tablespace == GLOBALTABLESPACE_OID)
1790 snprintf(path, MAXPGPATH, "base/%s", PG_TEMP_FILES_DIR);
1791 else
1792 {
1793 /* All other tablespaces are accessed via symlinks */
1794 snprintf(path, MAXPGPATH, "%s/%u/%s/%s",
1797 }
1798}
1799
1800/*
1801 * Open a temporary file in a specific tablespace.
1802 * Subroutine for OpenTemporaryFile, which see for details.
1803 */
1804static File
1805OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
1806{
1807 char tempdirpath[MAXPGPATH];
1808 char tempfilepath[MAXPGPATH];
1809 File file;
1810
1811 TempTablespacePath(tempdirpath, tblspcOid);
1812
1813 /*
1814 * Generate a tempfile name that should be unique within the current
1815 * database instance.
1816 */
1817 snprintf(tempfilepath, sizeof(tempfilepath), "%s/%s%d.%ld",
1819
1820 /*
1821 * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1822 * temp file that can be reused.
1823 */
1824 file = PathNameOpenFile(tempfilepath,
1825 O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1826 if (file <= 0)
1827 {
1828 /*
1829 * We might need to create the tablespace's tempfile directory, if no
1830 * one has yet done so.
1831 *
1832 * Don't check for an error from MakePGDirectory; it could fail if
1833 * someone else just did the same thing. If it doesn't work then
1834 * we'll bomb out on the second create attempt, instead.
1835 */
1836 (void) MakePGDirectory(tempdirpath);
1837
1838 file = PathNameOpenFile(tempfilepath,
1839 O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1840 if (file <= 0 && rejectError)
1841 elog(ERROR, "could not create temporary file \"%s\": %m",
1842 tempfilepath);
1843 }
1844
1845 return file;
1846}
1847
1848
1849/*
1850 * Create a new file. The directory containing it must already exist. Files
1851 * created this way are subject to temp_file_limit and are automatically
1852 * closed at end of transaction, but are not automatically deleted on close
1853 * because they are intended to be shared between cooperating backends.
1854 *
1855 * If the file is inside the top-level temporary directory, its name should
1856 * begin with PG_TEMP_FILE_PREFIX so that it can be identified as temporary
1857 * and deleted at startup by RemovePgTempFiles(). Alternatively, it can be
1858 * inside a directory created with PathNameCreateTemporaryDir(), in which case
1859 * the prefix isn't needed.
1860 */
1861File
1862PathNameCreateTemporaryFile(const char *path, bool error_on_failure)
1863{
1864 File file;
1865
1866 Assert(temporary_files_allowed); /* check temp file access is up */
1867
1869
1870 /*
1871 * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1872 * temp file that can be reused.
1873 */
1874 file = PathNameOpenFile(path, O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1875 if (file <= 0)
1876 {
1877 if (error_on_failure)
1878 ereport(ERROR,
1880 errmsg("could not create temporary file \"%s\": %m",
1881 path)));
1882 else
1883 return file;
1884 }
1885
1886 /* Mark it for temp_file_limit accounting. */
1888
1889 /* Register it for automatic close. */
1891
1892 return file;
1893}
1894
1895/*
1896 * Open a file that was created with PathNameCreateTemporaryFile, possibly in
1897 * another backend. Files opened this way don't count against the
1898 * temp_file_limit of the caller, are automatically closed at the end of the
1899 * transaction but are not deleted on close.
1900 */
1901File
1902PathNameOpenTemporaryFile(const char *path, int mode)
1903{
1904 File file;
1905
1906 Assert(temporary_files_allowed); /* check temp file access is up */
1907
1909
1910 file = PathNameOpenFile(path, mode | PG_BINARY);
1911
1912 /* If no such file, then we don't raise an error. */
1913 if (file <= 0 && errno != ENOENT)
1914 ereport(ERROR,
1916 errmsg("could not open temporary file \"%s\": %m",
1917 path)));
1918
1919 if (file > 0)
1920 {
1921 /* Register it for automatic close. */
1923 }
1924
1925 return file;
1926}
1927
1928/*
1929 * Delete a file by pathname. Return true if the file existed, false if
1930 * didn't.
1931 */
1932bool
1933PathNameDeleteTemporaryFile(const char *path, bool error_on_failure)
1934{
1935 struct stat filestats;
1936 int stat_errno;
1937
1938 /* Get the final size for pgstat reporting. */
1939 if (stat(path, &filestats) != 0)
1940 stat_errno = errno;
1941 else
1942 stat_errno = 0;
1943
1944 /*
1945 * Unlike FileClose's automatic file deletion code, we tolerate
1946 * non-existence to support BufFileDeleteFileSet which doesn't know how
1947 * many segments it has to delete until it runs out.
1948 */
1949 if (stat_errno == ENOENT)
1950 return false;
1951
1952 if (unlink(path) < 0)
1953 {
1954 if (errno != ENOENT)
1955 ereport(error_on_failure ? ERROR : LOG,
1957 errmsg("could not unlink temporary file \"%s\": %m",
1958 path)));
1959 return false;
1960 }
1961
1962 if (stat_errno == 0)
1963 ReportTemporaryFileUsage(path, filestats.st_size);
1964 else
1965 {
1966 errno = stat_errno;
1967 ereport(LOG,
1969 errmsg("could not stat file \"%s\": %m", path)));
1970 }
1971
1972 return true;
1973}
1974
1975/*
1976 * close a file when done with it
1977 */
1978void
1980{
1981 Vfd *vfdP;
1982
1983 Assert(FileIsValid(file));
1984
1985 DO_DB(elog(LOG, "FileClose: %d (%s)",
1986 file, VfdCache[file].fileName));
1987
1988 vfdP = &VfdCache[file];
1989
1990 if (!FileIsNotOpen(file))
1991 {
1992 pgaio_closing_fd(vfdP->fd);
1993
1994 /* close the file */
1995 if (close(vfdP->fd) != 0)
1996 {
1997 /*
1998 * We may need to panic on failure to close non-temporary files;
1999 * see LruDelete.
2000 */
2002 "could not close file \"%s\": %m", vfdP->fileName);
2003 }
2004
2005 --nfile;
2006 vfdP->fd = VFD_CLOSED;
2007
2008 /* remove the file from the lru ring */
2009 Delete(file);
2010 }
2011
2012 if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
2013 {
2014 /* Subtract its size from current usage (do first in case of error) */
2016 vfdP->fileSize = 0;
2017 }
2018
2019 /*
2020 * Delete the file if it was temporary, and make a log entry if wanted
2021 */
2022 if (vfdP->fdstate & FD_DELETE_AT_CLOSE)
2023 {
2024 struct stat filestats;
2025 int stat_errno;
2026
2027 /*
2028 * If we get an error, as could happen within the ereport/elog calls,
2029 * we'll come right back here during transaction abort. Reset the
2030 * flag to ensure that we can't get into an infinite loop. This code
2031 * is arranged to ensure that the worst-case consequence is failing to
2032 * emit log message(s), not failing to attempt the unlink.
2033 */
2034 vfdP->fdstate &= ~FD_DELETE_AT_CLOSE;
2035
2036
2037 /* first try the stat() */
2038 if (stat(vfdP->fileName, &filestats))
2039 stat_errno = errno;
2040 else
2041 stat_errno = 0;
2042
2043 /* in any case do the unlink */
2044 if (unlink(vfdP->fileName))
2045 ereport(LOG,
2047 errmsg("could not delete file \"%s\": %m", vfdP->fileName)));
2048
2049 /* and last report the stat results */
2050 if (stat_errno == 0)
2051 ReportTemporaryFileUsage(vfdP->fileName, filestats.st_size);
2052 else
2053 {
2054 errno = stat_errno;
2055 ereport(LOG,
2057 errmsg("could not stat file \"%s\": %m", vfdP->fileName)));
2058 }
2059 }
2060
2061 /* Unregister it from the resource owner */
2062 if (vfdP->resowner)
2063 ResourceOwnerForgetFile(vfdP->resowner, file);
2064
2065 /*
2066 * Return the Vfd slot to the free list
2067 */
2068 FreeVfd(file);
2069}
2070
2071/*
2072 * FilePrefetch - initiate asynchronous read of a given range of the file.
2073 *
2074 * Returns 0 on success, otherwise an errno error code (like posix_fadvise()).
2075 *
2076 * posix_fadvise() is the simplest standardized interface that accomplishes
2077 * this.
2078 */
2079int
2080FilePrefetch(File file, off_t offset, off_t amount, uint32 wait_event_info)
2081{
2082 Assert(FileIsValid(file));
2083
2084 DO_DB(elog(LOG, "FilePrefetch: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2085 file, VfdCache[file].fileName,
2086 (int64) offset, (int64) amount));
2087
2088#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_WILLNEED)
2089 {
2090 int returnCode;
2091
2092 returnCode = FileAccess(file);
2093 if (returnCode < 0)
2094 return returnCode;
2095
2096retry:
2097 pgstat_report_wait_start(wait_event_info);
2098 returnCode = posix_fadvise(VfdCache[file].fd, offset, amount,
2099 POSIX_FADV_WILLNEED);
2101
2102 if (returnCode == EINTR)
2103 goto retry;
2104
2105 return returnCode;
2106 }
2107#elif defined(__darwin__)
2108 {
2109 struct radvisory
2110 {
2111 off_t ra_offset; /* offset into the file */
2112 int ra_count; /* size of the read */
2113 } ra;
2114 int returnCode;
2115
2116 returnCode = FileAccess(file);
2117 if (returnCode < 0)
2118 return returnCode;
2119
2120 ra.ra_offset = offset;
2121 ra.ra_count = amount;
2122 pgstat_report_wait_start(wait_event_info);
2123 returnCode = fcntl(VfdCache[file].fd, F_RDADVISE, &ra);
2125 if (returnCode != -1)
2126 return 0;
2127 else
2128 return errno;
2129 }
2130#else
2131 return 0;
2132#endif
2133}
2134
2135void
2136FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
2137{
2138 int returnCode;
2139
2140 Assert(FileIsValid(file));
2141
2142 DO_DB(elog(LOG, "FileWriteback: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2143 file, VfdCache[file].fileName,
2144 (int64) offset, (int64) nbytes));
2145
2146 if (nbytes <= 0)
2147 return;
2148
2149 if (VfdCache[file].fileFlags & PG_O_DIRECT)
2150 return;
2151
2152 returnCode = FileAccess(file);
2153 if (returnCode < 0)
2154 return;
2155
2156 pgstat_report_wait_start(wait_event_info);
2157 pg_flush_data(VfdCache[file].fd, offset, nbytes);
2159}
2160
2161ssize_t
2162FileReadV(File file, const struct iovec *iov, int iovcnt, off_t offset,
2163 uint32 wait_event_info)
2164{
2165 ssize_t returnCode;
2166 Vfd *vfdP;
2167
2168 Assert(FileIsValid(file));
2169
2170 DO_DB(elog(LOG, "FileReadV: %d (%s) " INT64_FORMAT " %d",
2171 file, VfdCache[file].fileName,
2172 (int64) offset,
2173 iovcnt));
2174
2175 returnCode = FileAccess(file);
2176 if (returnCode < 0)
2177 return returnCode;
2178
2179 vfdP = &VfdCache[file];
2180
2181retry:
2182 pgstat_report_wait_start(wait_event_info);
2183 returnCode = pg_preadv(vfdP->fd, iov, iovcnt, offset);
2185
2186 if (returnCode < 0)
2187 {
2188 /*
2189 * Windows may run out of kernel buffers and return "Insufficient
2190 * system resources" error. Wait a bit and retry to solve it.
2191 *
2192 * It is rumored that EINTR is also possible on some Unix filesystems,
2193 * in which case immediate retry is indicated.
2194 */
2195#ifdef WIN32
2196 DWORD error = GetLastError();
2197
2198 switch (error)
2199 {
2200 case ERROR_NO_SYSTEM_RESOURCES:
2201 pg_usleep(1000L);
2202 errno = EINTR;
2203 break;
2204 default:
2206 break;
2207 }
2208#endif
2209 /* OK to retry if interrupted */
2210 if (errno == EINTR)
2211 goto retry;
2212 }
2213
2214 return returnCode;
2215}
2216
2217int
2219 int iovcnt, off_t offset,
2220 uint32 wait_event_info)
2221{
2222 int returnCode;
2223 Vfd *vfdP;
2224
2225 Assert(FileIsValid(file));
2226
2227 DO_DB(elog(LOG, "FileStartReadV: %d (%s) " INT64_FORMAT " %d",
2228 file, VfdCache[file].fileName,
2229 (int64) offset,
2230 iovcnt));
2231
2232 returnCode = FileAccess(file);
2233 if (returnCode < 0)
2234 return returnCode;
2235
2236 vfdP = &VfdCache[file];
2237
2238 pgaio_io_start_readv(ioh, vfdP->fd, iovcnt, offset);
2239
2240 return 0;
2241}
2242
2243ssize_t
2244FileWriteV(File file, const struct iovec *iov, int iovcnt, off_t offset,
2245 uint32 wait_event_info)
2246{
2247 ssize_t returnCode;
2248 Vfd *vfdP;
2249
2250 Assert(FileIsValid(file));
2251
2252 DO_DB(elog(LOG, "FileWriteV: %d (%s) " INT64_FORMAT " %d",
2253 file, VfdCache[file].fileName,
2254 (int64) offset,
2255 iovcnt));
2256
2257 returnCode = FileAccess(file);
2258 if (returnCode < 0)
2259 return returnCode;
2260
2261 vfdP = &VfdCache[file];
2262
2263 /*
2264 * If enforcing temp_file_limit and it's a temp file, check to see if the
2265 * write would overrun temp_file_limit, and throw error if so. Note: it's
2266 * really a modularity violation to throw error here; we should set errno
2267 * and return -1. However, there's no way to report a suitable error
2268 * message if we do that. All current callers would just throw error
2269 * immediately anyway, so this is safe at present.
2270 */
2271 if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMP_FILE_LIMIT))
2272 {
2273 off_t past_write = offset;
2274
2275 for (int i = 0; i < iovcnt; ++i)
2276 past_write += iov[i].iov_len;
2277
2278 if (past_write > vfdP->fileSize)
2279 {
2280 uint64 newTotal = temporary_files_size;
2281
2282 newTotal += past_write - vfdP->fileSize;
2283 if (newTotal > (uint64) temp_file_limit * (uint64) 1024)
2284 ereport(ERROR,
2285 (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED),
2286 errmsg("temporary file size exceeds \"temp_file_limit\" (%dkB)",
2287 temp_file_limit)));
2288 }
2289 }
2290
2291retry:
2292 pgstat_report_wait_start(wait_event_info);
2293 returnCode = pg_pwritev(vfdP->fd, iov, iovcnt, offset);
2295
2296 if (returnCode >= 0)
2297 {
2298 /*
2299 * Some callers expect short writes to set errno, and traditionally we
2300 * have assumed that they imply disk space shortage. We don't want to
2301 * waste CPU cycles adding up the total size here, so we'll just set
2302 * it for all successful writes in case such a caller determines that
2303 * the write was short and ereports "%m".
2304 */
2305 errno = ENOSPC;
2306
2307 /*
2308 * Maintain fileSize and temporary_files_size if it's a temp file.
2309 */
2310 if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
2311 {
2312 off_t past_write = offset + returnCode;
2313
2314 if (past_write > vfdP->fileSize)
2315 {
2316 temporary_files_size += past_write - vfdP->fileSize;
2317 vfdP->fileSize = past_write;
2318 }
2319 }
2320 }
2321 else
2322 {
2323 /*
2324 * See comments in FileReadV()
2325 */
2326#ifdef WIN32
2327 DWORD error = GetLastError();
2328
2329 switch (error)
2330 {
2331 case ERROR_NO_SYSTEM_RESOURCES:
2332 pg_usleep(1000L);
2333 errno = EINTR;
2334 break;
2335 default:
2337 break;
2338 }
2339#endif
2340 /* OK to retry if interrupted */
2341 if (errno == EINTR)
2342 goto retry;
2343 }
2344
2345 return returnCode;
2346}
2347
2348int
2349FileSync(File file, uint32 wait_event_info)
2350{
2351 int returnCode;
2352
2353 Assert(FileIsValid(file));
2354
2355 DO_DB(elog(LOG, "FileSync: %d (%s)",
2356 file, VfdCache[file].fileName));
2357
2358 returnCode = FileAccess(file);
2359 if (returnCode < 0)
2360 return returnCode;
2361
2362 pgstat_report_wait_start(wait_event_info);
2363 returnCode = pg_fsync(VfdCache[file].fd);
2365
2366 return returnCode;
2367}
2368
2369/*
2370 * Zero a region of the file.
2371 *
2372 * Returns 0 on success, -1 otherwise. In the latter case errno is set to the
2373 * appropriate error.
2374 */
2375int
2376FileZero(File file, off_t offset, off_t amount, uint32 wait_event_info)
2377{
2378 int returnCode;
2379 ssize_t written;
2380
2381 Assert(FileIsValid(file));
2382
2383 DO_DB(elog(LOG, "FileZero: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2384 file, VfdCache[file].fileName,
2385 (int64) offset, (int64) amount));
2386
2387 returnCode = FileAccess(file);
2388 if (returnCode < 0)
2389 return returnCode;
2390
2391 pgstat_report_wait_start(wait_event_info);
2392 written = pg_pwrite_zeros(VfdCache[file].fd, amount, offset);
2394
2395 if (written < 0)
2396 return -1;
2397 else if (written != amount)
2398 {
2399 /* if errno is unset, assume problem is no disk space */
2400 if (errno == 0)
2401 errno = ENOSPC;
2402 return -1;
2403 }
2404
2405 return 0;
2406}
2407
2408/*
2409 * Try to reserve file space with posix_fallocate(). If posix_fallocate() is
2410 * not implemented on the operating system or fails with EINVAL / EOPNOTSUPP,
2411 * use FileZero() instead.
2412 *
2413 * Note that at least glibc() implements posix_fallocate() in userspace if not
2414 * implemented by the filesystem. That's not the case for all environments
2415 * though.
2416 *
2417 * Returns 0 on success, -1 otherwise. In the latter case errno is set to the
2418 * appropriate error.
2419 */
2420int
2421FileFallocate(File file, off_t offset, off_t amount, uint32 wait_event_info)
2422{
2423#ifdef HAVE_POSIX_FALLOCATE
2424 int returnCode;
2425
2426 Assert(FileIsValid(file));
2427
2428 DO_DB(elog(LOG, "FileFallocate: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2429 file, VfdCache[file].fileName,
2430 (int64) offset, (int64) amount));
2431
2432 returnCode = FileAccess(file);
2433 if (returnCode < 0)
2434 return -1;
2435
2436retry:
2437 pgstat_report_wait_start(wait_event_info);
2438 returnCode = posix_fallocate(VfdCache[file].fd, offset, amount);
2440
2441 if (returnCode == 0)
2442 return 0;
2443 else if (returnCode == EINTR)
2444 goto retry;
2445
2446 /* for compatibility with %m printing etc */
2447 errno = returnCode;
2448
2449 /*
2450 * Return in cases of a "real" failure, if fallocate is not supported,
2451 * fall through to the FileZero() backed implementation.
2452 */
2453 if (returnCode != EINVAL && returnCode != EOPNOTSUPP)
2454 return -1;
2455#endif
2456
2457 return FileZero(file, offset, amount, wait_event_info);
2458}
2459
2460off_t
2462{
2463 Assert(FileIsValid(file));
2464
2465 DO_DB(elog(LOG, "FileSize %d (%s)",
2466 file, VfdCache[file].fileName));
2467
2468 if (FileIsNotOpen(file))
2469 {
2470 if (FileAccess(file) < 0)
2471 return (off_t) -1;
2472 }
2473
2474 return lseek(VfdCache[file].fd, 0, SEEK_END);
2475}
2476
2477int
2478FileTruncate(File file, off_t offset, uint32 wait_event_info)
2479{
2480 int returnCode;
2481
2482 Assert(FileIsValid(file));
2483
2484 DO_DB(elog(LOG, "FileTruncate %d (%s)",
2485 file, VfdCache[file].fileName));
2486
2487 returnCode = FileAccess(file);
2488 if (returnCode < 0)
2489 return returnCode;
2490
2491 pgstat_report_wait_start(wait_event_info);
2492 returnCode = pg_ftruncate(VfdCache[file].fd, offset);
2494
2495 if (returnCode == 0 && VfdCache[file].fileSize > offset)
2496 {
2497 /* adjust our state for truncation of a temp file */
2498 Assert(VfdCache[file].fdstate & FD_TEMP_FILE_LIMIT);
2499 temporary_files_size -= VfdCache[file].fileSize - offset;
2500 VfdCache[file].fileSize = offset;
2501 }
2502
2503 return returnCode;
2504}
2505
2506/*
2507 * Return the pathname associated with an open file.
2508 *
2509 * The returned string points to an internal buffer, which is valid until
2510 * the file is closed.
2511 */
2512char *
2514{
2515 Assert(FileIsValid(file));
2516
2517 return VfdCache[file].fileName;
2518}
2519
2520/*
2521 * Return the raw file descriptor of an opened file.
2522 *
2523 * The returned file descriptor will be valid until the file is closed, but
2524 * there are a lot of things that can make that happen. So the caller should
2525 * be careful not to do much of anything else before it finishes using the
2526 * returned file descriptor.
2527 */
2528int
2530{
2531 int returnCode;
2532
2533 returnCode = FileAccess(file);
2534 if (returnCode < 0)
2535 return returnCode;
2536
2537 Assert(FileIsValid(file));
2538 return VfdCache[file].fd;
2539}
2540
2541/*
2542 * FileGetRawFlags - returns the file flags on open(2)
2543 */
2544int
2546{
2547 Assert(FileIsValid(file));
2548 return VfdCache[file].fileFlags;
2549}
2550
2551/*
2552 * FileGetRawMode - returns the mode bitmask passed to open(2)
2553 */
2554mode_t
2556{
2557 Assert(FileIsValid(file));
2558 return VfdCache[file].fileMode;
2559}
2560
2561/*
2562 * Make room for another allocatedDescs[] array entry if needed and possible.
2563 * Returns true if an array element is available.
2564 */
2565static bool
2567{
2568 AllocateDesc *newDescs;
2569 int newMax;
2570
2571 /* Quick out if array already has a free slot. */
2573 return true;
2574
2575 /*
2576 * If the array hasn't yet been created in the current process, initialize
2577 * it with FD_MINFREE / 3 elements. In many scenarios this is as many as
2578 * we will ever need, anyway. We don't want to look at max_safe_fds
2579 * immediately because set_max_safe_fds() may not have run yet.
2580 */
2581 if (allocatedDescs == NULL)
2582 {
2583 newMax = FD_MINFREE / 3;
2584 newDescs = (AllocateDesc *) malloc(newMax * sizeof(AllocateDesc));
2585 /* Out of memory already? Treat as fatal error. */
2586 if (newDescs == NULL)
2587 ereport(ERROR,
2588 (errcode(ERRCODE_OUT_OF_MEMORY),
2589 errmsg("out of memory")));
2590 allocatedDescs = newDescs;
2591 maxAllocatedDescs = newMax;
2592 return true;
2593 }
2594
2595 /*
2596 * Consider enlarging the array beyond the initial allocation used above.
2597 * By the time this happens, max_safe_fds should be known accurately.
2598 *
2599 * We mustn't let allocated descriptors hog all the available FDs, and in
2600 * practice we'd better leave a reasonable number of FDs for VFD use. So
2601 * set the maximum to max_safe_fds / 3. (This should certainly be at
2602 * least as large as the initial size, FD_MINFREE / 3, so we aren't
2603 * tightening the restriction here.) Recall that "external" FDs are
2604 * allowed to consume another third of max_safe_fds.
2605 */
2606 newMax = max_safe_fds / 3;
2607 if (newMax > maxAllocatedDescs)
2608 {
2609 newDescs = (AllocateDesc *) realloc(allocatedDescs,
2610 newMax * sizeof(AllocateDesc));
2611 /* Treat out-of-memory as a non-fatal error. */
2612 if (newDescs == NULL)
2613 return false;
2614 allocatedDescs = newDescs;
2615 maxAllocatedDescs = newMax;
2616 return true;
2617 }
2618
2619 /* Can't enlarge allocatedDescs[] any more. */
2620 return false;
2621}
2622
2623/*
2624 * Routines that want to use stdio (ie, FILE*) should use AllocateFile
2625 * rather than plain fopen(). This lets fd.c deal with freeing FDs if
2626 * necessary to open the file. When done, call FreeFile rather than fclose.
2627 *
2628 * Note that files that will be open for any significant length of time
2629 * should NOT be handled this way, since they cannot share kernel file
2630 * descriptors with other files; there is grave risk of running out of FDs
2631 * if anyone locks down too many FDs. Most callers of this routine are
2632 * simply reading a config file that they will read and close immediately.
2633 *
2634 * fd.c will automatically close all files opened with AllocateFile at
2635 * transaction commit or abort; this prevents FD leakage if a routine
2636 * that calls AllocateFile is terminated prematurely by ereport(ERROR).
2637 *
2638 * Ideally this should be the *only* direct call of fopen() in the backend.
2639 */
2640FILE *
2641AllocateFile(const char *name, const char *mode)
2642{
2643 FILE *file;
2644
2645 DO_DB(elog(LOG, "AllocateFile: Allocated %d (%s)",
2647
2648 /* Can we allocate another non-virtual FD? */
2649 if (!reserveAllocatedDesc())
2650 ereport(ERROR,
2651 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2652 errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2654
2655 /* Close excess kernel FDs. */
2657
2658TryAgain:
2659 if ((file = fopen(name, mode)) != NULL)
2660 {
2662
2663 desc->kind = AllocateDescFile;
2664 desc->desc.file = file;
2667 return desc->desc.file;
2668 }
2669
2670 if (errno == EMFILE || errno == ENFILE)
2671 {
2672 int save_errno = errno;
2673
2674 ereport(LOG,
2675 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2676 errmsg("out of file descriptors: %m; release and retry")));
2677 errno = 0;
2678 if (ReleaseLruFile())
2679 goto TryAgain;
2680 errno = save_errno;
2681 }
2682
2683 return NULL;
2684}
2685
2686/*
2687 * Open a file with OpenTransientFilePerm() and pass default file mode for
2688 * the fileMode parameter.
2689 */
2690int
2691OpenTransientFile(const char *fileName, int fileFlags)
2692{
2693 return OpenTransientFilePerm(fileName, fileFlags, pg_file_create_mode);
2694}
2695
2696/*
2697 * Like AllocateFile, but returns an unbuffered fd like open(2)
2698 */
2699int
2700OpenTransientFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
2701{
2702 int fd;
2703
2704 DO_DB(elog(LOG, "OpenTransientFile: Allocated %d (%s)",
2705 numAllocatedDescs, fileName));
2706
2707 /* Can we allocate another non-virtual FD? */
2708 if (!reserveAllocatedDesc())
2709 ereport(ERROR,
2710 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2711 errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2712 maxAllocatedDescs, fileName)));
2713
2714 /* Close excess kernel FDs. */
2716
2717 fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
2718
2719 if (fd >= 0)
2720 {
2722
2723 desc->kind = AllocateDescRawFD;
2724 desc->desc.fd = fd;
2727
2728 return fd;
2729 }
2730
2731 return -1; /* failure */
2732}
2733
2734/*
2735 * Routines that want to initiate a pipe stream should use OpenPipeStream
2736 * rather than plain popen(). This lets fd.c deal with freeing FDs if
2737 * necessary. When done, call ClosePipeStream rather than pclose.
2738 *
2739 * This function also ensures that the popen'd program is run with default
2740 * SIGPIPE processing, rather than the SIG_IGN setting the backend normally
2741 * uses. This ensures desirable response to, eg, closing a read pipe early.
2742 */
2743FILE *
2744OpenPipeStream(const char *command, const char *mode)
2745{
2746 FILE *file;
2747 int save_errno;
2748
2749 DO_DB(elog(LOG, "OpenPipeStream: Allocated %d (%s)",
2750 numAllocatedDescs, command));
2751
2752 /* Can we allocate another non-virtual FD? */
2753 if (!reserveAllocatedDesc())
2754 ereport(ERROR,
2755 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2756 errmsg("exceeded maxAllocatedDescs (%d) while trying to execute command \"%s\"",
2757 maxAllocatedDescs, command)));
2758
2759 /* Close excess kernel FDs. */
2761
2762TryAgain:
2763 fflush(NULL);
2764 pqsignal(SIGPIPE, SIG_DFL);
2765 errno = 0;
2766 file = popen(command, mode);
2767 save_errno = errno;
2768 pqsignal(SIGPIPE, SIG_IGN);
2769 errno = save_errno;
2770 if (file != NULL)
2771 {
2773
2774 desc->kind = AllocateDescPipe;
2775 desc->desc.file = file;
2778 return desc->desc.file;
2779 }
2780
2781 if (errno == EMFILE || errno == ENFILE)
2782 {
2783 ereport(LOG,
2784 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2785 errmsg("out of file descriptors: %m; release and retry")));
2786 if (ReleaseLruFile())
2787 goto TryAgain;
2788 errno = save_errno;
2789 }
2790
2791 return NULL;
2792}
2793
2794/*
2795 * Free an AllocateDesc of any type.
2796 *
2797 * The argument *must* point into the allocatedDescs[] array.
2798 */
2799static int
2801{
2802 int result;
2803
2804 /* Close the underlying object */
2805 switch (desc->kind)
2806 {
2807 case AllocateDescFile:
2808 result = fclose(desc->desc.file);
2809 break;
2810 case AllocateDescPipe:
2811 result = pclose(desc->desc.file);
2812 break;
2813 case AllocateDescDir:
2814 result = closedir(desc->desc.dir);
2815 break;
2816 case AllocateDescRawFD:
2817 pgaio_closing_fd(desc->desc.fd);
2818 result = close(desc->desc.fd);
2819 break;
2820 default:
2821 elog(ERROR, "AllocateDesc kind not recognized");
2822 result = 0; /* keep compiler quiet */
2823 break;
2824 }
2825
2826 /* Compact storage in the allocatedDescs array */
2829
2830 return result;
2831}
2832
2833/*
2834 * Close a file returned by AllocateFile.
2835 *
2836 * Note we do not check fclose's return value --- it is up to the caller
2837 * to handle close errors.
2838 */
2839int
2840FreeFile(FILE *file)
2841{
2842 int i;
2843
2844 DO_DB(elog(LOG, "FreeFile: Allocated %d", numAllocatedDescs));
2845
2846 /* Remove file from list of allocated files, if it's present */
2847 for (i = numAllocatedDescs; --i >= 0;)
2848 {
2849 AllocateDesc *desc = &allocatedDescs[i];
2850
2851 if (desc->kind == AllocateDescFile && desc->desc.file == file)
2852 return FreeDesc(desc);
2853 }
2854
2855 /* Only get here if someone passes us a file not in allocatedDescs */
2856 elog(WARNING, "file passed to FreeFile was not obtained from AllocateFile");
2857
2858 return fclose(file);
2859}
2860
2861/*
2862 * Close a file returned by OpenTransientFile.
2863 *
2864 * Note we do not check close's return value --- it is up to the caller
2865 * to handle close errors.
2866 */
2867int
2869{
2870 int i;
2871
2872 DO_DB(elog(LOG, "CloseTransientFile: Allocated %d", numAllocatedDescs));
2873
2874 /* Remove fd from list of allocated files, if it's present */
2875 for (i = numAllocatedDescs; --i >= 0;)
2876 {
2877 AllocateDesc *desc = &allocatedDescs[i];
2878
2879 if (desc->kind == AllocateDescRawFD && desc->desc.fd == fd)
2880 return FreeDesc(desc);
2881 }
2882
2883 /* Only get here if someone passes us a file not in allocatedDescs */
2884 elog(WARNING, "fd passed to CloseTransientFile was not obtained from OpenTransientFile");
2885
2887
2888 return close(fd);
2889}
2890
2891/*
2892 * Routines that want to use <dirent.h> (ie, DIR*) should use AllocateDir
2893 * rather than plain opendir(). This lets fd.c deal with freeing FDs if
2894 * necessary to open the directory, and with closing it after an elog.
2895 * When done, call FreeDir rather than closedir.
2896 *
2897 * Returns NULL, with errno set, on failure. Note that failure detection
2898 * is commonly left to the following call of ReadDir or ReadDirExtended;
2899 * see the comments for ReadDir.
2900 *
2901 * Ideally this should be the *only* direct call of opendir() in the backend.
2902 */
2903DIR *
2904AllocateDir(const char *dirname)
2905{
2906 DIR *dir;
2907
2908 DO_DB(elog(LOG, "AllocateDir: Allocated %d (%s)",
2909 numAllocatedDescs, dirname));
2910
2911 /* Can we allocate another non-virtual FD? */
2912 if (!reserveAllocatedDesc())
2913 ereport(ERROR,
2914 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2915 errmsg("exceeded maxAllocatedDescs (%d) while trying to open directory \"%s\"",
2916 maxAllocatedDescs, dirname)));
2917
2918 /* Close excess kernel FDs. */
2920
2921TryAgain:
2922 if ((dir = opendir(dirname)) != NULL)
2923 {
2925
2926 desc->kind = AllocateDescDir;
2927 desc->desc.dir = dir;
2930 return desc->desc.dir;
2931 }
2932
2933 if (errno == EMFILE || errno == ENFILE)
2934 {
2935 int save_errno = errno;
2936
2937 ereport(LOG,
2938 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2939 errmsg("out of file descriptors: %m; release and retry")));
2940 errno = 0;
2941 if (ReleaseLruFile())
2942 goto TryAgain;
2943 errno = save_errno;
2944 }
2945
2946 return NULL;
2947}
2948
2949/*
2950 * Read a directory opened with AllocateDir, ereport'ing any error.
2951 *
2952 * This is easier to use than raw readdir() since it takes care of some
2953 * otherwise rather tedious and error-prone manipulation of errno. Also,
2954 * if you are happy with a generic error message for AllocateDir failure,
2955 * you can just do
2956 *
2957 * dir = AllocateDir(path);
2958 * while ((dirent = ReadDir(dir, path)) != NULL)
2959 * process dirent;
2960 * FreeDir(dir);
2961 *
2962 * since a NULL dir parameter is taken as indicating AllocateDir failed.
2963 * (Make sure errno isn't changed between AllocateDir and ReadDir if you
2964 * use this shortcut.)
2965 *
2966 * The pathname passed to AllocateDir must be passed to this routine too,
2967 * but it is only used for error reporting.
2968 */
2969struct dirent *
2970ReadDir(DIR *dir, const char *dirname)
2971{
2972 return ReadDirExtended(dir, dirname, ERROR);
2973}
2974
2975/*
2976 * Alternate version of ReadDir that allows caller to specify the elevel
2977 * for any error report (whether it's reporting an initial failure of
2978 * AllocateDir or a subsequent directory read failure).
2979 *
2980 * If elevel < ERROR, returns NULL after any error. With the normal coding
2981 * pattern, this will result in falling out of the loop immediately as
2982 * though the directory contained no (more) entries.
2983 */
2984struct dirent *
2985ReadDirExtended(DIR *dir, const char *dirname, int elevel)
2986{
2987 struct dirent *dent;
2988
2989 /* Give a generic message for AllocateDir failure, if caller didn't */
2990 if (dir == NULL)
2991 {
2992 ereport(elevel,
2994 errmsg("could not open directory \"%s\": %m",
2995 dirname)));
2996 return NULL;
2997 }
2998
2999 errno = 0;
3000 if ((dent = readdir(dir)) != NULL)
3001 return dent;
3002
3003 if (errno)
3004 ereport(elevel,
3006 errmsg("could not read directory \"%s\": %m",
3007 dirname)));
3008 return NULL;
3009}
3010
3011/*
3012 * Close a directory opened with AllocateDir.
3013 *
3014 * Returns closedir's return value (with errno set if it's not 0).
3015 * Note we do not check the return value --- it is up to the caller
3016 * to handle close errors if wanted.
3017 *
3018 * Does nothing if dir == NULL; we assume that directory open failure was
3019 * already reported if desired.
3020 */
3021int
3023{
3024 int i;
3025
3026 /* Nothing to do if AllocateDir failed */
3027 if (dir == NULL)
3028 return 0;
3029
3030 DO_DB(elog(LOG, "FreeDir: Allocated %d", numAllocatedDescs));
3031
3032 /* Remove dir from list of allocated dirs, if it's present */
3033 for (i = numAllocatedDescs; --i >= 0;)
3034 {
3035 AllocateDesc *desc = &allocatedDescs[i];
3036
3037 if (desc->kind == AllocateDescDir && desc->desc.dir == dir)
3038 return FreeDesc(desc);
3039 }
3040
3041 /* Only get here if someone passes us a dir not in allocatedDescs */
3042 elog(WARNING, "dir passed to FreeDir was not obtained from AllocateDir");
3043
3044 return closedir(dir);
3045}
3046
3047
3048/*
3049 * Close a pipe stream returned by OpenPipeStream.
3050 */
3051int
3053{
3054 int i;
3055
3056 DO_DB(elog(LOG, "ClosePipeStream: Allocated %d", numAllocatedDescs));
3057
3058 /* Remove file from list of allocated files, if it's present */
3059 for (i = numAllocatedDescs; --i >= 0;)
3060 {
3061 AllocateDesc *desc = &allocatedDescs[i];
3062
3063 if (desc->kind == AllocateDescPipe && desc->desc.file == file)
3064 return FreeDesc(desc);
3065 }
3066
3067 /* Only get here if someone passes us a file not in allocatedDescs */
3068 elog(WARNING, "file passed to ClosePipeStream was not obtained from OpenPipeStream");
3069
3070 return pclose(file);
3071}
3072
3073/*
3074 * closeAllVfds
3075 *
3076 * Force all VFDs into the physically-closed state, so that the fewest
3077 * possible number of kernel file descriptors are in use. There is no
3078 * change in the logical state of the VFDs.
3079 */
3080void
3082{
3083 Index i;
3084
3085 if (SizeVfdCache > 0)
3086 {
3087 Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
3088 for (i = 1; i < SizeVfdCache; i++)
3089 {
3090 if (!FileIsNotOpen(i))
3091 LruDelete(i);
3092 }
3093 }
3094}
3095
3096
3097/*
3098 * SetTempTablespaces
3099 *
3100 * Define a list (actually an array) of OIDs of tablespaces to use for
3101 * temporary files. This list will be used until end of transaction,
3102 * unless this function is called again before then. It is caller's
3103 * responsibility that the passed-in array has adequate lifespan (typically
3104 * it'd be allocated in TopTransactionContext).
3105 *
3106 * Some entries of the array may be InvalidOid, indicating that the current
3107 * database's default tablespace should be used.
3108 */
3109void
3110SetTempTablespaces(Oid *tableSpaces, int numSpaces)
3111{
3112 Assert(numSpaces >= 0);
3113 tempTableSpaces = tableSpaces;
3114 numTempTableSpaces = numSpaces;
3115
3116 /*
3117 * Select a random starting point in the list. This is to minimize
3118 * conflicts between backends that are most likely sharing the same list
3119 * of temp tablespaces. Note that if we create multiple temp files in the
3120 * same transaction, we'll advance circularly through the list --- this
3121 * ensures that large temporary sort files are nicely spread across all
3122 * available tablespaces.
3123 */
3124 if (numSpaces > 1)
3126 0, numSpaces - 1);
3127 else
3129}
3130
3131/*
3132 * TempTablespacesAreSet
3133 *
3134 * Returns true if SetTempTablespaces has been called in current transaction.
3135 * (This is just so that tablespaces.c doesn't need its own per-transaction
3136 * state.)
3137 */
3138bool
3140{
3141 return (numTempTableSpaces >= 0);
3142}
3143
3144/*
3145 * GetTempTablespaces
3146 *
3147 * Populate an array with the OIDs of the tablespaces that should be used for
3148 * temporary files. (Some entries may be InvalidOid, indicating that the
3149 * current database's default tablespace should be used.) At most numSpaces
3150 * entries will be filled.
3151 * Returns the number of OIDs that were copied into the output array.
3152 */
3153int
3154GetTempTablespaces(Oid *tableSpaces, int numSpaces)
3155{
3156 int i;
3157
3159 for (i = 0; i < numTempTableSpaces && i < numSpaces; ++i)
3160 tableSpaces[i] = tempTableSpaces[i];
3161
3162 return i;
3163}
3164
3165/*
3166 * GetNextTempTableSpace
3167 *
3168 * Select the next temp tablespace to use. A result of InvalidOid means
3169 * to use the current database's default tablespace.
3170 */
3171Oid
3173{
3174 if (numTempTableSpaces > 0)
3175 {
3176 /* Advance nextTempTableSpace counter with wraparound */
3180 }
3181 return InvalidOid;
3182}
3183
3184
3185/*
3186 * AtEOSubXact_Files
3187 *
3188 * Take care of subtransaction commit/abort. At abort, we close temp files
3189 * that the subtransaction may have opened. At commit, we reassign the
3190 * files that were opened to the parent subtransaction.
3191 */
3192void
3194 SubTransactionId parentSubid)
3195{
3196 Index i;
3197
3198 for (i = 0; i < numAllocatedDescs; i++)
3199 {
3200 if (allocatedDescs[i].create_subid == mySubid)
3201 {
3202 if (isCommit)
3203 allocatedDescs[i].create_subid = parentSubid;
3204 else
3205 {
3206 /* have to recheck the item after FreeDesc (ugly) */
3208 }
3209 }
3210 }
3211}
3212
3213/*
3214 * AtEOXact_Files
3215 *
3216 * This routine is called during transaction commit or abort. All still-open
3217 * per-transaction temporary file VFDs are closed, which also causes the
3218 * underlying files to be deleted (although they should've been closed already
3219 * by the ResourceOwner cleanup). Furthermore, all "allocated" stdio files are
3220 * closed. We also forget any transaction-local temp tablespace list.
3221 *
3222 * The isCommit flag is used only to decide whether to emit warnings about
3223 * unclosed files.
3224 */
3225void
3226AtEOXact_Files(bool isCommit)
3227{
3228 CleanupTempFiles(isCommit, false);
3229 tempTableSpaces = NULL;
3230 numTempTableSpaces = -1;
3231}
3232
3233/*
3234 * BeforeShmemExit_Files
3235 *
3236 * before_shmem_exit hook to clean up temp files during backend shutdown.
3237 * Here, we want to clean up *all* temp files including interXact ones.
3238 */
3239static void
3241{
3242 CleanupTempFiles(false, true);
3243
3244 /* prevent further temp files from being created */
3245#ifdef USE_ASSERT_CHECKING
3246 temporary_files_allowed = false;
3247#endif
3248}
3249
3250/*
3251 * Close temporary files and delete their underlying files.
3252 *
3253 * isCommit: if true, this is normal transaction commit, and we don't
3254 * expect any remaining files; warn if there are some.
3255 *
3256 * isProcExit: if true, this is being called as the backend process is
3257 * exiting. If that's the case, we should remove all temporary files; if
3258 * that's not the case, we are being called for transaction commit/abort
3259 * and should only remove transaction-local temp files. In either case,
3260 * also clean up "allocated" stdio files, dirs and fds.
3261 */
3262static void
3263CleanupTempFiles(bool isCommit, bool isProcExit)
3264{
3265 Index i;
3266
3267 /*
3268 * Careful here: at proc_exit we need extra cleanup, not just
3269 * xact_temporary files.
3270 */
3271 if (isProcExit || have_xact_temporary_files)
3272 {
3273 Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
3274 for (i = 1; i < SizeVfdCache; i++)
3275 {
3276 unsigned short fdstate = VfdCache[i].fdstate;
3277
3278 if (((fdstate & FD_DELETE_AT_CLOSE) || (fdstate & FD_CLOSE_AT_EOXACT)) &&
3279 VfdCache[i].fileName != NULL)
3280 {
3281 /*
3282 * If we're in the process of exiting a backend process, close
3283 * all temporary files. Otherwise, only close temporary files
3284 * local to the current transaction. They should be closed by
3285 * the ResourceOwner mechanism already, so this is just a
3286 * debugging cross-check.
3287 */
3288 if (isProcExit)
3289 FileClose(i);
3290 else if (fdstate & FD_CLOSE_AT_EOXACT)
3291 {
3292 elog(WARNING,
3293 "temporary file %s not closed at end-of-transaction",
3294 VfdCache[i].fileName);
3295 FileClose(i);
3296 }
3297 }
3298 }
3299
3301 }
3302
3303 /* Complain if any allocated files remain open at commit. */
3304 if (isCommit && numAllocatedDescs > 0)
3305 elog(WARNING, "%d temporary files and directories not closed at end-of-transaction",
3307
3308 /* Clean up "allocated" stdio files, dirs and fds. */
3309 while (numAllocatedDescs > 0)
3311}
3312
3313
3314/*
3315 * Remove temporary and temporary relation files left over from a prior
3316 * postmaster session
3317 *
3318 * This should be called during postmaster startup. It will forcibly
3319 * remove any leftover files created by OpenTemporaryFile and any leftover
3320 * temporary relation files created by mdcreate.
3321 *
3322 * During post-backend-crash restart cycle, this routine is called when
3323 * remove_temp_files_after_crash GUC is enabled. Multiple crashes while
3324 * queries are using temp files could result in useless storage usage that can
3325 * only be reclaimed by a service restart. The argument against enabling it is
3326 * that someone might want to examine the temporary files for debugging
3327 * purposes. This does however mean that OpenTemporaryFile had better allow for
3328 * collision with an existing temp file name.
3329 *
3330 * NOTE: this function and its subroutines generally report syscall failures
3331 * with ereport(LOG) and keep going. Removing temp files is not so critical
3332 * that we should fail to start the database when we can't do it.
3333 */
3334void
3336{
3337 char temp_path[MAXPGPATH + sizeof(PG_TBLSPC_DIR) + sizeof(TABLESPACE_VERSION_DIRECTORY) + sizeof(PG_TEMP_FILES_DIR)];
3338 DIR *spc_dir;
3339 struct dirent *spc_de;
3340
3341 /*
3342 * First process temp files in pg_default ($PGDATA/base)
3343 */
3344 snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR);
3345 RemovePgTempFilesInDir(temp_path, true, false);
3347
3348 /*
3349 * Cycle through temp directories for all non-default tablespaces.
3350 */
3351 spc_dir = AllocateDir(PG_TBLSPC_DIR);
3352
3353 while ((spc_de = ReadDirExtended(spc_dir, PG_TBLSPC_DIR, LOG)) != NULL)
3354 {
3355 if (strcmp(spc_de->d_name, ".") == 0 ||
3356 strcmp(spc_de->d_name, "..") == 0)
3357 continue;
3358
3359 snprintf(temp_path, sizeof(temp_path), "%s/%s/%s/%s",
3362 RemovePgTempFilesInDir(temp_path, true, false);
3363
3364 snprintf(temp_path, sizeof(temp_path), "%s/%s/%s",
3366 RemovePgTempRelationFiles(temp_path);
3367 }
3368
3369 FreeDir(spc_dir);
3370
3371 /*
3372 * In EXEC_BACKEND case there is a pgsql_tmp directory at the top level of
3373 * DataDir as well. However, that is *not* cleaned here because doing so
3374 * would create a race condition. It's done separately, earlier in
3375 * postmaster startup.
3376 */
3377}
3378
3379/*
3380 * Process one pgsql_tmp directory for RemovePgTempFiles.
3381 *
3382 * If missing_ok is true, it's all right for the named directory to not exist.
3383 * Any other problem results in a LOG message. (missing_ok should be true at
3384 * the top level, since pgsql_tmp directories are not created until needed.)
3385 *
3386 * At the top level, this should be called with unlink_all = false, so that
3387 * only files matching the temporary name prefix will be unlinked. When
3388 * recursing it will be called with unlink_all = true to unlink everything
3389 * under a top-level temporary directory.
3390 *
3391 * (These two flags could be replaced by one, but it seems clearer to keep
3392 * them separate.)
3393 */
3394void
3395RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
3396{
3397 DIR *temp_dir;
3398 struct dirent *temp_de;
3399 char rm_path[MAXPGPATH * 2];
3400
3401 temp_dir = AllocateDir(tmpdirname);
3402
3403 if (temp_dir == NULL && errno == ENOENT && missing_ok)
3404 return;
3405
3406 while ((temp_de = ReadDirExtended(temp_dir, tmpdirname, LOG)) != NULL)
3407 {
3408 if (strcmp(temp_de->d_name, ".") == 0 ||
3409 strcmp(temp_de->d_name, "..") == 0)
3410 continue;
3411
3412 snprintf(rm_path, sizeof(rm_path), "%s/%s",
3413 tmpdirname, temp_de->d_name);
3414
3415 if (unlink_all ||
3416 strncmp(temp_de->d_name,
3418 strlen(PG_TEMP_FILE_PREFIX)) == 0)
3419 {
3420 PGFileType type = get_dirent_type(rm_path, temp_de, false, LOG);
3421
3422 if (type == PGFILETYPE_ERROR)
3423 continue;
3424 else if (type == PGFILETYPE_DIR)
3425 {
3426 /* recursively remove contents, then directory itself */
3427 RemovePgTempFilesInDir(rm_path, false, true);
3428
3429 if (rmdir(rm_path) < 0)
3430 ereport(LOG,
3432 errmsg("could not remove directory \"%s\": %m",
3433 rm_path)));
3434 }
3435 else
3436 {
3437 if (unlink(rm_path) < 0)
3438 ereport(LOG,
3440 errmsg("could not remove file \"%s\": %m",
3441 rm_path)));
3442 }
3443 }
3444 else
3445 ereport(LOG,
3446 (errmsg("unexpected file found in temporary-files directory: \"%s\"",
3447 rm_path)));
3448 }
3449
3450 FreeDir(temp_dir);
3451}
3452
3453/* Process one tablespace directory, look for per-DB subdirectories */
3454static void
3455RemovePgTempRelationFiles(const char *tsdirname)
3456{
3457 DIR *ts_dir;
3458 struct dirent *de;
3459 char dbspace_path[MAXPGPATH * 2];
3460
3461 ts_dir = AllocateDir(tsdirname);
3462
3463 while ((de = ReadDirExtended(ts_dir, tsdirname, LOG)) != NULL)
3464 {
3465 /*
3466 * We're only interested in the per-database directories, which have
3467 * numeric names. Note that this code will also (properly) ignore "."
3468 * and "..".
3469 */
3470 if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
3471 continue;
3472
3473 snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s",
3474 tsdirname, de->d_name);
3476 }
3477
3478 FreeDir(ts_dir);
3479}
3480
3481/* Process one per-dbspace directory for RemovePgTempRelationFiles */
3482static void
3483RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
3484{
3485 DIR *dbspace_dir;
3486 struct dirent *de;
3487 char rm_path[MAXPGPATH * 2];
3488
3489 dbspace_dir = AllocateDir(dbspacedirname);
3490
3491 while ((de = ReadDirExtended(dbspace_dir, dbspacedirname, LOG)) != NULL)
3492 {
3494 continue;
3495
3496 snprintf(rm_path, sizeof(rm_path), "%s/%s",
3497 dbspacedirname, de->d_name);
3498
3499 if (unlink(rm_path) < 0)
3500 ereport(LOG,
3502 errmsg("could not remove file \"%s\": %m",
3503 rm_path)));
3504 }
3505
3506 FreeDir(dbspace_dir);
3507}
3508
3509/* t<digits>_<digits>, or t<digits>_<digits>_<forkname> */
3510bool
3512{
3513 int pos;
3514 int savepos;
3515
3516 /* Must start with "t". */
3517 if (name[0] != 't')
3518 return false;
3519
3520 /* Followed by a non-empty string of digits and then an underscore. */
3521 for (pos = 1; isdigit((unsigned char) name[pos]); ++pos)
3522 ;
3523 if (pos == 1 || name[pos] != '_')
3524 return false;
3525
3526 /* Followed by another nonempty string of digits. */
3527 for (savepos = ++pos; isdigit((unsigned char) name[pos]); ++pos)
3528 ;
3529 if (savepos == pos)
3530 return false;
3531
3532 /* We might have _forkname or .segment or both. */
3533 if (name[pos] == '_')
3534 {
3535 int forkchar = forkname_chars(&name[pos + 1], NULL);
3536
3537 if (forkchar <= 0)
3538 return false;
3539 pos += forkchar + 1;
3540 }
3541 if (name[pos] == '.')
3542 {
3543 int segchar;
3544
3545 for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar)
3546 ;
3547 if (segchar <= 1)
3548 return false;
3549 pos += segchar;
3550 }
3551
3552 /* Now we should be at the end. */
3553 if (name[pos] != '\0')
3554 return false;
3555 return true;
3556}
3557
3558#ifdef HAVE_SYNCFS
3559static void
3560do_syncfs(const char *path)
3561{
3562 int fd;
3563
3564 ereport_startup_progress("syncing data directory (syncfs), elapsed time: %ld.%02d s, current path: %s",
3565 path);
3566
3567 fd = OpenTransientFile(path, O_RDONLY);
3568 if (fd < 0)
3569 {
3570 ereport(LOG,
3572 errmsg("could not open file \"%s\": %m", path)));
3573 return;
3574 }
3575 if (syncfs(fd) < 0)
3576 ereport(LOG,
3578 errmsg("could not synchronize file system for file \"%s\": %m", path)));
3580}
3581#endif
3582
3583/*
3584 * Issue fsync recursively on PGDATA and all its contents, or issue syncfs for
3585 * all potential filesystem, depending on recovery_init_sync_method setting.
3586 *
3587 * We fsync regular files and directories wherever they are, but we
3588 * follow symlinks only for pg_wal and immediately under pg_tblspc.
3589 * Other symlinks are presumed to point at files we're not responsible
3590 * for fsyncing, and might not have privileges to write at all.
3591 *
3592 * Errors are logged but not considered fatal; that's because this is used
3593 * only during database startup, to deal with the possibility that there are
3594 * issued-but-unsynced writes pending against the data directory. We want to
3595 * ensure that such writes reach disk before anything that's done in the new
3596 * run. However, aborting on error would result in failure to start for
3597 * harmless cases such as read-only files in the data directory, and that's
3598 * not good either.
3599 *
3600 * Note that if we previously crashed due to a PANIC on fsync(), we'll be
3601 * rewriting all changes again during recovery.
3602 *
3603 * Note we assume we're chdir'd into PGDATA to begin with.
3604 */
3605void
3607{
3608 bool xlog_is_symlink;
3609
3610 /* We can skip this whole thing if fsync is disabled. */
3611 if (!enableFsync)
3612 return;
3613
3614 /*
3615 * If pg_wal is a symlink, we'll need to recurse into it separately,
3616 * because the first walkdir below will ignore it.
3617 */
3618 xlog_is_symlink = false;
3619
3620 {
3621 struct stat st;
3622
3623 if (lstat("pg_wal", &st) < 0)
3624 ereport(LOG,
3626 errmsg("could not stat file \"%s\": %m",
3627 "pg_wal")));
3628 else if (S_ISLNK(st.st_mode))
3629 xlog_is_symlink = true;
3630 }
3631
3632#ifdef HAVE_SYNCFS
3634 {
3635 DIR *dir;
3636 struct dirent *de;
3637
3638 /*
3639 * On Linux, we don't have to open every single file one by one. We
3640 * can use syncfs() to sync whole filesystems. We only expect
3641 * filesystem boundaries to exist where we tolerate symlinks, namely
3642 * pg_wal and the tablespaces, so we call syncfs() for each of those
3643 * directories.
3644 */
3645
3646 /* Prepare to report progress syncing the data directory via syncfs. */
3648
3649 /* Sync the top level pgdata directory. */
3650 do_syncfs(".");
3651 /* If any tablespaces are configured, sync each of those. */
3653 while ((de = ReadDirExtended(dir, PG_TBLSPC_DIR, LOG)))
3654 {
3655 char path[MAXPGPATH];
3656
3657 if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
3658 continue;
3659
3660 snprintf(path, MAXPGPATH, "%s/%s", PG_TBLSPC_DIR, de->d_name);
3661 do_syncfs(path);
3662 }
3663 FreeDir(dir);
3664 /* If pg_wal is a symlink, process that too. */
3665 if (xlog_is_symlink)
3666 do_syncfs("pg_wal");
3667 return;
3668 }
3669#endif /* !HAVE_SYNCFS */
3670
3671#ifdef PG_FLUSH_DATA_WORKS
3672 /* Prepare to report progress of the pre-fsync phase. */
3674
3675 /*
3676 * If possible, hint to the kernel that we're soon going to fsync the data
3677 * directory and its contents. Errors in this step are even less
3678 * interesting than normal, so log them only at DEBUG1.
3679 */
3680 walkdir(".", pre_sync_fname, false, DEBUG1);
3681 if (xlog_is_symlink)
3682 walkdir("pg_wal", pre_sync_fname, false, DEBUG1);
3683 walkdir(PG_TBLSPC_DIR, pre_sync_fname, true, DEBUG1);
3684#endif
3685
3686 /* Prepare to report progress syncing the data directory via fsync. */
3688
3689 /*
3690 * Now we do the fsync()s in the same order.
3691 *
3692 * The main call ignores symlinks, so in addition to specially processing
3693 * pg_wal if it's a symlink, pg_tblspc has to be visited separately with
3694 * process_symlinks = true. Note that if there are any plain directories
3695 * in pg_tblspc, they'll get fsync'd twice. That's not an expected case
3696 * so we don't worry about optimizing it.
3697 */
3698 walkdir(".", datadir_fsync_fname, false, LOG);
3699 if (xlog_is_symlink)
3700 walkdir("pg_wal", datadir_fsync_fname, false, LOG);
3702}
3703
3704/*
3705 * walkdir: recursively walk a directory, applying the action to each
3706 * regular file and directory (including the named directory itself).
3707 *
3708 * If process_symlinks is true, the action and recursion are also applied
3709 * to regular files and directories that are pointed to by symlinks in the
3710 * given directory; otherwise symlinks are ignored. Symlinks are always
3711 * ignored in subdirectories, ie we intentionally don't pass down the
3712 * process_symlinks flag to recursive calls.
3713 *
3714 * Errors are reported at level elevel, which might be ERROR or less.
3715 *
3716 * See also walkdir in file_utils.c, which is a frontend version of this
3717 * logic.
3718 */
3719static void
3720walkdir(const char *path,
3721 void (*action) (const char *fname, bool isdir, int elevel),
3722 bool process_symlinks,
3723 int elevel)
3724{
3725 DIR *dir;
3726 struct dirent *de;
3727
3728 dir = AllocateDir(path);
3729
3730 while ((de = ReadDirExtended(dir, path, elevel)) != NULL)
3731 {
3732 char subpath[MAXPGPATH * 2];
3733
3735
3736 if (strcmp(de->d_name, ".") == 0 ||
3737 strcmp(de->d_name, "..") == 0)
3738 continue;
3739
3740 snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name);
3741
3742 switch (get_dirent_type(subpath, de, process_symlinks, elevel))
3743 {
3744 case PGFILETYPE_REG:
3745 (*action) (subpath, false, elevel);
3746 break;
3747 case PGFILETYPE_DIR:
3748 walkdir(subpath, action, false, elevel);
3749 break;
3750 default:
3751
3752 /*
3753 * Errors are already reported directly by get_dirent_type(),
3754 * and any remaining symlinks and unknown file types are
3755 * ignored.
3756 */
3757 break;
3758 }
3759 }
3760
3761 FreeDir(dir); /* we ignore any error here */
3762
3763 /*
3764 * It's important to fsync the destination directory itself as individual
3765 * file fsyncs don't guarantee that the directory entry for the file is
3766 * synced. However, skip this if AllocateDir failed; the action function
3767 * might not be robust against that.
3768 */
3769 if (dir)
3770 (*action) (path, true, elevel);
3771}
3772
3773
3774/*
3775 * Hint to the OS that it should get ready to fsync() this file.
3776 *
3777 * Ignores errors trying to open unreadable files, and logs other errors at a
3778 * caller-specified level.
3779 */
3780#ifdef PG_FLUSH_DATA_WORKS
3781
3782static void
3783pre_sync_fname(const char *fname, bool isdir, int elevel)
3784{
3785 int fd;
3786
3787 /* Don't try to flush directories, it'll likely just fail */
3788 if (isdir)
3789 return;
3790
3791 ereport_startup_progress("syncing data directory (pre-fsync), elapsed time: %ld.%02d s, current path: %s",
3792 fname);
3793
3794 fd = OpenTransientFile(fname, O_RDONLY | PG_BINARY);
3795
3796 if (fd < 0)
3797 {
3798 if (errno == EACCES)
3799 return;
3800 ereport(elevel,
3802 errmsg("could not open file \"%s\": %m", fname)));
3803 return;
3804 }
3805
3806 /*
3807 * pg_flush_data() ignores errors, which is ok because this is only a
3808 * hint.
3809 */
3810 pg_flush_data(fd, 0, 0);
3811
3812 if (CloseTransientFile(fd) != 0)
3813 ereport(elevel,
3815 errmsg("could not close file \"%s\": %m", fname)));
3816}
3817
3818#endif /* PG_FLUSH_DATA_WORKS */
3819
3820static void
3821datadir_fsync_fname(const char *fname, bool isdir, int elevel)
3822{
3823 ereport_startup_progress("syncing data directory (fsync), elapsed time: %ld.%02d s, current path: %s",
3824 fname);
3825
3826 /*
3827 * We want to silently ignoring errors about unreadable files. Pass that
3828 * desire on to fsync_fname_ext().
3829 */
3830 fsync_fname_ext(fname, isdir, true, elevel);
3831}
3832
3833static void
3834unlink_if_exists_fname(const char *fname, bool isdir, int elevel)
3835{
3836 if (isdir)
3837 {
3838 if (rmdir(fname) != 0 && errno != ENOENT)
3839 ereport(elevel,
3841 errmsg("could not remove directory \"%s\": %m", fname)));
3842 }
3843 else
3844 {
3845 /* Use PathNameDeleteTemporaryFile to report filesize */
3846 PathNameDeleteTemporaryFile(fname, false);
3847 }
3848}
3849
3850/*
3851 * fsync_fname_ext -- Try to fsync a file or directory
3852 *
3853 * If ignore_perm is true, ignore errors upon trying to open unreadable
3854 * files. Logs other errors at a caller-specified level.
3855 *
3856 * Returns 0 if the operation succeeded, -1 otherwise.
3857 */
3858int
3859fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
3860{
3861 int fd;
3862 int flags;
3863 int returncode;
3864
3865 /*
3866 * Some OSs require directories to be opened read-only whereas other
3867 * systems don't allow us to fsync files opened read-only; so we need both
3868 * cases here. Using O_RDWR will cause us to fail to fsync files that are
3869 * not writable by our userid, but we assume that's OK.
3870 */
3871 flags = PG_BINARY;
3872 if (!isdir)
3873 flags |= O_RDWR;
3874 else
3875 flags |= O_RDONLY;
3876
3877 fd = OpenTransientFile(fname, flags);
3878
3879 /*
3880 * Some OSs don't allow us to open directories at all (Windows returns
3881 * EACCES), just ignore the error in that case. If desired also silently
3882 * ignoring errors about unreadable files. Log others.
3883 */
3884 if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
3885 return 0;
3886 else if (fd < 0 && ignore_perm && errno == EACCES)
3887 return 0;
3888 else if (fd < 0)
3889 {
3890 ereport(elevel,
3892 errmsg("could not open file \"%s\": %m", fname)));
3893 return -1;
3894 }
3895
3896 returncode = pg_fsync(fd);
3897
3898 /*
3899 * Some OSes don't allow us to fsync directories at all, so we can ignore
3900 * those errors. Anything else needs to be logged.
3901 */
3902 if (returncode != 0 && !(isdir && (errno == EBADF || errno == EINVAL)))
3903 {
3904 int save_errno;
3905
3906 /* close file upon error, might not be in transaction context */
3907 save_errno = errno;
3908 (void) CloseTransientFile(fd);
3909 errno = save_errno;
3910
3911 ereport(elevel,
3913 errmsg("could not fsync file \"%s\": %m", fname)));
3914 return -1;
3915 }
3916
3917 if (CloseTransientFile(fd) != 0)
3918 {
3919 ereport(elevel,
3921 errmsg("could not close file \"%s\": %m", fname)));
3922 return -1;
3923 }
3924
3925 return 0;
3926}
3927
3928/*
3929 * fsync_parent_path -- fsync the parent path of a file or directory
3930 *
3931 * This is aimed at making file operations persistent on disk in case of
3932 * an OS crash or power failure.
3933 */
3934static int
3935fsync_parent_path(const char *fname, int elevel)
3936{
3937 char parentpath[MAXPGPATH];
3938
3939 strlcpy(parentpath, fname, MAXPGPATH);
3940 get_parent_directory(parentpath);
3941
3942 /*
3943 * get_parent_directory() returns an empty string if the input argument is
3944 * just a file name (see comments in path.c), so handle that as being the
3945 * current directory.
3946 */
3947 if (strlen(parentpath) == 0)
3948 strlcpy(parentpath, ".", MAXPGPATH);
3949
3950 if (fsync_fname_ext(parentpath, true, false, elevel) != 0)
3951 return -1;
3952
3953 return 0;
3954}
3955
3956/*
3957 * Create a PostgreSQL data sub-directory
3958 *
3959 * The data directory itself, and most of its sub-directories, are created at
3960 * initdb time, but we do have some occasions when we create directories in
3961 * the backend (CREATE TABLESPACE, for example). In those cases, we want to
3962 * make sure that those directories are created consistently. Today, that means
3963 * making sure that the created directory has the correct permissions, which is
3964 * what pg_dir_create_mode tracks for us.
3965 *
3966 * Note that we also set the umask() based on what we understand the correct
3967 * permissions to be (see file_perm.c).
3968 *
3969 * For permissions other than the default, mkdir() can be used directly, but
3970 * be sure to consider carefully such cases -- a sub-directory with incorrect
3971 * permissions in a PostgreSQL data directory could cause backups and other
3972 * processes to fail.
3973 */
3974int
3975MakePGDirectory(const char *directoryName)
3976{
3977 return mkdir(directoryName, pg_dir_create_mode);
3978}
3979
3980/*
3981 * Return the passed-in error level, or PANIC if data_sync_retry is off.
3982 *
3983 * Failure to fsync any data file is cause for immediate panic, unless
3984 * data_sync_retry is enabled. Data may have been written to the operating
3985 * system and removed from our buffer pool already, and if we are running on
3986 * an operating system that forgets dirty data on write-back failure, there
3987 * may be only one copy of the data remaining: in the WAL. A later attempt to
3988 * fsync again might falsely report success. Therefore we must not allow any
3989 * further checkpoints to be attempted. data_sync_retry can in theory be
3990 * enabled on systems known not to drop dirty buffered data on write-back
3991 * failure (with the likely outcome that checkpoints will continue to fail
3992 * until the underlying problem is fixed).
3993 *
3994 * Any code that reports a failure from fsync() or related functions should
3995 * filter the error level with this function.
3996 */
3997int
3999{
4000 return data_sync_retry ? elevel : PANIC;
4001}
4002
4003bool
4005{
4006 bool result = true;
4007 int flags;
4008
4009#if PG_O_DIRECT == 0
4010 if (strcmp(*newval, "") != 0)
4011 {
4012 GUC_check_errdetail("\"%s\" is not supported on this platform.",
4013 "debug_io_direct");
4014 result = false;
4015 }
4016 flags = 0;
4017#else
4018 List *elemlist;
4019 ListCell *l;
4020 char *rawstring;
4021
4022 /* Need a modifiable copy of string */
4023 rawstring = pstrdup(*newval);
4024
4025 if (!SplitGUCList(rawstring, ',', &elemlist))
4026 {
4027 GUC_check_errdetail("Invalid list syntax in parameter \"%s\".",
4028 "debug_io_direct");
4029 pfree(rawstring);
4030 list_free(elemlist);
4031 return false;
4032 }
4033
4034 flags = 0;
4035 foreach(l, elemlist)
4036 {
4037 char *item = (char *) lfirst(l);
4038
4039 if (pg_strcasecmp(item, "data") == 0)
4040 flags |= IO_DIRECT_DATA;
4041 else if (pg_strcasecmp(item, "wal") == 0)
4042 flags |= IO_DIRECT_WAL;
4043 else if (pg_strcasecmp(item, "wal_init") == 0)
4044 flags |= IO_DIRECT_WAL_INIT;
4045 else
4046 {
4047 GUC_check_errdetail("Invalid option \"%s\".", item);
4048 result = false;
4049 break;
4050 }
4051 }
4052
4053 /*
4054 * It's possible to configure block sizes smaller than our assumed I/O
4055 * alignment size, which could result in invalid I/O requests.
4056 */
4057#if XLOG_BLCKSZ < PG_IO_ALIGN_SIZE
4058 if (result && (flags & (IO_DIRECT_WAL | IO_DIRECT_WAL_INIT)))
4059 {
4060 GUC_check_errdetail("\"%s\" is not supported for WAL because %s is too small.",
4061 "debug_io_direct", "XLOG_BLCKSZ");
4062 result = false;
4063 }
4064#endif
4065#if BLCKSZ < PG_IO_ALIGN_SIZE
4066 if (result && (flags & IO_DIRECT_DATA))
4067 {
4068 GUC_check_errdetail("\"%s\" is not supported for data because %s is too small.",
4069 "debug_io_direct", "BLCKSZ");
4070 result = false;
4071 }
4072#endif
4073
4074 pfree(rawstring);
4075 list_free(elemlist);
4076#endif
4077
4078 if (!result)
4079 return result;
4080
4081 /* Save the flags in *extra, for use by assign_debug_io_direct */
4082 *extra = guc_malloc(LOG, sizeof(int));
4083 if (!*extra)
4084 return false;
4085 *((int *) *extra) = flags;
4086
4087 return result;
4088}
4089
4090void
4091assign_debug_io_direct(const char *newval, void *extra)
4092{
4093 int *flags = (int *) extra;
4094
4095 io_direct_flags = *flags;
4096}
4097
4098/* ResourceOwner callbacks */
4099
4100static void
4102{
4103 File file = (File) DatumGetInt32(res);
4104 Vfd *vfdP;
4105
4106 Assert(FileIsValid(file));
4107
4108 vfdP = &VfdCache[file];
4109 vfdP->resowner = NULL;
4110
4111 FileClose(file);
4112}
4113
4114static char *
4116{
4117 return psprintf("File %d", DatumGetInt32(res));
4118}
void pgaio_closing_fd(int fd)
Definition: aio.c:1217
void pgaio_io_start_readv(PgAioHandle *ioh, int fd, int iovcnt, uint64 offset)
Definition: aio_io.c:78
void begin_startup_progress_phase(void)
Definition: startup.c:343
#define Min(x, y)
Definition: c.h:1004
uint32 SubTransactionId
Definition: c.h:662
#define INT64_FORMAT
Definition: c.h:557
int64_t int64
Definition: c.h:536
#define PG_BINARY
Definition: c.h:1273
uint64_t uint64
Definition: c.h:540
uint32_t uint32
Definition: c.h:539
unsigned int Index
Definition: c.h:620
#define MemSet(start, val, len)
Definition: c.h:1020
#define StaticAssertStmt(condition, errmessage)
Definition: c.h:938
int fdatasync(int fildes)
#define OidIsValid(objectId)
Definition: c.h:775
size_t Size
Definition: c.h:611
int closedir(DIR *)
Definition: dirent.c:127
struct dirent * readdir(DIR *)
Definition: dirent.c:78
DIR * opendir(const char *)
Definition: dirent.c:33
int errcode_for_file_access(void)
Definition: elog.c:877
int errdetail(const char *fmt,...)
Definition: elog.c:1207
int errcode(int sqlerrcode)
Definition: elog.c:854
int errmsg(const char *fmt,...)
Definition: elog.c:1071
#define LOG
Definition: elog.h:31
#define FATAL
Definition: elog.h:41
#define WARNING
Definition: elog.h:36
#define DEBUG2
Definition: elog.h:29
#define PANIC
Definition: elog.h:42
#define DEBUG1
Definition: elog.h:30
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:226
#define ereport(elevel,...)
Definition: elog.h:150
static int pg_ftruncate(int fd, off_t length)
Definition: fd.c:700
int max_files_per_process
Definition: fd.c:146
void pg_flush_data(int fd, off_t offset, off_t nbytes)
Definition: fd.c:522
int FileGetRawDesc(File file)
Definition: fd.c:2529
int MakePGDirectory(const char *directoryName)
Definition: fd.c:3975
int FreeDir(DIR *dir)
Definition: fd.c:3022
int recovery_init_sync_method
Definition: fd.c:165
static const ResourceOwnerDesc file_resowner_desc
Definition: fd.c:361
void FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
Definition: fd.c:2136
int pg_fsync_no_writethrough(int fd)
Definition: fd.c:438
#define FD_MINFREE
Definition: fd.c:138
FILE * OpenPipeStream(const char *command, const char *mode)
Definition: fd.c:2744
static int numTempTableSpaces
Definition: fd.c:289
static bool ReleaseLruFile(void)
Definition: fd.c:1383
int io_direct_flags
Definition: fd.c:168
#define FD_DELETE_AT_CLOSE
Definition: fd.c:192
int BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition: fd.c:1108
static int maxAllocatedDescs
Definition: fd.c:268
static void Delete(File file)
Definition: fd.c:1267
static int FreeDesc(AllocateDesc *desc)
Definition: fd.c:2800
static long tempFileCounter
Definition: fd.c:280
static char * ResOwnerPrintFile(Datum res)
Definition: fd.c:4115
int durable_rename(const char *oldfile, const char *newfile, int elevel)
Definition: fd.c:779
char * FilePathName(File file)
Definition: fd.c:2513
static void ResourceOwnerForgetFile(ResourceOwner owner, File file)
Definition: fd.c:377
int GetTempTablespaces(Oid *tableSpaces, int numSpaces)
Definition: fd.c:3154
static int numAllocatedDescs
Definition: fd.c:267
File PathNameOpenTemporaryFile(const char *path, int mode)
Definition: fd.c:1902
static void LruDelete(File file)
Definition: fd.c:1286
int pg_fdatasync(int fd)
Definition: fd.c:477
#define FileIsValid(file)
Definition: fd.c:186
void assign_debug_io_direct(const char *newval, void *extra)
Definition: fd.c:4091
int FileSync(File file, uint32 wait_event_info)
Definition: fd.c:2349
static int nfile
Definition: fd.c:222
int CloseTransientFile(int fd)
Definition: fd.c:2868
#define DO_DB(A)
Definition: fd.c:180
int BasicOpenFile(const char *fileName, int fileFlags)
Definition: fd.c:1086
void closeAllVfds(void)
Definition: fd.c:3081
int max_safe_fds
Definition: fd.c:159
static File AllocateVfd(void)
Definition: fd.c:1415
File PathNameCreateTemporaryFile(const char *path, bool error_on_failure)
Definition: fd.c:1862
void PathNameDeleteTemporaryDir(const char *dirname)
Definition: fd.c:1692
int ClosePipeStream(FILE *file)
Definition: fd.c:3052
void AtEOXact_Files(bool isCommit)
Definition: fd.c:3226
int FileGetRawFlags(File file)
Definition: fd.c:2545
static Size SizeVfdCache
Definition: fd.c:217
static int nextTempTableSpace
Definition: fd.c:290
#define FD_CLOSE_AT_EOXACT
Definition: fd.c:193
int fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
Definition: fd.c:3859
static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel)
Definition: fd.c:3834
static void ResOwnerReleaseFile(Datum res)
Definition: fd.c:4101
static void RemovePgTempRelationFiles(const char *tsdirname)
Definition: fd.c:3455
int FreeFile(FILE *file)
Definition: fd.c:2840
mode_t FileGetRawMode(File file)
Definition: fd.c:2555
static AllocateDesc * allocatedDescs
Definition: fd.c:269
struct dirent * ReadDirExtended(DIR *dir, const char *dirname, int elevel)
Definition: fd.c:2985
static void count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
Definition: fd.c:961
static int FileAccess(File file)
Definition: fd.c:1493
static void FreeVfd(File file)
Definition: fd.c:1473
struct vfd Vfd
int pg_fsync_writethrough(int fd)
Definition: fd.c:458
void FileClose(File file)
Definition: fd.c:1979
int FileStartReadV(PgAioHandle *ioh, File file, int iovcnt, off_t offset, uint32 wait_event_info)
Definition: fd.c:2218
void ReleaseExternalFD(void)
Definition: fd.c:1238
#define FD_TEMP_FILE_LIMIT
Definition: fd.c:194
void RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
Definition: fd.c:3395
bool pg_file_exists(const char *name)
Definition: fd.c:500
void RemovePgTempFiles(void)
Definition: fd.c:3335
#define FileIsNotOpen(file)
Definition: fd.c:189
bool TempTablespacesAreSet(void)
Definition: fd.c:3139
void fsync_fname(const char *fname, bool isdir)
Definition: fd.c:753
int FileFallocate(File file, off_t offset, off_t amount, uint32 wait_event_info)
Definition: fd.c:2421
int FilePrefetch(File file, off_t offset, off_t amount, uint32 wait_event_info)
Definition: fd.c:2080
int data_sync_elevel(int elevel)
Definition: fd.c:3998
File PathNameOpenFile(const char *fileName, int fileFlags)
Definition: fd.c:1576
static void Insert(File file)
Definition: fd.c:1314
AllocateDescKind
Definition: fd.c:248
@ AllocateDescDir
Definition: fd.c:251
@ AllocateDescPipe
Definition: fd.c:250
@ AllocateDescFile
Definition: fd.c:249
@ AllocateDescRawFD
Definition: fd.c:252
Oid GetNextTempTableSpace(void)
Definition: fd.c:3172
File PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition: fd.c:1589
static void datadir_fsync_fname(const char *fname, bool isdir, int elevel)
Definition: fd.c:3821
static void ReportTemporaryFileUsage(const char *path, off_t size)
Definition: fd.c:1529
static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
Definition: fd.c:1805
bool AcquireExternalFD(void)
Definition: fd.c:1185
static void RegisterTemporaryFile(File file)
Definition: fd.c:1548
#define NUM_RESERVED_FDS
Definition: fd.c:129
DIR * AllocateDir(const char *dirname)
Definition: fd.c:2904
static Oid * tempTableSpaces
Definition: fd.c:288
static bool reserveAllocatedDesc(void)
Definition: fd.c:2566
void InitFileAccess(void)
Definition: fd.c:900
static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
Definition: fd.c:3483
File OpenTemporaryFile(bool interXact)
Definition: fd.c:1725
int durable_unlink(const char *fname, int elevel)
Definition: fd.c:869
static uint64 temporary_files_size
Definition: fd.c:236
void ReserveExternalFD(void)
Definition: fd.c:1220
struct dirent * ReadDir(DIR *dir, const char *dirname)
Definition: fd.c:2970
bool looks_like_temp_rel_name(const char *name)
Definition: fd.c:3511
bool PathNameDeleteTemporaryFile(const char *path, bool error_on_failure)
Definition: fd.c:1933
void set_max_safe_fds(void)
Definition: fd.c:1041
int pg_fsync(int fd)
Definition: fd.c:386
static void CleanupTempFiles(bool isCommit, bool isProcExit)
Definition: fd.c:3263
#define VFD_CLOSED
Definition: fd.c:184
static bool have_xact_temporary_files
Definition: fd.c:228
static int LruInsert(File file)
Definition: fd.c:1336
static int numExternalFDs
Definition: fd.c:274
static int fsync_parent_path(const char *fname, int elevel)
Definition: fd.c:3935
void PathNameCreateTemporaryDir(const char *basedir, const char *directory)
Definition: fd.c:1661
FILE * AllocateFile(const char *name, const char *mode)
Definition: fd.c:2641
void AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid, SubTransactionId parentSubid)
Definition: fd.c:3193
int OpenTransientFile(const char *fileName, int fileFlags)
Definition: fd.c:2691
void InitTemporaryFileAccess(void)
Definition: fd.c:930
static Vfd * VfdCache
Definition: fd.c:216
int OpenTransientFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition: fd.c:2700
bool data_sync_retry
Definition: fd.c:162
static void ReleaseLruFiles(void)
Definition: fd.c:1405
ssize_t FileWriteV(File file, const struct iovec *iov, int iovcnt, off_t offset, uint32 wait_event_info)
Definition: fd.c:2244
void SyncDataDirectory(void)
Definition: fd.c:3606
int FileZero(File file, off_t offset, off_t amount, uint32 wait_event_info)
Definition: fd.c:2376
off_t FileSize(File file)
Definition: fd.c:2461
ssize_t FileReadV(File file, const struct iovec *iov, int iovcnt, off_t offset, uint32 wait_event_info)
Definition: fd.c:2162
int FileTruncate(File file, off_t offset, uint32 wait_event_info)
Definition: fd.c:2478
bool check_debug_io_direct(char **newval, void **extra, GucSource source)
Definition: fd.c:4004
static void ResourceOwnerRememberFile(ResourceOwner owner, File file)
Definition: fd.c:372
static void BeforeShmemExit_Files(int code, Datum arg)
Definition: fd.c:3240
static void walkdir(const char *path, void(*action)(const char *fname, bool isdir, int elevel), bool process_symlinks, int elevel)
Definition: fd.c:3720
int pg_truncate(const char *path, off_t length)
Definition: fd.c:717
void SetTempTablespaces(Oid *tableSpaces, int numSpaces)
Definition: fd.c:3110
void TempTablespacePath(char *path, Oid tablespace)
Definition: fd.c:1780
#define IO_DIRECT_WAL
Definition: fd.h:55
#define IO_DIRECT_DATA
Definition: fd.h:54
#define IO_DIRECT_WAL_INIT
Definition: fd.h:56
int File
Definition: fd.h:51
#define PG_O_DIRECT
Definition: fd.h:97
int pg_file_create_mode
Definition: file_perm.c:19
int pg_dir_create_mode
Definition: file_perm.c:18
ssize_t pg_pwrite_zeros(int fd, size_t size, off_t offset)
Definition: file_utils.c:709
PGFileType get_dirent_type(const char *path, const struct dirent *de, bool look_through_symlinks, int elevel)
Definition: file_utils.c:547
#define PG_TEMP_FILES_DIR
Definition: file_utils.h:63
#define PG_TEMP_FILE_PREFIX
Definition: file_utils.h:64
PGFileType
Definition: file_utils.h:19
@ PGFILETYPE_DIR
Definition: file_utils.h:23
@ PGFILETYPE_REG
Definition: file_utils.h:22
@ PGFILETYPE_ERROR
Definition: file_utils.h:20
@ DATA_DIR_SYNC_METHOD_SYNCFS
Definition: file_utils.h:30
@ DATA_DIR_SYNC_METHOD_FSYNC
Definition: file_utils.h:29
int MyProcPid
Definition: globals.c:47
bool enableFsync
Definition: globals.c:129
Oid MyDatabaseTableSpace
Definition: globals.c:96
void * guc_malloc(int elevel, size_t size)
Definition: guc.c:639
#define newval
#define GUC_check_errdetail
Definition: guc.h:505
GucSource
Definition: guc.h:112
int temp_file_limit
Definition: guc_tables.c:551
int log_temp_files
Definition: guc_tables.c:546
Assert(PointerIsAligned(start, uint64))
#define realloc(a, b)
Definition: header.h:60
#define free(a)
Definition: header.h:65
#define malloc(a)
Definition: header.h:50
#define close(a)
Definition: win32.h:12
void before_shmem_exit(pg_on_exit_callback function, Datum arg)
Definition: ipc.c:337
int j
Definition: isn.c:78
int i
Definition: isn.c:77
void list_free(List *list)
Definition: list.c:1546
Datum subpath(PG_FUNCTION_ARGS)
Definition: ltree_op.c:311
char * pstrdup(const char *in)
Definition: mcxt.c:1759
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1610
void pfree(void *pointer)
Definition: mcxt.c:1594
void * palloc(Size size)
Definition: mcxt.c:1365
#define MAP_FAILED
Definition: mem.h:45
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:122
void * arg
static char * basedir
static PgChecksumMode mode
Definition: pg_checksums.c:55
#define MAXPGPATH
static ssize_t pg_pwritev(int fd, const struct iovec *iov, int iovcnt, off_t offset)
Definition: pg_iovec.h:93
static ssize_t pg_preadv(int fd, const struct iovec *iov, int iovcnt, off_t offset)
Definition: pg_iovec.h:54
#define lfirst(lc)
Definition: pg_list.h:172
uint64 pg_prng_uint64_range(pg_prng_state *state, uint64 rmin, uint64 rmax)
Definition: pg_prng.c:144
pg_prng_state pg_global_prng_state
Definition: pg_prng.c:34
static rewind_source * source
Definition: pg_rewind.c:89
static char * buf
Definition: pg_test_fsync.c:72
static char * tablespace
Definition: pgbench.c:217
void pgstat_report_tempfile(size_t filesize)
#define pqsignal
Definition: port.h:531
int pg_strcasecmp(const char *s1, const char *s2)
Definition: pgstrcasecmp.c:36
void get_parent_directory(char *path)
Definition: path.c:1068
#define snprintf
Definition: port.h:239
size_t strlcpy(char *dst, const char *src, size_t siz)
Definition: strlcpy.c:45
uint64_t Datum
Definition: postgres.h:70
static Datum Int32GetDatum(int32 X)
Definition: postgres.h:222
static int32 DatumGetInt32(Datum X)
Definition: postgres.h:212
#define InvalidOid
Definition: postgres_ext.h:37
unsigned int Oid
Definition: postgres_ext.h:32
static int fd(const char *x, int i)
Definition: preproc-init.c:105
char * psprintf(const char *fmt,...)
Definition: psprintf.c:43
int forkname_chars(const char *str, ForkNumber *fork)
Definition: relpath.c:81
#define PG_TBLSPC_DIR
Definition: relpath.h:41
#define TABLESPACE_VERSION_DIRECTORY
Definition: relpath.h:33
ResourceOwner CurrentResourceOwner
Definition: resowner.c:173
void ResourceOwnerForget(ResourceOwner owner, Datum value, const ResourceOwnerDesc *kind)
Definition: resowner.c:561
void ResourceOwnerRemember(ResourceOwner owner, Datum value, const ResourceOwnerDesc *kind)
Definition: resowner.c:521
void ResourceOwnerEnlarge(ResourceOwner owner)
Definition: resowner.c:449
@ RESOURCE_RELEASE_AFTER_LOCKS
Definition: resowner.h:56
#define RELEASE_PRIO_FILES
Definition: resowner.h:76
void pg_usleep(long microsec)
Definition: signal.c:53
static void error(void)
Definition: sql-dyntest.c:147
#define ereport_startup_progress(msg,...)
Definition: startup.h:18
SubTransactionId create_subid
Definition: fd.c:258
DIR * dir
Definition: fd.c:262
FILE * file
Definition: fd.c:261
int fd
Definition: fd.c:263
union AllocateDesc::@20 desc
AllocateDescKind kind
Definition: fd.c:257
Definition: dirent.c:26
Definition: pg_list.h:54
const char * name
Definition: resowner.h:93
Definition: dirent.h:10
char d_name[MAX_PATH]
Definition: dirent.h:15
__int64 st_size
Definition: win32_port.h:263
unsigned short st_mode
Definition: win32_port.h:258
Definition: fd.c:197
int fd
Definition: fd.c:198
int fileFlags
Definition: fd.c:207
File lruLessRecently
Definition: fd.c:203
File lruMoreRecently
Definition: fd.c:202
char * fileName
Definition: fd.c:205
ResourceOwner resowner
Definition: fd.c:200
unsigned short fdstate
Definition: fd.c:199
File nextFree
Definition: fd.c:201
mode_t fileMode
Definition: fd.c:208
off_t fileSize
Definition: fd.c:204
bool SplitGUCList(char *rawstring, char separator, List **namelist)
Definition: varlena.c:2992
static void pgstat_report_wait_start(uint32 wait_event_info)
Definition: wait_event.h:69
static void pgstat_report_wait_end(void)
Definition: wait_event.h:85
const char * type
const char * name
#define fsync(fd)
Definition: win32_port.h:83
#define stat
Definition: win32_port.h:274
#define EINTR
Definition: win32_port.h:364
#define EOPNOTSUPP
Definition: win32_port.h:388
#define SIGPIPE
Definition: win32_port.h:163
#define lstat(path, sb)
Definition: win32_port.h:275
#define S_ISDIR(m)
Definition: win32_port.h:315
void _dosmaperr(unsigned long)
Definition: win32error.c:177
#define S_ISLNK(m)
Definition: win32_port.h:334
#define mkdir(a, b)
Definition: win32_port.h:80
#define fstat
Definition: win32_port.h:273
#define O_CLOEXEC
Definition: win32_port.h:349
#define O_DSYNC
Definition: win32_port.h:342
SubTransactionId GetCurrentSubTransactionId(void)
Definition: xact.c:791
int wal_sync_method
Definition: xlog.c:131
@ WAL_SYNC_METHOD_FSYNC_WRITETHROUGH
Definition: xlog.h:27
static const char * directory
Definition: zic.c:634