Thanks to visit codestin.com
Credit goes to doxygen.postgresql.org

PostgreSQL Source Code git master
bufmgr.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * bufmgr.c
4 * buffer manager interface routines
5 *
6 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 *
10 * IDENTIFICATION
11 * src/backend/storage/buffer/bufmgr.c
12 *
13 *-------------------------------------------------------------------------
14 */
15/*
16 * Principal entry points:
17 *
18 * ReadBuffer() -- find or create a buffer holding the requested page,
19 * and pin it so that no one can destroy it while this process
20 * is using it.
21 *
22 * StartReadBuffer() -- as above, with separate wait step
23 * StartReadBuffers() -- multiple block version
24 * WaitReadBuffers() -- second step of above
25 *
26 * ReleaseBuffer() -- unpin a buffer
27 *
28 * MarkBufferDirty() -- mark a pinned buffer's contents as "dirty".
29 * The disk write is delayed until buffer replacement or checkpoint.
30 *
31 * See also these files:
32 * freelist.c -- chooses victim for buffer replacement
33 * buf_table.c -- manages the buffer lookup table
34 */
35#include "postgres.h"
36
37#include <sys/file.h>
38#include <unistd.h>
39
40#include "access/tableam.h"
41#include "access/xloginsert.h"
42#include "access/xlogutils.h"
43#ifdef USE_ASSERT_CHECKING
44#include "catalog/pg_tablespace_d.h"
45#endif
46#include "catalog/storage.h"
48#include "executor/instrument.h"
49#include "lib/binaryheap.h"
50#include "miscadmin.h"
51#include "pg_trace.h"
52#include "pgstat.h"
53#include "postmaster/bgwriter.h"
54#include "storage/aio.h"
56#include "storage/bufmgr.h"
57#include "storage/fd.h"
58#include "storage/ipc.h"
59#include "storage/lmgr.h"
60#include "storage/proc.h"
61#include "storage/read_stream.h"
62#include "storage/smgr.h"
63#include "storage/standby.h"
64#include "utils/memdebug.h"
65#include "utils/ps_status.h"
66#include "utils/rel.h"
67#include "utils/resowner.h"
68#include "utils/timestamp.h"
69
70
71/* Note: these two macros only work on shared buffers, not local ones! */
72#define BufHdrGetBlock(bufHdr) ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
73#define BufferGetLSN(bufHdr) (PageGetLSN(BufHdrGetBlock(bufHdr)))
74
75/* Note: this macro only works on local buffers, not shared ones! */
76#define LocalBufHdrGetBlock(bufHdr) \
77 LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]
78
79/* Bits in SyncOneBuffer's return value */
80#define BUF_WRITTEN 0x01
81#define BUF_REUSABLE 0x02
82
83#define RELS_BSEARCH_THRESHOLD 20
84
85/*
86 * This is the size (in the number of blocks) above which we scan the
87 * entire buffer pool to remove the buffers for all the pages of relation
88 * being dropped. For the relations with size below this threshold, we find
89 * the buffers by doing lookups in BufMapping table.
90 */
91#define BUF_DROP_FULL_SCAN_THRESHOLD (uint64) (NBuffers / 32)
92
94{
98
99/* 64 bytes, about the size of a cache line on common systems */
100#define REFCOUNT_ARRAY_ENTRIES 8
101
102/*
103 * Status of buffers to checkpoint for a particular tablespace, used
104 * internally in BufferSync.
105 */
106typedef struct CkptTsStatus
107{
108 /* oid of the tablespace */
110
111 /*
112 * Checkpoint progress for this tablespace. To make progress comparable
113 * between tablespaces the progress is, for each tablespace, measured as a
114 * number between 0 and the total number of to-be-checkpointed pages. Each
115 * page checkpointed in this tablespace increments this space's progress
116 * by progress_slice.
117 */
120
121 /* number of to-be checkpointed pages in this tablespace */
123 /* already processed pages in this tablespace */
125
126 /* current offset in CkptBufferIds for this tablespace */
127 int index;
129
130/*
131 * Type for array used to sort SMgrRelations
132 *
133 * FlushRelationsAllBuffers shares the same comparator function with
134 * DropRelationsAllBuffers. Pointer to this struct and RelFileLocator must be
135 * compatible.
136 */
137typedef struct SMgrSortArray
138{
139 RelFileLocator rlocator; /* This must be the first member */
142
143/* GUC variables */
147bool track_io_timing = false;
148
149/*
150 * How many buffers PrefetchBuffer callers should try to stay ahead of their
151 * ReadBuffer calls by. Zero means "never prefetch". This value is only used
152 * for buffers not belonging to tablespaces that have their
153 * effective_io_concurrency parameter set.
154 */
156
157/*
158 * Like effective_io_concurrency, but used by maintenance code paths that might
159 * benefit from a higher setting because they work on behalf of many sessions.
160 * Overridden by the tablespace setting of the same name.
161 */
163
164/*
165 * Limit on how many blocks should be handled in single I/O operations.
166 * StartReadBuffers() callers should respect it, as should other operations
167 * that call smgr APIs directly. It is computed as the minimum of underlying
168 * GUCs io_combine_limit_guc and io_max_combine_limit.
169 */
173
174/*
175 * GUC variables about triggering kernel writeback for buffers written; OS
176 * dependent defaults are set via the GUC mechanism.
177 */
181
182/* local state for LockBufferForCleanup */
184
185/*
186 * Backend-Private refcount management:
187 *
188 * Each buffer also has a private refcount that keeps track of the number of
189 * times the buffer is pinned in the current process. This is so that the
190 * shared refcount needs to be modified only once if a buffer is pinned more
191 * than once by an individual backend. It's also used to check that no buffers
192 * are still pinned at the end of transactions and when exiting.
193 *
194 *
195 * To avoid - as we used to - requiring an array with NBuffers entries to keep
196 * track of local buffers, we use a small sequentially searched array
197 * (PrivateRefCountArray) and an overflow hash table (PrivateRefCountHash) to
198 * keep track of backend local pins.
199 *
200 * Until no more than REFCOUNT_ARRAY_ENTRIES buffers are pinned at once, all
201 * refcounts are kept track of in the array; after that, new array entries
202 * displace old ones into the hash table. That way a frequently used entry
203 * can't get "stuck" in the hashtable while infrequent ones clog the array.
204 *
205 * Note that in most scenarios the number of pinned buffers will not exceed
206 * REFCOUNT_ARRAY_ENTRIES.
207 *
208 *
209 * To enter a buffer into the refcount tracking mechanism first reserve a free
210 * entry using ReservePrivateRefCountEntry() and then later, if necessary,
211 * fill it with NewPrivateRefCountEntry(). That split lets us avoid doing
212 * memory allocations in NewPrivateRefCountEntry() which can be important
213 * because in some scenarios it's called with a spinlock held...
214 */
220
222
223static void ReservePrivateRefCountEntry(void);
228
229/* ResourceOwner callbacks to hold in-progress I/Os and buffer pins */
230static void ResOwnerReleaseBufferIO(Datum res);
231static char *ResOwnerPrintBufferIO(Datum res);
232static void ResOwnerReleaseBufferPin(Datum res);
233static char *ResOwnerPrintBufferPin(Datum res);
234
236{
237 .name = "buffer io",
238 .release_phase = RESOURCE_RELEASE_BEFORE_LOCKS,
239 .release_priority = RELEASE_PRIO_BUFFER_IOS,
240 .ReleaseResource = ResOwnerReleaseBufferIO,
241 .DebugPrint = ResOwnerPrintBufferIO
242};
243
245{
246 .name = "buffer pin",
247 .release_phase = RESOURCE_RELEASE_BEFORE_LOCKS,
248 .release_priority = RELEASE_PRIO_BUFFER_PINS,
249 .ReleaseResource = ResOwnerReleaseBufferPin,
250 .DebugPrint = ResOwnerPrintBufferPin
251};
252
253/*
254 * Ensure that the PrivateRefCountArray has sufficient space to store one more
255 * entry. This has to be called before using NewPrivateRefCountEntry() to fill
256 * a new entry - but it's perfectly fine to not use a reserved entry.
257 */
258static void
260{
261 /* Already reserved (or freed), nothing to do */
262 if (ReservedRefCountEntry != NULL)
263 return;
264
265 /*
266 * First search for a free entry the array, that'll be sufficient in the
267 * majority of cases.
268 */
269 {
270 int i;
271
272 for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
273 {
275
276 res = &PrivateRefCountArray[i];
277
278 if (res->buffer == InvalidBuffer)
279 {
281 return;
282 }
283 }
284 }
285
286 /*
287 * No luck. All array entries are full. Move one array entry into the hash
288 * table.
289 */
290 {
291 /*
292 * Move entry from the current clock position in the array into the
293 * hashtable. Use that slot.
294 */
295 PrivateRefCountEntry *hashent;
296 bool found;
297
298 /* select victim slot */
301
302 /* Better be used, otherwise we shouldn't get here. */
304
305 /* enter victim array entry into hashtable */
309 &found);
310 Assert(!found);
312
313 /* clear the now free array slot */
316
318 }
319}
320
321/*
322 * Fill a previously reserved refcount entry.
323 */
326{
328
329 /* only allowed to be called when a reservation has been made */
331
332 /* use up the reserved entry */
335
336 /* and fill it */
337 res->buffer = buffer;
338 res->refcount = 0;
339
340 return res;
341}
342
343/*
344 * Return the PrivateRefCount entry for the passed buffer.
345 *
346 * Returns NULL if a buffer doesn't have a refcount entry. Otherwise, if
347 * do_move is true, and the entry resides in the hashtable the entry is
348 * optimized for frequent access by moving it to the array.
349 */
352{
354 int i;
355
358
359 /*
360 * First search for references in the array, that'll be sufficient in the
361 * majority of cases.
362 */
363 for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
364 {
365 res = &PrivateRefCountArray[i];
366
367 if (res->buffer == buffer)
368 return res;
369 }
370
371 /*
372 * By here we know that the buffer, if already pinned, isn't residing in
373 * the array.
374 *
375 * Only look up the buffer in the hashtable if we've previously overflowed
376 * into it.
377 */
379 return NULL;
380
382
383 if (res == NULL)
384 return NULL;
385 else if (!do_move)
386 {
387 /* caller doesn't want us to move the hash entry into the array */
388 return res;
389 }
390 else
391 {
392 /* move buffer from hashtable into the free array slot */
393 bool found;
395
396 /* Ensure there's a free array slot */
398
399 /* Use up the reserved slot */
403 Assert(free->buffer == InvalidBuffer);
404
405 /* and fill it */
406 free->buffer = buffer;
407 free->refcount = res->refcount;
408
409 /* delete from hashtable */
411 Assert(found);
414
415 return free;
416 }
417}
418
419/*
420 * Returns how many times the passed buffer is pinned by this backend.
421 *
422 * Only works for shared memory buffers!
423 */
424static inline int32
426{
428
431
432 /*
433 * Not moving the entry - that's ok for the current users, but we might
434 * want to change this one day.
435 */
436 ref = GetPrivateRefCountEntry(buffer, false);
437
438 if (ref == NULL)
439 return 0;
440 return ref->refcount;
441}
442
443/*
444 * Release resources used to track the reference count of a buffer which we no
445 * longer have pinned and don't want to pin again immediately.
446 */
447static void
449{
450 Assert(ref->refcount == 0);
451
452 if (ref >= &PrivateRefCountArray[0] &&
454 {
455 ref->buffer = InvalidBuffer;
456
457 /*
458 * Mark the just used entry as reserved - in many scenarios that
459 * allows us to avoid ever having to search the array/hash for free
460 * entries.
461 */
463 }
464 else
465 {
466 bool found;
467 Buffer buffer = ref->buffer;
468
470 Assert(found);
473 }
474}
475
476/*
477 * BufferIsPinned
478 * True iff the buffer is pinned (also checks for valid buffer number).
479 *
480 * NOTE: what we check here is that *this* backend holds a pin on
481 * the buffer. We do not care whether some other backend does.
482 */
483#define BufferIsPinned(bufnum) \
484( \
485 !BufferIsValid(bufnum) ? \
486 false \
487 : \
488 BufferIsLocal(bufnum) ? \
489 (LocalRefCount[-(bufnum) - 1] > 0) \
490 : \
491 (GetPrivateRefCount(bufnum) > 0) \
492)
493
494
496 SMgrRelation smgr, char smgr_persistence,
497 ForkNumber forkNum, BlockNumber blockNum,
500 ForkNumber fork,
501 BufferAccessStrategy strategy,
502 uint32 flags,
503 uint32 extend_by,
504 BlockNumber extend_upto,
505 Buffer *buffers,
506 uint32 *extended_by);
508 ForkNumber fork,
509 BufferAccessStrategy strategy,
510 uint32 flags,
511 uint32 extend_by,
512 BlockNumber extend_upto,
513 Buffer *buffers,
514 uint32 *extended_by);
515static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy);
516static void PinBuffer_Locked(BufferDesc *buf);
517static void UnpinBuffer(BufferDesc *buf);
518static void UnpinBufferNoOwner(BufferDesc *buf);
519static void BufferSync(int flags);
521static int SyncOneBuffer(int buf_id, bool skip_recently_used,
522 WritebackContext *wb_context);
523static void WaitIO(BufferDesc *buf);
524static void AbortBufferIO(Buffer buffer);
525static void shared_buffer_write_error_callback(void *arg);
526static void local_buffer_write_error_callback(void *arg);
527static inline BufferDesc *BufferAlloc(SMgrRelation smgr,
528 char relpersistence,
529 ForkNumber forkNum,
530 BlockNumber blockNum,
531 BufferAccessStrategy strategy,
532 bool *foundPtr, IOContext io_context);
533static bool AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress);
534static void CheckReadBuffersOperation(ReadBuffersOperation *operation, bool is_complete);
535static Buffer GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context);
536static void FlushBuffer(BufferDesc *buf, SMgrRelation reln,
537 IOObject io_object, IOContext io_context);
538static void FindAndDropRelationBuffers(RelFileLocator rlocator,
539 ForkNumber forkNum,
540 BlockNumber nForkBlock,
541 BlockNumber firstDelBlock);
543 RelFileLocator dstlocator,
544 ForkNumber forkNum, bool permanent);
545static void AtProcExit_Buffers(int code, Datum arg);
546static void CheckForBufferLeaks(void);
547#ifdef USE_ASSERT_CHECKING
548static void AssertNotCatalogBufferLock(LWLock *lock, LWLockMode mode,
549 void *unused_context);
550#endif
551static int rlocator_comparator(const void *p1, const void *p2);
552static inline int buffertag_comparator(const BufferTag *ba, const BufferTag *bb);
553static inline int ckpt_buforder_comparator(const CkptSortItem *a, const CkptSortItem *b);
554static int ts_ckpt_progress_comparator(Datum a, Datum b, void *arg);
555
556
557/*
558 * Implementation of PrefetchBuffer() for shared buffers.
559 */
562 ForkNumber forkNum,
563 BlockNumber blockNum)
564{
565 PrefetchBufferResult result = {InvalidBuffer, false};
566 BufferTag newTag; /* identity of requested block */
567 uint32 newHash; /* hash value for newTag */
568 LWLock *newPartitionLock; /* buffer partition lock for it */
569 int buf_id;
570
571 Assert(BlockNumberIsValid(blockNum));
572
573 /* create a tag so we can lookup the buffer */
574 InitBufferTag(&newTag, &smgr_reln->smgr_rlocator.locator,
575 forkNum, blockNum);
576
577 /* determine its hash code and partition lock ID */
578 newHash = BufTableHashCode(&newTag);
579 newPartitionLock = BufMappingPartitionLock(newHash);
580
581 /* see if the block is in the buffer pool already */
582 LWLockAcquire(newPartitionLock, LW_SHARED);
583 buf_id = BufTableLookup(&newTag, newHash);
584 LWLockRelease(newPartitionLock);
585
586 /* If not in buffers, initiate prefetch */
587 if (buf_id < 0)
588 {
589#ifdef USE_PREFETCH
590 /*
591 * Try to initiate an asynchronous read. This returns false in
592 * recovery if the relation file doesn't exist.
593 */
594 if ((io_direct_flags & IO_DIRECT_DATA) == 0 &&
595 smgrprefetch(smgr_reln, forkNum, blockNum, 1))
596 {
597 result.initiated_io = true;
598 }
599#endif /* USE_PREFETCH */
600 }
601 else
602 {
603 /*
604 * Report the buffer it was in at that time. The caller may be able
605 * to avoid a buffer table lookup, but it's not pinned and it must be
606 * rechecked!
607 */
608 result.recent_buffer = buf_id + 1;
609 }
610
611 /*
612 * If the block *is* in buffers, we do nothing. This is not really ideal:
613 * the block might be just about to be evicted, which would be stupid
614 * since we know we are going to need it soon. But the only easy answer
615 * is to bump the usage_count, which does not seem like a great solution:
616 * when the caller does ultimately touch the block, usage_count would get
617 * bumped again, resulting in too much favoritism for blocks that are
618 * involved in a prefetch sequence. A real fix would involve some
619 * additional per-buffer state, and it's not clear that there's enough of
620 * a problem to justify that.
621 */
622
623 return result;
624}
625
626/*
627 * PrefetchBuffer -- initiate asynchronous read of a block of a relation
628 *
629 * This is named by analogy to ReadBuffer but doesn't actually allocate a
630 * buffer. Instead it tries to ensure that a future ReadBuffer for the given
631 * block will not be delayed by the I/O. Prefetching is optional.
632 *
633 * There are three possible outcomes:
634 *
635 * 1. If the block is already cached, the result includes a valid buffer that
636 * could be used by the caller to avoid the need for a later buffer lookup, but
637 * it's not pinned, so the caller must recheck it.
638 *
639 * 2. If the kernel has been asked to initiate I/O, the initiated_io member is
640 * true. Currently there is no way to know if the data was already cached by
641 * the kernel and therefore didn't really initiate I/O, and no way to know when
642 * the I/O completes other than using synchronous ReadBuffer().
643 *
644 * 3. Otherwise, the buffer wasn't already cached by PostgreSQL, and
645 * USE_PREFETCH is not defined (this build doesn't support prefetching due to
646 * lack of a kernel facility), direct I/O is enabled, or the underlying
647 * relation file wasn't found and we are in recovery. (If the relation file
648 * wasn't found and we are not in recovery, an error is raised).
649 */
652{
653 Assert(RelationIsValid(reln));
654 Assert(BlockNumberIsValid(blockNum));
655
656 if (RelationUsesLocalBuffers(reln))
657 {
658 /* see comments in ReadBufferExtended */
659 if (RELATION_IS_OTHER_TEMP(reln))
661 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
662 errmsg("cannot access temporary tables of other sessions")));
663
664 /* pass it off to localbuf.c */
665 return PrefetchLocalBuffer(RelationGetSmgr(reln), forkNum, blockNum);
666 }
667 else
668 {
669 /* pass it to the shared buffer version */
670 return PrefetchSharedBuffer(RelationGetSmgr(reln), forkNum, blockNum);
671 }
672}
673
674/*
675 * ReadRecentBuffer -- try to pin a block in a recently observed buffer
676 *
677 * Compared to ReadBuffer(), this avoids a buffer mapping lookup when it's
678 * successful. Return true if the buffer is valid and still has the expected
679 * tag. In that case, the buffer is pinned and the usage count is bumped.
680 */
681bool
683 Buffer recent_buffer)
684{
685 BufferDesc *bufHdr;
686 BufferTag tag;
687 uint32 buf_state;
688 bool have_private_ref;
689
690 Assert(BufferIsValid(recent_buffer));
691
694 InitBufferTag(&tag, &rlocator, forkNum, blockNum);
695
696 if (BufferIsLocal(recent_buffer))
697 {
698 int b = -recent_buffer - 1;
699
700 bufHdr = GetLocalBufferDescriptor(b);
701 buf_state = pg_atomic_read_u32(&bufHdr->state);
702
703 /* Is it still valid and holding the right tag? */
704 if ((buf_state & BM_VALID) && BufferTagsEqual(&tag, &bufHdr->tag))
705 {
706 PinLocalBuffer(bufHdr, true);
707
709
710 return true;
711 }
712 }
713 else
714 {
715 bufHdr = GetBufferDescriptor(recent_buffer - 1);
716 have_private_ref = GetPrivateRefCount(recent_buffer) > 0;
717
718 /*
719 * Do we already have this buffer pinned with a private reference? If
720 * so, it must be valid and it is safe to check the tag without
721 * locking. If not, we have to lock the header first and then check.
722 */
723 if (have_private_ref)
724 buf_state = pg_atomic_read_u32(&bufHdr->state);
725 else
726 buf_state = LockBufHdr(bufHdr);
727
728 if ((buf_state & BM_VALID) && BufferTagsEqual(&tag, &bufHdr->tag))
729 {
730 /*
731 * It's now safe to pin the buffer. We can't pin first and ask
732 * questions later, because it might confuse code paths like
733 * InvalidateBuffer() if we pinned a random non-matching buffer.
734 */
735 if (have_private_ref)
736 PinBuffer(bufHdr, NULL); /* bump pin count */
737 else
738 PinBuffer_Locked(bufHdr); /* pin for first time */
739
741
742 return true;
743 }
744
745 /* If we locked the header above, now unlock. */
746 if (!have_private_ref)
747 UnlockBufHdr(bufHdr, buf_state);
748 }
749
750 return false;
751}
752
753/*
754 * ReadBuffer -- a shorthand for ReadBufferExtended, for reading from main
755 * fork with RBM_NORMAL mode and default strategy.
756 */
757Buffer
759{
760 return ReadBufferExtended(reln, MAIN_FORKNUM, blockNum, RBM_NORMAL, NULL);
761}
762
763/*
764 * ReadBufferExtended -- returns a buffer containing the requested
765 * block of the requested relation. If the blknum
766 * requested is P_NEW, extend the relation file and
767 * allocate a new block. (Caller is responsible for
768 * ensuring that only one backend tries to extend a
769 * relation at the same time!)
770 *
771 * Returns: the buffer number for the buffer containing
772 * the block read. The returned buffer has been pinned.
773 * Does not return on error --- elog's instead.
774 *
775 * Assume when this function is called, that reln has been opened already.
776 *
777 * In RBM_NORMAL mode, the page is read from disk, and the page header is
778 * validated. An error is thrown if the page header is not valid. (But
779 * note that an all-zero page is considered "valid"; see
780 * PageIsVerified().)
781 *
782 * RBM_ZERO_ON_ERROR is like the normal mode, but if the page header is not
783 * valid, the page is zeroed instead of throwing an error. This is intended
784 * for non-critical data, where the caller is prepared to repair errors.
785 *
786 * In RBM_ZERO_AND_LOCK mode, if the page isn't in buffer cache already, it's
787 * filled with zeros instead of reading it from disk. Useful when the caller
788 * is going to fill the page from scratch, since this saves I/O and avoids
789 * unnecessary failure if the page-on-disk has corrupt page headers.
790 * The page is returned locked to ensure that the caller has a chance to
791 * initialize the page before it's made visible to others.
792 * Caution: do not use this mode to read a page that is beyond the relation's
793 * current physical EOF; that is likely to cause problems in md.c when
794 * the page is modified and written out. P_NEW is OK, though.
795 *
796 * RBM_ZERO_AND_CLEANUP_LOCK is the same as RBM_ZERO_AND_LOCK, but acquires
797 * a cleanup-strength lock on the page.
798 *
799 * RBM_NORMAL_NO_LOG mode is treated the same as RBM_NORMAL here.
800 *
801 * If strategy is not NULL, a nondefault buffer access strategy is used.
802 * See buffer/README for details.
803 */
804inline Buffer
807{
808 Buffer buf;
809
810 /*
811 * Reject attempts to read non-local temporary relations; we would be
812 * likely to get wrong data since we have no visibility into the owning
813 * session's local buffers.
814 */
815 if (RELATION_IS_OTHER_TEMP(reln))
817 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
818 errmsg("cannot access temporary tables of other sessions")));
819
820 /*
821 * Read the buffer, and update pgstat counters to reflect a cache hit or
822 * miss.
823 */
824 buf = ReadBuffer_common(reln, RelationGetSmgr(reln), 0,
825 forkNum, blockNum, mode, strategy);
826
827 return buf;
828}
829
830
831/*
832 * ReadBufferWithoutRelcache -- like ReadBufferExtended, but doesn't require
833 * a relcache entry for the relation.
834 *
835 * Pass permanent = true for a RELPERSISTENCE_PERMANENT relation, and
836 * permanent = false for a RELPERSISTENCE_UNLOGGED relation. This function
837 * cannot be used for temporary relations (and making that work might be
838 * difficult, unless we only want to read temporary relations for our own
839 * ProcNumber).
840 */
841Buffer
844 BufferAccessStrategy strategy, bool permanent)
845{
846 SMgrRelation smgr = smgropen(rlocator, INVALID_PROC_NUMBER);
847
848 return ReadBuffer_common(NULL, smgr,
849 permanent ? RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED,
850 forkNum, blockNum,
851 mode, strategy);
852}
853
854/*
855 * Convenience wrapper around ExtendBufferedRelBy() extending by one block.
856 */
857Buffer
859 ForkNumber forkNum,
860 BufferAccessStrategy strategy,
861 uint32 flags)
862{
863 Buffer buf;
864 uint32 extend_by = 1;
865
866 ExtendBufferedRelBy(bmr, forkNum, strategy, flags, extend_by,
867 &buf, &extend_by);
868
869 return buf;
870}
871
872/*
873 * Extend relation by multiple blocks.
874 *
875 * Tries to extend the relation by extend_by blocks. Depending on the
876 * availability of resources the relation may end up being extended by a
877 * smaller number of pages (unless an error is thrown, always by at least one
878 * page). *extended_by is updated to the number of pages the relation has been
879 * extended to.
880 *
881 * buffers needs to be an array that is at least extend_by long. Upon
882 * completion, the first extend_by array elements will point to a pinned
883 * buffer.
884 *
885 * If EB_LOCK_FIRST is part of flags, the first returned buffer is
886 * locked. This is useful for callers that want a buffer that is guaranteed to
887 * be empty.
888 */
891 ForkNumber fork,
892 BufferAccessStrategy strategy,
893 uint32 flags,
894 uint32 extend_by,
895 Buffer *buffers,
896 uint32 *extended_by)
897{
898 Assert((bmr.rel != NULL) != (bmr.smgr != NULL));
899 Assert(bmr.smgr == NULL || bmr.relpersistence != 0);
900 Assert(extend_by > 0);
901
902 if (bmr.smgr == NULL)
903 {
904 bmr.smgr = RelationGetSmgr(bmr.rel);
905 bmr.relpersistence = bmr.rel->rd_rel->relpersistence;
906 }
907
908 return ExtendBufferedRelCommon(bmr, fork, strategy, flags,
909 extend_by, InvalidBlockNumber,
910 buffers, extended_by);
911}
912
913/*
914 * Extend the relation so it is at least extend_to blocks large, return buffer
915 * (extend_to - 1).
916 *
917 * This is useful for callers that want to write a specific page, regardless
918 * of the current size of the relation (e.g. useful for visibilitymap and for
919 * crash recovery).
920 */
921Buffer
923 ForkNumber fork,
924 BufferAccessStrategy strategy,
925 uint32 flags,
926 BlockNumber extend_to,
928{
930 uint32 extended_by = 0;
932 Buffer buffers[64];
933
934 Assert((bmr.rel != NULL) != (bmr.smgr != NULL));
935 Assert(bmr.smgr == NULL || bmr.relpersistence != 0);
936 Assert(extend_to != InvalidBlockNumber && extend_to > 0);
937
938 if (bmr.smgr == NULL)
939 {
940 bmr.smgr = RelationGetSmgr(bmr.rel);
941 bmr.relpersistence = bmr.rel->rd_rel->relpersistence;
942 }
943
944 /*
945 * If desired, create the file if it doesn't exist. If
946 * smgr_cached_nblocks[fork] is positive then it must exist, no need for
947 * an smgrexists call.
948 */
949 if ((flags & EB_CREATE_FORK_IF_NEEDED) &&
950 (bmr.smgr->smgr_cached_nblocks[fork] == 0 ||
952 !smgrexists(bmr.smgr, fork))
953 {
955
956 /* recheck, fork might have been created concurrently */
957 if (!smgrexists(bmr.smgr, fork))
958 smgrcreate(bmr.smgr, fork, flags & EB_PERFORMING_RECOVERY);
959
961 }
962
963 /*
964 * If requested, invalidate size cache, so that smgrnblocks asks the
965 * kernel.
966 */
967 if (flags & EB_CLEAR_SIZE_CACHE)
969
970 /*
971 * Estimate how many pages we'll need to extend by. This avoids acquiring
972 * unnecessarily many victim buffers.
973 */
974 current_size = smgrnblocks(bmr.smgr, fork);
975
976 /*
977 * Since no-one else can be looking at the page contents yet, there is no
978 * difference between an exclusive lock and a cleanup-strength lock. Note
979 * that we pass the original mode to ReadBuffer_common() below, when
980 * falling back to reading the buffer to a concurrent relation extension.
981 */
983 flags |= EB_LOCK_TARGET;
984
985 while (current_size < extend_to)
986 {
987 uint32 num_pages = lengthof(buffers);
988 BlockNumber first_block;
989
990 if ((uint64) current_size + num_pages > extend_to)
991 num_pages = extend_to - current_size;
992
993 first_block = ExtendBufferedRelCommon(bmr, fork, strategy, flags,
994 num_pages, extend_to,
995 buffers, &extended_by);
996
997 current_size = first_block + extended_by;
998 Assert(num_pages != 0 || current_size >= extend_to);
999
1000 for (uint32 i = 0; i < extended_by; i++)
1001 {
1002 if (first_block + i != extend_to - 1)
1003 ReleaseBuffer(buffers[i]);
1004 else
1005 buffer = buffers[i];
1006 }
1007 }
1008
1009 /*
1010 * It's possible that another backend concurrently extended the relation.
1011 * In that case read the buffer.
1012 *
1013 * XXX: Should we control this via a flag?
1014 */
1015 if (buffer == InvalidBuffer)
1016 {
1017 Assert(extended_by == 0);
1019 fork, extend_to - 1, mode, strategy);
1020 }
1021
1022 return buffer;
1023}
1024
1025/*
1026 * Lock and optionally zero a buffer, as part of the implementation of
1027 * RBM_ZERO_AND_LOCK or RBM_ZERO_AND_CLEANUP_LOCK. The buffer must be already
1028 * pinned. If the buffer is not already valid, it is zeroed and made valid.
1029 */
1030static void
1032{
1033 BufferDesc *bufHdr;
1034 bool need_to_zero;
1035 bool isLocalBuf = BufferIsLocal(buffer);
1036
1038
1039 if (already_valid)
1040 {
1041 /*
1042 * If the caller already knew the buffer was valid, we can skip some
1043 * header interaction. The caller just wants to lock the buffer.
1044 */
1045 need_to_zero = false;
1046 }
1047 else if (isLocalBuf)
1048 {
1049 /* Simple case for non-shared buffers. */
1050 bufHdr = GetLocalBufferDescriptor(-buffer - 1);
1051 need_to_zero = StartLocalBufferIO(bufHdr, true, false);
1052 }
1053 else
1054 {
1055 /*
1056 * Take BM_IO_IN_PROGRESS, or discover that BM_VALID has been set
1057 * concurrently. Even though we aren't doing I/O, that ensures that
1058 * we don't zero a page that someone else has pinned. An exclusive
1059 * content lock wouldn't be enough, because readers are allowed to
1060 * drop the content lock after determining that a tuple is visible
1061 * (see buffer access rules in README).
1062 */
1063 bufHdr = GetBufferDescriptor(buffer - 1);
1064 need_to_zero = StartBufferIO(bufHdr, true, false);
1065 }
1066
1067 if (need_to_zero)
1068 {
1069 memset(BufferGetPage(buffer), 0, BLCKSZ);
1070
1071 /*
1072 * Grab the buffer content lock before marking the page as valid, to
1073 * make sure that no other backend sees the zeroed page before the
1074 * caller has had a chance to initialize it.
1075 *
1076 * Since no-one else can be looking at the page contents yet, there is
1077 * no difference between an exclusive lock and a cleanup-strength
1078 * lock. (Note that we cannot use LockBuffer() or
1079 * LockBufferForCleanup() here, because they assert that the buffer is
1080 * already valid.)
1081 */
1082 if (!isLocalBuf)
1084
1085 /* Set BM_VALID, terminate IO, and wake up any waiters */
1086 if (isLocalBuf)
1087 TerminateLocalBufferIO(bufHdr, false, BM_VALID, false);
1088 else
1089 TerminateBufferIO(bufHdr, false, BM_VALID, true, false);
1090 }
1091 else if (!isLocalBuf)
1092 {
1093 /*
1094 * The buffer is valid, so we can't zero it. The caller still expects
1095 * the page to be locked on return.
1096 */
1097 if (mode == RBM_ZERO_AND_LOCK)
1099 else
1101 }
1102}
1103
1104/*
1105 * Pin a buffer for a given block. *foundPtr is set to true if the block was
1106 * already present, or false if more work is required to either read it in or
1107 * zero it.
1108 */
1111 SMgrRelation smgr,
1112 char persistence,
1113 ForkNumber forkNum,
1114 BlockNumber blockNum,
1115 BufferAccessStrategy strategy,
1116 bool *foundPtr)
1117{
1118 BufferDesc *bufHdr;
1119 IOContext io_context;
1120 IOObject io_object;
1121
1122 Assert(blockNum != P_NEW);
1123
1124 /* Persistence should be set before */
1125 Assert((persistence == RELPERSISTENCE_TEMP ||
1126 persistence == RELPERSISTENCE_PERMANENT ||
1127 persistence == RELPERSISTENCE_UNLOGGED));
1128
1129 if (persistence == RELPERSISTENCE_TEMP)
1130 {
1131 io_context = IOCONTEXT_NORMAL;
1132 io_object = IOOBJECT_TEMP_RELATION;
1133 }
1134 else
1135 {
1136 io_context = IOContextForStrategy(strategy);
1137 io_object = IOOBJECT_RELATION;
1138 }
1139
1140 TRACE_POSTGRESQL_BUFFER_READ_START(forkNum, blockNum,
1144 smgr->smgr_rlocator.backend);
1145
1146 if (persistence == RELPERSISTENCE_TEMP)
1147 {
1148 bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, foundPtr);
1149 if (*foundPtr)
1151 }
1152 else
1153 {
1154 bufHdr = BufferAlloc(smgr, persistence, forkNum, blockNum,
1155 strategy, foundPtr, io_context);
1156 if (*foundPtr)
1158 }
1159 if (rel)
1160 {
1161 /*
1162 * While pgBufferUsage's "read" counter isn't bumped unless we reach
1163 * WaitReadBuffers() (so, not for hits, and not for buffers that are
1164 * zeroed instead), the per-relation stats always count them.
1165 */
1167 if (*foundPtr)
1169 }
1170 if (*foundPtr)
1171 {
1172 pgstat_count_io_op(io_object, io_context, IOOP_HIT, 1, 0);
1173 if (VacuumCostActive)
1175
1176 TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
1180 smgr->smgr_rlocator.backend,
1181 true);
1182 }
1183
1184 return BufferDescriptorGetBuffer(bufHdr);
1185}
1186
1187/*
1188 * ReadBuffer_common -- common logic for all ReadBuffer variants
1189 *
1190 * smgr is required, rel is optional unless using P_NEW.
1191 */
1193ReadBuffer_common(Relation rel, SMgrRelation smgr, char smgr_persistence,
1194 ForkNumber forkNum,
1196 BufferAccessStrategy strategy)
1197{
1198 ReadBuffersOperation operation;
1199 Buffer buffer;
1200 int flags;
1201 char persistence;
1202
1203 /*
1204 * Backward compatibility path, most code should use ExtendBufferedRel()
1205 * instead, as acquiring the extension lock inside ExtendBufferedRel()
1206 * scales a lot better.
1207 */
1208 if (unlikely(blockNum == P_NEW))
1209 {
1211
1212 /*
1213 * Since no-one else can be looking at the page contents yet, there is
1214 * no difference between an exclusive lock and a cleanup-strength
1215 * lock.
1216 */
1218 flags |= EB_LOCK_FIRST;
1219
1220 return ExtendBufferedRel(BMR_REL(rel), forkNum, strategy, flags);
1221 }
1222
1223 if (rel)
1224 persistence = rel->rd_rel->relpersistence;
1225 else
1226 persistence = smgr_persistence;
1227
1230 {
1231 bool found;
1232
1233 buffer = PinBufferForBlock(rel, smgr, persistence,
1234 forkNum, blockNum, strategy, &found);
1235 ZeroAndLockBuffer(buffer, mode, found);
1236 return buffer;
1237 }
1238
1239 /*
1240 * Signal that we are going to immediately wait. If we're immediately
1241 * waiting, there is no benefit in actually executing the IO
1242 * asynchronously, it would just add dispatch overhead.
1243 */
1245 if (mode == RBM_ZERO_ON_ERROR)
1247 operation.smgr = smgr;
1248 operation.rel = rel;
1249 operation.persistence = persistence;
1250 operation.forknum = forkNum;
1251 operation.strategy = strategy;
1252 if (StartReadBuffer(&operation,
1253 &buffer,
1254 blockNum,
1255 flags))
1256 WaitReadBuffers(&operation);
1257
1258 return buffer;
1259}
1260
1263 Buffer *buffers,
1264 BlockNumber blockNum,
1265 int *nblocks,
1266 int flags,
1267 bool allow_forwarding)
1268{
1269 int actual_nblocks = *nblocks;
1270 int maxcombine = 0;
1271 bool did_start_io;
1272
1273 Assert(*nblocks == 1 || allow_forwarding);
1274 Assert(*nblocks > 0);
1275 Assert(*nblocks <= MAX_IO_COMBINE_LIMIT);
1276
1277 for (int i = 0; i < actual_nblocks; ++i)
1278 {
1279 bool found;
1280
1281 if (allow_forwarding && buffers[i] != InvalidBuffer)
1282 {
1283 BufferDesc *bufHdr;
1284
1285 /*
1286 * This is a buffer that was pinned by an earlier call to
1287 * StartReadBuffers(), but couldn't be handled in one operation at
1288 * that time. The operation was split, and the caller has passed
1289 * an already pinned buffer back to us to handle the rest of the
1290 * operation. It must continue at the expected block number.
1291 */
1292 Assert(BufferGetBlockNumber(buffers[i]) == blockNum + i);
1293
1294 /*
1295 * It might be an already valid buffer (a hit) that followed the
1296 * final contiguous block of an earlier I/O (a miss) marking the
1297 * end of it, or a buffer that some other backend has since made
1298 * valid by performing the I/O for us, in which case we can handle
1299 * it as a hit now. It is safe to check for a BM_VALID flag with
1300 * a relaxed load, because we got a fresh view of it while pinning
1301 * it in the previous call.
1302 *
1303 * On the other hand if we don't see BM_VALID yet, it must be an
1304 * I/O that was split by the previous call and we need to try to
1305 * start a new I/O from this block. We're also racing against any
1306 * other backend that might start the I/O or even manage to mark
1307 * it BM_VALID after this check, but StartBufferIO() will handle
1308 * those cases.
1309 */
1310 if (BufferIsLocal(buffers[i]))
1311 bufHdr = GetLocalBufferDescriptor(-buffers[i] - 1);
1312 else
1313 bufHdr = GetBufferDescriptor(buffers[i] - 1);
1315 found = pg_atomic_read_u32(&bufHdr->state) & BM_VALID;
1316 }
1317 else
1318 {
1319 buffers[i] = PinBufferForBlock(operation->rel,
1320 operation->smgr,
1321 operation->persistence,
1322 operation->forknum,
1323 blockNum + i,
1324 operation->strategy,
1325 &found);
1326 }
1327
1328 if (found)
1329 {
1330 /*
1331 * We have a hit. If it's the first block in the requested range,
1332 * we can return it immediately and report that WaitReadBuffers()
1333 * does not need to be called. If the initial value of *nblocks
1334 * was larger, the caller will have to call again for the rest.
1335 */
1336 if (i == 0)
1337 {
1338 *nblocks = 1;
1339
1340#ifdef USE_ASSERT_CHECKING
1341
1342 /*
1343 * Initialize enough of ReadBuffersOperation to make
1344 * CheckReadBuffersOperation() work. Outside of assertions
1345 * that's not necessary when no IO is issued.
1346 */
1347 operation->buffers = buffers;
1348 operation->blocknum = blockNum;
1349 operation->nblocks = 1;
1350 operation->nblocks_done = 1;
1351 CheckReadBuffersOperation(operation, true);
1352#endif
1353 return false;
1354 }
1355
1356 /*
1357 * Otherwise we already have an I/O to perform, but this block
1358 * can't be included as it is already valid. Split the I/O here.
1359 * There may or may not be more blocks requiring I/O after this
1360 * one, we haven't checked, but they can't be contiguous with this
1361 * one in the way. We'll leave this buffer pinned, forwarding it
1362 * to the next call, avoiding the need to unpin it here and re-pin
1363 * it in the next call.
1364 */
1365 actual_nblocks = i;
1366 break;
1367 }
1368 else
1369 {
1370 /*
1371 * Check how many blocks we can cover with the same IO. The smgr
1372 * implementation might e.g. be limited due to a segment boundary.
1373 */
1374 if (i == 0 && actual_nblocks > 1)
1375 {
1376 maxcombine = smgrmaxcombine(operation->smgr,
1377 operation->forknum,
1378 blockNum);
1379 if (unlikely(maxcombine < actual_nblocks))
1380 {
1381 elog(DEBUG2, "limiting nblocks at %u from %u to %u",
1382 blockNum, actual_nblocks, maxcombine);
1383 actual_nblocks = maxcombine;
1384 }
1385 }
1386 }
1387 }
1388 *nblocks = actual_nblocks;
1389
1390 /* Populate information needed for I/O. */
1391 operation->buffers = buffers;
1392 operation->blocknum = blockNum;
1393 operation->flags = flags;
1394 operation->nblocks = actual_nblocks;
1395 operation->nblocks_done = 0;
1396 pgaio_wref_clear(&operation->io_wref);
1397
1398 /*
1399 * When using AIO, start the IO in the background. If not, issue prefetch
1400 * requests if desired by the caller.
1401 *
1402 * The reason we have a dedicated path for IOMETHOD_SYNC here is to
1403 * de-risk the introduction of AIO somewhat. It's a large architectural
1404 * change, with lots of chances for unanticipated performance effects.
1405 *
1406 * Use of IOMETHOD_SYNC already leads to not actually performing IO
1407 * asynchronously, but without the check here we'd execute IO earlier than
1408 * we used to. Eventually this IOMETHOD_SYNC specific path should go away.
1409 */
1410 if (io_method != IOMETHOD_SYNC)
1411 {
1412 /*
1413 * Try to start IO asynchronously. It's possible that no IO needs to
1414 * be started, if another backend already performed the IO.
1415 *
1416 * Note that if an IO is started, it might not cover the entire
1417 * requested range, e.g. because an intermediary block has been read
1418 * in by another backend. In that case any "trailing" buffers we
1419 * already pinned above will be "forwarded" by read_stream.c to the
1420 * next call to StartReadBuffers().
1421 *
1422 * This is signalled to the caller by decrementing *nblocks *and*
1423 * reducing operation->nblocks. The latter is done here, but not below
1424 * WaitReadBuffers(), as in WaitReadBuffers() we can't "shorten" the
1425 * overall read size anymore, we need to retry until done in its
1426 * entirety or until failed.
1427 */
1428 did_start_io = AsyncReadBuffers(operation, nblocks);
1429
1430 operation->nblocks = *nblocks;
1431 }
1432 else
1433 {
1434 operation->flags |= READ_BUFFERS_SYNCHRONOUSLY;
1435
1436 if (flags & READ_BUFFERS_ISSUE_ADVICE)
1437 {
1438 /*
1439 * In theory we should only do this if PinBufferForBlock() had to
1440 * allocate new buffers above. That way, if two calls to
1441 * StartReadBuffers() were made for the same blocks before
1442 * WaitReadBuffers(), only the first would issue the advice.
1443 * That'd be a better simulation of true asynchronous I/O, which
1444 * would only start the I/O once, but isn't done here for
1445 * simplicity.
1446 */
1447 smgrprefetch(operation->smgr,
1448 operation->forknum,
1449 blockNum,
1450 actual_nblocks);
1451 }
1452
1453 /*
1454 * Indicate that WaitReadBuffers() should be called. WaitReadBuffers()
1455 * will initiate the necessary IO.
1456 */
1457 did_start_io = true;
1458 }
1459
1460 CheckReadBuffersOperation(operation, !did_start_io);
1461
1462 return did_start_io;
1463}
1464
1465/*
1466 * Begin reading a range of blocks beginning at blockNum and extending for
1467 * *nblocks. *nblocks and the buffers array are in/out parameters. On entry,
1468 * the buffers elements covered by *nblocks must hold either InvalidBuffer or
1469 * buffers forwarded by an earlier call to StartReadBuffers() that was split
1470 * and is now being continued. On return, *nblocks holds the number of blocks
1471 * accepted by this operation. If it is less than the original number then
1472 * this operation has been split, but buffer elements up to the original
1473 * requested size may hold forwarded buffers to be used for a continuing
1474 * operation. The caller must either start a new I/O beginning at the block
1475 * immediately following the blocks accepted by this call and pass those
1476 * buffers back in, or release them if it chooses not to. It shouldn't make
1477 * any other use of or assumptions about forwarded buffers.
1478 *
1479 * If false is returned, no I/O is necessary and the buffers covered by
1480 * *nblocks on exit are valid and ready to be accessed. If true is returned,
1481 * an I/O has been started, and WaitReadBuffers() must be called with the same
1482 * operation object before the buffers covered by *nblocks on exit can be
1483 * accessed. Along with the operation object, the caller-supplied array of
1484 * buffers must remain valid until WaitReadBuffers() is called, and any
1485 * forwarded buffers must also be preserved for a continuing call unless
1486 * they are explicitly released.
1487 */
1488bool
1490 Buffer *buffers,
1491 BlockNumber blockNum,
1492 int *nblocks,
1493 int flags)
1494{
1495 return StartReadBuffersImpl(operation, buffers, blockNum, nblocks, flags,
1496 true /* expect forwarded buffers */ );
1497}
1498
1499/*
1500 * Single block version of the StartReadBuffers(). This might save a few
1501 * instructions when called from another translation unit, because it is
1502 * specialized for nblocks == 1.
1503 *
1504 * This version does not support "forwarded" buffers: they cannot be created
1505 * by reading only one block and *buffer is ignored on entry.
1506 */
1507bool
1509 Buffer *buffer,
1510 BlockNumber blocknum,
1511 int flags)
1512{
1513 int nblocks = 1;
1514 bool result;
1515
1516 result = StartReadBuffersImpl(operation, buffer, blocknum, &nblocks, flags,
1517 false /* single block, no forwarding */ );
1518 Assert(nblocks == 1); /* single block can't be short */
1519
1520 return result;
1521}
1522
1523/*
1524 * Perform sanity checks on the ReadBuffersOperation.
1525 */
1526static void
1528{
1529#ifdef USE_ASSERT_CHECKING
1530 Assert(operation->nblocks_done <= operation->nblocks);
1531 Assert(!is_complete || operation->nblocks == operation->nblocks_done);
1532
1533 for (int i = 0; i < operation->nblocks; i++)
1534 {
1535 Buffer buffer = operation->buffers[i];
1536 BufferDesc *buf_hdr = BufferIsLocal(buffer) ?
1539
1540 Assert(BufferGetBlockNumber(buffer) == operation->blocknum + i);
1542
1543 if (i < operation->nblocks_done)
1545 }
1546#endif
1547}
1548
1549/* helper for ReadBuffersCanStartIO(), to avoid repetition */
1550static inline bool
1552{
1553 if (BufferIsLocal(buffer))
1555 true, nowait);
1556 else
1557 return StartBufferIO(GetBufferDescriptor(buffer - 1), true, nowait);
1558}
1559
1560/*
1561 * Helper for AsyncReadBuffers that tries to get the buffer ready for IO.
1562 */
1563static inline bool
1565{
1566 /*
1567 * If this backend currently has staged IO, we need to submit the pending
1568 * IO before waiting for the right to issue IO, to avoid the potential for
1569 * deadlocks (and, more commonly, unnecessary delays for other backends).
1570 */
1571 if (!nowait && pgaio_have_staged())
1572 {
1574 return true;
1575
1576 /*
1577 * Unfortunately StartBufferIO() returning false doesn't allow to
1578 * distinguish between the buffer already being valid and IO already
1579 * being in progress. Since IO already being in progress is quite
1580 * rare, this approach seems fine.
1581 */
1583 }
1584
1585 return ReadBuffersCanStartIOOnce(buffer, nowait);
1586}
1587
1588/*
1589 * Helper for WaitReadBuffers() that processes the results of a readv
1590 * operation, raising an error if necessary.
1591 */
1592static void
1594{
1595 PgAioReturn *aio_ret = &operation->io_return;
1596 PgAioResultStatus rs = aio_ret->result.status;
1597 int newly_read_blocks = 0;
1598
1599 Assert(pgaio_wref_valid(&operation->io_wref));
1600 Assert(aio_ret->result.status != PGAIO_RS_UNKNOWN);
1601
1602 /*
1603 * SMGR reports the number of blocks successfully read as the result of
1604 * the IO operation. Thus we can simply add that to ->nblocks_done.
1605 */
1606
1607 if (likely(rs != PGAIO_RS_ERROR))
1608 newly_read_blocks = aio_ret->result.result;
1609
1610 if (rs == PGAIO_RS_ERROR || rs == PGAIO_RS_WARNING)
1611 pgaio_result_report(aio_ret->result, &aio_ret->target_data,
1612 rs == PGAIO_RS_ERROR ? ERROR : WARNING);
1613 else if (aio_ret->result.status == PGAIO_RS_PARTIAL)
1614 {
1615 /*
1616 * We'll retry, so we just emit a debug message to the server log (or
1617 * not even that in prod scenarios).
1618 */
1619 pgaio_result_report(aio_ret->result, &aio_ret->target_data, DEBUG1);
1620 elog(DEBUG3, "partial read, will retry");
1621 }
1622
1623 Assert(newly_read_blocks > 0);
1624 Assert(newly_read_blocks <= MAX_IO_COMBINE_LIMIT);
1625
1626 operation->nblocks_done += newly_read_blocks;
1627
1628 Assert(operation->nblocks_done <= operation->nblocks);
1629}
1630
1631void
1633{
1634 PgAioReturn *aio_ret = &operation->io_return;
1635 IOContext io_context;
1636 IOObject io_object;
1637
1638 if (operation->persistence == RELPERSISTENCE_TEMP)
1639 {
1640 io_context = IOCONTEXT_NORMAL;
1641 io_object = IOOBJECT_TEMP_RELATION;
1642 }
1643 else
1644 {
1645 io_context = IOContextForStrategy(operation->strategy);
1646 io_object = IOOBJECT_RELATION;
1647 }
1648
1649 /*
1650 * If we get here without an IO operation having been issued, the
1651 * io_method == IOMETHOD_SYNC path must have been used. Otherwise the
1652 * caller should not have called WaitReadBuffers().
1653 *
1654 * In the case of IOMETHOD_SYNC, we start - as we used to before the
1655 * introducing of AIO - the IO in WaitReadBuffers(). This is done as part
1656 * of the retry logic below, no extra code is required.
1657 *
1658 * This path is expected to eventually go away.
1659 */
1660 if (!pgaio_wref_valid(&operation->io_wref) && io_method != IOMETHOD_SYNC)
1661 elog(ERROR, "waiting for read operation that didn't read");
1662
1663 /*
1664 * To handle partial reads, and IOMETHOD_SYNC, we re-issue IO until we're
1665 * done. We may need multiple retries, not just because we could get
1666 * multiple partial reads, but also because some of the remaining
1667 * to-be-read buffers may have been read in by other backends, limiting
1668 * the IO size.
1669 */
1670 while (true)
1671 {
1672 int ignored_nblocks_progress;
1673
1674 CheckReadBuffersOperation(operation, false);
1675
1676 /*
1677 * If there is an IO associated with the operation, we may need to
1678 * wait for it.
1679 */
1680 if (pgaio_wref_valid(&operation->io_wref))
1681 {
1682 /*
1683 * Track the time spent waiting for the IO to complete. As
1684 * tracking a wait even if we don't actually need to wait
1685 *
1686 * a) is not cheap, due to the timestamping overhead
1687 *
1688 * b) reports some time as waiting, even if we never waited
1689 *
1690 * we first check if we already know the IO is complete.
1691 */
1692 if (aio_ret->result.status == PGAIO_RS_UNKNOWN &&
1693 !pgaio_wref_check_done(&operation->io_wref))
1694 {
1696
1697 pgaio_wref_wait(&operation->io_wref);
1698
1699 /*
1700 * The IO operation itself was already counted earlier, in
1701 * AsyncReadBuffers(), this just accounts for the wait time.
1702 */
1703 pgstat_count_io_op_time(io_object, io_context, IOOP_READ,
1704 io_start, 0, 0);
1705 }
1706 else
1707 {
1708 Assert(pgaio_wref_check_done(&operation->io_wref));
1709 }
1710
1711 /*
1712 * We now are sure the IO completed. Check the results. This
1713 * includes reporting on errors if there were any.
1714 */
1715 ProcessReadBuffersResult(operation);
1716 }
1717
1718 /*
1719 * Most of the time, the one IO we already started, will read in
1720 * everything. But we need to deal with partial reads and buffers not
1721 * needing IO anymore.
1722 */
1723 if (operation->nblocks_done == operation->nblocks)
1724 break;
1725
1727
1728 /*
1729 * This may only complete the IO partially, either because some
1730 * buffers were already valid, or because of a partial read.
1731 *
1732 * NB: In contrast to after the AsyncReadBuffers() call in
1733 * StartReadBuffers(), we do *not* reduce
1734 * ReadBuffersOperation->nblocks here, callers expect the full
1735 * operation to be completed at this point (as more operations may
1736 * have been queued).
1737 */
1738 AsyncReadBuffers(operation, &ignored_nblocks_progress);
1739 }
1740
1741 CheckReadBuffersOperation(operation, true);
1742
1743 /* NB: READ_DONE tracepoint was already executed in completion callback */
1744}
1745
1746/*
1747 * Initiate IO for the ReadBuffersOperation
1748 *
1749 * This function only starts a single IO at a time. The size of the IO may be
1750 * limited to below the to-be-read blocks, if one of the buffers has
1751 * concurrently been read in. If the first to-be-read buffer is already valid,
1752 * no IO will be issued.
1753 *
1754 * To support retries after partial reads, the first operation->nblocks_done
1755 * buffers are skipped.
1756 *
1757 * On return *nblocks_progress is updated to reflect the number of buffers
1758 * affected by the call. If the first buffer is valid, *nblocks_progress is
1759 * set to 1 and operation->nblocks_done is incremented.
1760 *
1761 * Returns true if IO was initiated, false if no IO was necessary.
1762 */
1763static bool
1764AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress)
1765{
1766 Buffer *buffers = &operation->buffers[0];
1767 int flags = operation->flags;
1768 BlockNumber blocknum = operation->blocknum;
1769 ForkNumber forknum = operation->forknum;
1770 char persistence = operation->persistence;
1771 int16 nblocks_done = operation->nblocks_done;
1772 Buffer *io_buffers = &operation->buffers[nblocks_done];
1773 int io_buffers_len = 0;
1774 PgAioHandle *ioh;
1775 uint32 ioh_flags = 0;
1776 void *io_pages[MAX_IO_COMBINE_LIMIT];
1777 IOContext io_context;
1778 IOObject io_object;
1779 bool did_start_io;
1780
1781 /*
1782 * When this IO is executed synchronously, either because the caller will
1783 * immediately block waiting for the IO or because IOMETHOD_SYNC is used,
1784 * the AIO subsystem needs to know.
1785 */
1786 if (flags & READ_BUFFERS_SYNCHRONOUSLY)
1787 ioh_flags |= PGAIO_HF_SYNCHRONOUS;
1788
1789 if (persistence == RELPERSISTENCE_TEMP)
1790 {
1791 io_context = IOCONTEXT_NORMAL;
1792 io_object = IOOBJECT_TEMP_RELATION;
1793 ioh_flags |= PGAIO_HF_REFERENCES_LOCAL;
1794 }
1795 else
1796 {
1797 io_context = IOContextForStrategy(operation->strategy);
1798 io_object = IOOBJECT_RELATION;
1799 }
1800
1801 /*
1802 * If zero_damaged_pages is enabled, add the READ_BUFFERS_ZERO_ON_ERROR
1803 * flag. The reason for that is that, hopefully, zero_damaged_pages isn't
1804 * set globally, but on a per-session basis. The completion callback,
1805 * which may be run in other processes, e.g. in IO workers, may have a
1806 * different value of the zero_damaged_pages GUC.
1807 *
1808 * XXX: We probably should eventually use a different flag for
1809 * zero_damaged_pages, so we can report different log levels / error codes
1810 * for zero_damaged_pages and ZERO_ON_ERROR.
1811 */
1814
1815 /*
1816 * For the same reason as with zero_damaged_pages we need to use this
1817 * backend's ignore_checksum_failure value.
1818 */
1821
1822
1823 /*
1824 * To be allowed to report stats in the local completion callback we need
1825 * to prepare to report stats now. This ensures we can safely report the
1826 * checksum failure even in a critical section.
1827 */
1829
1830 /*
1831 * Get IO handle before ReadBuffersCanStartIO(), as pgaio_io_acquire()
1832 * might block, which we don't want after setting IO_IN_PROGRESS.
1833 *
1834 * If we need to wait for IO before we can get a handle, submit
1835 * already-staged IO first, so that other backends don't need to wait.
1836 * There wouldn't be a deadlock risk, as pgaio_io_acquire() just needs to
1837 * wait for already submitted IO, which doesn't require additional locks,
1838 * but it could still cause undesirable waits.
1839 *
1840 * A secondary benefit is that this would allow us to measure the time in
1841 * pgaio_io_acquire() without causing undue timer overhead in the common,
1842 * non-blocking, case. However, currently the pgstats infrastructure
1843 * doesn't really allow that, as it a) asserts that an operation can't
1844 * have time without operations b) doesn't have an API to report
1845 * "accumulated" time.
1846 */
1848 if (unlikely(!ioh))
1849 {
1851
1853 }
1854
1855 /*
1856 * Check if we can start IO on the first to-be-read buffer.
1857 *
1858 * If an I/O is already in progress in another backend, we want to wait
1859 * for the outcome: either done, or something went wrong and we will
1860 * retry.
1861 */
1862 if (!ReadBuffersCanStartIO(buffers[nblocks_done], false))
1863 {
1864 /*
1865 * Someone else has already completed this block, we're done.
1866 *
1867 * When IO is necessary, ->nblocks_done is updated in
1868 * ProcessReadBuffersResult(), but that is not called if no IO is
1869 * necessary. Thus update here.
1870 */
1871 operation->nblocks_done += 1;
1872 *nblocks_progress = 1;
1873
1874 pgaio_io_release(ioh);
1875 pgaio_wref_clear(&operation->io_wref);
1876 did_start_io = false;
1877
1878 /*
1879 * Report and track this as a 'hit' for this backend, even though it
1880 * must have started out as a miss in PinBufferForBlock(). The other
1881 * backend will track this as a 'read'.
1882 */
1883 TRACE_POSTGRESQL_BUFFER_READ_DONE(forknum, blocknum + operation->nblocks_done,
1884 operation->smgr->smgr_rlocator.locator.spcOid,
1885 operation->smgr->smgr_rlocator.locator.dbOid,
1886 operation->smgr->smgr_rlocator.locator.relNumber,
1887 operation->smgr->smgr_rlocator.backend,
1888 true);
1889
1890 if (persistence == RELPERSISTENCE_TEMP)
1892 else
1894
1895 if (operation->rel)
1896 pgstat_count_buffer_hit(operation->rel);
1897
1898 pgstat_count_io_op(io_object, io_context, IOOP_HIT, 1, 0);
1899
1900 if (VacuumCostActive)
1902 }
1903 else
1904 {
1905 instr_time io_start;
1906
1907 /* We found a buffer that we need to read in. */
1908 Assert(io_buffers[0] == buffers[nblocks_done]);
1909 io_pages[0] = BufferGetBlock(buffers[nblocks_done]);
1910 io_buffers_len = 1;
1911
1912 /*
1913 * How many neighboring-on-disk blocks can we scatter-read into other
1914 * buffers at the same time? In this case we don't wait if we see an
1915 * I/O already in progress. We already set BM_IO_IN_PROGRESS for the
1916 * head block, so we should get on with that I/O as soon as possible.
1917 */
1918 for (int i = nblocks_done + 1; i < operation->nblocks; i++)
1919 {
1920 if (!ReadBuffersCanStartIO(buffers[i], true))
1921 break;
1922 /* Must be consecutive block numbers. */
1923 Assert(BufferGetBlockNumber(buffers[i - 1]) ==
1924 BufferGetBlockNumber(buffers[i]) - 1);
1925 Assert(io_buffers[io_buffers_len] == buffers[i]);
1926
1927 io_pages[io_buffers_len++] = BufferGetBlock(buffers[i]);
1928 }
1929
1930 /* get a reference to wait for in WaitReadBuffers() */
1931 pgaio_io_get_wref(ioh, &operation->io_wref);
1932
1933 /* provide the list of buffers to the completion callbacks */
1934 pgaio_io_set_handle_data_32(ioh, (uint32 *) io_buffers, io_buffers_len);
1935
1937 persistence == RELPERSISTENCE_TEMP ?
1940 flags);
1941
1942 pgaio_io_set_flag(ioh, ioh_flags);
1943
1944 /* ---
1945 * Even though we're trying to issue IO asynchronously, track the time
1946 * in smgrstartreadv():
1947 * - if io_method == IOMETHOD_SYNC, we will always perform the IO
1948 * immediately
1949 * - the io method might not support the IO (e.g. worker IO for a temp
1950 * table)
1951 * ---
1952 */
1954 smgrstartreadv(ioh, operation->smgr, forknum,
1955 blocknum + nblocks_done,
1956 io_pages, io_buffers_len);
1957 pgstat_count_io_op_time(io_object, io_context, IOOP_READ,
1958 io_start, 1, io_buffers_len * BLCKSZ);
1959
1960 if (persistence == RELPERSISTENCE_TEMP)
1961 pgBufferUsage.local_blks_read += io_buffers_len;
1962 else
1963 pgBufferUsage.shared_blks_read += io_buffers_len;
1964
1965 /*
1966 * Track vacuum cost when issuing IO, not after waiting for it.
1967 * Otherwise we could end up issuing a lot of IO in a short timespan,
1968 * despite a low cost limit.
1969 */
1970 if (VacuumCostActive)
1971 VacuumCostBalance += VacuumCostPageMiss * io_buffers_len;
1972
1973 *nblocks_progress = io_buffers_len;
1974 did_start_io = true;
1975 }
1976
1977 return did_start_io;
1978}
1979
1980/*
1981 * BufferAlloc -- subroutine for PinBufferForBlock. Handles lookup of a shared
1982 * buffer. If no buffer exists already, selects a replacement victim and
1983 * evicts the old page, but does NOT read in new page.
1984 *
1985 * "strategy" can be a buffer replacement strategy object, or NULL for
1986 * the default strategy. The selected buffer's usage_count is advanced when
1987 * using the default strategy, but otherwise possibly not (see PinBuffer).
1988 *
1989 * The returned buffer is pinned and is already marked as holding the
1990 * desired page. If it already did have the desired page, *foundPtr is
1991 * set true. Otherwise, *foundPtr is set false.
1992 *
1993 * io_context is passed as an output parameter to avoid calling
1994 * IOContextForStrategy() when there is a shared buffers hit and no IO
1995 * statistics need be captured.
1996 *
1997 * No locks are held either at entry or exit.
1998 */
2000BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
2001 BlockNumber blockNum,
2002 BufferAccessStrategy strategy,
2003 bool *foundPtr, IOContext io_context)
2004{
2005 BufferTag newTag; /* identity of requested block */
2006 uint32 newHash; /* hash value for newTag */
2007 LWLock *newPartitionLock; /* buffer partition lock for it */
2008 int existing_buf_id;
2009 Buffer victim_buffer;
2010 BufferDesc *victim_buf_hdr;
2011 uint32 victim_buf_state;
2012
2013 /* Make sure we will have room to remember the buffer pin */
2016
2017 /* create a tag so we can lookup the buffer */
2018 InitBufferTag(&newTag, &smgr->smgr_rlocator.locator, forkNum, blockNum);
2019
2020 /* determine its hash code and partition lock ID */
2021 newHash = BufTableHashCode(&newTag);
2022 newPartitionLock = BufMappingPartitionLock(newHash);
2023
2024 /* see if the block is in the buffer pool already */
2025 LWLockAcquire(newPartitionLock, LW_SHARED);
2026 existing_buf_id = BufTableLookup(&newTag, newHash);
2027 if (existing_buf_id >= 0)
2028 {
2029 BufferDesc *buf;
2030 bool valid;
2031
2032 /*
2033 * Found it. Now, pin the buffer so no one can steal it from the
2034 * buffer pool, and check to see if the correct data has been loaded
2035 * into the buffer.
2036 */
2037 buf = GetBufferDescriptor(existing_buf_id);
2038
2039 valid = PinBuffer(buf, strategy);
2040
2041 /* Can release the mapping lock as soon as we've pinned it */
2042 LWLockRelease(newPartitionLock);
2043
2044 *foundPtr = true;
2045
2046 if (!valid)
2047 {
2048 /*
2049 * We can only get here if (a) someone else is still reading in
2050 * the page, (b) a previous read attempt failed, or (c) someone
2051 * called StartReadBuffers() but not yet WaitReadBuffers().
2052 */
2053 *foundPtr = false;
2054 }
2055
2056 return buf;
2057 }
2058
2059 /*
2060 * Didn't find it in the buffer pool. We'll have to initialize a new
2061 * buffer. Remember to unlock the mapping lock while doing the work.
2062 */
2063 LWLockRelease(newPartitionLock);
2064
2065 /*
2066 * Acquire a victim buffer. Somebody else might try to do the same, we
2067 * don't hold any conflicting locks. If so we'll have to undo our work
2068 * later.
2069 */
2070 victim_buffer = GetVictimBuffer(strategy, io_context);
2071 victim_buf_hdr = GetBufferDescriptor(victim_buffer - 1);
2072
2073 /*
2074 * Try to make a hashtable entry for the buffer under its new tag. If
2075 * somebody else inserted another buffer for the tag, we'll release the
2076 * victim buffer we acquired and use the already inserted one.
2077 */
2078 LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
2079 existing_buf_id = BufTableInsert(&newTag, newHash, victim_buf_hdr->buf_id);
2080 if (existing_buf_id >= 0)
2081 {
2082 BufferDesc *existing_buf_hdr;
2083 bool valid;
2084
2085 /*
2086 * Got a collision. Someone has already done what we were about to do.
2087 * We'll just handle this as if it were found in the buffer pool in
2088 * the first place. First, give up the buffer we were planning to
2089 * use.
2090 *
2091 * We could do this after releasing the partition lock, but then we'd
2092 * have to call ResourceOwnerEnlarge() & ReservePrivateRefCountEntry()
2093 * before acquiring the lock, for the rare case of such a collision.
2094 */
2095 UnpinBuffer(victim_buf_hdr);
2096
2097 /* remaining code should match code at top of routine */
2098
2099 existing_buf_hdr = GetBufferDescriptor(existing_buf_id);
2100
2101 valid = PinBuffer(existing_buf_hdr, strategy);
2102
2103 /* Can release the mapping lock as soon as we've pinned it */
2104 LWLockRelease(newPartitionLock);
2105
2106 *foundPtr = true;
2107
2108 if (!valid)
2109 {
2110 /*
2111 * We can only get here if (a) someone else is still reading in
2112 * the page, (b) a previous read attempt failed, or (c) someone
2113 * called StartReadBuffers() but not yet WaitReadBuffers().
2114 */
2115 *foundPtr = false;
2116 }
2117
2118 return existing_buf_hdr;
2119 }
2120
2121 /*
2122 * Need to lock the buffer header too in order to change its tag.
2123 */
2124 victim_buf_state = LockBufHdr(victim_buf_hdr);
2125
2126 /* some sanity checks while we hold the buffer header lock */
2127 Assert(BUF_STATE_GET_REFCOUNT(victim_buf_state) == 1);
2128 Assert(!(victim_buf_state & (BM_TAG_VALID | BM_VALID | BM_DIRTY | BM_IO_IN_PROGRESS)));
2129
2130 victim_buf_hdr->tag = newTag;
2131
2132 /*
2133 * Make sure BM_PERMANENT is set for buffers that must be written at every
2134 * checkpoint. Unlogged buffers only need to be written at shutdown
2135 * checkpoints, except for their "init" forks, which need to be treated
2136 * just like permanent relations.
2137 */
2138 victim_buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
2139 if (relpersistence == RELPERSISTENCE_PERMANENT || forkNum == INIT_FORKNUM)
2140 victim_buf_state |= BM_PERMANENT;
2141
2142 UnlockBufHdr(victim_buf_hdr, victim_buf_state);
2143
2144 LWLockRelease(newPartitionLock);
2145
2146 /*
2147 * Buffer contents are currently invalid.
2148 */
2149 *foundPtr = false;
2150
2151 return victim_buf_hdr;
2152}
2153
2154/*
2155 * InvalidateBuffer -- mark a shared buffer invalid.
2156 *
2157 * The buffer header spinlock must be held at entry. We drop it before
2158 * returning. (This is sane because the caller must have locked the
2159 * buffer in order to be sure it should be dropped.)
2160 *
2161 * This is used only in contexts such as dropping a relation. We assume
2162 * that no other backend could possibly be interested in using the page,
2163 * so the only reason the buffer might be pinned is if someone else is
2164 * trying to write it out. We have to let them finish before we can
2165 * reclaim the buffer.
2166 *
2167 * The buffer could get reclaimed by someone else while we are waiting
2168 * to acquire the necessary locks; if so, don't mess it up.
2169 */
2170static void
2172{
2173 BufferTag oldTag;
2174 uint32 oldHash; /* hash value for oldTag */
2175 LWLock *oldPartitionLock; /* buffer partition lock for it */
2176 uint32 oldFlags;
2177 uint32 buf_state;
2178
2179 /* Save the original buffer tag before dropping the spinlock */
2180 oldTag = buf->tag;
2181
2182 buf_state = pg_atomic_read_u32(&buf->state);
2183 Assert(buf_state & BM_LOCKED);
2184 UnlockBufHdr(buf, buf_state);
2185
2186 /*
2187 * Need to compute the old tag's hashcode and partition lock ID. XXX is it
2188 * worth storing the hashcode in BufferDesc so we need not recompute it
2189 * here? Probably not.
2190 */
2191 oldHash = BufTableHashCode(&oldTag);
2192 oldPartitionLock = BufMappingPartitionLock(oldHash);
2193
2194retry:
2195
2196 /*
2197 * Acquire exclusive mapping lock in preparation for changing the buffer's
2198 * association.
2199 */
2200 LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
2201
2202 /* Re-lock the buffer header */
2203 buf_state = LockBufHdr(buf);
2204
2205 /* If it's changed while we were waiting for lock, do nothing */
2206 if (!BufferTagsEqual(&buf->tag, &oldTag))
2207 {
2208 UnlockBufHdr(buf, buf_state);
2209 LWLockRelease(oldPartitionLock);
2210 return;
2211 }
2212
2213 /*
2214 * We assume the reason for it to be pinned is that either we were
2215 * asynchronously reading the page in before erroring out or someone else
2216 * is flushing the page out. Wait for the IO to finish. (This could be
2217 * an infinite loop if the refcount is messed up... it would be nice to
2218 * time out after awhile, but there seems no way to be sure how many loops
2219 * may be needed. Note that if the other guy has pinned the buffer but
2220 * not yet done StartBufferIO, WaitIO will fall through and we'll
2221 * effectively be busy-looping here.)
2222 */
2223 if (BUF_STATE_GET_REFCOUNT(buf_state) != 0)
2224 {
2225 UnlockBufHdr(buf, buf_state);
2226 LWLockRelease(oldPartitionLock);
2227 /* safety check: should definitely not be our *own* pin */
2229 elog(ERROR, "buffer is pinned in InvalidateBuffer");
2230 WaitIO(buf);
2231 goto retry;
2232 }
2233
2234 /*
2235 * Clear out the buffer's tag and flags. We must do this to ensure that
2236 * linear scans of the buffer array don't think the buffer is valid.
2237 */
2238 oldFlags = buf_state & BUF_FLAG_MASK;
2239 ClearBufferTag(&buf->tag);
2240 buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
2241 UnlockBufHdr(buf, buf_state);
2242
2243 /*
2244 * Remove the buffer from the lookup hashtable, if it was in there.
2245 */
2246 if (oldFlags & BM_TAG_VALID)
2247 BufTableDelete(&oldTag, oldHash);
2248
2249 /*
2250 * Done with mapping lock.
2251 */
2252 LWLockRelease(oldPartitionLock);
2253}
2254
2255/*
2256 * Helper routine for GetVictimBuffer()
2257 *
2258 * Needs to be called on a buffer with a valid tag, pinned, but without the
2259 * buffer header spinlock held.
2260 *
2261 * Returns true if the buffer can be reused, in which case the buffer is only
2262 * pinned by this backend and marked as invalid, false otherwise.
2263 */
2264static bool
2266{
2267 uint32 buf_state;
2268 uint32 hash;
2269 LWLock *partition_lock;
2270 BufferTag tag;
2271
2273
2274 /* have buffer pinned, so it's safe to read tag without lock */
2275 tag = buf_hdr->tag;
2276
2277 hash = BufTableHashCode(&tag);
2278 partition_lock = BufMappingPartitionLock(hash);
2279
2280 LWLockAcquire(partition_lock, LW_EXCLUSIVE);
2281
2282 /* lock the buffer header */
2283 buf_state = LockBufHdr(buf_hdr);
2284
2285 /*
2286 * We have the buffer pinned nobody else should have been able to unset
2287 * this concurrently.
2288 */
2289 Assert(buf_state & BM_TAG_VALID);
2290 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
2291 Assert(BufferTagsEqual(&buf_hdr->tag, &tag));
2292
2293 /*
2294 * If somebody else pinned the buffer since, or even worse, dirtied it,
2295 * give up on this buffer: It's clearly in use.
2296 */
2297 if (BUF_STATE_GET_REFCOUNT(buf_state) != 1 || (buf_state & BM_DIRTY))
2298 {
2299 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
2300
2301 UnlockBufHdr(buf_hdr, buf_state);
2302 LWLockRelease(partition_lock);
2303
2304 return false;
2305 }
2306
2307 /*
2308 * Clear out the buffer's tag and flags and usagecount. This is not
2309 * strictly required, as BM_TAG_VALID/BM_VALID needs to be checked before
2310 * doing anything with the buffer. But currently it's beneficial, as the
2311 * cheaper pre-check for several linear scans of shared buffers use the
2312 * tag (see e.g. FlushDatabaseBuffers()).
2313 */
2314 ClearBufferTag(&buf_hdr->tag);
2315 buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
2316 UnlockBufHdr(buf_hdr, buf_state);
2317
2318 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
2319
2320 /* finally delete buffer from the buffer mapping table */
2321 BufTableDelete(&tag, hash);
2322
2323 LWLockRelease(partition_lock);
2324
2325 Assert(!(buf_state & (BM_DIRTY | BM_VALID | BM_TAG_VALID)));
2326 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
2328
2329 return true;
2330}
2331
2332static Buffer
2334{
2335 BufferDesc *buf_hdr;
2336 Buffer buf;
2337 uint32 buf_state;
2338 bool from_ring;
2339
2340 /*
2341 * Ensure, while the spinlock's not yet held, that there's a free refcount
2342 * entry, and a resource owner slot for the pin.
2343 */
2346
2347 /* we return here if a prospective victim buffer gets used concurrently */
2348again:
2349
2350 /*
2351 * Select a victim buffer. The buffer is returned with its header
2352 * spinlock still held!
2353 */
2354 buf_hdr = StrategyGetBuffer(strategy, &buf_state, &from_ring);
2355 buf = BufferDescriptorGetBuffer(buf_hdr);
2356
2357 Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 0);
2358
2359 /* Pin the buffer and then release the buffer spinlock */
2360 PinBuffer_Locked(buf_hdr);
2361
2362 /*
2363 * We shouldn't have any other pins for this buffer.
2364 */
2366
2367 /*
2368 * If the buffer was dirty, try to write it out. There is a race
2369 * condition here, in that someone might dirty it after we released the
2370 * buffer header lock above, or even while we are writing it out (since
2371 * our share-lock won't prevent hint-bit updates). We will recheck the
2372 * dirty bit after re-locking the buffer header.
2373 */
2374 if (buf_state & BM_DIRTY)
2375 {
2376 LWLock *content_lock;
2377
2378 Assert(buf_state & BM_TAG_VALID);
2379 Assert(buf_state & BM_VALID);
2380
2381 /*
2382 * We need a share-lock on the buffer contents to write it out (else
2383 * we might write invalid data, eg because someone else is compacting
2384 * the page contents while we write). We must use a conditional lock
2385 * acquisition here to avoid deadlock. Even though the buffer was not
2386 * pinned (and therefore surely not locked) when StrategyGetBuffer
2387 * returned it, someone else could have pinned and exclusive-locked it
2388 * by the time we get here. If we try to get the lock unconditionally,
2389 * we'd block waiting for them; if they later block waiting for us,
2390 * deadlock ensues. (This has been observed to happen when two
2391 * backends are both trying to split btree index pages, and the second
2392 * one just happens to be trying to split the page the first one got
2393 * from StrategyGetBuffer.)
2394 */
2395 content_lock = BufferDescriptorGetContentLock(buf_hdr);
2396 if (!LWLockConditionalAcquire(content_lock, LW_SHARED))
2397 {
2398 /*
2399 * Someone else has locked the buffer, so give it up and loop back
2400 * to get another one.
2401 */
2402 UnpinBuffer(buf_hdr);
2403 goto again;
2404 }
2405
2406 /*
2407 * If using a nondefault strategy, and writing the buffer would
2408 * require a WAL flush, let the strategy decide whether to go ahead
2409 * and write/reuse the buffer or to choose another victim. We need a
2410 * lock to inspect the page LSN, so this can't be done inside
2411 * StrategyGetBuffer.
2412 */
2413 if (strategy != NULL)
2414 {
2415 XLogRecPtr lsn;
2416
2417 /* Read the LSN while holding buffer header lock */
2418 buf_state = LockBufHdr(buf_hdr);
2419 lsn = BufferGetLSN(buf_hdr);
2420 UnlockBufHdr(buf_hdr, buf_state);
2421
2422 if (XLogNeedsFlush(lsn)
2423 && StrategyRejectBuffer(strategy, buf_hdr, from_ring))
2424 {
2425 LWLockRelease(content_lock);
2426 UnpinBuffer(buf_hdr);
2427 goto again;
2428 }
2429 }
2430
2431 /* OK, do the I/O */
2432 FlushBuffer(buf_hdr, NULL, IOOBJECT_RELATION, io_context);
2433 LWLockRelease(content_lock);
2434
2436 &buf_hdr->tag);
2437 }
2438
2439
2440 if (buf_state & BM_VALID)
2441 {
2442 /*
2443 * When a BufferAccessStrategy is in use, blocks evicted from shared
2444 * buffers are counted as IOOP_EVICT in the corresponding context
2445 * (e.g. IOCONTEXT_BULKWRITE). Shared buffers are evicted by a
2446 * strategy in two cases: 1) while initially claiming buffers for the
2447 * strategy ring 2) to replace an existing strategy ring buffer
2448 * because it is pinned or in use and cannot be reused.
2449 *
2450 * Blocks evicted from buffers already in the strategy ring are
2451 * counted as IOOP_REUSE in the corresponding strategy context.
2452 *
2453 * At this point, we can accurately count evictions and reuses,
2454 * because we have successfully claimed the valid buffer. Previously,
2455 * we may have been forced to release the buffer due to concurrent
2456 * pinners or erroring out.
2457 */
2459 from_ring ? IOOP_REUSE : IOOP_EVICT, 1, 0);
2460 }
2461
2462 /*
2463 * If the buffer has an entry in the buffer mapping table, delete it. This
2464 * can fail because another backend could have pinned or dirtied the
2465 * buffer.
2466 */
2467 if ((buf_state & BM_TAG_VALID) && !InvalidateVictimBuffer(buf_hdr))
2468 {
2469 UnpinBuffer(buf_hdr);
2470 goto again;
2471 }
2472
2473 /* a final set of sanity checks */
2474#ifdef USE_ASSERT_CHECKING
2475 buf_state = pg_atomic_read_u32(&buf_hdr->state);
2476
2477 Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 1);
2478 Assert(!(buf_state & (BM_TAG_VALID | BM_VALID | BM_DIRTY)));
2479
2481#endif
2482
2483 return buf;
2484}
2485
2486/*
2487 * Return the maximum number of buffers that a backend should try to pin once,
2488 * to avoid exceeding its fair share. This is the highest value that
2489 * GetAdditionalPinLimit() could ever return. Note that it may be zero on a
2490 * system with a very small buffer pool relative to max_connections.
2491 */
2492uint32
2494{
2495 return MaxProportionalPins;
2496}
2497
2498/*
2499 * Return the maximum number of additional buffers that this backend should
2500 * pin if it wants to stay under the per-backend limit, considering the number
2501 * of buffers it has already pinned. Unlike LimitAdditionalPins(), the limit
2502 * return by this function can be zero.
2503 */
2504uint32
2506{
2507 uint32 estimated_pins_held;
2508
2509 /*
2510 * We get the number of "overflowed" pins for free, but don't know the
2511 * number of pins in PrivateRefCountArray. The cost of calculating that
2512 * exactly doesn't seem worth it, so just assume the max.
2513 */
2514 estimated_pins_held = PrivateRefCountOverflowed + REFCOUNT_ARRAY_ENTRIES;
2515
2516 /* Is this backend already holding more than its fair share? */
2517 if (estimated_pins_held > MaxProportionalPins)
2518 return 0;
2519
2520 return MaxProportionalPins - estimated_pins_held;
2521}
2522
2523/*
2524 * Limit the number of pins a batch operation may additionally acquire, to
2525 * avoid running out of pinnable buffers.
2526 *
2527 * One additional pin is always allowed, on the assumption that the operation
2528 * requires at least one to make progress.
2529 */
2530void
2532{
2533 uint32 limit;
2534
2535 if (*additional_pins <= 1)
2536 return;
2537
2538 limit = GetAdditionalPinLimit();
2539 limit = Max(limit, 1);
2540 if (limit < *additional_pins)
2541 *additional_pins = limit;
2542}
2543
2544/*
2545 * Logic shared between ExtendBufferedRelBy(), ExtendBufferedRelTo(). Just to
2546 * avoid duplicating the tracing and relpersistence related logic.
2547 */
2548static BlockNumber
2550 ForkNumber fork,
2551 BufferAccessStrategy strategy,
2552 uint32 flags,
2553 uint32 extend_by,
2554 BlockNumber extend_upto,
2555 Buffer *buffers,
2556 uint32 *extended_by)
2557{
2558 BlockNumber first_block;
2559
2560 TRACE_POSTGRESQL_BUFFER_EXTEND_START(fork,
2565 extend_by);
2566
2567 if (bmr.relpersistence == RELPERSISTENCE_TEMP)
2568 first_block = ExtendBufferedRelLocal(bmr, fork, flags,
2569 extend_by, extend_upto,
2570 buffers, &extend_by);
2571 else
2572 first_block = ExtendBufferedRelShared(bmr, fork, strategy, flags,
2573 extend_by, extend_upto,
2574 buffers, &extend_by);
2575 *extended_by = extend_by;
2576
2577 TRACE_POSTGRESQL_BUFFER_EXTEND_DONE(fork,
2582 *extended_by,
2583 first_block);
2584
2585 return first_block;
2586}
2587
2588/*
2589 * Implementation of ExtendBufferedRelBy() and ExtendBufferedRelTo() for
2590 * shared buffers.
2591 */
2592static BlockNumber
2594 ForkNumber fork,
2595 BufferAccessStrategy strategy,
2596 uint32 flags,
2597 uint32 extend_by,
2598 BlockNumber extend_upto,
2599 Buffer *buffers,
2600 uint32 *extended_by)
2601{
2602 BlockNumber first_block;
2603 IOContext io_context = IOContextForStrategy(strategy);
2604 instr_time io_start;
2605
2606 LimitAdditionalPins(&extend_by);
2607
2608 /*
2609 * Acquire victim buffers for extension without holding extension lock.
2610 * Writing out victim buffers is the most expensive part of extending the
2611 * relation, particularly when doing so requires WAL flushes. Zeroing out
2612 * the buffers is also quite expensive, so do that before holding the
2613 * extension lock as well.
2614 *
2615 * These pages are pinned by us and not valid. While we hold the pin they
2616 * can't be acquired as victim buffers by another backend.
2617 */
2618 for (uint32 i = 0; i < extend_by; i++)
2619 {
2620 Block buf_block;
2621
2622 buffers[i] = GetVictimBuffer(strategy, io_context);
2623 buf_block = BufHdrGetBlock(GetBufferDescriptor(buffers[i] - 1));
2624
2625 /* new buffers are zero-filled */
2626 MemSet(buf_block, 0, BLCKSZ);
2627 }
2628
2629 /*
2630 * Lock relation against concurrent extensions, unless requested not to.
2631 *
2632 * We use the same extension lock for all forks. That's unnecessarily
2633 * restrictive, but currently extensions for forks don't happen often
2634 * enough to make it worth locking more granularly.
2635 *
2636 * Note that another backend might have extended the relation by the time
2637 * we get the lock.
2638 */
2639 if (!(flags & EB_SKIP_EXTENSION_LOCK))
2641
2642 /*
2643 * If requested, invalidate size cache, so that smgrnblocks asks the
2644 * kernel.
2645 */
2646 if (flags & EB_CLEAR_SIZE_CACHE)
2648
2649 first_block = smgrnblocks(bmr.smgr, fork);
2650
2651 /*
2652 * Now that we have the accurate relation size, check if the caller wants
2653 * us to extend to only up to a specific size. If there were concurrent
2654 * extensions, we might have acquired too many buffers and need to release
2655 * them.
2656 */
2657 if (extend_upto != InvalidBlockNumber)
2658 {
2659 uint32 orig_extend_by = extend_by;
2660
2661 if (first_block > extend_upto)
2662 extend_by = 0;
2663 else if ((uint64) first_block + extend_by > extend_upto)
2664 extend_by = extend_upto - first_block;
2665
2666 for (uint32 i = extend_by; i < orig_extend_by; i++)
2667 {
2668 BufferDesc *buf_hdr = GetBufferDescriptor(buffers[i] - 1);
2669
2670 UnpinBuffer(buf_hdr);
2671 }
2672
2673 if (extend_by == 0)
2674 {
2675 if (!(flags & EB_SKIP_EXTENSION_LOCK))
2677 *extended_by = extend_by;
2678 return first_block;
2679 }
2680 }
2681
2682 /* Fail if relation is already at maximum possible length */
2683 if ((uint64) first_block + extend_by >= MaxBlockNumber)
2684 ereport(ERROR,
2685 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
2686 errmsg("cannot extend relation %s beyond %u blocks",
2687 relpath(bmr.smgr->smgr_rlocator, fork).str,
2688 MaxBlockNumber)));
2689
2690 /*
2691 * Insert buffers into buffer table, mark as IO_IN_PROGRESS.
2692 *
2693 * This needs to happen before we extend the relation, because as soon as
2694 * we do, other backends can start to read in those pages.
2695 */
2696 for (uint32 i = 0; i < extend_by; i++)
2697 {
2698 Buffer victim_buf = buffers[i];
2699 BufferDesc *victim_buf_hdr = GetBufferDescriptor(victim_buf - 1);
2700 BufferTag tag;
2701 uint32 hash;
2702 LWLock *partition_lock;
2703 int existing_id;
2704
2705 /* in case we need to pin an existing buffer below */
2708
2709 InitBufferTag(&tag, &bmr.smgr->smgr_rlocator.locator, fork, first_block + i);
2710 hash = BufTableHashCode(&tag);
2711 partition_lock = BufMappingPartitionLock(hash);
2712
2713 LWLockAcquire(partition_lock, LW_EXCLUSIVE);
2714
2715 existing_id = BufTableInsert(&tag, hash, victim_buf_hdr->buf_id);
2716
2717 /*
2718 * We get here only in the corner case where we are trying to extend
2719 * the relation but we found a pre-existing buffer. This can happen
2720 * because a prior attempt at extending the relation failed, and
2721 * because mdread doesn't complain about reads beyond EOF (when
2722 * zero_damaged_pages is ON) and so a previous attempt to read a block
2723 * beyond EOF could have left a "valid" zero-filled buffer.
2724 *
2725 * This has also been observed when relation was overwritten by
2726 * external process. Since the legitimate cases should always have
2727 * left a zero-filled buffer, complain if not PageIsNew.
2728 */
2729 if (existing_id >= 0)
2730 {
2731 BufferDesc *existing_hdr = GetBufferDescriptor(existing_id);
2732 Block buf_block;
2733 bool valid;
2734
2735 /*
2736 * Pin the existing buffer before releasing the partition lock,
2737 * preventing it from being evicted.
2738 */
2739 valid = PinBuffer(existing_hdr, strategy);
2740
2741 LWLockRelease(partition_lock);
2742 UnpinBuffer(victim_buf_hdr);
2743
2744 buffers[i] = BufferDescriptorGetBuffer(existing_hdr);
2745 buf_block = BufHdrGetBlock(existing_hdr);
2746
2747 if (valid && !PageIsNew((Page) buf_block))
2748 ereport(ERROR,
2749 (errmsg("unexpected data beyond EOF in block %u of relation \"%s\"",
2750 existing_hdr->tag.blockNum,
2751 relpath(bmr.smgr->smgr_rlocator, fork).str)));
2752
2753 /*
2754 * We *must* do smgr[zero]extend before succeeding, else the page
2755 * will not be reserved by the kernel, and the next P_NEW call
2756 * will decide to return the same page. Clear the BM_VALID bit,
2757 * do StartBufferIO() and proceed.
2758 *
2759 * Loop to handle the very small possibility that someone re-sets
2760 * BM_VALID between our clearing it and StartBufferIO inspecting
2761 * it.
2762 */
2763 do
2764 {
2765 uint32 buf_state = LockBufHdr(existing_hdr);
2766
2767 buf_state &= ~BM_VALID;
2768 UnlockBufHdr(existing_hdr, buf_state);
2769 } while (!StartBufferIO(existing_hdr, true, false));
2770 }
2771 else
2772 {
2773 uint32 buf_state;
2774
2775 buf_state = LockBufHdr(victim_buf_hdr);
2776
2777 /* some sanity checks while we hold the buffer header lock */
2778 Assert(!(buf_state & (BM_VALID | BM_TAG_VALID | BM_DIRTY | BM_JUST_DIRTIED)));
2779 Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 1);
2780
2781 victim_buf_hdr->tag = tag;
2782
2783 buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
2784 if (bmr.relpersistence == RELPERSISTENCE_PERMANENT || fork == INIT_FORKNUM)
2785 buf_state |= BM_PERMANENT;
2786
2787 UnlockBufHdr(victim_buf_hdr, buf_state);
2788
2789 LWLockRelease(partition_lock);
2790
2791 /* XXX: could combine the locked operations in it with the above */
2792 StartBufferIO(victim_buf_hdr, true, false);
2793 }
2794 }
2795
2797
2798 /*
2799 * Note: if smgrzeroextend fails, we will end up with buffers that are
2800 * allocated but not marked BM_VALID. The next relation extension will
2801 * still select the same block number (because the relation didn't get any
2802 * longer on disk) and so future attempts to extend the relation will find
2803 * the same buffers (if they have not been recycled) but come right back
2804 * here to try smgrzeroextend again.
2805 *
2806 * We don't need to set checksum for all-zero pages.
2807 */
2808 smgrzeroextend(bmr.smgr, fork, first_block, extend_by, false);
2809
2810 /*
2811 * Release the file-extension lock; it's now OK for someone else to extend
2812 * the relation some more.
2813 *
2814 * We remove IO_IN_PROGRESS after this, as waking up waiting backends can
2815 * take noticeable time.
2816 */
2817 if (!(flags & EB_SKIP_EXTENSION_LOCK))
2819
2821 io_start, 1, extend_by * BLCKSZ);
2822
2823 /* Set BM_VALID, terminate IO, and wake up any waiters */
2824 for (uint32 i = 0; i < extend_by; i++)
2825 {
2826 Buffer buf = buffers[i];
2827 BufferDesc *buf_hdr = GetBufferDescriptor(buf - 1);
2828 bool lock = false;
2829
2830 if (flags & EB_LOCK_FIRST && i == 0)
2831 lock = true;
2832 else if (flags & EB_LOCK_TARGET)
2833 {
2834 Assert(extend_upto != InvalidBlockNumber);
2835 if (first_block + i + 1 == extend_upto)
2836 lock = true;
2837 }
2838
2839 if (lock)
2841
2842 TerminateBufferIO(buf_hdr, false, BM_VALID, true, false);
2843 }
2844
2846
2847 *extended_by = extend_by;
2848
2849 return first_block;
2850}
2851
2852/*
2853 * BufferIsExclusiveLocked
2854 *
2855 * Checks if buffer is exclusive-locked.
2856 *
2857 * Buffer must be pinned.
2858 */
2859bool
2861{
2862 BufferDesc *bufHdr;
2863
2865
2866 if (BufferIsLocal(buffer))
2867 {
2868 /* Content locks are not maintained for local buffers. */
2869 return true;
2870 }
2871 else
2872 {
2873 bufHdr = GetBufferDescriptor(buffer - 1);
2875 LW_EXCLUSIVE);
2876 }
2877}
2878
2879/*
2880 * BufferIsDirty
2881 *
2882 * Checks if buffer is already dirty.
2883 *
2884 * Buffer must be pinned and exclusive-locked. (Without an exclusive lock,
2885 * the result may be stale before it's returned.)
2886 */
2887bool
2889{
2890 BufferDesc *bufHdr;
2891
2893
2894 if (BufferIsLocal(buffer))
2895 {
2896 int bufid = -buffer - 1;
2897
2898 bufHdr = GetLocalBufferDescriptor(bufid);
2899 /* Content locks are not maintained for local buffers. */
2900 }
2901 else
2902 {
2903 bufHdr = GetBufferDescriptor(buffer - 1);
2905 LW_EXCLUSIVE));
2906 }
2907
2908 return pg_atomic_read_u32(&bufHdr->state) & BM_DIRTY;
2909}
2910
2911/*
2912 * MarkBufferDirty
2913 *
2914 * Marks buffer contents as dirty (actual write happens later).
2915 *
2916 * Buffer must be pinned and exclusive-locked. (If caller does not hold
2917 * exclusive lock, then somebody could be in process of writing the buffer,
2918 * leading to risk of bad data written to disk.)
2919 */
2920void
2922{
2923 BufferDesc *bufHdr;
2924 uint32 buf_state;
2925 uint32 old_buf_state;
2926
2927 if (!BufferIsValid(buffer))
2928 elog(ERROR, "bad buffer ID: %d", buffer);
2929
2930 if (BufferIsLocal(buffer))
2931 {
2933 return;
2934 }
2935
2936 bufHdr = GetBufferDescriptor(buffer - 1);
2937
2940 LW_EXCLUSIVE));
2941
2942 old_buf_state = pg_atomic_read_u32(&bufHdr->state);
2943 for (;;)
2944 {
2945 if (old_buf_state & BM_LOCKED)
2946 old_buf_state = WaitBufHdrUnlocked(bufHdr);
2947
2948 buf_state = old_buf_state;
2949
2950 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
2951 buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
2952
2953 if (pg_atomic_compare_exchange_u32(&bufHdr->state, &old_buf_state,
2954 buf_state))
2955 break;
2956 }
2957
2958 /*
2959 * If the buffer was not dirty already, do vacuum accounting.
2960 */
2961 if (!(old_buf_state & BM_DIRTY))
2962 {
2964 if (VacuumCostActive)
2966 }
2967}
2968
2969/*
2970 * ReleaseAndReadBuffer -- combine ReleaseBuffer() and ReadBuffer()
2971 *
2972 * Formerly, this saved one cycle of acquiring/releasing the BufMgrLock
2973 * compared to calling the two routines separately. Now it's mainly just
2974 * a convenience function. However, if the passed buffer is valid and
2975 * already contains the desired block, we just return it as-is; and that
2976 * does save considerable work compared to a full release and reacquire.
2977 *
2978 * Note: it is OK to pass buffer == InvalidBuffer, indicating that no old
2979 * buffer actually needs to be released. This case is the same as ReadBuffer,
2980 * but can save some tests in the caller.
2981 */
2982Buffer
2984 Relation relation,
2985 BlockNumber blockNum)
2986{
2987 ForkNumber forkNum = MAIN_FORKNUM;
2988 BufferDesc *bufHdr;
2989
2990 if (BufferIsValid(buffer))
2991 {
2993 if (BufferIsLocal(buffer))
2994 {
2995 bufHdr = GetLocalBufferDescriptor(-buffer - 1);
2996 if (bufHdr->tag.blockNum == blockNum &&
2997 BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) &&
2998 BufTagGetForkNum(&bufHdr->tag) == forkNum)
2999 return buffer;
3001 }
3002 else
3003 {
3004 bufHdr = GetBufferDescriptor(buffer - 1);
3005 /* we have pin, so it's ok to examine tag without spinlock */
3006 if (bufHdr->tag.blockNum == blockNum &&
3007 BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) &&
3008 BufTagGetForkNum(&bufHdr->tag) == forkNum)
3009 return buffer;
3010 UnpinBuffer(bufHdr);
3011 }
3012 }
3013
3014 return ReadBuffer(relation, blockNum);
3015}
3016
3017/*
3018 * PinBuffer -- make buffer unavailable for replacement.
3019 *
3020 * For the default access strategy, the buffer's usage_count is incremented
3021 * when we first pin it; for other strategies we just make sure the usage_count
3022 * isn't zero. (The idea of the latter is that we don't want synchronized
3023 * heap scans to inflate the count, but we need it to not be zero to discourage
3024 * other backends from stealing buffers from our ring. As long as we cycle
3025 * through the ring faster than the global clock-sweep cycles, buffers in
3026 * our ring won't be chosen as victims for replacement by other backends.)
3027 *
3028 * This should be applied only to shared buffers, never local ones.
3029 *
3030 * Since buffers are pinned/unpinned very frequently, pin buffers without
3031 * taking the buffer header lock; instead update the state variable in loop of
3032 * CAS operations. Hopefully it's just a single CAS.
3033 *
3034 * Note that ResourceOwnerEnlarge() and ReservePrivateRefCountEntry()
3035 * must have been done already.
3036 *
3037 * Returns true if buffer is BM_VALID, else false. This provision allows
3038 * some callers to avoid an extra spinlock cycle.
3039 */
3040static bool
3042{
3044 bool result;
3046
3049
3050 ref = GetPrivateRefCountEntry(b, true);
3051
3052 if (ref == NULL)
3053 {
3054 uint32 buf_state;
3055 uint32 old_buf_state;
3056
3058
3059 old_buf_state = pg_atomic_read_u32(&buf->state);
3060 for (;;)
3061 {
3062 if (old_buf_state & BM_LOCKED)
3063 old_buf_state = WaitBufHdrUnlocked(buf);
3064
3065 buf_state = old_buf_state;
3066
3067 /* increase refcount */
3068 buf_state += BUF_REFCOUNT_ONE;
3069
3070 if (strategy == NULL)
3071 {
3072 /* Default case: increase usagecount unless already max. */
3074 buf_state += BUF_USAGECOUNT_ONE;
3075 }
3076 else
3077 {
3078 /*
3079 * Ring buffers shouldn't evict others from pool. Thus we
3080 * don't make usagecount more than 1.
3081 */
3082 if (BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
3083 buf_state += BUF_USAGECOUNT_ONE;
3084 }
3085
3086 if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
3087 buf_state))
3088 {
3089 result = (buf_state & BM_VALID) != 0;
3090
3091 /*
3092 * Assume that we acquired a buffer pin for the purposes of
3093 * Valgrind buffer client checks (even in !result case) to
3094 * keep things simple. Buffers that are unsafe to access are
3095 * not generally guaranteed to be marked undefined or
3096 * non-accessible in any case.
3097 */
3099 break;
3100 }
3101 }
3102 }
3103 else
3104 {
3105 /*
3106 * If we previously pinned the buffer, it is likely to be valid, but
3107 * it may not be if StartReadBuffers() was called and
3108 * WaitReadBuffers() hasn't been called yet. We'll check by loading
3109 * the flags without locking. This is racy, but it's OK to return
3110 * false spuriously: when WaitReadBuffers() calls StartBufferIO(),
3111 * it'll see that it's now valid.
3112 *
3113 * Note: We deliberately avoid a Valgrind client request here.
3114 * Individual access methods can optionally superimpose buffer page
3115 * client requests on top of our client requests to enforce that
3116 * buffers are only accessed while locked (and pinned). It's possible
3117 * that the buffer page is legitimately non-accessible here. We
3118 * cannot meddle with that.
3119 */
3120 result = (pg_atomic_read_u32(&buf->state) & BM_VALID) != 0;
3121 }
3122
3123 ref->refcount++;
3124 Assert(ref->refcount > 0);
3126 return result;
3127}
3128
3129/*
3130 * PinBuffer_Locked -- as above, but caller already locked the buffer header.
3131 * The spinlock is released before return.
3132 *
3133 * As this function is called with the spinlock held, the caller has to
3134 * previously call ReservePrivateRefCountEntry() and
3135 * ResourceOwnerEnlarge(CurrentResourceOwner);
3136 *
3137 * Currently, no callers of this function want to modify the buffer's
3138 * usage_count at all, so there's no need for a strategy parameter.
3139 * Also we don't bother with a BM_VALID test (the caller could check that for
3140 * itself).
3141 *
3142 * Also all callers only ever use this function when it's known that the
3143 * buffer can't have a preexisting pin by this backend. That allows us to skip
3144 * searching the private refcount array & hash, which is a boon, because the
3145 * spinlock is still held.
3146 *
3147 * Note: use of this routine is frequently mandatory, not just an optimization
3148 * to save a spin lock/unlock cycle, because we need to pin a buffer before
3149 * its state can change under us.
3150 */
3151static void
3153{
3154 Buffer b;
3156 uint32 buf_state;
3157
3158 /*
3159 * As explained, We don't expect any preexisting pins. That allows us to
3160 * manipulate the PrivateRefCount after releasing the spinlock
3161 */
3163
3164 /*
3165 * Buffer can't have a preexisting pin, so mark its page as defined to
3166 * Valgrind (this is similar to the PinBuffer() case where the backend
3167 * doesn't already have a buffer pin)
3168 */
3170
3171 /*
3172 * Since we hold the buffer spinlock, we can update the buffer state and
3173 * release the lock in one operation.
3174 */
3175 buf_state = pg_atomic_read_u32(&buf->state);
3176 Assert(buf_state & BM_LOCKED);
3177 buf_state += BUF_REFCOUNT_ONE;
3178 UnlockBufHdr(buf, buf_state);
3179
3181
3183 ref->refcount++;
3184
3186}
3187
3188/*
3189 * Support for waking up another backend that is waiting for the cleanup lock
3190 * to be released using BM_PIN_COUNT_WAITER.
3191 *
3192 * See LockBufferForCleanup().
3193 *
3194 * Expected to be called just after releasing a buffer pin (in a BufferDesc,
3195 * not just reducing the backend-local pincount for the buffer).
3196 */
3197static void
3199{
3200 /*
3201 * Acquire the buffer header lock, re-check that there's a waiter. Another
3202 * backend could have unpinned this buffer, and already woken up the
3203 * waiter.
3204 *
3205 * There's no danger of the buffer being replaced after we unpinned it
3206 * above, as it's pinned by the waiter. The waiter removes
3207 * BM_PIN_COUNT_WAITER if it stops waiting for a reason other than this
3208 * backend waking it up.
3209 */
3210 uint32 buf_state = LockBufHdr(buf);
3211
3212 if ((buf_state & BM_PIN_COUNT_WAITER) &&
3213 BUF_STATE_GET_REFCOUNT(buf_state) == 1)
3214 {
3215 /* we just released the last pin other than the waiter's */
3216 int wait_backend_pgprocno = buf->wait_backend_pgprocno;
3217
3218 buf_state &= ~BM_PIN_COUNT_WAITER;
3219 UnlockBufHdr(buf, buf_state);
3220 ProcSendSignal(wait_backend_pgprocno);
3221 }
3222 else
3223 UnlockBufHdr(buf, buf_state);
3224}
3225
3226/*
3227 * UnpinBuffer -- make buffer available for replacement.
3228 *
3229 * This should be applied only to shared buffers, never local ones. This
3230 * always adjusts CurrentResourceOwner.
3231 */
3232static void
3234{
3236
3239}
3240
3241static void
3243{
3246
3248
3249 /* not moving as we're likely deleting it soon anyway */
3250 ref = GetPrivateRefCountEntry(b, false);
3251 Assert(ref != NULL);
3252 Assert(ref->refcount > 0);
3253 ref->refcount--;
3254 if (ref->refcount == 0)
3255 {
3256 uint32 buf_state;
3257 uint32 old_buf_state;
3258
3259 /*
3260 * Mark buffer non-accessible to Valgrind.
3261 *
3262 * Note that the buffer may have already been marked non-accessible
3263 * within access method code that enforces that buffers are only
3264 * accessed while a buffer lock is held.
3265 */
3267
3268 /* I'd better not still hold the buffer content lock */
3270
3271 /*
3272 * Decrement the shared reference count.
3273 *
3274 * Since buffer spinlock holder can update status using just write,
3275 * it's not safe to use atomic decrement here; thus use a CAS loop.
3276 */
3277 old_buf_state = pg_atomic_read_u32(&buf->state);
3278 for (;;)
3279 {
3280 if (old_buf_state & BM_LOCKED)
3281 old_buf_state = WaitBufHdrUnlocked(buf);
3282
3283 buf_state = old_buf_state;
3284
3285 buf_state -= BUF_REFCOUNT_ONE;
3286
3287 if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
3288 buf_state))
3289 break;
3290 }
3291
3292 /* Support LockBufferForCleanup() */
3293 if (buf_state & BM_PIN_COUNT_WAITER)
3295
3297 }
3298}
3299
3300#define ST_SORT sort_checkpoint_bufferids
3301#define ST_ELEMENT_TYPE CkptSortItem
3302#define ST_COMPARE(a, b) ckpt_buforder_comparator(a, b)
3303#define ST_SCOPE static
3304#define ST_DEFINE
3305#include "lib/sort_template.h"
3306
3307/*
3308 * BufferSync -- Write out all dirty buffers in the pool.
3309 *
3310 * This is called at checkpoint time to write out all dirty shared buffers.
3311 * The checkpoint request flags should be passed in. If CHECKPOINT_FAST is
3312 * set, we disable delays between writes; if CHECKPOINT_IS_SHUTDOWN,
3313 * CHECKPOINT_END_OF_RECOVERY or CHECKPOINT_FLUSH_UNLOGGED is set, we write
3314 * even unlogged buffers, which are otherwise skipped. The remaining flags
3315 * currently have no effect here.
3316 */
3317static void
3318BufferSync(int flags)
3319{
3320 uint32 buf_state;
3321 int buf_id;
3322 int num_to_scan;
3323 int num_spaces;
3324 int num_processed;
3325 int num_written;
3326 CkptTsStatus *per_ts_stat = NULL;
3327 Oid last_tsid;
3328 binaryheap *ts_heap;
3329 int i;
3330 int mask = BM_DIRTY;
3331 WritebackContext wb_context;
3332
3333 /*
3334 * Unless this is a shutdown checkpoint or we have been explicitly told,
3335 * we write only permanent, dirty buffers. But at shutdown or end of
3336 * recovery, we write all dirty buffers.
3337 */
3340 mask |= BM_PERMANENT;
3341
3342 /*
3343 * Loop over all buffers, and mark the ones that need to be written with
3344 * BM_CHECKPOINT_NEEDED. Count them as we go (num_to_scan), so that we
3345 * can estimate how much work needs to be done.
3346 *
3347 * This allows us to write only those pages that were dirty when the
3348 * checkpoint began, and not those that get dirtied while it proceeds.
3349 * Whenever a page with BM_CHECKPOINT_NEEDED is written out, either by us
3350 * later in this function, or by normal backends or the bgwriter cleaning
3351 * scan, the flag is cleared. Any buffer dirtied after this point won't
3352 * have the flag set.
3353 *
3354 * Note that if we fail to write some buffer, we may leave buffers with
3355 * BM_CHECKPOINT_NEEDED still set. This is OK since any such buffer would
3356 * certainly need to be written for the next checkpoint attempt, too.
3357 */
3358 num_to_scan = 0;
3359 for (buf_id = 0; buf_id < NBuffers; buf_id++)
3360 {
3361 BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
3362
3363 /*
3364 * Header spinlock is enough to examine BM_DIRTY, see comment in
3365 * SyncOneBuffer.
3366 */
3367 buf_state = LockBufHdr(bufHdr);
3368
3369 if ((buf_state & mask) == mask)
3370 {
3371 CkptSortItem *item;
3372
3373 buf_state |= BM_CHECKPOINT_NEEDED;
3374
3375 item = &CkptBufferIds[num_to_scan++];
3376 item->buf_id = buf_id;
3377 item->tsId = bufHdr->tag.spcOid;
3378 item->relNumber = BufTagGetRelNumber(&bufHdr->tag);
3379 item->forkNum = BufTagGetForkNum(&bufHdr->tag);
3380 item->blockNum = bufHdr->tag.blockNum;
3381 }
3382
3383 UnlockBufHdr(bufHdr, buf_state);
3384
3385 /* Check for barrier events in case NBuffers is large. */
3388 }
3389
3390 if (num_to_scan == 0)
3391 return; /* nothing to do */
3392
3394
3395 TRACE_POSTGRESQL_BUFFER_SYNC_START(NBuffers, num_to_scan);
3396
3397 /*
3398 * Sort buffers that need to be written to reduce the likelihood of random
3399 * IO. The sorting is also important for the implementation of balancing
3400 * writes between tablespaces. Without balancing writes we'd potentially
3401 * end up writing to the tablespaces one-by-one; possibly overloading the
3402 * underlying system.
3403 */
3404 sort_checkpoint_bufferids(CkptBufferIds, num_to_scan);
3405
3406 num_spaces = 0;
3407
3408 /*
3409 * Allocate progress status for each tablespace with buffers that need to
3410 * be flushed. This requires the to-be-flushed array to be sorted.
3411 */
3412 last_tsid = InvalidOid;
3413 for (i = 0; i < num_to_scan; i++)
3414 {
3415 CkptTsStatus *s;
3416 Oid cur_tsid;
3417
3418 cur_tsid = CkptBufferIds[i].tsId;
3419
3420 /*
3421 * Grow array of per-tablespace status structs, every time a new
3422 * tablespace is found.
3423 */
3424 if (last_tsid == InvalidOid || last_tsid != cur_tsid)
3425 {
3426 Size sz;
3427
3428 num_spaces++;
3429
3430 /*
3431 * Not worth adding grow-by-power-of-2 logic here - even with a
3432 * few hundred tablespaces this should be fine.
3433 */
3434 sz = sizeof(CkptTsStatus) * num_spaces;
3435
3436 if (per_ts_stat == NULL)
3437 per_ts_stat = (CkptTsStatus *) palloc(sz);
3438 else
3439 per_ts_stat = (CkptTsStatus *) repalloc(per_ts_stat, sz);
3440
3441 s = &per_ts_stat[num_spaces - 1];
3442 memset(s, 0, sizeof(*s));
3443 s->tsId = cur_tsid;
3444
3445 /*
3446 * The first buffer in this tablespace. As CkptBufferIds is sorted
3447 * by tablespace all (s->num_to_scan) buffers in this tablespace
3448 * will follow afterwards.
3449 */
3450 s->index = i;
3451
3452 /*
3453 * progress_slice will be determined once we know how many buffers
3454 * are in each tablespace, i.e. after this loop.
3455 */
3456
3457 last_tsid = cur_tsid;
3458 }
3459 else
3460 {
3461 s = &per_ts_stat[num_spaces - 1];
3462 }
3463
3464 s->num_to_scan++;
3465
3466 /* Check for barrier events. */
3469 }
3470
3471 Assert(num_spaces > 0);
3472
3473 /*
3474 * Build a min-heap over the write-progress in the individual tablespaces,
3475 * and compute how large a portion of the total progress a single
3476 * processed buffer is.
3477 */
3478 ts_heap = binaryheap_allocate(num_spaces,
3480 NULL);
3481
3482 for (i = 0; i < num_spaces; i++)
3483 {
3484 CkptTsStatus *ts_stat = &per_ts_stat[i];
3485
3486 ts_stat->progress_slice = (float8) num_to_scan / ts_stat->num_to_scan;
3487
3488 binaryheap_add_unordered(ts_heap, PointerGetDatum(ts_stat));
3489 }
3490
3491 binaryheap_build(ts_heap);
3492
3493 /*
3494 * Iterate through to-be-checkpointed buffers and write the ones (still)
3495 * marked with BM_CHECKPOINT_NEEDED. The writes are balanced between
3496 * tablespaces; otherwise the sorting would lead to only one tablespace
3497 * receiving writes at a time, making inefficient use of the hardware.
3498 */
3499 num_processed = 0;
3500 num_written = 0;
3501 while (!binaryheap_empty(ts_heap))
3502 {
3503 BufferDesc *bufHdr = NULL;
3504 CkptTsStatus *ts_stat = (CkptTsStatus *)
3506
3507 buf_id = CkptBufferIds[ts_stat->index].buf_id;
3508 Assert(buf_id != -1);
3509
3510 bufHdr = GetBufferDescriptor(buf_id);
3511
3512 num_processed++;
3513
3514 /*
3515 * We don't need to acquire the lock here, because we're only looking
3516 * at a single bit. It's possible that someone else writes the buffer
3517 * and clears the flag right after we check, but that doesn't matter
3518 * since SyncOneBuffer will then do nothing. However, there is a
3519 * further race condition: it's conceivable that between the time we
3520 * examine the bit here and the time SyncOneBuffer acquires the lock,
3521 * someone else not only wrote the buffer but replaced it with another
3522 * page and dirtied it. In that improbable case, SyncOneBuffer will
3523 * write the buffer though we didn't need to. It doesn't seem worth
3524 * guarding against this, though.
3525 */
3527 {
3528 if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN)
3529 {
3530 TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id);
3532 num_written++;
3533 }
3534 }
3535
3536 /*
3537 * Measure progress independent of actually having to flush the buffer
3538 * - otherwise writing become unbalanced.
3539 */
3540 ts_stat->progress += ts_stat->progress_slice;
3541 ts_stat->num_scanned++;
3542 ts_stat->index++;
3543
3544 /* Have all the buffers from the tablespace been processed? */
3545 if (ts_stat->num_scanned == ts_stat->num_to_scan)
3546 {
3547 binaryheap_remove_first(ts_heap);
3548 }
3549 else
3550 {
3551 /* update heap with the new progress */
3552 binaryheap_replace_first(ts_heap, PointerGetDatum(ts_stat));
3553 }
3554
3555 /*
3556 * Sleep to throttle our I/O rate.
3557 *
3558 * (This will check for barrier events even if it doesn't sleep.)
3559 */
3560 CheckpointWriteDelay(flags, (double) num_processed / num_to_scan);
3561 }
3562
3563 /*
3564 * Issue all pending flushes. Only checkpointer calls BufferSync(), so
3565 * IOContext will always be IOCONTEXT_NORMAL.
3566 */
3568
3569 pfree(per_ts_stat);
3570 per_ts_stat = NULL;
3571 binaryheap_free(ts_heap);
3572
3573 /*
3574 * Update checkpoint statistics. As noted above, this doesn't include
3575 * buffers written by other backends or bgwriter scan.
3576 */
3577 CheckpointStats.ckpt_bufs_written += num_written;
3578
3579 TRACE_POSTGRESQL_BUFFER_SYNC_DONE(NBuffers, num_written, num_to_scan);
3580}
3581
3582/*
3583 * BgBufferSync -- Write out some dirty buffers in the pool.
3584 *
3585 * This is called periodically by the background writer process.
3586 *
3587 * Returns true if it's appropriate for the bgwriter process to go into
3588 * low-power hibernation mode. (This happens if the strategy clock-sweep
3589 * has been "lapped" and no buffer allocations have occurred recently,
3590 * or if the bgwriter has been effectively disabled by setting
3591 * bgwriter_lru_maxpages to 0.)
3592 */
3593bool
3595{
3596 /* info obtained from freelist.c */
3597 int strategy_buf_id;
3598 uint32 strategy_passes;
3599 uint32 recent_alloc;
3600
3601 /*
3602 * Information saved between calls so we can determine the strategy
3603 * point's advance rate and avoid scanning already-cleaned buffers.
3604 */
3605 static bool saved_info_valid = false;
3606 static int prev_strategy_buf_id;
3607 static uint32 prev_strategy_passes;
3608 static int next_to_clean;
3609 static uint32 next_passes;
3610
3611 /* Moving averages of allocation rate and clean-buffer density */
3612 static float smoothed_alloc = 0;
3613 static float smoothed_density = 10.0;
3614
3615 /* Potentially these could be tunables, but for now, not */
3616 float smoothing_samples = 16;
3617 float scan_whole_pool_milliseconds = 120000.0;
3618
3619 /* Used to compute how far we scan ahead */
3620 long strategy_delta;
3621 int bufs_to_lap;
3622 int bufs_ahead;
3623 float scans_per_alloc;
3624 int reusable_buffers_est;
3625 int upcoming_alloc_est;
3626 int min_scan_buffers;
3627
3628 /* Variables for the scanning loop proper */
3629 int num_to_scan;
3630 int num_written;
3631 int reusable_buffers;
3632
3633 /* Variables for final smoothed_density update */
3634 long new_strategy_delta;
3635 uint32 new_recent_alloc;
3636
3637 /*
3638 * Find out where the clock-sweep currently is, and how many buffer
3639 * allocations have happened since our last call.
3640 */
3641 strategy_buf_id = StrategySyncStart(&strategy_passes, &recent_alloc);
3642
3643 /* Report buffer alloc counts to pgstat */
3644 PendingBgWriterStats.buf_alloc += recent_alloc;
3645
3646 /*
3647 * If we're not running the LRU scan, just stop after doing the stats
3648 * stuff. We mark the saved state invalid so that we can recover sanely
3649 * if LRU scan is turned back on later.
3650 */
3651 if (bgwriter_lru_maxpages <= 0)
3652 {
3653 saved_info_valid = false;
3654 return true;
3655 }
3656
3657 /*
3658 * Compute strategy_delta = how many buffers have been scanned by the
3659 * clock-sweep since last time. If first time through, assume none. Then
3660 * see if we are still ahead of the clock-sweep, and if so, how many
3661 * buffers we could scan before we'd catch up with it and "lap" it. Note:
3662 * weird-looking coding of xxx_passes comparisons are to avoid bogus
3663 * behavior when the passes counts wrap around.
3664 */
3665 if (saved_info_valid)
3666 {
3667 int32 passes_delta = strategy_passes - prev_strategy_passes;
3668
3669 strategy_delta = strategy_buf_id - prev_strategy_buf_id;
3670 strategy_delta += (long) passes_delta * NBuffers;
3671
3672 Assert(strategy_delta >= 0);
3673
3674 if ((int32) (next_passes - strategy_passes) > 0)
3675 {
3676 /* we're one pass ahead of the strategy point */
3677 bufs_to_lap = strategy_buf_id - next_to_clean;
3678#ifdef BGW_DEBUG
3679 elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
3680 next_passes, next_to_clean,
3681 strategy_passes, strategy_buf_id,
3682 strategy_delta, bufs_to_lap);
3683#endif
3684 }
3685 else if (next_passes == strategy_passes &&
3686 next_to_clean >= strategy_buf_id)
3687 {
3688 /* on same pass, but ahead or at least not behind */
3689 bufs_to_lap = NBuffers - (next_to_clean - strategy_buf_id);
3690#ifdef BGW_DEBUG
3691 elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
3692 next_passes, next_to_clean,
3693 strategy_passes, strategy_buf_id,
3694 strategy_delta, bufs_to_lap);
3695#endif
3696 }
3697 else
3698 {
3699 /*
3700 * We're behind, so skip forward to the strategy point and start
3701 * cleaning from there.
3702 */
3703#ifdef BGW_DEBUG
3704 elog(DEBUG2, "bgwriter behind: bgw %u-%u strategy %u-%u delta=%ld",
3705 next_passes, next_to_clean,
3706 strategy_passes, strategy_buf_id,
3707 strategy_delta);
3708#endif
3709 next_to_clean = strategy_buf_id;
3710 next_passes = strategy_passes;
3711 bufs_to_lap = NBuffers;
3712 }
3713 }
3714 else
3715 {
3716 /*
3717 * Initializing at startup or after LRU scanning had been off. Always
3718 * start at the strategy point.
3719 */
3720#ifdef BGW_DEBUG
3721 elog(DEBUG2, "bgwriter initializing: strategy %u-%u",
3722 strategy_passes, strategy_buf_id);
3723#endif
3724 strategy_delta = 0;
3725 next_to_clean = strategy_buf_id;
3726 next_passes = strategy_passes;
3727 bufs_to_lap = NBuffers;
3728 }
3729
3730 /* Update saved info for next time */
3731 prev_strategy_buf_id = strategy_buf_id;
3732 prev_strategy_passes = strategy_passes;
3733 saved_info_valid = true;
3734
3735 /*
3736 * Compute how many buffers had to be scanned for each new allocation, ie,
3737 * 1/density of reusable buffers, and track a moving average of that.
3738 *
3739 * If the strategy point didn't move, we don't update the density estimate
3740 */
3741 if (strategy_delta > 0 && recent_alloc > 0)
3742 {
3743 scans_per_alloc = (float) strategy_delta / (float) recent_alloc;
3744 smoothed_density += (scans_per_alloc - smoothed_density) /
3745 smoothing_samples;
3746 }
3747
3748 /*
3749 * Estimate how many reusable buffers there are between the current
3750 * strategy point and where we've scanned ahead to, based on the smoothed
3751 * density estimate.
3752 */
3753 bufs_ahead = NBuffers - bufs_to_lap;
3754 reusable_buffers_est = (float) bufs_ahead / smoothed_density;
3755
3756 /*
3757 * Track a moving average of recent buffer allocations. Here, rather than
3758 * a true average we want a fast-attack, slow-decline behavior: we
3759 * immediately follow any increase.
3760 */
3761 if (smoothed_alloc <= (float) recent_alloc)
3762 smoothed_alloc = recent_alloc;
3763 else
3764 smoothed_alloc += ((float) recent_alloc - smoothed_alloc) /
3765 smoothing_samples;
3766
3767 /* Scale the estimate by a GUC to allow more aggressive tuning. */
3768 upcoming_alloc_est = (int) (smoothed_alloc * bgwriter_lru_multiplier);
3769
3770 /*
3771 * If recent_alloc remains at zero for many cycles, smoothed_alloc will
3772 * eventually underflow to zero, and the underflows produce annoying
3773 * kernel warnings on some platforms. Once upcoming_alloc_est has gone to
3774 * zero, there's no point in tracking smaller and smaller values of
3775 * smoothed_alloc, so just reset it to exactly zero to avoid this
3776 * syndrome. It will pop back up as soon as recent_alloc increases.
3777 */
3778 if (upcoming_alloc_est == 0)
3779 smoothed_alloc = 0;
3780
3781 /*
3782 * Even in cases where there's been little or no buffer allocation
3783 * activity, we want to make a small amount of progress through the buffer
3784 * cache so that as many reusable buffers as possible are clean after an
3785 * idle period.
3786 *
3787 * (scan_whole_pool_milliseconds / BgWriterDelay) computes how many times
3788 * the BGW will be called during the scan_whole_pool time; slice the
3789 * buffer pool into that many sections.
3790 */
3791 min_scan_buffers = (int) (NBuffers / (scan_whole_pool_milliseconds / BgWriterDelay));
3792
3793 if (upcoming_alloc_est < (min_scan_buffers + reusable_buffers_est))
3794 {
3795#ifdef BGW_DEBUG
3796 elog(DEBUG2, "bgwriter: alloc_est=%d too small, using min=%d + reusable_est=%d",
3797 upcoming_alloc_est, min_scan_buffers, reusable_buffers_est);
3798#endif
3799 upcoming_alloc_est = min_scan_buffers + reusable_buffers_est;
3800 }
3801
3802 /*
3803 * Now write out dirty reusable buffers, working forward from the
3804 * next_to_clean point, until we have lapped the strategy scan, or cleaned
3805 * enough buffers to match our estimate of the next cycle's allocation
3806 * requirements, or hit the bgwriter_lru_maxpages limit.
3807 */
3808
3809 num_to_scan = bufs_to_lap;
3810 num_written = 0;
3811 reusable_buffers = reusable_buffers_est;
3812
3813 /* Execute the LRU scan */
3814 while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est)
3815 {
3816 int sync_state = SyncOneBuffer(next_to_clean, true,
3817 wb_context);
3818
3819 if (++next_to_clean >= NBuffers)
3820 {
3821 next_to_clean = 0;
3822 next_passes++;
3823 }
3824 num_to_scan--;
3825
3826 if (sync_state & BUF_WRITTEN)
3827 {
3828 reusable_buffers++;
3829 if (++num_written >= bgwriter_lru_maxpages)
3830 {
3832 break;
3833 }
3834 }
3835 else if (sync_state & BUF_REUSABLE)
3836 reusable_buffers++;
3837 }
3838
3840
3841#ifdef BGW_DEBUG
3842 elog(DEBUG1, "bgwriter: recent_alloc=%u smoothed=%.2f delta=%ld ahead=%d density=%.2f reusable_est=%d upcoming_est=%d scanned=%d wrote=%d reusable=%d",
3843 recent_alloc, smoothed_alloc, strategy_delta, bufs_ahead,
3844 smoothed_density, reusable_buffers_est, upcoming_alloc_est,
3845 bufs_to_lap - num_to_scan,
3846 num_written,
3847 reusable_buffers - reusable_buffers_est);
3848#endif
3849
3850 /*
3851 * Consider the above scan as being like a new allocation scan.
3852 * Characterize its density and update the smoothed one based on it. This
3853 * effectively halves the moving average period in cases where both the
3854 * strategy and the background writer are doing some useful scanning,
3855 * which is helpful because a long memory isn't as desirable on the
3856 * density estimates.
3857 */
3858 new_strategy_delta = bufs_to_lap - num_to_scan;
3859 new_recent_alloc = reusable_buffers - reusable_buffers_est;
3860 if (new_strategy_delta > 0 && new_recent_alloc > 0)
3861 {
3862 scans_per_alloc = (float) new_strategy_delta / (float) new_recent_alloc;
3863 smoothed_density += (scans_per_alloc - smoothed_density) /
3864 smoothing_samples;
3865
3866#ifdef BGW_DEBUG
3867 elog(DEBUG2, "bgwriter: cleaner density alloc=%u scan=%ld density=%.2f new smoothed=%.2f",
3868 new_recent_alloc, new_strategy_delta,
3869 scans_per_alloc, smoothed_density);
3870#endif
3871 }
3872
3873 /* Return true if OK to hibernate */
3874 return (bufs_to_lap == 0 && recent_alloc == 0);
3875}
3876
3877/*
3878 * SyncOneBuffer -- process a single buffer during syncing.
3879 *
3880 * If skip_recently_used is true, we don't write currently-pinned buffers, nor
3881 * buffers marked recently used, as these are not replacement candidates.
3882 *
3883 * Returns a bitmask containing the following flag bits:
3884 * BUF_WRITTEN: we wrote the buffer.
3885 * BUF_REUSABLE: buffer is available for replacement, ie, it has
3886 * pin count 0 and usage count 0.
3887 *
3888 * (BUF_WRITTEN could be set in error if FlushBuffer finds the buffer clean
3889 * after locking it, but we don't care all that much.)
3890 */
3891static int
3892SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
3893{
3894 BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
3895 int result = 0;
3896 uint32 buf_state;
3897 BufferTag tag;
3898
3899 /* Make sure we can handle the pin */
3902
3903 /*
3904 * Check whether buffer needs writing.
3905 *
3906 * We can make this check without taking the buffer content lock so long
3907 * as we mark pages dirty in access methods *before* logging changes with
3908 * XLogInsert(): if someone marks the buffer dirty just after our check we
3909 * don't worry because our checkpoint.redo points before log record for
3910 * upcoming changes and so we are not required to write such dirty buffer.
3911 */
3912 buf_state = LockBufHdr(bufHdr);
3913
3914 if (BUF_STATE_GET_REFCOUNT(buf_state) == 0 &&
3915 BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
3916 {
3917 result |= BUF_REUSABLE;
3918 }
3919 else if (skip_recently_used)
3920 {
3921 /* Caller told us not to write recently-used buffers */
3922 UnlockBufHdr(bufHdr, buf_state);
3923 return result;
3924 }
3925
3926 if (!(buf_state & BM_VALID) || !(buf_state & BM_DIRTY))
3927 {
3928 /* It's clean, so nothing to do */
3929 UnlockBufHdr(bufHdr, buf_state);
3930 return result;
3931 }
3932
3933 /*
3934 * Pin it, share-lock it, write it. (FlushBuffer will do nothing if the
3935 * buffer is clean by the time we've locked it.)
3936 */
3937 PinBuffer_Locked(bufHdr);
3939
3941
3943
3944 tag = bufHdr->tag;
3945
3946 UnpinBuffer(bufHdr);
3947
3948 /*
3949 * SyncOneBuffer() is only called by checkpointer and bgwriter, so
3950 * IOContext will always be IOCONTEXT_NORMAL.
3951 */
3953
3954 return result | BUF_WRITTEN;
3955}
3956
3957/*
3958 * AtEOXact_Buffers - clean up at end of transaction.
3959 *
3960 * As of PostgreSQL 8.0, buffer pins should get released by the
3961 * ResourceOwner mechanism. This routine is just a debugging
3962 * cross-check that no pins remain.
3963 */
3964void
3965AtEOXact_Buffers(bool isCommit)
3966{
3968
3969 AtEOXact_LocalBuffers(isCommit);
3970
3972}
3973
3974/*
3975 * Initialize access to shared buffer pool
3976 *
3977 * This is called during backend startup (whether standalone or under the
3978 * postmaster). It sets up for this backend's access to the already-existing
3979 * buffer pool.
3980 */
3981void
3983{
3984 HASHCTL hash_ctl;
3985
3986 /*
3987 * An advisory limit on the number of pins each backend should hold, based
3988 * on shared_buffers and the maximum number of connections possible.
3989 * That's very pessimistic, but outside toy-sized shared_buffers it should
3990 * allow plenty of pins. LimitAdditionalPins() and
3991 * GetAdditionalPinLimit() can be used to check the remaining balance.
3992 */
3994
3995 memset(&PrivateRefCountArray, 0, sizeof(PrivateRefCountArray));
3996
3997 hash_ctl.keysize = sizeof(int32);
3998 hash_ctl.entrysize = sizeof(PrivateRefCountEntry);
3999
4000 PrivateRefCountHash = hash_create("PrivateRefCount", 100, &hash_ctl,
4002
4003 /*
4004 * AtProcExit_Buffers needs LWLock access, and thereby has to be called at
4005 * the corresponding phase of backend shutdown.
4006 */
4007 Assert(MyProc != NULL);
4009}
4010
4011/*
4012 * During backend exit, ensure that we released all shared-buffer locks and
4013 * assert that we have no remaining pins.
4014 */
4015static void
4017{
4018 UnlockBuffers();
4019
4021
4022 /* localbuf.c needs a chance too */
4024}
4025
4026/*
4027 * CheckForBufferLeaks - ensure this backend holds no buffer pins
4028 *
4029 * As of PostgreSQL 8.0, buffer pins should get released by the
4030 * ResourceOwner mechanism. This routine is just a debugging
4031 * cross-check that no pins remain.
4032 */
4033static void
4035{
4036#ifdef USE_ASSERT_CHECKING
4037 int RefCountErrors = 0;
4039 int i;
4040 char *s;
4041
4042 /* check the array */
4043 for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
4044 {
4045 res = &PrivateRefCountArray[i];
4046
4047 if (res->buffer != InvalidBuffer)
4048 {
4050 elog(WARNING, "buffer refcount leak: %s", s);
4051 pfree(s);
4052
4053 RefCountErrors++;
4054 }
4055 }
4056
4057 /* if necessary search the hash */
4059 {
4060 HASH_SEQ_STATUS hstat;
4061
4063 while ((res = (PrivateRefCountEntry *) hash_seq_search(&hstat)) != NULL)
4064 {
4066 elog(WARNING, "buffer refcount leak: %s", s);
4067 pfree(s);
4068 RefCountErrors++;
4069 }
4070 }
4071
4072 Assert(RefCountErrors == 0);
4073#endif
4074}
4075
4076#ifdef USE_ASSERT_CHECKING
4077/*
4078 * Check for exclusive-locked catalog buffers. This is the core of
4079 * AssertCouldGetRelation().
4080 *
4081 * A backend would self-deadlock on LWLocks if the catalog scan read the
4082 * exclusive-locked buffer. The main threat is exclusive-locked buffers of
4083 * catalogs used in relcache, because a catcache search on any catalog may
4084 * build that catalog's relcache entry. We don't have an inventory of
4085 * catalogs relcache uses, so just check buffers of most catalogs.
4086 *
4087 * It's better to minimize waits while holding an exclusive buffer lock, so it
4088 * would be nice to broaden this check not to be catalog-specific. However,
4089 * bttextcmp() accesses pg_collation, and non-core opclasses might similarly
4090 * read tables. That is deadlock-free as long as there's no loop in the
4091 * dependency graph: modifying table A may cause an opclass to read table B,
4092 * but it must not cause a read of table A.
4093 */
4094void
4095AssertBufferLocksPermitCatalogRead(void)
4096{
4097 ForEachLWLockHeldByMe(AssertNotCatalogBufferLock, NULL);
4098}
4099
4100static void
4101AssertNotCatalogBufferLock(LWLock *lock, LWLockMode mode,
4102 void *unused_context)
4103{
4104 BufferDesc *bufHdr;
4105 BufferTag tag;
4106 Oid relid;
4107
4108 if (mode != LW_EXCLUSIVE)
4109 return;
4110
4111 if (!((BufferDescPadded *) lock > BufferDescriptors &&
4113 return; /* not a buffer lock */
4114
4115 bufHdr = (BufferDesc *)
4116 ((char *) lock - offsetof(BufferDesc, content_lock));
4117 tag = bufHdr->tag;
4118
4119 /*
4120 * This relNumber==relid assumption holds until a catalog experiences
4121 * VACUUM FULL or similar. After a command like that, relNumber will be
4122 * in the normal (non-catalog) range, and we lose the ability to detect
4123 * hazardous access to that catalog. Calling RelidByRelfilenumber() would
4124 * close that gap, but RelidByRelfilenumber() might then deadlock with a
4125 * held lock.
4126 */
4127 relid = tag.relNumber;
4128
4129 if (IsCatalogTextUniqueIndexOid(relid)) /* see comments at the callee */
4130 return;
4131
4133}
4134#endif
4135
4136
4137/*
4138 * Helper routine to issue warnings when a buffer is unexpectedly pinned
4139 */
4140char *
4142{
4143 BufferDesc *buf;
4144 int32 loccount;
4145 char *result;
4146 ProcNumber backend;
4147 uint32 buf_state;
4148
4150 if (BufferIsLocal(buffer))
4151 {
4153 loccount = LocalRefCount[-buffer - 1];
4154 backend = MyProcNumber;
4155 }
4156 else
4157 {
4159 loccount = GetPrivateRefCount(buffer);
4160 backend = INVALID_PROC_NUMBER;
4161 }
4162
4163 /* theoretically we should lock the bufhdr here */
4164 buf_state = pg_atomic_read_u32(&buf->state);
4165
4166 result = psprintf("[%03d] (rel=%s, blockNum=%u, flags=0x%x, refcount=%u %d)",
4167 buffer,
4169 BufTagGetForkNum(&buf->tag)).str,
4170 buf->tag.blockNum, buf_state & BUF_FLAG_MASK,
4171 BUF_STATE_GET_REFCOUNT(buf_state), loccount);
4172 return result;
4173}
4174
4175/*
4176 * CheckPointBuffers
4177 *
4178 * Flush all dirty blocks in buffer pool to disk at checkpoint time.
4179 *
4180 * Note: temporary relations do not participate in checkpoints, so they don't
4181 * need to be flushed.
4182 */
4183void
4185{
4186 BufferSync(flags);
4187}
4188
4189/*
4190 * BufferGetBlockNumber
4191 * Returns the block number associated with a buffer.
4192 *
4193 * Note:
4194 * Assumes that the buffer is valid and pinned, else the
4195 * value may be obsolete immediately...
4196 */
4199{
4200 BufferDesc *bufHdr;
4201
4203
4204 if (BufferIsLocal(buffer))
4205 bufHdr = GetLocalBufferDescriptor(-buffer - 1);
4206 else
4207 bufHdr = GetBufferDescriptor(buffer - 1);
4208
4209 /* pinned, so OK to read tag without spinlock */
4210 return bufHdr->tag.blockNum;
4211}
4212
4213/*
4214 * BufferGetTag
4215 * Returns the relfilelocator, fork number and block number associated with
4216 * a buffer.
4217 */
4218void
4220 BlockNumber *blknum)
4221{
4222 BufferDesc *bufHdr;
4223
4224 /* Do the same checks as BufferGetBlockNumber. */
4226
4227 if (BufferIsLocal(buffer))
4228 bufHdr = GetLocalBufferDescriptor(-buffer - 1);
4229 else
4230 bufHdr = GetBufferDescriptor(buffer - 1);
4231
4232 /* pinned, so OK to read tag without spinlock */
4233 *rlocator = BufTagGetRelFileLocator(&bufHdr->tag);
4234 *forknum = BufTagGetForkNum(&bufHdr->tag);
4235 *blknum = bufHdr->tag.blockNum;
4236}
4237
4238/*
4239 * FlushBuffer
4240 * Physically write out a shared buffer.
4241 *
4242 * NOTE: this actually just passes the buffer contents to the kernel; the
4243 * real write to disk won't happen until the kernel feels like it. This
4244 * is okay from our point of view since we can redo the changes from WAL.
4245 * However, we will need to force the changes to disk via fsync before
4246 * we can checkpoint WAL.
4247 *
4248 * The caller must hold a pin on the buffer and have share-locked the
4249 * buffer contents. (Note: a share-lock does not prevent updates of
4250 * hint bits in the buffer, so the page could change while the write
4251 * is in progress, but we assume that that will not invalidate the data
4252 * written.)
4253 *
4254 * If the caller has an smgr reference for the buffer's relation, pass it
4255 * as the second parameter. If not, pass NULL.
4256 */
4257static void
4259 IOContext io_context)
4260{
4261 XLogRecPtr recptr;
4262 ErrorContextCallback errcallback;
4263 instr_time io_start;
4264 Block bufBlock;
4265 char *bufToWrite;
4266 uint32 buf_state;
4267
4268 /*
4269 * Try to start an I/O operation. If StartBufferIO returns false, then
4270 * someone else flushed the buffer before we could, so we need not do
4271 * anything.
4272 */
4273 if (!StartBufferIO(buf, false, false))
4274 return;
4275
4276 /* Setup error traceback support for ereport() */
4278 errcallback.arg = buf;
4279 errcallback.previous = error_context_stack;
4280 error_context_stack = &errcallback;
4281
4282 /* Find smgr relation for buffer */
4283 if (reln == NULL)
4285
4286 TRACE_POSTGRESQL_BUFFER_FLUSH_START(BufTagGetForkNum(&buf->tag),
4287 buf->tag.blockNum,
4291
4292 buf_state = LockBufHdr(buf);
4293
4294 /*
4295 * Run PageGetLSN while holding header lock, since we don't have the
4296 * buffer locked exclusively in all cases.
4297 */
4298 recptr = BufferGetLSN(buf);
4299
4300 /* To check if block content changes while flushing. - vadim 01/17/97 */
4301 buf_state &= ~BM_JUST_DIRTIED;
4302 UnlockBufHdr(buf, buf_state);
4303
4304 /*
4305 * Force XLOG flush up to buffer's LSN. This implements the basic WAL
4306 * rule that log updates must hit disk before any of the data-file changes
4307 * they describe do.
4308 *
4309 * However, this rule does not apply to unlogged relations, which will be
4310 * lost after a crash anyway. Most unlogged relation pages do not bear
4311 * LSNs since we never emit WAL records for them, and therefore flushing
4312 * up through the buffer LSN would be useless, but harmless. However,
4313 * GiST indexes use LSNs internally to track page-splits, and therefore
4314 * unlogged GiST pages bear "fake" LSNs generated by
4315 * GetFakeLSNForUnloggedRel. It is unlikely but possible that the fake
4316 * LSN counter could advance past the WAL insertion point; and if it did
4317 * happen, attempting to flush WAL through that location would fail, with
4318 * disastrous system-wide consequences. To make sure that can't happen,
4319 * skip the flush if the buffer isn't permanent.
4320 */
4321 if (buf_state & BM_PERMANENT)
4322 XLogFlush(recptr);
4323
4324 /*
4325 * Now it's safe to write the buffer to disk. Note that no one else should
4326 * have been able to write it, while we were busy with log flushing,
4327 * because we got the exclusive right to perform I/O by setting the
4328 * BM_IO_IN_PROGRESS bit.
4329 */
4330 bufBlock = BufHdrGetBlock(buf);
4331
4332 /*
4333 * Update page checksum if desired. Since we have only shared lock on the
4334 * buffer, other processes might be updating hint bits in it, so we must
4335 * copy the page to private storage if we do checksumming.
4336 */
4337 bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum);
4338
4340
4341 /*
4342 * bufToWrite is either the shared buffer or a copy, as appropriate.
4343 */
4344 smgrwrite(reln,
4345 BufTagGetForkNum(&buf->tag),
4346 buf->tag.blockNum,
4347 bufToWrite,
4348 false);
4349
4350 /*
4351 * When a strategy is in use, only flushes of dirty buffers already in the
4352 * strategy ring are counted as strategy writes (IOCONTEXT
4353 * [BULKREAD|BULKWRITE|VACUUM] IOOP_WRITE) for the purpose of IO
4354 * statistics tracking.
4355 *
4356 * If a shared buffer initially added to the ring must be flushed before
4357 * being used, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE.
4358 *
4359 * If a shared buffer which was added to the ring later because the
4360 * current strategy buffer is pinned or in use or because all strategy
4361 * buffers were dirty and rejected (for BAS_BULKREAD operations only)
4362 * requires flushing, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE
4363 * (from_ring will be false).
4364 *
4365 * When a strategy is not in use, the write can only be a "regular" write
4366 * of a dirty shared buffer (IOCONTEXT_NORMAL IOOP_WRITE).
4367 */
4369 IOOP_WRITE, io_start, 1, BLCKSZ);
4370
4372
4373 /*
4374 * Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and
4375 * end the BM_IO_IN_PROGRESS state.
4376 */
4377 TerminateBufferIO(buf, true, 0, true, false);
4378
4379 TRACE_POSTGRESQL_BUFFER_FLUSH_DONE(BufTagGetForkNum(&buf->tag),
4380 buf->tag.blockNum,
4384
4385 /* Pop the error context stack */
4386 error_context_stack = errcallback.previous;
4387}
4388
4389/*
4390 * RelationGetNumberOfBlocksInFork
4391 * Determines the current number of pages in the specified relation fork.
4392 *
4393 * Note that the accuracy of the result will depend on the details of the
4394 * relation's storage. For builtin AMs it'll be accurate, but for external AMs
4395 * it might not be.
4396 */
4399{
4400 if (RELKIND_HAS_TABLE_AM(relation->rd_rel->relkind))
4401 {
4402 /*
4403 * Not every table AM uses BLCKSZ wide fixed size blocks. Therefore
4404 * tableam returns the size in bytes - but for the purpose of this
4405 * routine, we want the number of blocks. Therefore divide, rounding
4406 * up.
4407 */
4408 uint64 szbytes;
4409
4410 szbytes = table_relation_size(relation, forkNum);
4411
4412 return (szbytes + (BLCKSZ - 1)) / BLCKSZ;
4413 }
4414 else if (RELKIND_HAS_STORAGE(relation->rd_rel->relkind))
4415 {
4416 return smgrnblocks(RelationGetSmgr(relation), forkNum);
4417 }
4418 else
4419 Assert(false);
4420
4421 return 0; /* keep compiler quiet */
4422}
4423
4424/*
4425 * BufferIsPermanent
4426 * Determines whether a buffer will potentially still be around after
4427 * a crash. Caller must hold a buffer pin.
4428 */
4429bool
4431{
4432 BufferDesc *bufHdr;
4433
4434 /* Local buffers are used only for temp relations. */
4435 if (BufferIsLocal(buffer))
4436 return false;
4437
4438 /* Make sure we've got a real buffer, and that we hold a pin on it. */
4441
4442 /*
4443 * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
4444 * need not bother with the buffer header spinlock. Even if someone else
4445 * changes the buffer header state while we're doing this, the state is
4446 * changed atomically, so we'll read the old value or the new value, but
4447 * not random garbage.
4448 */
4449 bufHdr = GetBufferDescriptor(buffer - 1);
4450 return (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT) != 0;
4451}
4452
4453/*
4454 * BufferGetLSNAtomic
4455 * Retrieves the LSN of the buffer atomically using a buffer header lock.
4456 * This is necessary for some callers who may not have an exclusive lock
4457 * on the buffer.
4458 */
4461{
4462 char *page = BufferGetPage(buffer);
4463 BufferDesc *bufHdr;
4464 XLogRecPtr lsn;
4465 uint32 buf_state;
4466
4467 /*
4468 * If we don't need locking for correctness, fastpath out.
4469 */
4471 return PageGetLSN(page);
4472
4473 /* Make sure we've got a real buffer, and that we hold a pin on it. */
4476
4477 bufHdr = GetBufferDescriptor(buffer - 1);
4478 buf_state = LockBufHdr(bufHdr);
4479 lsn = PageGetLSN(page);
4480 UnlockBufHdr(bufHdr, buf_state);
4481
4482 return lsn;
4483}
4484
4485/* ---------------------------------------------------------------------
4486 * DropRelationBuffers
4487 *
4488 * This function removes from the buffer pool all the pages of the
4489 * specified relation forks that have block numbers >= firstDelBlock.
4490 * (In particular, with firstDelBlock = 0, all pages are removed.)
4491 * Dirty pages are simply dropped, without bothering to write them
4492 * out first. Therefore, this is NOT rollback-able, and so should be
4493 * used only with extreme caution!
4494 *
4495 * Currently, this is called only from smgr.c when the underlying file
4496 * is about to be deleted or truncated (firstDelBlock is needed for
4497 * the truncation case). The data in the affected pages would therefore
4498 * be deleted momentarily anyway, and there is no point in writing it.
4499 * It is the responsibility of higher-level code to ensure that the
4500 * deletion or truncation does not lose any data that could be needed
4501 * later. It is also the responsibility of higher-level code to ensure
4502 * that no other process could be trying to load more pages of the
4503 * relation into buffers.
4504 * --------------------------------------------------------------------
4505 */
4506void
4508 int nforks, BlockNumber *firstDelBlock)
4509{
4510 int i;
4511 int j;
4512 RelFileLocatorBackend rlocator;
4513 BlockNumber nForkBlock[MAX_FORKNUM];
4514 uint64 nBlocksToInvalidate = 0;
4515
4516 rlocator = smgr_reln->smgr_rlocator;
4517
4518 /* If it's a local relation, it's localbuf.c's problem. */
4519 if (RelFileLocatorBackendIsTemp(rlocator))
4520 {
4521 if (rlocator.backend == MyProcNumber)
4522 DropRelationLocalBuffers(rlocator.locator, forkNum, nforks,
4523 firstDelBlock);
4524
4525 return;
4526 }
4527
4528 /*
4529 * To remove all the pages of the specified relation forks from the buffer
4530 * pool, we need to scan the entire buffer pool but we can optimize it by
4531 * finding the buffers from BufMapping table provided we know the exact
4532 * size of each fork of the relation. The exact size is required to ensure
4533 * that we don't leave any buffer for the relation being dropped as
4534 * otherwise the background writer or checkpointer can lead to a PANIC
4535 * error while flushing buffers corresponding to files that don't exist.
4536 *
4537 * To know the exact size, we rely on the size cached for each fork by us
4538 * during recovery which limits the optimization to recovery and on
4539 * standbys but we can easily extend it once we have shared cache for
4540 * relation size.
4541 *
4542 * In recovery, we cache the value returned by the first lseek(SEEK_END)
4543 * and the future writes keeps the cached value up-to-date. See
4544 * smgrextend. It is possible that the value of the first lseek is smaller
4545 * than the actual number of existing blocks in the file due to buggy
4546 * Linux kernels that might not have accounted for the recent write. But
4547 * that should be fine because there must not be any buffers after that
4548 * file size.
4549 */
4550 for (i = 0; i < nforks; i++)
4551 {
4552 /* Get the number of blocks for a relation's fork */
4553 nForkBlock[i] = smgrnblocks_cached(smgr_reln, forkNum[i]);
4554
4555 if (nForkBlock[i] == InvalidBlockNumber)
4556 {
4557 nBlocksToInvalidate = InvalidBlockNumber;
4558 break;
4559 }
4560
4561 /* calculate the number of blocks to be invalidated */
4562 nBlocksToInvalidate += (nForkBlock[i] - firstDelBlock[i]);
4563 }
4564
4565 /*
4566 * We apply the optimization iff the total number of blocks to invalidate
4567 * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
4568 */
4569 if (BlockNumberIsValid(nBlocksToInvalidate) &&
4570 nBlocksToInvalidate < BUF_DROP_FULL_SCAN_THRESHOLD)
4571 {
4572 for (j = 0; j < nforks; j++)
4573 FindAndDropRelationBuffers(rlocator.locator, forkNum[j],
4574 nForkBlock[j], firstDelBlock[j]);
4575 return;
4576 }
4577
4578 for (i = 0; i < NBuffers; i++)
4579 {
4580 BufferDesc *bufHdr = GetBufferDescriptor(i);
4581 uint32 buf_state;
4582
4583 /*
4584 * We can make this a tad faster by prechecking the buffer tag before
4585 * we attempt to lock the buffer; this saves a lot of lock
4586 * acquisitions in typical cases. It should be safe because the
4587 * caller must have AccessExclusiveLock on the relation, or some other
4588 * reason to be certain that no one is loading new pages of the rel
4589 * into the buffer pool. (Otherwise we might well miss such pages
4590 * entirely.) Therefore, while the tag might be changing while we
4591 * look at it, it can't be changing *to* a value we care about, only
4592 * *away* from such a value. So false negatives are impossible, and
4593 * false positives are safe because we'll recheck after getting the
4594 * buffer lock.
4595 *
4596 * We could check forkNum and blockNum as well as the rlocator, but
4597 * the incremental win from doing so seems small.
4598 */
4599 if (!BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator))
4600 continue;
4601
4602 buf_state = LockBufHdr(bufHdr);
4603
4604 for (j = 0; j < nforks; j++)
4605 {
4606 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator) &&
4607 BufTagGetForkNum(&bufHdr->tag) == forkNum[j] &&
4608 bufHdr->tag.blockNum >= firstDelBlock[j])
4609 {
4610 InvalidateBuffer(bufHdr); /* releases spinlock */
4611 break;
4612 }
4613 }
4614 if (j >= nforks)
4615 UnlockBufHdr(bufHdr, buf_state);
4616 }
4617}
4618
4619/* ---------------------------------------------------------------------
4620 * DropRelationsAllBuffers
4621 *
4622 * This function removes from the buffer pool all the pages of all
4623 * forks of the specified relations. It's equivalent to calling
4624 * DropRelationBuffers once per fork per relation with firstDelBlock = 0.
4625 * --------------------------------------------------------------------
4626 */
4627void
4628DropRelationsAllBuffers(SMgrRelation *smgr_reln, int nlocators)
4629{
4630 int i;
4631 int n = 0;
4632 SMgrRelation *rels;
4633 BlockNumber (*block)[MAX_FORKNUM + 1];
4634 uint64 nBlocksToInvalidate = 0;
4635 RelFileLocator *locators;
4636 bool cached = true;
4637 bool use_bsearch;
4638
4639 if (nlocators == 0)
4640 return;
4641
4642 rels = palloc(sizeof(SMgrRelation) * nlocators); /* non-local relations */
4643
4644 /* If it's a local relation, it's localbuf.c's problem. */
4645 for (i = 0; i < nlocators; i++)
4646 {
4647 if (RelFileLocatorBackendIsTemp(smgr_reln[i]->smgr_rlocator))
4648 {
4649 if (smgr_reln[i]->smgr_rlocator.backend == MyProcNumber)
4650 DropRelationAllLocalBuffers(smgr_reln[i]->smgr_rlocator.locator);
4651 }
4652 else
4653 rels[n++] = smgr_reln[i];
4654 }
4655
4656 /*
4657 * If there are no non-local relations, then we're done. Release the
4658 * memory and return.
4659 */
4660 if (n == 0)
4661 {
4662 pfree(rels);
4663 return;
4664 }
4665
4666 /*
4667 * This is used to remember the number of blocks for all the relations
4668 * forks.
4669 */
4670 block = (BlockNumber (*)[MAX_FORKNUM + 1])
4671 palloc(sizeof(BlockNumber) * n * (MAX_FORKNUM + 1));
4672
4673 /*
4674 * We can avoid scanning the entire buffer pool if we know the exact size
4675 * of each of the given relation forks. See DropRelationBuffers.
4676 */
4677 for (i = 0; i < n && cached; i++)
4678 {
4679 for (int j = 0; j <= MAX_FORKNUM; j++)
4680 {
4681 /* Get the number of blocks for a relation's fork. */
4682 block[i][j] = smgrnblocks_cached(rels[i], j);
4683
4684 /* We need to only consider the relation forks that exists. */
4685 if (block[i][j] == InvalidBlockNumber)
4686 {
4687 if (!smgrexists(rels[i], j))
4688 continue;
4689 cached = false;
4690 break;
4691 }
4692
4693 /* calculate the total number of blocks to be invalidated */
4694 nBlocksToInvalidate += block[i][j];
4695 }
4696 }
4697
4698 /*
4699 * We apply the optimization iff the total number of blocks to invalidate
4700 * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
4701 */
4702 if (cached && nBlocksToInvalidate < BUF_DROP_FULL_SCAN_THRESHOLD)
4703 {
4704 for (i = 0; i < n; i++)
4705 {
4706 for (int j = 0; j <= MAX_FORKNUM; j++)
4707 {
4708 /* ignore relation forks that doesn't exist */
4709 if (!BlockNumberIsValid(block[i][j]))
4710 continue;
4711
4712 /* drop all the buffers for a particular relation fork */
4713 FindAndDropRelationBuffers(rels[i]->smgr_rlocator.locator,
4714 j, block[i][j], 0);
4715 }
4716 }
4717
4718 pfree(block);
4719 pfree(rels);
4720 return;
4721 }
4722
4723 pfree(block);
4724 locators = palloc(sizeof(RelFileLocator) * n); /* non-local relations */
4725 for (i = 0; i < n; i++)
4726 locators[i] = rels[i]->smgr_rlocator.locator;
4727
4728 /*
4729 * For low number of relations to drop just use a simple walk through, to
4730 * save the bsearch overhead. The threshold to use is rather a guess than
4731 * an exactly determined value, as it depends on many factors (CPU and RAM
4732 * speeds, amount of shared buffers etc.).
4733 */
4734 use_bsearch = n > RELS_BSEARCH_THRESHOLD;
4735
4736 /* sort the list of rlocators if necessary */
4737 if (use_bsearch)
4738 qsort(locators, n, sizeof(RelFileLocator), rlocator_comparator);
4739
4740 for (i = 0; i < NBuffers; i++)
4741 {
4742 RelFileLocator *rlocator = NULL;
4743 BufferDesc *bufHdr = GetBufferDescriptor(i);
4744 uint32 buf_state;
4745
4746 /*
4747 * As in DropRelationBuffers, an unlocked precheck should be safe and
4748 * saves some cycles.
4749 */
4750
4751 if (!use_bsearch)
4752 {
4753 int j;
4754
4755 for (j = 0; j < n; j++)
4756 {
4757 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &locators[j]))
4758 {
4759 rlocator = &locators[j];
4760 break;
4761 }
4762 }
4763 }
4764 else
4765 {
4766 RelFileLocator locator;
4767
4768 locator = BufTagGetRelFileLocator(&bufHdr->tag);
4769 rlocator = bsearch(&locator,
4770 locators, n, sizeof(RelFileLocator),
4772 }
4773
4774 /* buffer doesn't belong to any of the given relfilelocators; skip it */
4775 if (rlocator == NULL)
4776 continue;
4777
4778 buf_state = LockBufHdr(bufHdr);
4779 if (BufTagMatchesRelFileLocator(&bufHdr->tag, rlocator))
4780 InvalidateBuffer(bufHdr); /* releases spinlock */
4781 else
4782 UnlockBufHdr(bufHdr, buf_state);
4783 }
4784
4785 pfree(locators);
4786 pfree(rels);
4787}
4788
4789/* ---------------------------------------------------------------------
4790 * FindAndDropRelationBuffers
4791 *
4792 * This function performs look up in BufMapping table and removes from the
4793 * buffer pool all the pages of the specified relation fork that has block
4794 * number >= firstDelBlock. (In particular, with firstDelBlock = 0, all
4795 * pages are removed.)
4796 * --------------------------------------------------------------------
4797 */
4798static void
4800 BlockNumber nForkBlock,
4801 BlockNumber firstDelBlock)
4802{
4803 BlockNumber curBlock;
4804
4805 for (curBlock = firstDelBlock; curBlock < nForkBlock; curBlock++)
4806 {
4807 uint32 bufHash; /* hash value for tag */
4808 BufferTag bufTag; /* identity of requested block */
4809 LWLock *bufPartitionLock; /* buffer partition lock for it */
4810 int buf_id;
4811 BufferDesc *bufHdr;
4812 uint32 buf_state;
4813
4814 /* create a tag so we can lookup the buffer */
4815 InitBufferTag(&bufTag, &rlocator, forkNum, curBlock);
4816
4817 /* determine its hash code and partition lock ID */
4818 bufHash = BufTableHashCode(&bufTag);
4819 bufPartitionLock = BufMappingPartitionLock(bufHash);
4820
4821 /* Check that it is in the buffer pool. If not, do nothing. */
4822 LWLockAcquire(bufPartitionLock, LW_SHARED);
4823 buf_id = BufTableLookup(&bufTag, bufHash);
4824 LWLockRelease(bufPartitionLock);
4825
4826 if (buf_id < 0)
4827 continue;
4828
4829 bufHdr = GetBufferDescriptor(buf_id);
4830
4831 /*
4832 * We need to lock the buffer header and recheck if the buffer is
4833 * still associated with the same block because the buffer could be
4834 * evicted by some other backend loading blocks for a different
4835 * relation after we release lock on the BufMapping table.
4836 */
4837 buf_state = LockBufHdr(bufHdr);
4838
4839 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator) &&
4840 BufTagGetForkNum(&bufHdr->tag) == forkNum &&
4841 bufHdr->tag.blockNum >= firstDelBlock)
4842 InvalidateBuffer(bufHdr); /* releases spinlock */
4843 else
4844 UnlockBufHdr(bufHdr, buf_state);
4845 }
4846}
4847
4848/* ---------------------------------------------------------------------
4849 * DropDatabaseBuffers
4850 *
4851 * This function removes all the buffers in the buffer cache for a
4852 * particular database. Dirty pages are simply dropped, without
4853 * bothering to write them out first. This is used when we destroy a
4854 * database, to avoid trying to flush data to disk when the directory
4855 * tree no longer exists. Implementation is pretty similar to
4856 * DropRelationBuffers() which is for destroying just one relation.
4857 * --------------------------------------------------------------------
4858 */
4859void
4861{
4862 int i;
4863
4864 /*
4865 * We needn't consider local buffers, since by assumption the target
4866 * database isn't our own.
4867 */
4868
4869 for (i = 0; i < NBuffers; i++)
4870 {
4871 BufferDesc *bufHdr = GetBufferDescriptor(i);
4872 uint32 buf_state;
4873
4874 /*
4875 * As in DropRelationBuffers, an unlocked precheck should be safe and
4876 * saves some cycles.
4877 */
4878 if (bufHdr->tag.dbOid != dbid)
4879 continue;
4880
4881 buf_state = LockBufHdr(bufHdr);
4882 if (bufHdr->tag.dbOid == dbid)
4883 InvalidateBuffer(bufHdr); /* releases spinlock */
4884 else
4885 UnlockBufHdr(bufHdr, buf_state);
4886 }
4887}
4888
4889/* ---------------------------------------------------------------------
4890 * FlushRelationBuffers
4891 *
4892 * This function writes all dirty pages of a relation out to disk
4893 * (or more accurately, out to kernel disk buffers), ensuring that the
4894 * kernel has an up-to-date view of the relation.
4895 *
4896 * Generally, the caller should be holding AccessExclusiveLock on the
4897 * target relation to ensure that no other backend is busy dirtying
4898 * more blocks of the relation; the effects can't be expected to last
4899 * after the lock is released.
4900 *
4901 * XXX currently it sequentially searches the buffer pool, should be
4902 * changed to more clever ways of searching. This routine is not
4903 * used in any performance-critical code paths, so it's not worth
4904 * adding additional overhead to normal paths to make it go faster.
4905 * --------------------------------------------------------------------
4906 */
4907void
4909{
4910 int i;
4911 BufferDesc *bufHdr;
4912 SMgrRelation srel = RelationGetSmgr(rel);
4913
4914 if (RelationUsesLocalBuffers(rel))
4915 {
4916 for (i = 0; i < NLocBuffer; i++)
4917 {
4918 uint32 buf_state;
4919
4920 bufHdr = GetLocalBufferDescriptor(i);
4921 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
4922 ((buf_state = pg_atomic_read_u32(&bufHdr->state)) &
4923 (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
4924 {
4925 ErrorContextCallback errcallback;
4926
4927 /* Setup error traceback support for ereport() */
4929 errcallback.arg = bufHdr;
4930 errcallback.previous = error_context_stack;
4931 error_context_stack = &errcallback;
4932
4933 /* Make sure we can handle the pin */
4936
4937 /*
4938 * Pin/unpin mostly to make valgrind work, but it also seems
4939 * like the right thing to do.
4940 */
4941 PinLocalBuffer(bufHdr, false);
4942
4943
4944 FlushLocalBuffer(bufHdr, srel);
4945
4947
4948 /* Pop the error context stack */
4949 error_context_stack = errcallback.previous;
4950 }
4951 }
4952
4953 return;
4954 }
4955
4956 for (i = 0; i < NBuffers; i++)
4957 {
4958 uint32 buf_state;
4959
4960 bufHdr = GetBufferDescriptor(i);
4961
4962 /*
4963 * As in DropRelationBuffers, an unlocked precheck should be safe and
4964 * saves some cycles.
4965 */
4966 if (!BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator))
4967 continue;
4968
4969 /* Make sure we can handle the pin */
4972
4973 buf_state = LockBufHdr(bufHdr);
4974 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
4975 (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
4976 {
4977 PinBuffer_Locked(bufHdr);
4981 UnpinBuffer(bufHdr);
4982 }
4983 else
4984 UnlockBufHdr(bufHdr, buf_state);
4985 }
4986}
4987
4988/* ---------------------------------------------------------------------
4989 * FlushRelationsAllBuffers
4990 *
4991 * This function flushes out of the buffer pool all the pages of all
4992 * forks of the specified smgr relations. It's equivalent to calling
4993 * FlushRelationBuffers once per relation. The relations are assumed not
4994 * to use local buffers.
4995 * --------------------------------------------------------------------
4996 */
4997void
4999{
5000 int i;
5001 SMgrSortArray *srels;
5002 bool use_bsearch;
5003
5004 if (nrels == 0)
5005 return;
5006
5007 /* fill-in array for qsort */
5008 srels = palloc(sizeof(SMgrSortArray) * nrels);
5009
5010 for (i = 0; i < nrels; i++)
5011 {
5012 Assert(!RelFileLocatorBackendIsTemp(smgrs[i]->smgr_rlocator));
5013
5014 srels[i].rlocator = smgrs[i]->smgr_rlocator.locator;
5015 srels[i].srel = smgrs[i];
5016 }
5017
5018 /*
5019 * Save the bsearch overhead for low number of relations to sync. See
5020 * DropRelationsAllBuffers for details.
5021 */
5022 use_bsearch = nrels > RELS_BSEARCH_THRESHOLD;
5023
5024 /* sort the list of SMgrRelations if necessary */
5025 if (use_bsearch)
5026 qsort(srels, nrels, sizeof(SMgrSortArray), rlocator_comparator);
5027
5028 for (i = 0; i < NBuffers; i++)
5029 {
5030 SMgrSortArray *srelent = NULL;
5031 BufferDesc *bufHdr = GetBufferDescriptor(i);
5032 uint32 buf_state;
5033
5034 /*
5035 * As in DropRelationBuffers, an unlocked precheck should be safe and
5036 * saves some cycles.
5037 */
5038
5039 if (!use_bsearch)
5040 {
5041 int j;
5042
5043 for (j = 0; j < nrels; j++)
5044 {
5045 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srels[j].rlocator))
5046 {
5047 srelent = &srels[j];
5048 break;
5049 }
5050 }
5051 }
5052 else
5053 {
5054 RelFileLocator rlocator;
5055
5056 rlocator = BufTagGetRelFileLocator(&bufHdr->tag);
5057 srelent = bsearch(&rlocator,
5058 srels, nrels, sizeof(SMgrSortArray),
5060 }
5061
5062 /* buffer doesn't belong to any of the given relfilelocators; skip it */
5063 if (srelent == NULL)
5064 continue;
5065
5066 /* Make sure we can handle the pin */
5069
5070 buf_state = LockBufHdr(bufHdr);
5071 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srelent->rlocator) &&
5072 (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
5073 {
5074 PinBuffer_Locked(bufHdr);
5078 UnpinBuffer(bufHdr);
5079 }
5080 else
5081 UnlockBufHdr(bufHdr, buf_state);
5082 }
5083
5084 pfree(srels);
5085}
5086
5087/* ---------------------------------------------------------------------
5088 * RelationCopyStorageUsingBuffer
5089 *
5090 * Copy fork's data using bufmgr. Same as RelationCopyStorage but instead
5091 * of using smgrread and smgrextend this will copy using bufmgr APIs.
5092 *
5093 * Refer comments atop CreateAndCopyRelationData() for details about
5094 * 'permanent' parameter.
5095 * --------------------------------------------------------------------
5096 */
5097static void
5099 RelFileLocator dstlocator,
5100 ForkNumber forkNum, bool permanent)
5101{
5102 Buffer srcBuf;
5103 Buffer dstBuf;
5104 Page srcPage;
5105 Page dstPage;
5106 bool use_wal;
5107 BlockNumber nblocks;
5108 BlockNumber blkno;
5110 BufferAccessStrategy bstrategy_src;
5111 BufferAccessStrategy bstrategy_dst;
5113 ReadStream *src_stream;
5114 SMgrRelation src_smgr;
5115
5116 /*
5117 * In general, we want to write WAL whenever wal_level > 'minimal', but we
5118 * can skip it when copying any fork of an unlogged relation other than
5119 * the init fork.
5120 */
5121 use_wal = XLogIsNeeded() && (permanent || forkNum == INIT_FORKNUM);
5122
5123 /* Get number of blocks in the source relation. */
5124 nblocks = smgrnblocks(smgropen(srclocator, INVALID_PROC_NUMBER),
5125 forkNum);
5126
5127 /* Nothing to copy; just return. */
5128 if (nblocks == 0)
5129 return;
5130
5131 /*
5132 * Bulk extend the destination relation of the same size as the source
5133 * relation before starting to copy block by block.
5134 */
5135 memset(buf.data, 0, BLCKSZ);
5136 smgrextend(smgropen(dstlocator, INVALID_PROC_NUMBER), forkNum, nblocks - 1,
5137 buf.data, true);
5138
5139 /* This is a bulk operation, so use buffer access strategies. */
5140 bstrategy_src = GetAccessStrategy(BAS_BULKREAD);
5141 bstrategy_dst = GetAccessStrategy(BAS_BULKWRITE);
5142
5143 /* Initialize streaming read */
5144 p.current_blocknum = 0;
5145 p.last_exclusive = nblocks;
5146 src_smgr = smgropen(srclocator, INVALID_PROC_NUMBER);
5147
5148 /*
5149 * It is safe to use batchmode as block_range_read_stream_cb takes no
5150 * locks.
5151 */
5154 bstrategy_src,
5155 src_smgr,
5156 permanent ? RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED,
5157 forkNum,
5159 &p,
5160 0);
5161
5162 /* Iterate over each block of the source relation file. */
5163 for (blkno = 0; blkno < nblocks; blkno++)
5164 {
5166
5167 /* Read block from source relation. */
5168 srcBuf = read_stream_next_buffer(src_stream, NULL);
5170 srcPage = BufferGetPage(srcBuf);
5171
5172 dstBuf = ReadBufferWithoutRelcache(dstlocator, forkNum,
5173 BufferGetBlockNumber(srcBuf),
5174 RBM_ZERO_AND_LOCK, bstrategy_dst,
5175 permanent);
5176 dstPage = BufferGetPage(dstBuf);
5177
5179
5180 /* Copy page data from the source to the destination. */
5181 memcpy(dstPage, srcPage, BLCKSZ);
5182 MarkBufferDirty(dstBuf);
5183
5184 /* WAL-log the copied page. */
5185 if (use_wal)
5186 log_newpage_buffer(dstBuf, true);
5187
5189
5190 UnlockReleaseBuffer(dstBuf);
5191 UnlockReleaseBuffer(srcBuf);
5192 }
5193 Assert(read_stream_next_buffer(src_stream, NULL) == InvalidBuffer);
5194 read_stream_end(src_stream);
5195
5196 FreeAccessStrategy(bstrategy_src);
5197 FreeAccessStrategy(bstrategy_dst);
5198}
5199
5200/* ---------------------------------------------------------------------
5201 * CreateAndCopyRelationData
5202 *
5203 * Create destination relation storage and copy all forks from the
5204 * source relation to the destination.
5205 *
5206 * Pass permanent as true for permanent relations and false for
5207 * unlogged relations. Currently this API is not supported for
5208 * temporary relations.
5209 * --------------------------------------------------------------------
5210 */
5211void
5213 RelFileLocator dst_rlocator, bool permanent)
5214{
5215 char relpersistence;
5216 SMgrRelation src_rel;
5217 SMgrRelation dst_rel;
5218
5219 /* Set the relpersistence. */
5220 relpersistence = permanent ?
5221 RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED;
5222
5223 src_rel = smgropen(src_rlocator, INVALID_PROC_NUMBER);
5224 dst_rel = smgropen(dst_rlocator, INVALID_PROC_NUMBER);
5225
5226 /*
5227 * Create and copy all forks of the relation. During create database we
5228 * have a separate cleanup mechanism which deletes complete database
5229 * directory. Therefore, each individual relation doesn't need to be
5230 * registered for cleanup.
5231 */
5232 RelationCreateStorage(dst_rlocator, relpersistence, false);
5233
5234 /* copy main fork. */
5235 RelationCopyStorageUsingBuffer(src_rlocator, dst_rlocator, MAIN_FORKNUM,
5236 permanent);
5237
5238 /* copy those extra forks that exist */
5239 for (ForkNumber forkNum = MAIN_FORKNUM + 1;
5240 forkNum <= MAX_FORKNUM; forkNum++)
5241 {
5242 if (smgrexists(src_rel, forkNum))
5243 {
5244 smgrcreate(dst_rel, forkNum, false);
5245
5246 /*
5247 * WAL log creation if the relation is persistent, or this is the
5248 * init fork of an unlogged relation.
5249 */
5250 if (permanent || forkNum == INIT_FORKNUM)
5251 log_smgrcreate(&dst_rlocator, forkNum);
5252
5253 /* Copy a fork's data, block by block. */
5254 RelationCopyStorageUsingBuffer(src_rlocator, dst_rlocator, forkNum,
5255 permanent);
5256 }
5257 }
5258}
5259
5260/* ---------------------------------------------------------------------
5261 * FlushDatabaseBuffers
5262 *
5263 * This function writes all dirty pages of a database out to disk
5264 * (or more accurately, out to kernel disk buffers), ensuring that the
5265 * kernel has an up-to-date view of the database.
5266 *
5267 * Generally, the caller should be holding an appropriate lock to ensure
5268 * no other backend is active in the target database; otherwise more
5269 * pages could get dirtied.
5270 *
5271 * Note we don't worry about flushing any pages of temporary relations.
5272 * It's assumed these wouldn't be interesting.
5273 * --------------------------------------------------------------------
5274 */
5275void
5277{
5278 int i;
5279 BufferDesc *bufHdr;
5280
5281 for (i = 0; i < NBuffers; i++)
5282 {
5283 uint32 buf_state;
5284
5285 bufHdr = GetBufferDescriptor(i);
5286
5287 /*
5288 * As in DropRelationBuffers, an unlocked precheck should be safe and
5289 * saves some cycles.
5290 */
5291 if (bufHdr->tag.dbOid != dbid)
5292 continue;
5293
5294 /* Make sure we can handle the pin */
5297
5298 buf_state = LockBufHdr(bufHdr);
5299 if (bufHdr->tag.dbOid == dbid &&
5300 (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
5301 {
5302 PinBuffer_Locked(bufHdr);
5306 UnpinBuffer(bufHdr);
5307 }
5308 else
5309 UnlockBufHdr(bufHdr, buf_state);
5310 }
5311}
5312
5313/*
5314 * Flush a previously, shared or exclusively, locked and pinned buffer to the
5315 * OS.
5316 */
5317void
5319{
5320 BufferDesc *bufHdr;
5321
5322 /* currently not needed, but no fundamental reason not to support */
5324
5326
5327 bufHdr = GetBufferDescriptor(buffer - 1);
5328
5330
5332}
5333
5334/*
5335 * ReleaseBuffer -- release the pin on a buffer
5336 */
5337void
5339{
5340 if (!BufferIsValid(buffer))
5341 elog(ERROR, "bad buffer ID: %d", buffer);
5342
5343 if (BufferIsLocal(buffer))
5345 else
5347}
5348
5349/*
5350 * UnlockReleaseBuffer -- release the content lock and pin on a buffer
5351 *
5352 * This is just a shorthand for a common combination.
5353 */
5354void
5356{
5359}
5360
5361/*
5362 * IncrBufferRefCount
5363 * Increment the pin count on a buffer that we have *already* pinned
5364 * at least once.
5365 *
5366 * This function cannot be used on a buffer we do not have pinned,
5367 * because it doesn't change the shared buffer state.
5368 */
5369void
5371{
5374 if (BufferIsLocal(buffer))
5375 LocalRefCount[-buffer - 1]++;
5376 else
5377 {
5379
5380 ref = GetPrivateRefCountEntry(buffer, true);
5381 Assert(ref != NULL);
5382 ref->refcount++;
5383 }
5385}
5386
5387/*
5388 * MarkBufferDirtyHint
5389 *
5390 * Mark a buffer dirty for non-critical changes.
5391 *
5392 * This is essentially the same as MarkBufferDirty, except:
5393 *
5394 * 1. The caller does not write WAL; so if checksums are enabled, we may need
5395 * to write an XLOG_FPI_FOR_HINT WAL record to protect against torn pages.
5396 * 2. The caller might have only share-lock instead of exclusive-lock on the
5397 * buffer's content lock.
5398 * 3. This function does not guarantee that the buffer is always marked dirty
5399 * (due to a race condition), so it cannot be used for important changes.
5400 */
5401void
5403{
5404 BufferDesc *bufHdr;
5405 Page page = BufferGetPage(buffer);
5406
5407 if (!BufferIsValid(buffer))
5408 elog(ERROR, "bad buffer ID: %d", buffer);
5409
5410 if (BufferIsLocal(buffer))
5411 {
5413 return;
5414 }
5415
5416 bufHdr = GetBufferDescriptor(buffer - 1);
5417
5419 /* here, either share or exclusive lock is OK */
5421
5422 /*
5423 * This routine might get called many times on the same page, if we are
5424 * making the first scan after commit of an xact that added/deleted many
5425 * tuples. So, be as quick as we can if the buffer is already dirty. We
5426 * do this by not acquiring spinlock if it looks like the status bits are
5427 * already set. Since we make this test unlocked, there's a chance we
5428 * might fail to notice that the flags have just been cleared, and failed
5429 * to reset them, due to memory-ordering issues. But since this function
5430 * is only intended to be used in cases where failing to write out the
5431 * data would be harmless anyway, it doesn't really matter.
5432 */
5433 if ((pg_atomic_read_u32(&bufHdr->state) & (BM_DIRTY | BM_JUST_DIRTIED)) !=
5435 {
5437 bool dirtied = false;
5438 bool delayChkptFlags = false;
5439 uint32 buf_state;
5440
5441 /*
5442 * If we need to protect hint bit updates from torn writes, WAL-log a
5443 * full page image of the page. This full page image is only necessary
5444 * if the hint bit update is the first change to the page since the
5445 * last checkpoint.
5446 *
5447 * We don't check full_page_writes here because that logic is included
5448 * when we call XLogInsert() since the value changes dynamically.
5449 */
5450 if (XLogHintBitIsNeeded() &&
5452 {
5453 /*
5454 * If we must not write WAL, due to a relfilelocator-specific
5455 * condition or being in recovery, don't dirty the page. We can
5456 * set the hint, just not dirty the page as a result so the hint
5457 * is lost when we evict the page or shutdown.
5458 *
5459 * See src/backend/storage/page/README for longer discussion.
5460 */
5461 if (RecoveryInProgress() ||
5463 return;
5464
5465 /*
5466 * If the block is already dirty because we either made a change
5467 * or set a hint already, then we don't need to write a full page
5468 * image. Note that aggressive cleaning of blocks dirtied by hint
5469 * bit setting would increase the call rate. Bulk setting of hint
5470 * bits would reduce the call rate...
5471 *
5472 * We must issue the WAL record before we mark the buffer dirty.
5473 * Otherwise we might write the page before we write the WAL. That
5474 * causes a race condition, since a checkpoint might occur between
5475 * writing the WAL record and marking the buffer dirty. We solve
5476 * that with a kluge, but one that is already in use during
5477 * transaction commit to prevent race conditions. Basically, we
5478 * simply prevent the checkpoint WAL record from being written
5479 * until we have marked the buffer dirty. We don't start the
5480 * checkpoint flush until we have marked dirty, so our checkpoint
5481 * must flush the change to disk successfully or the checkpoint
5482 * never gets written, so crash recovery will fix.
5483 *
5484 * It's possible we may enter here without an xid, so it is
5485 * essential that CreateCheckPoint waits for virtual transactions
5486 * rather than full transactionids.
5487 */
5490 delayChkptFlags = true;
5491 lsn = XLogSaveBufferForHint(buffer, buffer_std);
5492 }
5493
5494 buf_state = LockBufHdr(bufHdr);
5495
5496 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
5497
5498 if (!(buf_state & BM_DIRTY))
5499 {
5500 dirtied = true; /* Means "will be dirtied by this action" */
5501
5502 /*
5503 * Set the page LSN if we wrote a backup block. We aren't supposed
5504 * to set this when only holding a share lock but as long as we
5505 * serialise it somehow we're OK. We choose to set LSN while
5506 * holding the buffer header lock, which causes any reader of an
5507 * LSN who holds only a share lock to also obtain a buffer header
5508 * lock before using PageGetLSN(), which is enforced in
5509 * BufferGetLSNAtomic().
5510 *
5511 * If checksums are enabled, you might think we should reset the
5512 * checksum here. That will happen when the page is written
5513 * sometime later in this checkpoint cycle.
5514 */
5515 if (!XLogRecPtrIsInvalid(lsn))
5516 PageSetLSN(page, lsn);
5517 }
5518
5519 buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
5520 UnlockBufHdr(bufHdr, buf_state);
5521
5522 if (delayChkptFlags)
5523 MyProc->delayChkptFlags &= ~DELAY_CHKPT_START;
5524
5525 if (dirtied)
5526 {
5528 if (VacuumCostActive)
5530 }
5531 }
5532}
5533
5534/*
5535 * Release buffer content locks for shared buffers.
5536 *
5537 * Used to clean up after errors.
5538 *
5539 * Currently, we can expect that lwlock.c's LWLockReleaseAll() took care
5540 * of releasing buffer content locks per se; the only thing we need to deal
5541 * with here is clearing any PIN_COUNT request that was in progress.
5542 */
5543void
5545{
5547
5548 if (buf)
5549 {
5550 uint32 buf_state;
5551
5552 buf_state = LockBufHdr(buf);
5553
5554 /*
5555 * Don't complain if flag bit not set; it could have been reset but we
5556 * got a cancel/die interrupt before getting the signal.
5557 */
5558 if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
5559 buf->wait_backend_pgprocno == MyProcNumber)
5560 buf_state &= ~BM_PIN_COUNT_WAITER;
5561
5562 UnlockBufHdr(buf, buf_state);
5563
5564 PinCountWaitBuf = NULL;
5565 }
5566}
5567
5568/*
5569 * Acquire or release the content_lock for the buffer.
5570 */
5571void
5573{
5574 BufferDesc *buf;
5575
5577 if (BufferIsLocal(buffer))
5578 return; /* local buffers need no lock */
5579
5581
5582 if (mode == BUFFER_LOCK_UNLOCK)
5584 else if (mode == BUFFER_LOCK_SHARE)
5586 else if (mode == BUFFER_LOCK_EXCLUSIVE)
5588 else
5589 elog(ERROR, "unrecognized buffer lock mode: %d", mode);
5590}
5591
5592/*
5593 * Acquire the content_lock for the buffer, but only if we don't have to wait.
5594 *
5595 * This assumes the caller wants BUFFER_LOCK_EXCLUSIVE mode.
5596 */
5597bool
5599{
5600 BufferDesc *buf;
5601
5603 if (BufferIsLocal(buffer))
5604 return true; /* act as though we got it */
5605
5607
5609 LW_EXCLUSIVE);
5610}
5611
5612/*
5613 * Verify that this backend is pinning the buffer exactly once.
5614 *
5615 * NOTE: Like in BufferIsPinned(), what we check here is that *this* backend
5616 * holds a pin on the buffer. We do not care whether some other backend does.
5617 */
5618void
5620{
5621 if (BufferIsLocal(buffer))
5622 {
5623 if (LocalRefCount[-buffer - 1] != 1)
5624 elog(ERROR, "incorrect local pin count: %d",
5625 LocalRefCount[-buffer - 1]);
5626 }
5627 else
5628 {
5629 if (GetPrivateRefCount(buffer) != 1)
5630 elog(ERROR, "incorrect local pin count: %d",
5632 }
5633}
5634
5635/*
5636 * LockBufferForCleanup - lock a buffer in preparation for deleting items
5637 *
5638 * Items may be deleted from a disk page only when the caller (a) holds an
5639 * exclusive lock on the buffer and (b) has observed that no other backend
5640 * holds a pin on the buffer. If there is a pin, then the other backend
5641 * might have a pointer into the buffer (for example, a heapscan reference
5642 * to an item --- see README for more details). It's OK if a pin is added
5643 * after the cleanup starts, however; the newly-arrived backend will be
5644 * unable to look at the page until we release the exclusive lock.
5645 *
5646 * To implement this protocol, a would-be deleter must pin the buffer and
5647 * then call LockBufferForCleanup(). LockBufferForCleanup() is similar to
5648 * LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE), except that it loops until
5649 * it has successfully observed pin count = 1.
5650 */
5651void
5653{
5654 BufferDesc *bufHdr;
5655 TimestampTz waitStart = 0;
5656 bool waiting = false;
5657 bool logged_recovery_conflict = false;
5658
5660 Assert(PinCountWaitBuf == NULL);
5661
5663
5664 /*
5665 * We do not yet need to be worried about in-progress AIOs holding a pin,
5666 * as we, so far, only support doing reads via AIO and this function can
5667 * only be called once the buffer is valid (i.e. no read can be in
5668 * flight).
5669 */
5670
5671 /* Nobody else to wait for */
5672 if (BufferIsLocal(buffer))
5673 return;
5674
5675 bufHdr = GetBufferDescriptor(buffer - 1);
5676
5677 for (;;)
5678 {
5679 uint32 buf_state;
5680
5681 /* Try to acquire lock */
5683 buf_state = LockBufHdr(bufHdr);
5684
5685 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
5686 if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
5687 {
5688 /* Successfully acquired exclusive lock with pincount 1 */
5689 UnlockBufHdr(bufHdr, buf_state);
5690
5691 /*
5692 * Emit the log message if recovery conflict on buffer pin was
5693 * resolved but the startup process waited longer than
5694 * deadlock_timeout for it.
5695 */
5696 if (logged_recovery_conflict)
5698 waitStart, GetCurrentTimestamp(),
5699 NULL, false);
5700
5701 if (waiting)
5702 {
5703 /* reset ps display to remove the suffix if we added one */
5705 waiting = false;
5706 }
5707 return;
5708 }
5709 /* Failed, so mark myself as waiting for pincount 1 */
5710 if (buf_state & BM_PIN_COUNT_WAITER)
5711 {
5712 UnlockBufHdr(bufHdr, buf_state);
5714 elog(ERROR, "multiple backends attempting to wait for pincount 1");
5715 }
5717 PinCountWaitBuf = bufHdr;
5718 buf_state |= BM_PIN_COUNT_WAITER;
5719 UnlockBufHdr(bufHdr, buf_state);
5721
5722 /* Wait to be signaled by UnpinBuffer() */
5723 if (InHotStandby)
5724 {
5725 if (!waiting)
5726 {
5727 /* adjust the process title to indicate that it's waiting */
5728 set_ps_display_suffix("waiting");
5729 waiting = true;
5730 }
5731
5732 /*
5733 * Emit the log message if the startup process is waiting longer
5734 * than deadlock_timeout for recovery conflict on buffer pin.
5735 *
5736 * Skip this if first time through because the startup process has
5737 * not started waiting yet in this case. So, the wait start
5738 * timestamp is set after this logic.
5739 */
5740 if (waitStart != 0 && !logged_recovery_conflict)
5741 {
5743
5744 if (TimestampDifferenceExceeds(waitStart, now,
5746 {
5748 waitStart, now, NULL, true);
5749 logged_recovery_conflict = true;
5750 }
5751 }
5752
5753 /*
5754 * Set the wait start timestamp if logging is enabled and first
5755 * time through.
5756 */
5757 if (log_recovery_conflict_waits && waitStart == 0)
5758 waitStart = GetCurrentTimestamp();
5759
5760 /* Publish the bufid that Startup process waits on */
5762 /* Set alarm and then wait to be signaled by UnpinBuffer() */
5764 /* Reset the published bufid */
5766 }
5767 else
5768 ProcWaitForSignal(WAIT_EVENT_BUFFER_PIN);
5769
5770 /*
5771 * Remove flag marking us as waiter. Normally this will not be set
5772 * anymore, but ProcWaitForSignal() can return for other signals as
5773 * well. We take care to only reset the flag if we're the waiter, as
5774 * theoretically another backend could have started waiting. That's
5775 * impossible with the current usages due to table level locking, but
5776 * better be safe.
5777 */
5778 buf_state = LockBufHdr(bufHdr);
5779 if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
5781 buf_state &= ~BM_PIN_COUNT_WAITER;
5782 UnlockBufHdr(bufHdr, buf_state);
5783
5784 PinCountWaitBuf = NULL;
5785 /* Loop back and try again */
5786 }
5787}
5788
5789/*
5790 * Check called from ProcessRecoveryConflictInterrupts() when Startup process
5791 * requests cancellation of all pin holders that are blocking it.
5792 */
5793bool
5795{
5796 int bufid = GetStartupBufferPinWaitBufId();
5797
5798 /*
5799 * If we get woken slowly then it's possible that the Startup process was
5800 * already woken by other backends before we got here. Also possible that
5801 * we get here by multiple interrupts or interrupts at inappropriate
5802 * times, so make sure we do nothing if the bufid is not set.
5803 */
5804 if (bufid < 0)
5805 return false;
5806
5807 if (GetPrivateRefCount(bufid + 1) > 0)
5808 return true;
5809
5810 return false;
5811}
5812
5813/*
5814 * ConditionalLockBufferForCleanup - as above, but don't wait to get the lock
5815 *
5816 * We won't loop, but just check once to see if the pin count is OK. If
5817 * not, return false with no lock held.
5818 */
5819bool
5821{
5822 BufferDesc *bufHdr;
5823 uint32 buf_state,
5824 refcount;
5825
5827
5828 /* see AIO related comment in LockBufferForCleanup() */
5829
5830 if (BufferIsLocal(buffer))
5831 {
5833 /* There should be exactly one pin */
5834 Assert(refcount > 0);
5835 if (refcount != 1)
5836 return false;
5837 /* Nobody else to wait for */
5838 return true;
5839 }
5840
5841 /* There should be exactly one local pin */
5844 if (refcount != 1)
5845 return false;
5846
5847 /* Try to acquire lock */
5849 return false;
5850
5851 bufHdr = GetBufferDescriptor(buffer - 1);
5852 buf_state = LockBufHdr(bufHdr);
5853 refcount = BUF_STATE_GET_REFCOUNT(buf_state);
5854
5855 Assert(refcount > 0);
5856 if (refcount == 1)
5857 {
5858 /* Successfully acquired exclusive lock with pincount 1 */
5859 UnlockBufHdr(bufHdr, buf_state);
5860 return true;
5861 }
5862
5863 /* Failed, so release the lock */
5864 UnlockBufHdr(bufHdr, buf_state);
5866 return false;
5867}
5868
5869/*
5870 * IsBufferCleanupOK - as above, but we already have the lock
5871 *
5872 * Check whether it's OK to perform cleanup on a buffer we've already
5873 * locked. If we observe that the pin count is 1, our exclusive lock
5874 * happens to be a cleanup lock, and we can proceed with anything that
5875 * would have been allowable had we sought a cleanup lock originally.
5876 */
5877bool
5879{
5880 BufferDesc *bufHdr;
5881 uint32 buf_state;
5882
5884
5885 /* see AIO related comment in LockBufferForCleanup() */
5886
5887 if (BufferIsLocal(buffer))
5888 {
5889 /* There should be exactly one pin */
5890 if (LocalRefCount[-buffer - 1] != 1)
5891 return false;
5892 /* Nobody else to wait for */
5893 return true;
5894 }
5895
5896 /* There should be exactly one local pin */
5897 if (GetPrivateRefCount(buffer) != 1)
5898 return false;
5899
5900 bufHdr = GetBufferDescriptor(buffer - 1);
5901
5902 /* caller must hold exclusive lock on buffer */
5904 LW_EXCLUSIVE));
5905
5906 buf_state = LockBufHdr(bufHdr);
5907
5908 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
5909 if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
5910 {
5911 /* pincount is OK. */
5912 UnlockBufHdr(bufHdr, buf_state);
5913 return true;
5914 }
5915
5916 UnlockBufHdr(bufHdr, buf_state);
5917 return false;
5918}
5919
5920
5921/*
5922 * Functions for buffer I/O handling
5923 *
5924 * Also note that these are used only for shared buffers, not local ones.
5925 */
5926
5927/*
5928 * WaitIO -- Block until the IO_IN_PROGRESS flag on 'buf' is cleared.
5929 */
5930static void
5932{
5934
5936 for (;;)
5937 {
5938 uint32 buf_state;
5939 PgAioWaitRef iow;
5940
5941 /*
5942 * It may not be necessary to acquire the spinlock to check the flag
5943 * here, but since this test is essential for correctness, we'd better
5944 * play it safe.
5945 */
5946 buf_state = LockBufHdr(buf);
5947
5948 /*
5949 * Copy the wait reference while holding the spinlock. This protects
5950 * against a concurrent TerminateBufferIO() in another backend from
5951 * clearing the wref while it's being read.
5952 */
5953 iow = buf->io_wref;
5954 UnlockBufHdr(buf, buf_state);
5955
5956 /* no IO in progress, we don't need to wait */
5957 if (!(buf_state & BM_IO_IN_PROGRESS))
5958 break;
5959
5960 /*
5961 * The buffer has asynchronous IO in progress, wait for it to
5962 * complete.
5963 */
5964 if (pgaio_wref_valid(&iow))
5965 {
5966 pgaio_wref_wait(&iow);
5967
5968 /*
5969 * The AIO subsystem internally uses condition variables and thus
5970 * might remove this backend from the BufferDesc's CV. While that
5971 * wouldn't cause a correctness issue (the first CV sleep just
5972 * immediately returns if not already registered), it seems worth
5973 * avoiding unnecessary loop iterations, given that we take care
5974 * to do so at the start of the function.
5975 */
5977 continue;
5978 }
5979
5980 /* wait on BufferDesc->cv, e.g. for concurrent synchronous IO */
5981 ConditionVariableSleep(cv, WAIT_EVENT_BUFFER_IO);
5982 }
5984}
5985
5986/*
5987 * StartBufferIO: begin I/O on this buffer
5988 * (Assumptions)
5989 * My process is executing no IO on this buffer
5990 * The buffer is Pinned
5991 *
5992 * In some scenarios multiple backends could attempt the same I/O operation
5993 * concurrently. If someone else has already started I/O on this buffer then
5994 * we will wait for completion of the IO using WaitIO().
5995 *
5996 * Input operations are only attempted on buffers that are not BM_VALID,
5997 * and output operations only on buffers that are BM_VALID and BM_DIRTY,
5998 * so we can always tell if the work is already done.
5999 *
6000 * Returns true if we successfully marked the buffer as I/O busy,
6001 * false if someone else already did the work.
6002 *
6003 * If nowait is true, then we don't wait for an I/O to be finished by another
6004 * backend. In that case, false indicates either that the I/O was already
6005 * finished, or is still in progress. This is useful for callers that want to
6006 * find out if they can perform the I/O as part of a larger operation, without
6007 * waiting for the answer or distinguishing the reasons why not.
6008 */
6009bool
6010StartBufferIO(BufferDesc *buf, bool forInput, bool nowait)
6011{
6012 uint32 buf_state;
6013
6015
6016 for (;;)
6017 {
6018 buf_state = LockBufHdr(buf);
6019
6020 if (!(buf_state & BM_IO_IN_PROGRESS))
6021 break;
6022 UnlockBufHdr(buf, buf_state);
6023 if (nowait)
6024 return false;
6025 WaitIO(buf);
6026 }
6027
6028 /* Once we get here, there is definitely no I/O active on this buffer */
6029
6030 /* Check if someone else already did the I/O */
6031 if (forInput ? (buf_state & BM_VALID) : !(buf_state & BM_DIRTY))
6032 {
6033 UnlockBufHdr(buf, buf_state);
6034 return false;
6035 }
6036
6037 buf_state |= BM_IO_IN_PROGRESS;
6038 UnlockBufHdr(buf, buf_state);
6039
6042
6043 return true;
6044}
6045
6046/*
6047 * TerminateBufferIO: release a buffer we were doing I/O on
6048 * (Assumptions)
6049 * My process is executing IO for the buffer
6050 * BM_IO_IN_PROGRESS bit is set for the buffer
6051 * The buffer is Pinned
6052 *
6053 * If clear_dirty is true and BM_JUST_DIRTIED is not set, we clear the
6054 * buffer's BM_DIRTY flag. This is appropriate when terminating a
6055 * successful write. The check on BM_JUST_DIRTIED is necessary to avoid
6056 * marking the buffer clean if it was re-dirtied while we were writing.
6057 *
6058 * set_flag_bits gets ORed into the buffer's flags. It must include
6059 * BM_IO_ERROR in a failure case. For successful completion it could
6060 * be 0, or BM_VALID if we just finished reading in the page.
6061 *
6062 * If forget_owner is true, we release the buffer I/O from the current
6063 * resource owner. (forget_owner=false is used when the resource owner itself
6064 * is being released)
6065 */
6066void
6067TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits,
6068 bool forget_owner, bool release_aio)
6069{
6070 uint32 buf_state;
6071
6072 buf_state = LockBufHdr(buf);
6073
6074 Assert(buf_state & BM_IO_IN_PROGRESS);
6075 buf_state &= ~BM_IO_IN_PROGRESS;
6076
6077 /* Clear earlier errors, if this IO failed, it'll be marked again */
6078 buf_state &= ~BM_IO_ERROR;
6079
6080 if (clear_dirty && !(buf_state & BM_JUST_DIRTIED))
6081 buf_state &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED);
6082
6083 if (release_aio)
6084 {
6085 /* release ownership by the AIO subsystem */
6086 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
6087 buf_state -= BUF_REFCOUNT_ONE;
6088 pgaio_wref_clear(&buf->io_wref);
6089 }
6090
6091 buf_state |= set_flag_bits;
6092 UnlockBufHdr(buf, buf_state);
6093
6094 if (forget_owner)
6097
6099
6100 /*
6101 * Support LockBufferForCleanup()
6102 *
6103 * We may have just released the last pin other than the waiter's. In most
6104 * cases, this backend holds another pin on the buffer. But, if, for
6105 * example, this backend is completing an IO issued by another backend, it
6106 * may be time to wake the waiter.
6107 */
6108 if (release_aio && (buf_state & BM_PIN_COUNT_WAITER))
6110}
6111
6112/*
6113 * AbortBufferIO: Clean up active buffer I/O after an error.
6114 *
6115 * All LWLocks we might have held have been released,
6116 * but we haven't yet released buffer pins, so the buffer is still pinned.
6117 *
6118 * If I/O was in progress, we always set BM_IO_ERROR, even though it's
6119 * possible the error condition wasn't related to the I/O.
6120 *
6121 * Note: this does not remove the buffer I/O from the resource owner.
6122 * That's correct when we're releasing the whole resource owner, but
6123 * beware if you use this in other contexts.
6124 */
6125static void
6127{
6128 BufferDesc *buf_hdr = GetBufferDescriptor(buffer - 1);
6129 uint32 buf_state;
6130
6131 buf_state = LockBufHdr(buf_hdr);
6132 Assert(buf_state & (BM_IO_IN_PROGRESS | BM_TAG_VALID));
6133
6134 if (!(buf_state & BM_VALID))
6135 {
6136 Assert(!(buf_state & BM_DIRTY));
6137 UnlockBufHdr(buf_hdr, buf_state);
6138 }
6139 else
6140 {
6141 Assert(buf_state & BM_DIRTY);
6142 UnlockBufHdr(buf_hdr, buf_state);
6143
6144 /* Issue notice if this is not the first failure... */
6145 if (buf_state & BM_IO_ERROR)
6146 {
6147 /* Buffer is pinned, so we can read tag without spinlock */
6149 (errcode(ERRCODE_IO_ERROR),
6150 errmsg("could not write block %u of %s",
6151 buf_hdr->tag.blockNum,
6153 BufTagGetForkNum(&buf_hdr->tag)).str),
6154 errdetail("Multiple failures --- write error might be permanent.")));
6155 }
6156 }
6157
6158 TerminateBufferIO(buf_hdr, false, BM_IO_ERROR, false, false);
6159}
6160
6161/*
6162 * Error context callback for errors occurring during shared buffer writes.
6163 */
6164static void
6166{
6167 BufferDesc *bufHdr = (BufferDesc *) arg;
6168
6169 /* Buffer is pinned, so we can read the tag without locking the spinlock */
6170 if (bufHdr != NULL)
6171 errcontext("writing block %u of relation \"%s\"",
6172 bufHdr->tag.blockNum,
6174 BufTagGetForkNum(&bufHdr->tag)).str);
6175}
6176
6177/*
6178 * Error context callback for errors occurring during local buffer writes.
6179 */
6180static void
6182{
6183 BufferDesc *bufHdr = (BufferDesc *) arg;
6184
6185 if (bufHdr != NULL)
6186 errcontext("writing block %u of relation \"%s\"",
6187 bufHdr->tag.blockNum,
6190 BufTagGetForkNum(&bufHdr->tag)).str);
6191}
6192
6193/*
6194 * RelFileLocator qsort/bsearch comparator; see RelFileLocatorEquals.
6195 */
6196static int
6197rlocator_comparator(const void *p1, const void *p2)
6198{
6199 RelFileLocator n1 = *(const RelFileLocator *) p1;
6200 RelFileLocator n2 = *(const RelFileLocator *) p2;
6201
6202 if (n1.relNumber < n2.relNumber)
6203 return -1;
6204 else if (n1.relNumber > n2.relNumber)
6205 return 1;
6206
6207 if (n1.dbOid < n2.dbOid)
6208 return -1;
6209 else if (n1.dbOid > n2.dbOid)
6210 return 1;
6211
6212 if (n1.spcOid < n2.spcOid)
6213 return -1;
6214 else if (n1.spcOid > n2.spcOid)
6215 return 1;
6216 else
6217 return 0;
6218}
6219
6220/*
6221 * Lock buffer header - set BM_LOCKED in buffer state.
6222 */
6223uint32
6225{
6226 SpinDelayStatus delayStatus;
6227 uint32 old_buf_state;
6228
6230
6231 init_local_spin_delay(&delayStatus);
6232
6233 while (true)
6234 {
6235 /* set BM_LOCKED flag */
6236 old_buf_state = pg_atomic_fetch_or_u32(&desc->state, BM_LOCKED);
6237 /* if it wasn't set before we're OK */
6238 if (!(old_buf_state & BM_LOCKED))
6239 break;
6240 perform_spin_delay(&delayStatus);
6241 }
6242 finish_spin_delay(&delayStatus);
6243 return old_buf_state | BM_LOCKED;
6244}
6245
6246/*
6247 * Wait until the BM_LOCKED flag isn't set anymore and return the buffer's
6248 * state at that point.
6249 *
6250 * Obviously the buffer could be locked by the time the value is returned, so
6251 * this is primarily useful in CAS style loops.
6252 */
6253static uint32
6255{
6256 SpinDelayStatus delayStatus;
6257 uint32 buf_state;
6258
6259 init_local_spin_delay(&delayStatus);
6260
6261 buf_state = pg_atomic_read_u32(&buf->state);
6262
6263 while (buf_state & BM_LOCKED)
6264 {
6265 perform_spin_delay(&delayStatus);
6266 buf_state = pg_atomic_read_u32(&buf->state);
6267 }
6268
6269 finish_spin_delay(&delayStatus);
6270
6271 return buf_state;
6272}
6273
6274/*
6275 * BufferTag comparator.
6276 */
6277static inline int
6279{
6280 int ret;
6281 RelFileLocator rlocatora;
6282 RelFileLocator rlocatorb;
6283
6284 rlocatora = BufTagGetRelFileLocator(ba);
6285 rlocatorb = BufTagGetRelFileLocator(bb);
6286
6287 ret = rlocator_comparator(&rlocatora, &rlocatorb);
6288
6289 if (ret != 0)
6290 return ret;
6291
6292 if (BufTagGetForkNum(ba) < BufTagGetForkNum(bb))
6293 return -1;
6294 if (BufTagGetForkNum(ba) > BufTagGetForkNum(bb))
6295 return 1;
6296
6297 if (ba->blockNum < bb->blockNum)
6298 return -1;
6299 if (ba->blockNum > bb->blockNum)
6300 return 1;
6301
6302 return 0;
6303}
6304
6305/*
6306 * Comparator determining the writeout order in a checkpoint.
6307 *
6308 * It is important that tablespaces are compared first, the logic balancing
6309 * writes between tablespaces relies on it.
6310 */
6311static inline int
6313{
6314 /* compare tablespace */
6315 if (a->tsId < b->tsId)
6316 return -1;
6317 else if (a->tsId > b->tsId)
6318 return 1;
6319 /* compare relation */
6320 if (a->relNumber < b->relNumber)
6321 return -1;
6322 else if (a->relNumber > b->relNumber)
6323 return 1;
6324 /* compare fork */
6325 else if (a->forkNum < b->forkNum)
6326 return -1;
6327 else if (a->forkNum > b->forkNum)
6328 return 1;
6329 /* compare block number */
6330 else if (a->blockNum < b->blockNum)
6331 return -1;
6332 else if (a->blockNum > b->blockNum)
6333 return 1;
6334 /* equal page IDs are unlikely, but not impossible */
6335 return 0;
6336}
6337
6338/*
6339 * Comparator for a Min-Heap over the per-tablespace checkpoint completion
6340 * progress.
6341 */
6342static int
6344{
6347
6348 /* we want a min-heap, so return 1 for the a < b */
6349 if (sa->progress < sb->progress)
6350 return 1;
6351 else if (sa->progress == sb->progress)
6352 return 0;
6353 else
6354 return -1;
6355}
6356
6357/*
6358 * Initialize a writeback context, discarding potential previous state.
6359 *
6360 * *max_pending is a pointer instead of an immediate value, so the coalesce
6361 * limits can easily changed by the GUC mechanism, and so calling code does
6362 * not have to check the current configuration. A value of 0 means that no
6363 * writeback control will be performed.
6364 */
6365void
6366WritebackContextInit(WritebackContext *context, int *max_pending)
6367{
6368 Assert(*max_pending <= WRITEBACK_MAX_PENDING_FLUSHES);
6369
6370 context->max_pending = max_pending;
6371 context->nr_pending = 0;
6372}
6373
6374/*
6375 * Add buffer to list of pending writeback requests.
6376 */
6377void
6379 BufferTag *tag)
6380{
6381 PendingWriteback *pending;
6382
6383 /*
6384 * As pg_flush_data() doesn't do anything with fsync disabled, there's no
6385 * point in tracking in that case.
6386 */
6388 !enableFsync)
6389 return;
6390
6391 /*
6392 * Add buffer to the pending writeback array, unless writeback control is
6393 * disabled.
6394 */
6395 if (*wb_context->max_pending > 0)
6396 {
6398
6399 pending = &wb_context->pending_writebacks[wb_context->nr_pending++];
6400
6401 pending->tag = *tag;
6402 }
6403
6404 /*
6405 * Perform pending flushes if the writeback limit is exceeded. This
6406 * includes the case where previously an item has been added, but control
6407 * is now disabled.
6408 */
6409 if (wb_context->nr_pending >= *wb_context->max_pending)
6410 IssuePendingWritebacks(wb_context, io_context);
6411}
6412
6413#define ST_SORT sort_pending_writebacks
6414#define ST_ELEMENT_TYPE PendingWriteback
6415#define ST_COMPARE(a, b) buffertag_comparator(&a->tag, &b->tag)
6416#define ST_SCOPE static
6417#define ST_DEFINE
6418#include "lib/sort_template.h"
6419
6420/*
6421 * Issue all pending writeback requests, previously scheduled with
6422 * ScheduleBufferTagForWriteback, to the OS.
6423 *
6424 * Because this is only used to improve the OSs IO scheduling we try to never
6425 * error out - it's just a hint.
6426 */
6427void
6429{
6430 instr_time io_start;
6431 int i;
6432
6433 if (wb_context->nr_pending == 0)
6434 return;
6435
6436 /*
6437 * Executing the writes in-order can make them a lot faster, and allows to
6438 * merge writeback requests to consecutive blocks into larger writebacks.
6439 */
6440 sort_pending_writebacks(wb_context->pending_writebacks,
6441 wb_context->nr_pending);
6442
6444
6445 /*
6446 * Coalesce neighbouring writes, but nothing else. For that we iterate
6447 * through the, now sorted, array of pending flushes, and look forward to
6448 * find all neighbouring (or identical) writes.
6449 */
6450 for (i = 0; i < wb_context->nr_pending; i++)
6451 {
6454 SMgrRelation reln;
6455 int ahead;
6456 BufferTag tag;
6457 RelFileLocator currlocator;
6458 Size nblocks = 1;
6459
6460 cur = &wb_context->pending_writebacks[i];
6461 tag = cur->tag;
6462 currlocator = BufTagGetRelFileLocator(&tag);
6463
6464 /*
6465 * Peek ahead, into following writeback requests, to see if they can
6466 * be combined with the current one.
6467 */
6468 for (ahead = 0; i + ahead + 1 < wb_context->nr_pending; ahead++)
6469 {
6470
6471 next = &wb_context->pending_writebacks[i + ahead + 1];
6472
6473 /* different file, stop */
6474 if (!RelFileLocatorEquals(currlocator,
6475 BufTagGetRelFileLocator(&next->tag)) ||
6476 BufTagGetForkNum(&cur->tag) != BufTagGetForkNum(&next->tag))
6477 break;
6478
6479 /* ok, block queued twice, skip */
6480 if (cur->tag.blockNum == next->tag.blockNum)
6481 continue;
6482
6483 /* only merge consecutive writes */
6484 if (cur->tag.blockNum + 1 != next->tag.blockNum)
6485 break;
6486
6487 nblocks++;
6488 cur = next;
6489 }
6490
6491 i += ahead;
6492
6493 /* and finally tell the kernel to write the data to storage */
6494 reln = smgropen(currlocator, INVALID_PROC_NUMBER);
6495 smgrwriteback(reln, BufTagGetForkNum(&tag), tag.blockNum, nblocks);
6496 }
6497
6498 /*
6499 * Assume that writeback requests are only issued for buffers containing
6500 * blocks of permanent relations.
6501 */
6503 IOOP_WRITEBACK, io_start, wb_context->nr_pending, 0);
6504
6505 wb_context->nr_pending = 0;
6506}
6507
6508/* ResourceOwner callbacks */
6509
6510static void
6512{
6514
6516}
6517
6518static char *
6520{
6522
6523 return psprintf("lost track of buffer IO on buffer %d", buffer);
6524}
6525
6526static void
6528{
6530
6531 /* Like ReleaseBuffer, but don't call ResourceOwnerForgetBuffer */
6532 if (!BufferIsValid(buffer))
6533 elog(ERROR, "bad buffer ID: %d", buffer);
6534
6535 if (BufferIsLocal(buffer))
6537 else
6539}
6540
6541static char *
6543{
6545}
6546
6547/*
6548 * Helper function to evict unpinned buffer whose buffer header lock is
6549 * already acquired.
6550 */
6551static bool
6552EvictUnpinnedBufferInternal(BufferDesc *desc, bool *buffer_flushed)
6553{
6554 uint32 buf_state;
6555 bool result;
6556
6557 *buffer_flushed = false;
6558
6559 buf_state = pg_atomic_read_u32(&(desc->state));
6560 Assert(buf_state & BM_LOCKED);
6561
6562 if ((buf_state & BM_VALID) == 0)
6563 {
6564 UnlockBufHdr(desc, buf_state);
6565 return false;
6566 }
6567
6568 /* Check that it's not pinned already. */
6569 if (BUF_STATE_GET_REFCOUNT(buf_state) > 0)
6570 {
6571 UnlockBufHdr(desc, buf_state);
6572 return false;
6573 }
6574
6575 PinBuffer_Locked(desc); /* releases spinlock */
6576
6577 /* If it was dirty, try to clean it once. */
6578 if (buf_state & BM_DIRTY)
6579 {
6582 *buffer_flushed = true;
6584 }
6585
6586 /* This will return false if it becomes dirty or someone else pins it. */
6587 result = InvalidateVictimBuffer(desc);
6588
6589 UnpinBuffer(desc);
6590
6591 return result;
6592}
6593
6594/*
6595 * Try to evict the current block in a shared buffer.
6596 *
6597 * This function is intended for testing/development use only!
6598 *
6599 * To succeed, the buffer must not be pinned on entry, so if the caller had a
6600 * particular block in mind, it might already have been replaced by some other
6601 * block by the time this function runs. It's also unpinned on return, so the
6602 * buffer might be occupied again by the time control is returned, potentially
6603 * even by the same block. This inherent raciness without other interlocking
6604 * makes the function unsuitable for non-testing usage.
6605 *
6606 * *buffer_flushed is set to true if the buffer was dirty and has been
6607 * flushed, false otherwise. However, *buffer_flushed=true does not
6608 * necessarily mean that we flushed the buffer, it could have been flushed by
6609 * someone else.
6610 *
6611 * Returns true if the buffer was valid and it has now been made invalid.
6612 * Returns false if it wasn't valid, if it couldn't be evicted due to a pin,
6613 * or if the buffer becomes dirty again while we're trying to write it out.
6614 */
6615bool
6616EvictUnpinnedBuffer(Buffer buf, bool *buffer_flushed)
6617{
6618 BufferDesc *desc;
6619
6621
6622 /* Make sure we can pin the buffer. */
6625
6626 desc = GetBufferDescriptor(buf - 1);
6627 LockBufHdr(desc);
6628
6629 return EvictUnpinnedBufferInternal(desc, buffer_flushed);
6630}
6631
6632/*
6633 * Try to evict all the shared buffers.
6634 *
6635 * This function is intended for testing/development use only! See
6636 * EvictUnpinnedBuffer().
6637 *
6638 * The buffers_* parameters are mandatory and indicate the total count of
6639 * buffers that:
6640 * - buffers_evicted - were evicted
6641 * - buffers_flushed - were flushed
6642 * - buffers_skipped - could not be evicted
6643 */
6644void
6645EvictAllUnpinnedBuffers(int32 *buffers_evicted, int32 *buffers_flushed,
6646 int32 *buffers_skipped)
6647{
6648 *buffers_evicted = 0;
6649 *buffers_skipped = 0;
6650 *buffers_flushed = 0;
6651
6652 for (int buf = 1; buf <= NBuffers; buf++)
6653 {
6654 BufferDesc *desc = GetBufferDescriptor(buf - 1);
6655 uint32 buf_state;
6656 bool buffer_flushed;
6657
6658 buf_state = pg_atomic_read_u32(&desc->state);
6659 if (!(buf_state & BM_VALID))
6660 continue;
6661
6664
6665 LockBufHdr(desc);
6666
6667 if (EvictUnpinnedBufferInternal(desc, &buffer_flushed))
6668 (*buffers_evicted)++;
6669 else
6670 (*buffers_skipped)++;
6671
6672 if (buffer_flushed)
6673 (*buffers_flushed)++;
6674 }
6675}
6676
6677/*
6678 * Try to evict all the shared buffers containing provided relation's pages.
6679 *
6680 * This function is intended for testing/development use only! See
6681 * EvictUnpinnedBuffer().
6682 *
6683 * The caller must hold at least AccessShareLock on the relation to prevent
6684 * the relation from being dropped.
6685 *
6686 * The buffers_* parameters are mandatory and indicate the total count of
6687 * buffers that:
6688 * - buffers_evicted - were evicted
6689 * - buffers_flushed - were flushed
6690 * - buffers_skipped - could not be evicted
6691 */
6692void
6694 int32 *buffers_flushed, int32 *buffers_skipped)
6695{
6697
6698 *buffers_skipped = 0;
6699 *buffers_evicted = 0;
6700 *buffers_flushed = 0;
6701
6702 for (int buf = 1; buf <= NBuffers; buf++)
6703 {
6704 BufferDesc *desc = GetBufferDescriptor(buf - 1);
6705 uint32 buf_state = pg_atomic_read_u32(&(desc->state));
6706 bool buffer_flushed;
6707
6708 /* An unlocked precheck should be safe and saves some cycles. */
6709 if ((buf_state & BM_VALID) == 0 ||
6711 continue;
6712
6713 /* Make sure we can pin the buffer. */
6716
6717 buf_state = LockBufHdr(desc);
6718
6719 /* recheck, could have changed without the lock */
6720 if ((buf_state & BM_VALID) == 0 ||
6722 {
6723 UnlockBufHdr(desc, buf_state);
6724 continue;
6725 }
6726
6727 if (EvictUnpinnedBufferInternal(desc, &buffer_flushed))
6728 (*buffers_evicted)++;
6729 else
6730 (*buffers_skipped)++;
6731
6732 if (buffer_flushed)
6733 (*buffers_flushed)++;
6734 }
6735}
6736
6737/*
6738 * Generic implementation of the AIO handle staging callback for readv/writev
6739 * on local/shared buffers.
6740 *
6741 * Each readv/writev can target multiple buffers. The buffers have already
6742 * been registered with the IO handle.
6743 *
6744 * To make the IO ready for execution ("staging"), we need to ensure that the
6745 * targeted buffers are in an appropriate state while the IO is ongoing. For
6746 * that the AIO subsystem needs to have its own buffer pin, otherwise an error
6747 * in this backend could lead to this backend's buffer pin being released as
6748 * part of error handling, which in turn could lead to the buffer being
6749 * replaced while IO is ongoing.
6750 */
6752buffer_stage_common(PgAioHandle *ioh, bool is_write, bool is_temp)
6753{
6754 uint64 *io_data;
6755 uint8 handle_data_len;
6756 PgAioWaitRef io_ref;
6758
6759 io_data = pgaio_io_get_handle_data(ioh, &handle_data_len);
6760
6761 pgaio_io_get_wref(ioh, &io_ref);
6762
6763 /* iterate over all buffers affected by the vectored readv/writev */
6764 for (int i = 0; i < handle_data_len; i++)
6765 {
6766 Buffer buffer = (Buffer) io_data[i];
6767 BufferDesc *buf_hdr = is_temp ?
6770 uint32 buf_state;
6771
6772 /*
6773 * Check that all the buffers are actually ones that could conceivably
6774 * be done in one IO, i.e. are sequential. This is the last
6775 * buffer-aware code before IO is actually executed and confusion
6776 * about which buffers are targeted by IO can be hard to debug, making
6777 * it worth doing extra-paranoid checks.
6778 */
6779 if (i == 0)
6780 first = buf_hdr->tag;
6781 else
6782 {
6783 Assert(buf_hdr->tag.relNumber == first.relNumber);
6784 Assert(buf_hdr->tag.blockNum == first.blockNum + i);
6785 }
6786
6787 if (is_temp)
6788 buf_state = pg_atomic_read_u32(&buf_hdr->state);
6789 else
6790 buf_state = LockBufHdr(buf_hdr);
6791
6792 /* verify the buffer is in the expected state */
6793 Assert(buf_state & BM_TAG_VALID);
6794 if (is_write)
6795 {
6796 Assert(buf_state & BM_VALID);
6797 Assert(buf_state & BM_DIRTY);
6798 }
6799 else
6800 {
6801 Assert(!(buf_state & BM_VALID));
6802 Assert(!(buf_state & BM_DIRTY));
6803 }
6804
6805 /* temp buffers don't use BM_IO_IN_PROGRESS */
6806 if (!is_temp)
6807 Assert(buf_state & BM_IO_IN_PROGRESS);
6808
6809 Assert(BUF_STATE_GET_REFCOUNT(buf_state) >= 1);
6810
6811 /*
6812 * Reflect that the buffer is now owned by the AIO subsystem.
6813 *
6814 * For local buffers: This can't be done just via LocalRefCount, as
6815 * one might initially think, as this backend could error out while
6816 * AIO is still in progress, releasing all the pins by the backend
6817 * itself.
6818 *
6819 * This pin is released again in TerminateBufferIO().
6820 */
6821 buf_state += BUF_REFCOUNT_ONE;
6822 buf_hdr->io_wref = io_ref;
6823
6824 if (is_temp)
6825 pg_atomic_unlocked_write_u32(&buf_hdr->state, buf_state);
6826 else
6827 UnlockBufHdr(buf_hdr, buf_state);
6828
6829 /*
6830 * Ensure the content lock that prevents buffer modifications while
6831 * the buffer is being written out is not released early due to an
6832 * error.
6833 */
6834 if (is_write && !is_temp)
6835 {
6836 LWLock *content_lock;
6837
6838 content_lock = BufferDescriptorGetContentLock(buf_hdr);
6839
6840 Assert(LWLockHeldByMe(content_lock));
6841
6842 /*
6843 * Lock is now owned by AIO subsystem.
6844 */
6845 LWLockDisown(content_lock);
6846 }
6847
6848 /*
6849 * Stop tracking this buffer via the resowner - the AIO system now
6850 * keeps track.
6851 */
6852 if (!is_temp)
6854 }
6855}
6856
6857/*
6858 * Decode readv errors as encoded by buffer_readv_encode_error().
6859 */
6860static inline void
6862 bool *zeroed_any,
6863 bool *ignored_any,
6864 uint8 *zeroed_or_error_count,
6865 uint8 *checkfail_count,
6866 uint8 *first_off)
6867{
6868 uint32 rem_error = result.error_data;
6869
6870 /* see static asserts in buffer_readv_encode_error */
6871#define READV_COUNT_BITS 7
6872#define READV_COUNT_MASK ((1 << READV_COUNT_BITS) - 1)
6873
6874 *zeroed_any = rem_error & 1;
6875 rem_error >>= 1;
6876
6877 *ignored_any = rem_error & 1;
6878 rem_error >>= 1;
6879
6880 *zeroed_or_error_count = rem_error & READV_COUNT_MASK;
6881 rem_error >>= READV_COUNT_BITS;
6882
6883 *checkfail_count = rem_error & READV_COUNT_MASK;
6884 rem_error >>= READV_COUNT_BITS;
6885
6886 *first_off = rem_error & READV_COUNT_MASK;
6887 rem_error >>= READV_COUNT_BITS;
6888}
6889
6890/*
6891 * Helper to encode errors for buffer_readv_complete()
6892 *
6893 * Errors are encoded as follows:
6894 * - bit 0 indicates whether any page was zeroed (1) or not (0)
6895 * - bit 1 indicates whether any checksum failure was ignored (1) or not (0)
6896 * - next READV_COUNT_BITS bits indicate the number of errored or zeroed pages
6897 * - next READV_COUNT_BITS bits indicate the number of checksum failures
6898 * - next READV_COUNT_BITS bits indicate the first offset of the first page
6899 * that was errored or zeroed or, if no errors/zeroes, the first ignored
6900 * checksum
6901 */
6902static inline void
6904 bool is_temp,
6905 bool zeroed_any,
6906 bool ignored_any,
6907 uint8 error_count,
6908 uint8 zeroed_count,
6909 uint8 checkfail_count,
6910 uint8 first_error_off,
6911 uint8 first_zeroed_off,
6912 uint8 first_ignored_off)
6913{
6914
6915 uint8 shift = 0;
6916 uint8 zeroed_or_error_count =
6917 error_count > 0 ? error_count : zeroed_count;
6918 uint8 first_off;
6919
6921 "PG_IOV_MAX is bigger than reserved space for error data");
6923 "PGAIO_RESULT_ERROR_BITS is insufficient for buffer_readv");
6924
6925 /*
6926 * We only have space to encode one offset - but luckily that's good
6927 * enough. If there is an error, the error is the interesting offset, same
6928 * with a zeroed buffer vs an ignored buffer.
6929 */
6930 if (error_count > 0)
6931 first_off = first_error_off;
6932 else if (zeroed_count > 0)
6933 first_off = first_zeroed_off;
6934 else
6935 first_off = first_ignored_off;
6936
6937 Assert(!zeroed_any || error_count == 0);
6938
6939 result->error_data = 0;
6940
6941 result->error_data |= zeroed_any << shift;
6942 shift += 1;
6943
6944 result->error_data |= ignored_any << shift;
6945 shift += 1;
6946
6947 result->error_data |= ((uint32) zeroed_or_error_count) << shift;
6948 shift += READV_COUNT_BITS;
6949
6950 result->error_data |= ((uint32) checkfail_count) << shift;
6951 shift += READV_COUNT_BITS;
6952
6953 result->error_data |= ((uint32) first_off) << shift;
6954 shift += READV_COUNT_BITS;
6955
6956 result->id = is_temp ? PGAIO_HCB_LOCAL_BUFFER_READV :
6958
6959 if (error_count > 0)
6960 result->status = PGAIO_RS_ERROR;
6961 else
6962 result->status = PGAIO_RS_WARNING;
6963
6964 /*
6965 * The encoding is complicated enough to warrant cross-checking it against
6966 * the decode function.
6967 */
6968#ifdef USE_ASSERT_CHECKING
6969 {
6970 bool zeroed_any_2,
6971 ignored_any_2;
6972 uint8 zeroed_or_error_count_2,
6973 checkfail_count_2,
6974 first_off_2;
6975
6977 &zeroed_any_2, &ignored_any_2,
6978 &zeroed_or_error_count_2,
6979 &checkfail_count_2,
6980 &first_off_2);
6981 Assert(zeroed_any == zeroed_any_2);
6982 Assert(ignored_any == ignored_any_2);
6983 Assert(zeroed_or_error_count == zeroed_or_error_count_2);
6984 Assert(checkfail_count == checkfail_count_2);
6985 Assert(first_off == first_off_2);
6986 }
6987#endif
6988
6989#undef READV_COUNT_BITS
6990#undef READV_COUNT_MASK
6991}
6992
6993/*
6994 * Helper for AIO readv completion callbacks, supporting both shared and temp
6995 * buffers. Gets called once for each buffer in a multi-page read.
6996 */
6999 uint8 flags, bool failed, bool is_temp,
7000 bool *buffer_invalid,
7001 bool *failed_checksum,
7002 bool *ignored_checksum,
7003 bool *zeroed_buffer)
7004{
7005 BufferDesc *buf_hdr = is_temp ?
7008 BufferTag tag = buf_hdr->tag;
7009 char *bufdata = BufferGetBlock(buffer);
7010 uint32 set_flag_bits;
7011 int piv_flags;
7012
7013 /* check that the buffer is in the expected state for a read */
7014#ifdef USE_ASSERT_CHECKING
7015 {
7016 uint32 buf_state = pg_atomic_read_u32(&buf_hdr->state);
7017
7018 Assert(buf_state & BM_TAG_VALID);
7019 Assert(!(buf_state & BM_VALID));
7020 /* temp buffers don't use BM_IO_IN_PROGRESS */
7021 if (!is_temp)
7022 Assert(buf_state & BM_IO_IN_PROGRESS);
7023 Assert(!(buf_state & BM_DIRTY));
7024 }
7025#endif
7026
7027 *buffer_invalid = false;
7028 *failed_checksum = false;
7029 *ignored_checksum = false;
7030 *zeroed_buffer = false;
7031
7032 /*
7033 * We ask PageIsVerified() to only log the message about checksum errors,
7034 * as the completion might be run in any backend (or IO workers). We will
7035 * report checksum errors in buffer_readv_report().
7036 */
7037 piv_flags = PIV_LOG_LOG;
7038
7039 /* the local zero_damaged_pages may differ from the definer's */
7041 piv_flags |= PIV_IGNORE_CHECKSUM_FAILURE;
7042
7043 /* Check for garbage data. */
7044 if (!failed)
7045 {
7046 /*
7047 * If the buffer is not currently pinned by this backend, e.g. because
7048 * we're completing this IO after an error, the buffer data will have
7049 * been marked as inaccessible when the buffer was unpinned. The AIO
7050 * subsystem holds a pin, but that doesn't prevent the buffer from
7051 * having been marked as inaccessible. The completion might also be
7052 * executed in a different process.
7053 */
7054#ifdef USE_VALGRIND
7055 if (!BufferIsPinned(buffer))
7056 VALGRIND_MAKE_MEM_DEFINED(bufdata, BLCKSZ);
7057#endif
7058
7059 if (!PageIsVerified((Page) bufdata, tag.blockNum, piv_flags,
7060 failed_checksum))
7061 {
7062 if (flags & READ_BUFFERS_ZERO_ON_ERROR)
7063 {
7064 memset(bufdata, 0, BLCKSZ);
7065 *zeroed_buffer = true;
7066 }
7067 else
7068 {
7069 *buffer_invalid = true;
7070 /* mark buffer as having failed */
7071 failed = true;
7072 }
7073 }
7074 else if (*failed_checksum)
7075 *ignored_checksum = true;
7076
7077 /* undo what we did above */
7078#ifdef USE_VALGRIND
7079 if (!BufferIsPinned(buffer))
7080 VALGRIND_MAKE_MEM_NOACCESS(bufdata, BLCKSZ);
7081#endif
7082
7083 /*
7084 * Immediately log a message about the invalid page, but only to the
7085 * server log. The reason to do so immediately is that this may be
7086 * executed in a different backend than the one that originated the
7087 * request. The reason to do so immediately is that the originator
7088 * might not process the query result immediately (because it is busy
7089 * doing another part of query processing) or at all (e.g. if it was
7090 * cancelled or errored out due to another IO also failing). The
7091 * definer of the IO will emit an ERROR or WARNING when processing the
7092 * IO's results
7093 *
7094 * To avoid duplicating the code to emit these log messages, we reuse
7095 * buffer_readv_report().
7096 */
7097 if (*buffer_invalid || *failed_checksum || *zeroed_buffer)
7098 {
7099 PgAioResult result_one = {0};
7100
7101 buffer_readv_encode_error(&result_one, is_temp,
7102 *zeroed_buffer,
7103 *ignored_checksum,
7104 *buffer_invalid,
7105 *zeroed_buffer ? 1 : 0,
7106 *failed_checksum ? 1 : 0,
7107 buf_off, buf_off, buf_off);
7108 pgaio_result_report(result_one, td, LOG_SERVER_ONLY);
7109 }
7110 }
7111
7112 /* Terminate I/O and set BM_VALID. */
7113 set_flag_bits = failed ? BM_IO_ERROR : BM_VALID;
7114 if (is_temp)
7115 TerminateLocalBufferIO(buf_hdr, false, set_flag_bits, true);
7116 else
7117 TerminateBufferIO(buf_hdr, false, set_flag_bits, false, true);
7118
7119 /*
7120 * Call the BUFFER_READ_DONE tracepoint in the callback, even though the
7121 * callback may not be executed in the same backend that called
7122 * BUFFER_READ_START. The alternative would be to defer calling the
7123 * tracepoint to a later point (e.g. the local completion callback for
7124 * shared buffer reads), which seems even less helpful.
7125 */
7126 TRACE_POSTGRESQL_BUFFER_READ_DONE(tag.forkNum,
7127 tag.blockNum,
7128 tag.spcOid,
7129 tag.dbOid,
7130 tag.relNumber,
7132 false);
7133}
7134
7135/*
7136 * Perform completion handling of a single AIO read. This read may cover
7137 * multiple blocks / buffers.
7138 *
7139 * Shared between shared and local buffers, to reduce code duplication.
7140 */
7143 uint8 cb_data, bool is_temp)
7144{
7145 PgAioResult result = prior_result;
7147 uint8 first_error_off = 0;
7148 uint8 first_zeroed_off = 0;
7149 uint8 first_ignored_off = 0;
7150 uint8 error_count = 0;
7151 uint8 zeroed_count = 0;
7152 uint8 ignored_count = 0;
7153 uint8 checkfail_count = 0;
7154 uint64 *io_data;
7155 uint8 handle_data_len;
7156
7157 if (is_temp)
7158 {
7159 Assert(td->smgr.is_temp);
7161 }
7162 else
7163 Assert(!td->smgr.is_temp);
7164
7165 /*
7166 * Iterate over all the buffers affected by this IO and call the
7167 * per-buffer completion function for each buffer.
7168 */
7169 io_data = pgaio_io_get_handle_data(ioh, &handle_data_len);
7170 for (uint8 buf_off = 0; buf_off < handle_data_len; buf_off++)
7171 {
7172 Buffer buf = io_data[buf_off];
7173 bool failed;
7174 bool failed_verification = false;
7175 bool failed_checksum = false;
7176 bool zeroed_buffer = false;
7177 bool ignored_checksum = false;
7178
7180
7181 /*
7182 * If the entire I/O failed on a lower-level, each buffer needs to be
7183 * marked as failed. In case of a partial read, the first few buffers
7184 * may be ok.
7185 */
7186 failed =
7187 prior_result.status == PGAIO_RS_ERROR
7188 || prior_result.result <= buf_off;
7189
7190 buffer_readv_complete_one(td, buf_off, buf, cb_data, failed, is_temp,
7191 &failed_verification,
7192 &failed_checksum,
7193 &ignored_checksum,
7194 &zeroed_buffer);
7195
7196 /*
7197 * Track information about the number of different kinds of error
7198 * conditions across all pages, as there can be multiple pages failing
7199 * verification as part of one IO.
7200 */
7201 if (failed_verification && !zeroed_buffer && error_count++ == 0)
7202 first_error_off = buf_off;
7203 if (zeroed_buffer && zeroed_count++ == 0)
7204 first_zeroed_off = buf_off;
7205 if (ignored_checksum && ignored_count++ == 0)
7206 first_ignored_off = buf_off;
7207 if (failed_checksum)
7208 checkfail_count++;
7209 }
7210
7211 /*
7212 * If the smgr read succeeded [partially] and page verification failed for
7213 * some of the pages, adjust the IO's result state appropriately.
7214 */
7215 if (prior_result.status != PGAIO_RS_ERROR &&
7216 (error_count > 0 || ignored_count > 0 || zeroed_count > 0))
7217 {
7218 buffer_readv_encode_error(&result, is_temp,
7219 zeroed_count > 0, ignored_count > 0,
7220 error_count, zeroed_count, checkfail_count,
7221 first_error_off, first_zeroed_off,
7222 first_ignored_off);
7223 pgaio_result_report(result, td, DEBUG1);
7224 }
7225
7226 /*
7227 * For shared relations this reporting is done in
7228 * shared_buffer_readv_complete_local().
7229 */
7230 if (is_temp && checkfail_count > 0)
7232 checkfail_count);
7233
7234 return result;
7235}
7236
7237/*
7238 * AIO error reporting callback for aio_shared_buffer_readv_cb and
7239 * aio_local_buffer_readv_cb.
7240 *
7241 * The error is encoded / decoded in buffer_readv_encode_error() /
7242 * buffer_readv_decode_error().
7243 */
7244static void
7246 int elevel)
7247{
7248 int nblocks = td->smgr.nblocks;
7249 BlockNumber first = td->smgr.blockNum;
7250 BlockNumber last = first + nblocks - 1;
7251 ProcNumber errProc =
7253 RelPathStr rpath =
7254 relpathbackend(td->smgr.rlocator, errProc, td->smgr.forkNum);
7255 bool zeroed_any,
7256 ignored_any;
7257 uint8 zeroed_or_error_count,
7258 checkfail_count,
7259 first_off;
7260 uint8 affected_count;
7261 const char *msg_one,
7262 *msg_mult,
7263 *det_mult,
7264 *hint_mult;
7265
7266 buffer_readv_decode_error(result, &zeroed_any, &ignored_any,
7267 &zeroed_or_error_count,
7268 &checkfail_count,
7269 &first_off);
7270
7271 /*
7272 * Treat a read that had both zeroed buffers *and* ignored checksums as a
7273 * special case, it's too irregular to be emitted the same way as the
7274 * other cases.
7275 */
7276 if (zeroed_any && ignored_any)
7277 {
7278 Assert(zeroed_any && ignored_any);
7279 Assert(nblocks > 1); /* same block can't be both zeroed and ignored */
7280 Assert(result.status != PGAIO_RS_ERROR);
7281 affected_count = zeroed_or_error_count;
7282
7283 ereport(elevel,
7285 errmsg("zeroing %u page(s) and ignoring %u checksum failure(s) among blocks %u..%u of relation \"%s\"",
7286 affected_count, checkfail_count, first, last, rpath.str),
7287 affected_count > 1 ?
7288 errdetail("Block %u held the first zeroed page.",
7289 first + first_off) : 0,
7290 errhint_plural("See server log for details about the other %d invalid block.",
7291 "See server log for details about the other %d invalid blocks.",
7292 affected_count + checkfail_count - 1,
7293 affected_count + checkfail_count - 1));
7294 return;
7295 }
7296
7297 /*
7298 * The other messages are highly repetitive. To avoid duplicating a long
7299 * and complicated ereport(), gather the translated format strings
7300 * separately and then do one common ereport.
7301 */
7302 if (result.status == PGAIO_RS_ERROR)
7303 {
7304 Assert(!zeroed_any); /* can't have invalid pages when zeroing them */
7305 affected_count = zeroed_or_error_count;
7306 msg_one = _("invalid page in block %u of relation \"%s\"");
7307 msg_mult = _("%u invalid pages among blocks %u..%u of relation \"%s\"");
7308 det_mult = _("Block %u held the first invalid page.");
7309 hint_mult = _("See server log for the other %u invalid block(s).");
7310 }
7311 else if (zeroed_any && !ignored_any)
7312 {
7313 affected_count = zeroed_or_error_count;
7314 msg_one = _("invalid page in block %u of relation \"%s\"; zeroing out page");
7315 msg_mult = _("zeroing out %u invalid pages among blocks %u..%u of relation \"%s\"");
7316 det_mult = _("Block %u held the first zeroed page.");
7317 hint_mult = _("See server log for the other %u zeroed block(s).");
7318 }
7319 else if (!zeroed_any && ignored_any)
7320 {
7321 affected_count = checkfail_count;
7322 msg_one = _("ignoring checksum failure in block %u of relation \"%s\"");
7323 msg_mult = _("ignoring %u checksum failures among blocks %u..%u of relation \"%s\"");
7324 det_mult = _("Block %u held the first ignored page.");
7325 hint_mult = _("See server log for the other %u ignored block(s).");
7326 }
7327 else
7329
7330 ereport(elevel,
7332 affected_count == 1 ?
7333 errmsg_internal(msg_one, first + first_off, rpath.str) :
7334 errmsg_internal(msg_mult, affected_count, first, last, rpath.str),
7335 affected_count > 1 ? errdetail_internal(det_mult, first + first_off) : 0,
7336 affected_count > 1 ? errhint_internal(hint_mult, affected_count - 1) : 0);
7337}
7338
7339static void
7341{
7342 buffer_stage_common(ioh, false, false);
7343}
7344
7345static PgAioResult
7347 uint8 cb_data)
7348{
7349 return buffer_readv_complete(ioh, prior_result, cb_data, false);
7350}
7351
7352/*
7353 * We need a backend-local completion callback for shared buffers, to be able
7354 * to report checksum errors correctly. Unfortunately that can only safely
7355 * happen if the reporting backend has previously called
7356 * pgstat_prepare_report_checksum_failure(), which we can only guarantee in
7357 * the backend that started the IO. Hence this callback.
7358 */
7359static PgAioResult
7361 uint8 cb_data)
7362{
7363 bool zeroed_any,
7364 ignored_any;
7365 uint8 zeroed_or_error_count,
7366 checkfail_count,
7367 first_off;
7368
7369 if (prior_result.status == PGAIO_RS_OK)
7370 return prior_result;
7371
7372 buffer_readv_decode_error(prior_result,
7373 &zeroed_any,
7374 &ignored_any,
7375 &zeroed_or_error_count,
7376 &checkfail_count,
7377 &first_off);
7378
7379 if (checkfail_count)
7380 {
7382
7384 checkfail_count);
7385 }
7386
7387 return prior_result;
7388}
7389
7390static void
7392{
7393 buffer_stage_common(ioh, false, true);
7394}
7395
7396static PgAioResult
7398 uint8 cb_data)
7399{
7400 return buffer_readv_complete(ioh, prior_result, cb_data, true);
7401}
7402
7403/* readv callback is passed READ_BUFFERS_* flags as callback data */
7406 .complete_shared = shared_buffer_readv_complete,
7407 /* need a local callback to report checksum failures */
7408 .complete_local = shared_buffer_readv_complete_local,
7409 .report = buffer_readv_report,
7410};
7411
7412/* readv callback is passed READ_BUFFERS_* flags as callback data */
7415
7416 /*
7417 * Note that this, in contrast to the shared_buffers case, uses
7418 * complete_local, as only the issuing backend has access to the required
7419 * datastructures. This is important in case the IO completion may be
7420 * consumed incidentally by another backend.
7421 */
7422 .complete_local = local_buffer_readv_complete,
7423 .report = buffer_readv_report,
7424};
int io_method
Definition: aio.c:74
bool pgaio_wref_valid(PgAioWaitRef *iow)
Definition: aio.c:968
PgAioHandle * pgaio_io_acquire(struct ResourceOwnerData *resowner, PgAioReturn *ret)
Definition: aio.c:159
void pgaio_wref_clear(PgAioWaitRef *iow)
Definition: aio.c:961
void pgaio_io_get_wref(PgAioHandle *ioh, PgAioWaitRef *iow)
Definition: aio.c:363
void pgaio_io_set_flag(PgAioHandle *ioh, PgAioHandleFlags flag)
Definition: aio.c:327
bool pgaio_have_staged(void)
Definition: aio.c:1104
bool pgaio_wref_check_done(PgAioWaitRef *iow)
Definition: aio.c:1002
ProcNumber pgaio_io_get_owner(PgAioHandle *ioh)
Definition: aio.c:352
void pgaio_submit_staged(void)
Definition: aio.c:1120
void pgaio_wref_wait(PgAioWaitRef *iow)
Definition: aio.c:988
void pgaio_io_release(PgAioHandle *ioh)
Definition: aio.c:237
PgAioHandle * pgaio_io_acquire_nb(struct ResourceOwnerData *resowner, PgAioReturn *ret)
Definition: aio.c:185
@ PGAIO_HCB_LOCAL_BUFFER_READV
Definition: aio.h:200
@ PGAIO_HCB_SHARED_BUFFER_READV
Definition: aio.h:198
@ IOMETHOD_SYNC
Definition: aio.h:34
@ PGAIO_HF_SYNCHRONOUS
Definition: aio.h:70
@ PGAIO_HF_REFERENCES_LOCAL
Definition: aio.h:60
void pgaio_io_set_handle_data_32(PgAioHandle *ioh, uint32 *data, uint8 len)
Definition: aio_callback.c:140
void pgaio_io_register_callbacks(PgAioHandle *ioh, PgAioHandleCallbackID cb_id, uint8 cb_data)
Definition: aio_callback.c:86
uint64 * pgaio_io_get_handle_data(PgAioHandle *ioh, uint8 *len)
Definition: aio_callback.c:156
void pgaio_result_report(PgAioResult result, const PgAioTargetData *target_data, int elevel)
Definition: aio_callback.c:173
PgAioTargetData * pgaio_io_get_target_data(PgAioHandle *ioh)
Definition: aio_target.c:73
#define PGAIO_RESULT_ERROR_BITS
Definition: aio_types.h:98
PgAioResultStatus
Definition: aio_types.h:79
@ PGAIO_RS_OK
Definition: aio_types.h:81
@ PGAIO_RS_UNKNOWN
Definition: aio_types.h:80
@ PGAIO_RS_PARTIAL
Definition: aio_types.h:82
@ PGAIO_RS_ERROR
Definition: aio_types.h:84
@ PGAIO_RS_WARNING
Definition: aio_types.h:83
static bool pg_atomic_compare_exchange_u32(volatile pg_atomic_uint32 *ptr, uint32 *expected, uint32 newval)
Definition: atomics.h:347
static uint32 pg_atomic_fetch_or_u32(volatile pg_atomic_uint32 *ptr, uint32 or_)
Definition: atomics.h:408
static void pg_atomic_unlocked_write_u32(volatile pg_atomic_uint32 *ptr, uint32 val)
Definition: atomics.h:293
static uint32 pg_atomic_read_u32(volatile pg_atomic_uint32 *ptr)
Definition: atomics.h:237
bool TimestampDifferenceExceeds(TimestampTz start_time, TimestampTz stop_time, int msec)
Definition: timestamp.c:1781
TimestampTz GetCurrentTimestamp(void)
Definition: timestamp.c:1645
Datum now(PG_FUNCTION_ARGS)
Definition: timestamp.c:1609
int BgWriterDelay
Definition: bgwriter.c:58
void binaryheap_build(binaryheap *heap)
Definition: binaryheap.c:138
void binaryheap_replace_first(binaryheap *heap, bh_node_type d)
Definition: binaryheap.c:255
bh_node_type binaryheap_first(binaryheap *heap)
Definition: binaryheap.c:177
bh_node_type binaryheap_remove_first(binaryheap *heap)
Definition: binaryheap.c:192
void binaryheap_free(binaryheap *heap)
Definition: binaryheap.c:75
void binaryheap_add_unordered(binaryheap *heap, bh_node_type d)
Definition: binaryheap.c:116
binaryheap * binaryheap_allocate(int capacity, binaryheap_comparator compare, void *arg)
Definition: binaryheap.c:39
#define binaryheap_empty(h)
Definition: binaryheap.h:65
uint32 BlockNumber
Definition: block.h:31
#define InvalidBlockNumber
Definition: block.h:33
static bool BlockNumberIsValid(BlockNumber blockNumber)
Definition: block.h:71
#define MaxBlockNumber
Definition: block.h:35
static int32 next
Definition: blutils.c:224
int Buffer
Definition: buf.h:23
#define InvalidBuffer
Definition: buf.h:25
#define BufferIsLocal(buffer)
Definition: buf.h:37
CkptSortItem * CkptBufferIds
Definition: buf_init.c:25
WritebackContext BackendWritebackContext
Definition: buf_init.c:24
BufferDescPadded * BufferDescriptors
Definition: buf_init.c:21
#define BM_MAX_USAGE_COUNT
Definition: buf_internals.h:86
static void InitBufferTag(BufferTag *tag, const RelFileLocator *rlocator, ForkNumber forkNum, BlockNumber blockNum)
#define BM_TAG_VALID
Definition: buf_internals.h:71
#define BM_PERMANENT
Definition: buf_internals.h:77
#define BUF_USAGECOUNT_MASK
Definition: buf_internals.h:53
static ForkNumber BufTagGetForkNum(const BufferTag *tag)
#define BUF_REFCOUNT_ONE
Definition: buf_internals.h:51
static ConditionVariable * BufferDescriptorGetIOCV(const BufferDesc *bdesc)
static void UnlockBufHdr(BufferDesc *desc, uint32 buf_state)
static bool BufferTagsEqual(const BufferTag *tag1, const BufferTag *tag2)
static RelFileNumber BufTagGetRelNumber(const BufferTag *tag)
static LWLock * BufferDescriptorGetContentLock(const BufferDesc *bdesc)
static bool BufTagMatchesRelFileLocator(const BufferTag *tag, const RelFileLocator *rlocator)
#define BUF_FLAG_MASK
Definition: buf_internals.h:56
#define BM_PIN_COUNT_WAITER
Definition: buf_internals.h:75
#define BM_DIRTY
Definition: buf_internals.h:69
static void ResourceOwnerRememberBufferIO(ResourceOwner owner, Buffer buffer)
#define BM_LOCKED
Definition: buf_internals.h:68
#define BM_JUST_DIRTIED
Definition: buf_internals.h:74
#define BUF_STATE_GET_USAGECOUNT(state)
Definition: buf_internals.h:60
static void ResourceOwnerForgetBufferIO(ResourceOwner owner, Buffer buffer)
#define BM_IO_IN_PROGRESS
Definition: buf_internals.h:72
static void ClearBufferTag(BufferTag *tag)
static void ResourceOwnerRememberBuffer(ResourceOwner owner, Buffer buffer)
static void ResourceOwnerForgetBuffer(ResourceOwner owner, Buffer buffer)
#define BUF_USAGECOUNT_ONE
Definition: buf_internals.h:54
#define BUF_STATE_GET_REFCOUNT(state)
Definition: buf_internals.h:59
static LWLock * BufMappingPartitionLock(uint32 hashcode)
static RelFileLocator BufTagGetRelFileLocator(const BufferTag *tag)
#define BM_VALID
Definition: buf_internals.h:70
#define BM_IO_ERROR
Definition: buf_internals.h:73
static BufferDesc * GetLocalBufferDescriptor(uint32 id)
static BufferDesc * GetBufferDescriptor(uint32 id)
static Buffer BufferDescriptorGetBuffer(const BufferDesc *bdesc)
#define BM_CHECKPOINT_NEEDED
Definition: buf_internals.h:76
void BufTableDelete(BufferTag *tagPtr, uint32 hashcode)
Definition: buf_table.c:148
int BufTableLookup(BufferTag *tagPtr, uint32 hashcode)
Definition: buf_table.c:90
uint32 BufTableHashCode(BufferTag *tagPtr)
Definition: buf_table.c:78
int BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id)
Definition: buf_table.c:118
bool track_io_timing
Definition: bufmgr.c:147
void CheckBufferIsPinnedOnce(Buffer buffer)
Definition: bufmgr.c:5619
void FlushRelationsAllBuffers(SMgrRelation *smgrs, int nrels)
Definition: bufmgr.c:4998
void IncrBufferRefCount(Buffer buffer)
Definition: bufmgr.c:5370
void DropDatabaseBuffers(Oid dbid)
Definition: bufmgr.c:4860
static int ckpt_buforder_comparator(const CkptSortItem *a, const CkptSortItem *b)
Definition: bufmgr.c:6312
static pg_attribute_always_inline PgAioResult buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data, bool is_temp)
Definition: bufmgr.c:7142
bool BufferIsExclusiveLocked(Buffer buffer)
Definition: bufmgr.c:2860
const ResourceOwnerDesc buffer_pin_resowner_desc
Definition: bufmgr.c:244
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:4198
static PrivateRefCountEntry * NewPrivateRefCountEntry(Buffer buffer)
Definition: bufmgr.c:325
static bool ReadBuffersCanStartIO(Buffer buffer, bool nowait)
Definition: bufmgr.c:1564
void DropRelationBuffers(SMgrRelation smgr_reln, ForkNumber *forkNum, int nforks, BlockNumber *firstDelBlock)
Definition: bufmgr.c:4507
Buffer ReleaseAndReadBuffer(Buffer buffer, Relation relation, BlockNumber blockNum)
Definition: bufmgr.c:2983
static PgAioResult shared_buffer_readv_complete_local(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
Definition: bufmgr.c:7360
static pg_attribute_always_inline bool StartReadBuffersImpl(ReadBuffersOperation *operation, Buffer *buffers, BlockNumber blockNum, int *nblocks, int flags, bool allow_forwarding)
Definition: bufmgr.c:1262
static void CheckReadBuffersOperation(ReadBuffersOperation *operation, bool is_complete)
Definition: bufmgr.c:1527
PrefetchBufferResult PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum)
Definition: bufmgr.c:651
static uint32 PrivateRefCountClock
Definition: bufmgr.c:218
static void FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object, IOContext io_context)
Definition: bufmgr.c:4258
static void ResOwnerReleaseBufferIO(Datum res)
Definition: bufmgr.c:6511
static PgAioResult local_buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
Definition: bufmgr.c:7397
bool StartReadBuffers(ReadBuffersOperation *operation, Buffer *buffers, BlockNumber blockNum, int *nblocks, int flags)
Definition: bufmgr.c:1489
void EvictAllUnpinnedBuffers(int32 *buffers_evicted, int32 *buffers_flushed, int32 *buffers_skipped)
Definition: bufmgr.c:6645
int io_max_combine_limit
Definition: bufmgr.c:172
static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy)
Definition: bufmgr.c:3041
const ResourceOwnerDesc buffer_io_resowner_desc
Definition: bufmgr.c:235
bool zero_damaged_pages
Definition: bufmgr.c:144
#define BUF_DROP_FULL_SCAN_THRESHOLD
Definition: bufmgr.c:91
static void PinBuffer_Locked(BufferDesc *buf)
Definition: bufmgr.c:3152
void EvictRelUnpinnedBuffers(Relation rel, int32 *buffers_evicted, int32 *buffers_flushed, int32 *buffers_skipped)
Definition: bufmgr.c:6693
static pg_attribute_always_inline void buffer_readv_complete_one(PgAioTargetData *td, uint8 buf_off, Buffer buffer, uint8 flags, bool failed, bool is_temp, bool *buffer_invalid, bool *failed_checksum, bool *ignored_checksum, bool *zeroed_buffer)
Definition: bufmgr.c:6998
static uint32 WaitBufHdrUnlocked(BufferDesc *buf)
Definition: bufmgr.c:6254
static int buffertag_comparator(const BufferTag *ba, const BufferTag *bb)
Definition: bufmgr.c:6278
bool IsBufferCleanupOK(Buffer buffer)
Definition: bufmgr.c:5878
#define BufferGetLSN(bufHdr)
Definition: bufmgr.c:73
static char * ResOwnerPrintBufferIO(Datum res)
Definition: bufmgr.c:6519
Buffer ExtendBufferedRel(BufferManagerRelation bmr, ForkNumber forkNum, BufferAccessStrategy strategy, uint32 flags)
Definition: bufmgr.c:858
void AtEOXact_Buffers(bool isCommit)
Definition: bufmgr.c:3965
static void AbortBufferIO(Buffer buffer)
Definition: bufmgr.c:6126
const PgAioHandleCallbacks aio_shared_buffer_readv_cb
Definition: bufmgr.c:7404
BlockNumber ExtendBufferedRelBy(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, Buffer *buffers, uint32 *extended_by)
Definition: bufmgr.c:890
static Buffer ReadBuffer_common(Relation rel, SMgrRelation smgr, char smgr_persistence, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:1193
static void ProcessReadBuffersResult(ReadBuffersOperation *operation)
Definition: bufmgr.c:1593
static void ZeroAndLockBuffer(Buffer buffer, ReadBufferMode mode, bool already_valid)
Definition: bufmgr.c:1031
static BufferDesc * BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, bool *foundPtr, IOContext io_context)
Definition: bufmgr.c:2000
static void CheckForBufferLeaks(void)
Definition: bufmgr.c:4034
static bool ReadBuffersCanStartIOOnce(Buffer buffer, bool nowait)
Definition: bufmgr.c:1551
void CreateAndCopyRelationData(RelFileLocator src_rlocator, RelFileLocator dst_rlocator, bool permanent)
Definition: bufmgr.c:5212
void DropRelationsAllBuffers(SMgrRelation *smgr_reln, int nlocators)
Definition: bufmgr.c:4628
static int rlocator_comparator(const void *p1, const void *p2)
Definition: bufmgr.c:6197
Buffer ExtendBufferedRelTo(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, BlockNumber extend_to, ReadBufferMode mode)
Definition: bufmgr.c:922
struct SMgrSortArray SMgrSortArray
const PgAioHandleCallbacks aio_local_buffer_readv_cb
Definition: bufmgr.c:7413
static bool InvalidateVictimBuffer(BufferDesc *buf_hdr)
Definition: bufmgr.c:2265
static void AtProcExit_Buffers(int code, Datum arg)
Definition: bufmgr.c:4016
int io_combine_limit_guc
Definition: bufmgr.c:171
static int ts_ckpt_progress_comparator(Datum a, Datum b, void *arg)
Definition: bufmgr.c:6343
void BufferGetTag(Buffer buffer, RelFileLocator *rlocator, ForkNumber *forknum, BlockNumber *blknum)
Definition: bufmgr.c:4219
#define BufHdrGetBlock(bufHdr)
Definition: bufmgr.c:72
static pg_attribute_always_inline void buffer_stage_common(PgAioHandle *ioh, bool is_write, bool is_temp)
Definition: bufmgr.c:6752
#define BUF_REUSABLE
Definition: bufmgr.c:81
static void local_buffer_write_error_callback(void *arg)
Definition: bufmgr.c:6181
static void BufferSync(int flags)
Definition: bufmgr.c:3318
static bool AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress)
Definition: bufmgr.c:1764
static void local_buffer_readv_stage(PgAioHandle *ioh, uint8 cb_data)
Definition: bufmgr.c:7391
char * DebugPrintBufferRefcount(Buffer buffer)
Definition: bufmgr.c:4141
static char * ResOwnerPrintBufferPin(Datum res)
Definition: bufmgr.c:6542
void CheckPointBuffers(int flags)
Definition: bufmgr.c:4184
bool BufferIsDirty(Buffer buffer)
Definition: bufmgr.c:2888
static uint32 MaxProportionalPins
Definition: bufmgr.c:221
static BlockNumber ExtendBufferedRelShared(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
Definition: bufmgr.c:2593
bool BgBufferSync(WritebackContext *wb_context)
Definition: bufmgr.c:3594
static void WakePinCountWaiter(BufferDesc *buf)
Definition: bufmgr.c:3198
bool BufferIsPermanent(Buffer buffer)
Definition: bufmgr.c:4430
#define REFCOUNT_ARRAY_ENTRIES
Definition: bufmgr.c:100
static void shared_buffer_readv_stage(PgAioHandle *ioh, uint8 cb_data)
Definition: bufmgr.c:7340
void UnlockBuffers(void)
Definition: bufmgr.c:5544
PrefetchBufferResult PrefetchSharedBuffer(SMgrRelation smgr_reln, ForkNumber forkNum, BlockNumber blockNum)
Definition: bufmgr.c:561
static PgAioResult shared_buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
Definition: bufmgr.c:7346
static Buffer GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context)
Definition: bufmgr.c:2333
bool ConditionalLockBuffer(Buffer buffer)
Definition: bufmgr.c:5598
BlockNumber RelationGetNumberOfBlocksInFork(Relation relation, ForkNumber forkNum)
Definition: bufmgr.c:4398
int bgwriter_flush_after
Definition: bufmgr.c:179
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:5338
static void FindAndDropRelationBuffers(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber nForkBlock, BlockNumber firstDelBlock)
Definition: bufmgr.c:4799
XLogRecPtr BufferGetLSNAtomic(Buffer buffer)
Definition: bufmgr.c:4460
bool HoldingBufferPinThatDelaysRecovery(void)
Definition: bufmgr.c:5794
int checkpoint_flush_after
Definition: bufmgr.c:178
void UnlockReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:5355
static pg_attribute_always_inline Buffer PinBufferForBlock(Relation rel, SMgrRelation smgr, char persistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, bool *foundPtr)
Definition: bufmgr.c:1110
void TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits, bool forget_owner, bool release_aio)
Definition: bufmgr.c:6067
static void UnpinBufferNoOwner(BufferDesc *buf)
Definition: bufmgr.c:3242
static void shared_buffer_write_error_callback(void *arg)
Definition: bufmgr.c:6165
void ScheduleBufferTagForWriteback(WritebackContext *wb_context, IOContext io_context, BufferTag *tag)
Definition: bufmgr.c:6378
void WaitReadBuffers(ReadBuffersOperation *operation)
Definition: bufmgr.c:1632
void WritebackContextInit(WritebackContext *context, int *max_pending)
Definition: bufmgr.c:6366
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:2921
#define BufferIsPinned(bufnum)
Definition: bufmgr.c:483
double bgwriter_lru_multiplier
Definition: bufmgr.c:146
static bool EvictUnpinnedBufferInternal(BufferDesc *desc, bool *buffer_flushed)
Definition: bufmgr.c:6552
int backend_flush_after
Definition: bufmgr.c:180
void LimitAdditionalPins(uint32 *additional_pins)
Definition: bufmgr.c:2531
static void buffer_readv_report(PgAioResult result, const PgAioTargetData *td, int elevel)
Definition: bufmgr.c:7245
static void ReservePrivateRefCountEntry(void)
Definition: bufmgr.c:259
static BufferDesc * PinCountWaitBuf
Definition: bufmgr.c:183
static int32 GetPrivateRefCount(Buffer buffer)
Definition: bufmgr.c:425
static BlockNumber ExtendBufferedRelCommon(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
Definition: bufmgr.c:2549
void LockBufferForCleanup(Buffer buffer)
Definition: bufmgr.c:5652
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:5572
static PrivateRefCountEntry * ReservedRefCountEntry
Definition: bufmgr.c:219
void MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
Definition: bufmgr.c:5402
void FlushRelationBuffers(Relation rel)
Definition: bufmgr.c:4908
#define READV_COUNT_BITS
void IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_context)
Definition: bufmgr.c:6428
static void ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref)
Definition: bufmgr.c:448
bool EvictUnpinnedBuffer(Buffer buf, bool *buffer_flushed)
Definition: bufmgr.c:6616
Buffer ReadBufferWithoutRelcache(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy, bool permanent)
Definition: bufmgr.c:842
bool ReadRecentBuffer(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockNum, Buffer recent_buffer)
Definition: bufmgr.c:682
#define RELS_BSEARCH_THRESHOLD
Definition: bufmgr.c:83
int maintenance_io_concurrency
Definition: bufmgr.c:162
static void UnpinBuffer(BufferDesc *buf)
Definition: bufmgr.c:3233
void FlushDatabaseBuffers(Oid dbid)
Definition: bufmgr.c:5276
static void InvalidateBuffer(BufferDesc *buf)
Definition: bufmgr.c:2171
static void RelationCopyStorageUsingBuffer(RelFileLocator srclocator, RelFileLocator dstlocator, ForkNumber forkNum, bool permanent)
Definition: bufmgr.c:5098
int effective_io_concurrency
Definition: bufmgr.c:155
static PrivateRefCountEntry * GetPrivateRefCountEntry(Buffer buffer, bool do_move)
Definition: bufmgr.c:351
bool StartBufferIO(BufferDesc *buf, bool forInput, bool nowait)
Definition: bufmgr.c:6010
struct PrivateRefCountEntry PrivateRefCountEntry
struct CkptTsStatus CkptTsStatus
bool StartReadBuffer(ReadBuffersOperation *operation, Buffer *buffer, BlockNumber blocknum, int flags)
Definition: bufmgr.c:1508
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:805
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:6224
static void ResOwnerReleaseBufferPin(Datum res)
Definition: bufmgr.c:6527
static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES]
Definition: bufmgr.c:215
static void buffer_readv_decode_error(PgAioResult result, bool *zeroed_any, bool *ignored_any, uint8 *zeroed_or_error_count, uint8 *checkfail_count, uint8 *first_off)
Definition: bufmgr.c:6861
#define READV_COUNT_MASK
int io_combine_limit
Definition: bufmgr.c:170
void InitBufferManagerAccess(void)
Definition: bufmgr.c:3982
static void buffer_readv_encode_error(PgAioResult *result, bool is_temp, bool zeroed_any, bool ignored_any, uint8 error_count, uint8 zeroed_count, uint8 checkfail_count, uint8 first_error_off, uint8 first_zeroed_off, uint8 first_ignored_off)
Definition: bufmgr.c:6903
static int SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
Definition: bufmgr.c:3892
uint32 GetAdditionalPinLimit(void)
Definition: bufmgr.c:2505
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition: bufmgr.c:758
static HTAB * PrivateRefCountHash
Definition: bufmgr.c:216
static int32 PrivateRefCountOverflowed
Definition: bufmgr.c:217
bool ConditionalLockBufferForCleanup(Buffer buffer)
Definition: bufmgr.c:5820
int bgwriter_lru_maxpages
Definition: bufmgr.c:145
uint32 GetPinLimit(void)
Definition: bufmgr.c:2493
static void WaitIO(BufferDesc *buf)
Definition: bufmgr.c:5931
#define BUF_WRITTEN
Definition: bufmgr.c:80
void FlushOneBuffer(Buffer buffer)
Definition: bufmgr.c:5318
@ BAS_BULKREAD
Definition: bufmgr.h:37
@ BAS_BULKWRITE
Definition: bufmgr.h:39
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:196
#define BUFFER_LOCK_SHARE
Definition: bufmgr.h:197
#define P_NEW
Definition: bufmgr.h:191
#define READ_BUFFERS_ZERO_ON_ERROR
Definition: bufmgr.h:115
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:417
#define DEFAULT_IO_COMBINE_LIMIT
Definition: bufmgr.h:167
static Block BufferGetBlock(Buffer buffer)
Definition: bufmgr.h:384
#define READ_BUFFERS_ISSUE_ADVICE
Definition: bufmgr.h:117
#define MAX_IO_COMBINE_LIMIT
Definition: bufmgr.h:166
#define DEFAULT_EFFECTIVE_IO_CONCURRENCY
Definition: bufmgr.h:161
#define READ_BUFFERS_IGNORE_CHECKSUM_FAILURES
Definition: bufmgr.h:119
#define DEFAULT_MAINTENANCE_IO_CONCURRENCY
Definition: bufmgr.h:162
void * Block
Definition: bufmgr.h:26
@ EB_LOCK_TARGET
Definition: bufmgr.h:93
@ EB_CLEAR_SIZE_CACHE
Definition: bufmgr.h:90
@ EB_PERFORMING_RECOVERY
Definition: bufmgr.h:78
@ EB_CREATE_FORK_IF_NEEDED
Definition: bufmgr.h:84
@ EB_SKIP_EXTENSION_LOCK
Definition: bufmgr.h:75
@ EB_LOCK_FIRST
Definition: bufmgr.h:87
#define READ_BUFFERS_SYNCHRONOUSLY
Definition: bufmgr.h:121
#define BUFFER_LOCK_EXCLUSIVE
Definition: bufmgr.h:198
ReadBufferMode
Definition: bufmgr.h:45
@ RBM_ZERO_ON_ERROR
Definition: bufmgr.h:51
@ RBM_ZERO_AND_CLEANUP_LOCK
Definition: bufmgr.h:49
@ RBM_ZERO_AND_LOCK
Definition: bufmgr.h:47
@ RBM_NORMAL
Definition: bufmgr.h:46
#define BMR_REL(p_rel)
Definition: bufmgr.h:111
static bool BufferIsValid(Buffer bufnum)
Definition: bufmgr.h:368
bool ignore_checksum_failure
Definition: bufpage.c:27
char * PageSetChecksumCopy(Page page, BlockNumber blkno)
Definition: bufpage.c:1509
bool PageIsVerified(PageData *page, BlockNumber blkno, int flags, bool *checksum_failure_p)
Definition: bufpage.c:94
#define PIV_LOG_LOG
Definition: bufpage.h:469
static bool PageIsNew(const PageData *page)
Definition: bufpage.h:234
static void PageSetLSN(Page page, XLogRecPtr lsn)
Definition: bufpage.h:391
PageData * Page
Definition: bufpage.h:82
static XLogRecPtr PageGetLSN(const PageData *page)
Definition: bufpage.h:386
#define PIV_IGNORE_CHECKSUM_FAILURE
Definition: bufpage.h:470
#define likely(x)
Definition: c.h:402
uint8_t uint8
Definition: c.h:537
#define PG_USED_FOR_ASSERTS_ONLY
Definition: c.h:223
#define Max(x, y)
Definition: c.h:998
double float8
Definition: c.h:636
#define pg_attribute_always_inline
Definition: c.h:269
int16_t int16
Definition: c.h:534
int32_t int32
Definition: c.h:535
uint64_t uint64
Definition: c.h:540
#define pg_unreachable()
Definition: c.h:331
#define unlikely(x)
Definition: c.h:403
uint32_t uint32
Definition: c.h:539
#define lengthof(array)
Definition: c.h:788
#define MemSet(start, val, len)
Definition: c.h:1020
#define StaticAssertStmt(condition, errmessage)
Definition: c.h:938
size_t Size
Definition: c.h:611
bool IsCatalogRelationOid(Oid relid)
Definition: catalog.c:121
bool IsCatalogTextUniqueIndexOid(Oid relid)
Definition: catalog.c:156
void CheckpointWriteDelay(int flags, double progress)
Definition: checkpointer.c:785
bool ConditionVariableCancelSleep(void)
void ConditionVariableBroadcast(ConditionVariable *cv)
void ConditionVariablePrepareToSleep(ConditionVariable *cv)
void ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info)
int64 TimestampTz
Definition: timestamp.h:39
void * hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, bool *foundPtr)
Definition: dynahash.c:952
HTAB * hash_create(const char *tabname, int64 nelem, const HASHCTL *info, int flags)
Definition: dynahash.c:358
void * hash_seq_search(HASH_SEQ_STATUS *status)
Definition: dynahash.c:1415
void hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
Definition: dynahash.c:1380
struct cursor * cur
Definition: ecpg.c:29
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1161
int errdetail_internal(const char *fmt,...)
Definition: elog.c:1234
int errdetail(const char *fmt,...)
Definition: elog.c:1207
ErrorContextCallback * error_context_stack
Definition: elog.c:95
int errhint_internal(const char *fmt,...)
Definition: elog.c:1343
int errcode(int sqlerrcode)
Definition: elog.c:854
int errmsg(const char *fmt,...)
Definition: elog.c:1071
int errhint_plural(const char *fmt_singular, const char *fmt_plural, unsigned long n,...)
Definition: elog.c:1364
#define _(x)
Definition: elog.c:91
#define errcontext
Definition: elog.h:198
#define DEBUG3
Definition: elog.h:28
#define LOG_SERVER_ONLY
Definition: elog.h:32
#define WARNING
Definition: elog.h:36
#define DEBUG2
Definition: elog.h:29
#define DEBUG1
Definition: elog.h:30
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:226
#define ereport(elevel,...)
Definition: elog.h:150
int io_direct_flags
Definition: fd.c:168
#define IO_DIRECT_DATA
Definition: fd.h:54
int StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc)
Definition: freelist.c:284
BufferAccessStrategy GetAccessStrategy(BufferAccessStrategyType btype)
Definition: freelist.c:424
void FreeAccessStrategy(BufferAccessStrategy strategy)
Definition: freelist.c:606
IOContext IOContextForStrategy(BufferAccessStrategy strategy)
Definition: freelist.c:683
BufferDesc * StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_ring)
Definition: freelist.c:171
bool StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf, bool from_ring)
Definition: freelist.c:723
volatile sig_atomic_t ProcSignalBarrierPending
Definition: globals.c:40
int NBuffers
Definition: globals.c:142
bool enableFsync
Definition: globals.c:129
ProcNumber MyProcNumber
Definition: globals.c:90
int VacuumCostPageMiss
Definition: globals.c:152
bool VacuumCostActive
Definition: globals.c:158
int VacuumCostBalance
Definition: globals.c:157
int MaxBackends
Definition: globals.c:146
int VacuumCostPageDirty
Definition: globals.c:153
int VacuumCostPageHit
Definition: globals.c:151
Assert(PointerIsAligned(start, uint64))
#define free(a)
Definition: header.h:65
@ HASH_FIND
Definition: hsearch.h:113
@ HASH_REMOVE
Definition: hsearch.h:115
@ HASH_ENTER
Definition: hsearch.h:114
#define HASH_ELEM
Definition: hsearch.h:95
#define HASH_BLOBS
Definition: hsearch.h:97
BufferUsage pgBufferUsage
Definition: instrument.c:20
void on_shmem_exit(pg_on_exit_callback function, Datum arg)
Definition: ipc.c:365
int b
Definition: isn.c:74
int a
Definition: isn.c:73
int j
Definition: isn.c:78
int i
Definition: isn.c:77
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:81
void LockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:424
void UnlockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:474
int32 * LocalRefCount
Definition: localbuf.c:48
void FlushLocalBuffer(BufferDesc *bufHdr, SMgrRelation reln)
Definition: localbuf.c:182
void UnpinLocalBuffer(Buffer buffer)
Definition: localbuf.c:839
bool StartLocalBufferIO(BufferDesc *bufHdr, bool forInput, bool nowait)
Definition: localbuf.c:521
void AtEOXact_LocalBuffers(bool isCommit)
Definition: localbuf.c:1001
void AtProcExit_LocalBuffers(void)
Definition: localbuf.c:1012
bool PinLocalBuffer(BufferDesc *buf_hdr, bool adjust_usagecount)
Definition: localbuf.c:803
void MarkLocalBufferDirty(Buffer buffer)
Definition: localbuf.c:489
void DropRelationAllLocalBuffers(RelFileLocator rlocator)
Definition: localbuf.c:700
void TerminateLocalBufferIO(BufferDesc *bufHdr, bool clear_dirty, uint32 set_flag_bits, bool release_aio)
Definition: localbuf.c:560
int NLocBuffer
Definition: localbuf.c:44
PrefetchBufferResult PrefetchLocalBuffer(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum)
Definition: localbuf.c:71
BlockNumber ExtendBufferedRelLocal(BufferManagerRelation bmr, ForkNumber fork, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
Definition: localbuf.c:345
void UnpinLocalBufferNoOwner(Buffer buffer)
Definition: localbuf.c:846
void DropRelationLocalBuffers(RelFileLocator rlocator, ForkNumber *forkNum, int nforks, BlockNumber *firstDelBlock)
Definition: localbuf.c:663
BufferDesc * LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, bool *foundPtr)
Definition: localbuf.c:118
#define ExclusiveLock
Definition: lockdefs.h:42
bool LWLockHeldByMe(LWLock *lock)
Definition: lwlock.c:1977
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1174
void LWLockDisown(LWLock *lock)
Definition: lwlock.c:1883
bool LWLockHeldByMeInMode(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:2021
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1894
bool LWLockConditionalAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1345
void ForEachLWLockHeldByMe(void(*callback)(LWLock *, LWLockMode, void *), void *context)
Definition: lwlock.c:1962
LWLockMode
Definition: lwlock.h:111
@ LW_SHARED
Definition: lwlock.h:113
@ LW_EXCLUSIVE
Definition: lwlock.h:112
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1610
void pfree(void *pointer)
Definition: mcxt.c:1594
void * palloc(Size size)
Definition: mcxt.c:1365
#define VALGRIND_MAKE_MEM_DEFINED(addr, size)
Definition: memdebug.h:26
#define VALGRIND_MAKE_MEM_NOACCESS(addr, size)
Definition: memdebug.h:27
#define START_CRIT_SECTION()
Definition: miscadmin.h:149
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:122
#define END_CRIT_SECTION()
Definition: miscadmin.h:151
void * arg
#define ERRCODE_DATA_CORRUPTED
Definition: pg_basebackup.c:42
static PgChecksumMode mode
Definition: pg_checksums.c:55
static int64 current_size
Definition: pg_checksums.c:63
#define WRITEBACK_MAX_PENDING_FLUSHES
#define DEFAULT_BACKEND_FLUSH_AFTER
#define DEFAULT_CHECKPOINT_FLUSH_AFTER
#define DEFAULT_BGWRITER_FLUSH_AFTER
#define PG_IOV_MAX
Definition: pg_iovec.h:47
static char * buf
Definition: pg_test_fsync.c:72
IOObject
Definition: pgstat.h:274
@ IOOBJECT_RELATION
Definition: pgstat.h:275
@ IOOBJECT_TEMP_RELATION
Definition: pgstat.h:276
#define pgstat_count_buffer_read(rel)
Definition: pgstat.h:705
IOContext
Definition: pgstat.h:283
@ IOCONTEXT_NORMAL
Definition: pgstat.h:287
@ IOOP_EXTEND
Definition: pgstat.h:312
@ IOOP_READ
Definition: pgstat.h:313
@ IOOP_WRITEBACK
Definition: pgstat.h:309
@ IOOP_HIT
Definition: pgstat.h:307
@ IOOP_EVICT
Definition: pgstat.h:305
@ IOOP_REUSE
Definition: pgstat.h:308
@ IOOP_WRITE
Definition: pgstat.h:314
#define pgstat_count_buffer_hit(rel)
Definition: pgstat.h:710
PgStat_BgWriterStats PendingBgWriterStats
PgStat_CheckpointerStats PendingCheckpointerStats
void pgstat_prepare_report_checksum_failure(Oid dboid)
void pgstat_report_checksum_failures_in_db(Oid dboid, int failurecount)
instr_time pgstat_prepare_io_time(bool track_io_guc)
Definition: pgstat_io.c:91
void pgstat_count_io_op(IOObject io_object, IOContext io_context, IOOp io_op, uint32 cnt, uint64 bytes)
Definition: pgstat_io.c:68
void pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op, instr_time start_time, uint32 cnt, uint64 bytes)
Definition: pgstat_io.c:122
#define qsort(a, b, c, d)
Definition: port.h:479
static Datum PointerGetDatum(const void *X)
Definition: postgres.h:332
uint64_t Datum
Definition: postgres.h:70
static Pointer DatumGetPointer(Datum X)
Definition: postgres.h:322
static int32 DatumGetInt32(Datum X)
Definition: postgres.h:212
#define InvalidOid
Definition: postgres_ext.h:37
unsigned int Oid
Definition: postgres_ext.h:32
#define NUM_AUXILIARY_PROCS
Definition: proc.h:463
#define DELAY_CHKPT_START
Definition: proc.h:135
#define INVALID_PROC_NUMBER
Definition: procnumber.h:26
int ProcNumber
Definition: procnumber.h:24
void ProcessProcSignalBarrier(void)
Definition: procsignal.c:499
@ PROCSIG_RECOVERY_CONFLICT_BUFFERPIN
Definition: procsignal.h:47
void set_ps_display_remove_suffix(void)
Definition: ps_status.c:439
void set_ps_display_suffix(const char *suffix)
Definition: ps_status.c:387
char * psprintf(const char *fmt,...)
Definition: psprintf.c:43
ReadStream * read_stream_begin_smgr_relation(int flags, BufferAccessStrategy strategy, SMgrRelation smgr, char smgr_persistence, ForkNumber forknum, ReadStreamBlockNumberCB callback, void *callback_private_data, size_t per_buffer_data_size)
Definition: read_stream.c:761
Buffer read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
Definition: read_stream.c:791
void read_stream_end(ReadStream *stream)
Definition: read_stream.c:1089
BlockNumber block_range_read_stream_cb(ReadStream *stream, void *callback_private_data, void *per_buffer_data)
Definition: read_stream.c:162
#define READ_STREAM_USE_BATCHING
Definition: read_stream.h:64
#define READ_STREAM_FULL
Definition: read_stream.h:43
static unsigned hash(unsigned *uv, int n)
Definition: rege_dfa.c:715
static SMgrRelation RelationGetSmgr(Relation rel)
Definition: rel.h:576
#define RelationUsesLocalBuffers(relation)
Definition: rel.h:646
#define RELATION_IS_OTHER_TEMP(relation)
Definition: rel.h:667
#define RelationIsValid(relation)
Definition: rel.h:489
#define RelFileLocatorBackendIsTemp(rlocator)
#define RelFileLocatorEquals(locator1, locator2)
ForkNumber
Definition: relpath.h:56
@ MAIN_FORKNUM
Definition: relpath.h:58
@ INIT_FORKNUM
Definition: relpath.h:61
#define MAX_FORKNUM
Definition: relpath.h:70
#define relpath(rlocator, forknum)
Definition: relpath.h:150
#define relpathbackend(rlocator, backend, forknum)
Definition: relpath.h:141
#define relpathperm(rlocator, forknum)
Definition: relpath.h:146
ResourceOwner CurrentResourceOwner
Definition: resowner.c:173
void ResourceOwnerEnlarge(ResourceOwner owner)
Definition: resowner.c:449
#define RELEASE_PRIO_BUFFER_IOS
Definition: resowner.h:62
@ RESOURCE_RELEASE_BEFORE_LOCKS
Definition: resowner.h:54
#define RELEASE_PRIO_BUFFER_PINS
Definition: resowner.h:63
void perform_spin_delay(SpinDelayStatus *status)
Definition: s_lock.c:126
void finish_spin_delay(SpinDelayStatus *status)
Definition: s_lock.c:186
#define init_local_spin_delay(status)
Definition: s_lock.h:733
BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:819
void smgrstartreadv(PgAioHandle *ioh, SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void **buffers, BlockNumber nblocks)
Definition: smgr.c:753
void smgrwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks)
Definition: smgr.c:805
SMgrRelation smgropen(RelFileLocator rlocator, ProcNumber backend)
Definition: smgr.c:240
void smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
Definition: smgr.c:481
BlockNumber smgrnblocks_cached(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:847
uint32 smgrmaxcombine(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
Definition: smgr.c:697
void smgrzeroextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks, bool skipFsync)
Definition: smgr.c:649
void smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync)
Definition: smgr.c:620
bool smgrexists(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:462
bool smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks)
Definition: smgr.c:678
static void smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync)
Definition: smgr.h:131
void ProcSendSignal(ProcNumber procNumber)
Definition: proc.c:1986
PGPROC * MyProc
Definition: proc.c:66
int GetStartupBufferPinWaitBufId(void)
Definition: proc.c:766
int DeadlockTimeout
Definition: proc.c:57
void SetStartupBufferPinWaitBufId(int bufid)
Definition: proc.c:754
void ProcWaitForSignal(uint32 wait_event_info)
Definition: proc.c:1974
void ResolveRecoveryConflictWithBufferPin(void)
Definition: standby.c:793
bool log_recovery_conflict_waits
Definition: standby.c:42
void LogRecoveryConflict(ProcSignalReason reason, TimestampTz wait_start, TimestampTz now, VirtualTransactionId *wait_list, bool still_waiting)
Definition: standby.c:274
bool RelFileLocatorSkippingWAL(RelFileLocator rlocator)
Definition: storage.c:573
SMgrRelation RelationCreateStorage(RelFileLocator rlocator, char relpersistence, bool register_delete)
Definition: storage.c:122
void log_smgrcreate(const RelFileLocator *rlocator, ForkNumber forkNum)
Definition: storage.c:187
int wait_backend_pgprocno
BufferTag tag
pg_atomic_uint32 state
PgAioWaitRef io_wref
SMgrRelation smgr
Definition: bufmgr.h:107
int64 shared_blks_dirtied
Definition: instrument.h:28
int64 local_blks_hit
Definition: instrument.h:30
int64 shared_blks_read
Definition: instrument.h:27
int64 shared_blks_written
Definition: instrument.h:29
int64 local_blks_read
Definition: instrument.h:31
int64 shared_blks_hit
Definition: instrument.h:26
int ckpt_bufs_written
Definition: xlog.h:167
ForkNumber forkNum
RelFileNumber relNumber
BlockNumber blockNum
float8 progress_slice
Definition: bufmgr.c:119
int index
Definition: bufmgr.c:127
int num_scanned
Definition: bufmgr.c:124
float8 progress
Definition: bufmgr.c:118
int num_to_scan
Definition: bufmgr.c:122
Oid tsId
Definition: bufmgr.c:109
struct ErrorContextCallback * previous
Definition: elog.h:297
void(* callback)(void *arg)
Definition: elog.h:298
Size keysize
Definition: hsearch.h:75
Size entrysize
Definition: hsearch.h:76
Definition: dynahash.c:222
Definition: lwlock.h:42
int delayChkptFlags
Definition: proc.h:257
PgAioHandleCallbackStage stage
Definition: aio.h:219
uint32 status
Definition: aio_types.h:108
uint32 error_data
Definition: aio_types.h:111
int32 result
Definition: aio_types.h:113
uint32 id
Definition: aio_types.h:105
PgAioResult result
Definition: aio_types.h:132
PgAioTargetData target_data
Definition: aio_types.h:133
PgStat_Counter buf_written_clean
Definition: pgstat.h:240
PgStat_Counter maxwritten_clean
Definition: pgstat.h:241
PgStat_Counter buf_alloc
Definition: pgstat.h:242
PgStat_Counter buffers_written
Definition: pgstat.h:264
Buffer recent_buffer
Definition: bufmgr.h:61
ForkNumber forknum
Definition: bufmgr.h:130
PgAioWaitRef io_wref
Definition: bufmgr.h:143
Buffer * buffers
Definition: bufmgr.h:138
SMgrRelation smgr
Definition: bufmgr.h:128
BufferAccessStrategy strategy
Definition: bufmgr.h:131
BlockNumber blocknum
Definition: bufmgr.h:139
PgAioReturn io_return
Definition: bufmgr.h:144
RelFileLocator locator
RelFileNumber relNumber
char str[REL_PATH_STR_MAXLEN+1]
Definition: relpath.h:123
RelFileLocator rd_locator
Definition: rel.h:57
Form_pg_class rd_rel
Definition: rel.h:111
const char * name
Definition: resowner.h:93
BlockNumber smgr_cached_nblocks[MAX_FORKNUM+1]
Definition: smgr.h:47
RelFileLocatorBackend smgr_rlocator
Definition: smgr.h:38
SMgrRelation srel
Definition: bufmgr.c:140
RelFileLocator rlocator
Definition: bufmgr.c:139
PendingWriteback pending_writebacks[WRITEBACK_MAX_PENDING_FLUSHES]
BlockNumber blockNum
RelFileNumber relNumber
ForkNumber forkNum
Oid spcOid
static uint64 table_relation_size(Relation rel, ForkNumber forkNumber)
Definition: tableam.h:1837
struct PgAioTargetData::@123 smgr
BlockNumber blockNum
Definition: aio_types.h:66
RelFileLocator rlocator
Definition: aio_types.h:65
BlockNumber nblocks
Definition: aio_types.h:67
ForkNumber forkNum
Definition: aio_types.h:68
static volatile sig_atomic_t waiting
Definition: waiteventset.c:171
bool RecoveryInProgress(void)
Definition: xlog.c:6383
bool XLogNeedsFlush(XLogRecPtr record)
Definition: xlog.c:3124
CheckpointStatsData CheckpointStats
Definition: xlog.c:210
void XLogFlush(XLogRecPtr record)
Definition: xlog.c:2780
#define CHECKPOINT_FLUSH_UNLOGGED
Definition: xlog.h:143
#define CHECKPOINT_END_OF_RECOVERY
Definition: xlog.h:140
#define CHECKPOINT_IS_SHUTDOWN
Definition: xlog.h:139
#define XLogIsNeeded()
Definition: xlog.h:109
#define XLogHintBitIsNeeded()
Definition: xlog.h:120
#define XLogRecPtrIsInvalid(r)
Definition: xlogdefs.h:29
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define InvalidXLogRecPtr
Definition: xlogdefs.h:28
XLogRecPtr XLogSaveBufferForHint(Buffer buffer, bool buffer_std)
Definition: xloginsert.c:1077
XLogRecPtr log_newpage_buffer(Buffer buffer, bool page_std)
Definition: xloginsert.c:1249
#define InHotStandby
Definition: xlogutils.h:60