Thanks to visit codestin.com
Credit goes to doxygen.postgresql.org

PostgreSQL Source Code git master
bufmgr.c File Reference
#include "postgres.h"
#include <sys/file.h>
#include <unistd.h>
#include "access/tableam.h"
#include "access/xloginsert.h"
#include "access/xlogutils.h"
#include "catalog/storage.h"
#include "catalog/storage_xlog.h"
#include "executor/instrument.h"
#include "lib/binaryheap.h"
#include "miscadmin.h"
#include "pg_trace.h"
#include "pgstat.h"
#include "postmaster/bgwriter.h"
#include "storage/aio.h"
#include "storage/buf_internals.h"
#include "storage/bufmgr.h"
#include "storage/fd.h"
#include "storage/ipc.h"
#include "storage/lmgr.h"
#include "storage/proc.h"
#include "storage/read_stream.h"
#include "storage/smgr.h"
#include "storage/standby.h"
#include "utils/memdebug.h"
#include "utils/ps_status.h"
#include "utils/rel.h"
#include "utils/resowner.h"
#include "utils/timestamp.h"
#include "lib/sort_template.h"
Include dependency graph for bufmgr.c:

Go to the source code of this file.

Data Structures

struct  PrivateRefCountEntry
 
struct  CkptTsStatus
 
struct  SMgrSortArray
 

Macros

#define BufHdrGetBlock(bufHdr)   ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
 
#define BufferGetLSN(bufHdr)   (PageGetLSN(BufHdrGetBlock(bufHdr)))
 
#define LocalBufHdrGetBlock(bufHdr)    LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]
 
#define BUF_WRITTEN   0x01
 
#define BUF_REUSABLE   0x02
 
#define RELS_BSEARCH_THRESHOLD   20
 
#define BUF_DROP_FULL_SCAN_THRESHOLD   (uint64) (NBuffers / 32)
 
#define REFCOUNT_ARRAY_ENTRIES   8
 
#define BufferIsPinned(bufnum)
 
#define ST_SORT   sort_checkpoint_bufferids
 
#define ST_ELEMENT_TYPE   CkptSortItem
 
#define ST_COMPARE(a, b)   ckpt_buforder_comparator(a, b)
 
#define ST_SCOPE   static
 
#define ST_DEFINE
 
#define ST_SORT   sort_pending_writebacks
 
#define ST_ELEMENT_TYPE   PendingWriteback
 
#define ST_COMPARE(a, b)   buffertag_comparator(&a->tag, &b->tag)
 
#define ST_SCOPE   static
 
#define ST_DEFINE
 
#define READV_COUNT_BITS   7
 
#define READV_COUNT_MASK   ((1 << READV_COUNT_BITS) - 1)
 

Typedefs

typedef struct PrivateRefCountEntry PrivateRefCountEntry
 
typedef struct CkptTsStatus CkptTsStatus
 
typedef struct SMgrSortArray SMgrSortArray
 

Functions

static void ReservePrivateRefCountEntry (void)
 
static PrivateRefCountEntryNewPrivateRefCountEntry (Buffer buffer)
 
static PrivateRefCountEntryGetPrivateRefCountEntry (Buffer buffer, bool do_move)
 
static int32 GetPrivateRefCount (Buffer buffer)
 
static void ForgetPrivateRefCountEntry (PrivateRefCountEntry *ref)
 
static void ResOwnerReleaseBufferIO (Datum res)
 
static char * ResOwnerPrintBufferIO (Datum res)
 
static void ResOwnerReleaseBufferPin (Datum res)
 
static char * ResOwnerPrintBufferPin (Datum res)
 
static Buffer ReadBuffer_common (Relation rel, SMgrRelation smgr, char smgr_persistence, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
 
static BlockNumber ExtendBufferedRelCommon (BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
 
static BlockNumber ExtendBufferedRelShared (BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
 
static bool PinBuffer (BufferDesc *buf, BufferAccessStrategy strategy)
 
static void PinBuffer_Locked (BufferDesc *buf)
 
static void UnpinBuffer (BufferDesc *buf)
 
static void UnpinBufferNoOwner (BufferDesc *buf)
 
static void BufferSync (int flags)
 
static uint32 WaitBufHdrUnlocked (BufferDesc *buf)
 
static int SyncOneBuffer (int buf_id, bool skip_recently_used, WritebackContext *wb_context)
 
static void WaitIO (BufferDesc *buf)
 
static void AbortBufferIO (Buffer buffer)
 
static void shared_buffer_write_error_callback (void *arg)
 
static void local_buffer_write_error_callback (void *arg)
 
static BufferDescBufferAlloc (SMgrRelation smgr, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, bool *foundPtr, IOContext io_context)
 
static bool AsyncReadBuffers (ReadBuffersOperation *operation, int *nblocks_progress)
 
static void CheckReadBuffersOperation (ReadBuffersOperation *operation, bool is_complete)
 
static Buffer GetVictimBuffer (BufferAccessStrategy strategy, IOContext io_context)
 
static void FlushBuffer (BufferDesc *buf, SMgrRelation reln, IOObject io_object, IOContext io_context)
 
static void FindAndDropRelationBuffers (RelFileLocator rlocator, ForkNumber forkNum, BlockNumber nForkBlock, BlockNumber firstDelBlock)
 
static void RelationCopyStorageUsingBuffer (RelFileLocator srclocator, RelFileLocator dstlocator, ForkNumber forkNum, bool permanent)
 
static void AtProcExit_Buffers (int code, Datum arg)
 
static void CheckForBufferLeaks (void)
 
static int rlocator_comparator (const void *p1, const void *p2)
 
static int buffertag_comparator (const BufferTag *ba, const BufferTag *bb)
 
static int ckpt_buforder_comparator (const CkptSortItem *a, const CkptSortItem *b)
 
static int ts_ckpt_progress_comparator (Datum a, Datum b, void *arg)
 
PrefetchBufferResult PrefetchSharedBuffer (SMgrRelation smgr_reln, ForkNumber forkNum, BlockNumber blockNum)
 
PrefetchBufferResult PrefetchBuffer (Relation reln, ForkNumber forkNum, BlockNumber blockNum)
 
bool ReadRecentBuffer (RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockNum, Buffer recent_buffer)
 
Buffer ReadBuffer (Relation reln, BlockNumber blockNum)
 
Buffer ReadBufferExtended (Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
 
Buffer ReadBufferWithoutRelcache (RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy, bool permanent)
 
Buffer ExtendBufferedRel (BufferManagerRelation bmr, ForkNumber forkNum, BufferAccessStrategy strategy, uint32 flags)
 
BlockNumber ExtendBufferedRelBy (BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, Buffer *buffers, uint32 *extended_by)
 
Buffer ExtendBufferedRelTo (BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, BlockNumber extend_to, ReadBufferMode mode)
 
static void ZeroAndLockBuffer (Buffer buffer, ReadBufferMode mode, bool already_valid)
 
static pg_attribute_always_inline Buffer PinBufferForBlock (Relation rel, SMgrRelation smgr, char persistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, bool *foundPtr)
 
static pg_attribute_always_inline bool StartReadBuffersImpl (ReadBuffersOperation *operation, Buffer *buffers, BlockNumber blockNum, int *nblocks, int flags, bool allow_forwarding)
 
bool StartReadBuffers (ReadBuffersOperation *operation, Buffer *buffers, BlockNumber blockNum, int *nblocks, int flags)
 
bool StartReadBuffer (ReadBuffersOperation *operation, Buffer *buffer, BlockNumber blocknum, int flags)
 
static bool ReadBuffersCanStartIOOnce (Buffer buffer, bool nowait)
 
static bool ReadBuffersCanStartIO (Buffer buffer, bool nowait)
 
static void ProcessReadBuffersResult (ReadBuffersOperation *operation)
 
void WaitReadBuffers (ReadBuffersOperation *operation)
 
static void InvalidateBuffer (BufferDesc *buf)
 
static bool InvalidateVictimBuffer (BufferDesc *buf_hdr)
 
uint32 GetPinLimit (void)
 
uint32 GetAdditionalPinLimit (void)
 
void LimitAdditionalPins (uint32 *additional_pins)
 
bool BufferIsExclusiveLocked (Buffer buffer)
 
bool BufferIsDirty (Buffer buffer)
 
void MarkBufferDirty (Buffer buffer)
 
Buffer ReleaseAndReadBuffer (Buffer buffer, Relation relation, BlockNumber blockNum)
 
static void WakePinCountWaiter (BufferDesc *buf)
 
bool BgBufferSync (WritebackContext *wb_context)
 
void AtEOXact_Buffers (bool isCommit)
 
void InitBufferManagerAccess (void)
 
char * DebugPrintBufferRefcount (Buffer buffer)
 
void CheckPointBuffers (int flags)
 
BlockNumber BufferGetBlockNumber (Buffer buffer)
 
void BufferGetTag (Buffer buffer, RelFileLocator *rlocator, ForkNumber *forknum, BlockNumber *blknum)
 
BlockNumber RelationGetNumberOfBlocksInFork (Relation relation, ForkNumber forkNum)
 
bool BufferIsPermanent (Buffer buffer)
 
XLogRecPtr BufferGetLSNAtomic (Buffer buffer)
 
void DropRelationBuffers (SMgrRelation smgr_reln, ForkNumber *forkNum, int nforks, BlockNumber *firstDelBlock)
 
void DropRelationsAllBuffers (SMgrRelation *smgr_reln, int nlocators)
 
void DropDatabaseBuffers (Oid dbid)
 
void FlushRelationBuffers (Relation rel)
 
void FlushRelationsAllBuffers (SMgrRelation *smgrs, int nrels)
 
void CreateAndCopyRelationData (RelFileLocator src_rlocator, RelFileLocator dst_rlocator, bool permanent)
 
void FlushDatabaseBuffers (Oid dbid)
 
void FlushOneBuffer (Buffer buffer)
 
void ReleaseBuffer (Buffer buffer)
 
void UnlockReleaseBuffer (Buffer buffer)
 
void IncrBufferRefCount (Buffer buffer)
 
void MarkBufferDirtyHint (Buffer buffer, bool buffer_std)
 
void UnlockBuffers (void)
 
void LockBuffer (Buffer buffer, int mode)
 
bool ConditionalLockBuffer (Buffer buffer)
 
void CheckBufferIsPinnedOnce (Buffer buffer)
 
void LockBufferForCleanup (Buffer buffer)
 
bool HoldingBufferPinThatDelaysRecovery (void)
 
bool ConditionalLockBufferForCleanup (Buffer buffer)
 
bool IsBufferCleanupOK (Buffer buffer)
 
bool StartBufferIO (BufferDesc *buf, bool forInput, bool nowait)
 
void TerminateBufferIO (BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits, bool forget_owner, bool release_aio)
 
uint32 LockBufHdr (BufferDesc *desc)
 
void WritebackContextInit (WritebackContext *context, int *max_pending)
 
void ScheduleBufferTagForWriteback (WritebackContext *wb_context, IOContext io_context, BufferTag *tag)
 
void IssuePendingWritebacks (WritebackContext *wb_context, IOContext io_context)
 
static bool EvictUnpinnedBufferInternal (BufferDesc *desc, bool *buffer_flushed)
 
bool EvictUnpinnedBuffer (Buffer buf, bool *buffer_flushed)
 
void EvictAllUnpinnedBuffers (int32 *buffers_evicted, int32 *buffers_flushed, int32 *buffers_skipped)
 
void EvictRelUnpinnedBuffers (Relation rel, int32 *buffers_evicted, int32 *buffers_flushed, int32 *buffers_skipped)
 
static pg_attribute_always_inline void buffer_stage_common (PgAioHandle *ioh, bool is_write, bool is_temp)
 
static void buffer_readv_decode_error (PgAioResult result, bool *zeroed_any, bool *ignored_any, uint8 *zeroed_or_error_count, uint8 *checkfail_count, uint8 *first_off)
 
static void buffer_readv_encode_error (PgAioResult *result, bool is_temp, bool zeroed_any, bool ignored_any, uint8 error_count, uint8 zeroed_count, uint8 checkfail_count, uint8 first_error_off, uint8 first_zeroed_off, uint8 first_ignored_off)
 
static pg_attribute_always_inline void buffer_readv_complete_one (PgAioTargetData *td, uint8 buf_off, Buffer buffer, uint8 flags, bool failed, bool is_temp, bool *buffer_invalid, bool *failed_checksum, bool *ignored_checksum, bool *zeroed_buffer)
 
static pg_attribute_always_inline PgAioResult buffer_readv_complete (PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data, bool is_temp)
 
static void buffer_readv_report (PgAioResult result, const PgAioTargetData *td, int elevel)
 
static void shared_buffer_readv_stage (PgAioHandle *ioh, uint8 cb_data)
 
static PgAioResult shared_buffer_readv_complete (PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
 
static PgAioResult shared_buffer_readv_complete_local (PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
 
static void local_buffer_readv_stage (PgAioHandle *ioh, uint8 cb_data)
 
static PgAioResult local_buffer_readv_complete (PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
 

Variables

bool zero_damaged_pages = false
 
int bgwriter_lru_maxpages = 100
 
double bgwriter_lru_multiplier = 2.0
 
bool track_io_timing = false
 
int effective_io_concurrency = DEFAULT_EFFECTIVE_IO_CONCURRENCY
 
int maintenance_io_concurrency = DEFAULT_MAINTENANCE_IO_CONCURRENCY
 
int io_combine_limit = DEFAULT_IO_COMBINE_LIMIT
 
int io_combine_limit_guc = DEFAULT_IO_COMBINE_LIMIT
 
int io_max_combine_limit = DEFAULT_IO_COMBINE_LIMIT
 
int checkpoint_flush_after = DEFAULT_CHECKPOINT_FLUSH_AFTER
 
int bgwriter_flush_after = DEFAULT_BGWRITER_FLUSH_AFTER
 
int backend_flush_after = DEFAULT_BACKEND_FLUSH_AFTER
 
static BufferDescPinCountWaitBuf = NULL
 
static struct PrivateRefCountEntry PrivateRefCountArray [REFCOUNT_ARRAY_ENTRIES]
 
static HTABPrivateRefCountHash = NULL
 
static int32 PrivateRefCountOverflowed = 0
 
static uint32 PrivateRefCountClock = 0
 
static PrivateRefCountEntryReservedRefCountEntry = NULL
 
static uint32 MaxProportionalPins
 
const ResourceOwnerDesc buffer_io_resowner_desc
 
const ResourceOwnerDesc buffer_pin_resowner_desc
 
const PgAioHandleCallbacks aio_shared_buffer_readv_cb
 
const PgAioHandleCallbacks aio_local_buffer_readv_cb
 

Macro Definition Documentation

◆ BUF_DROP_FULL_SCAN_THRESHOLD

#define BUF_DROP_FULL_SCAN_THRESHOLD   (uint64) (NBuffers / 32)

Definition at line 91 of file bufmgr.c.

◆ BUF_REUSABLE

#define BUF_REUSABLE   0x02

Definition at line 81 of file bufmgr.c.

◆ BUF_WRITTEN

#define BUF_WRITTEN   0x01

Definition at line 80 of file bufmgr.c.

◆ BufferGetLSN

#define BufferGetLSN (   bufHdr)    (PageGetLSN(BufHdrGetBlock(bufHdr)))

Definition at line 73 of file bufmgr.c.

◆ BufferIsPinned

#define BufferIsPinned (   bufnum)
Value:
( \
!BufferIsValid(bufnum) ? \
false \
: \
BufferIsLocal(bufnum) ? \
(LocalRefCount[-(bufnum) - 1] > 0) \
: \
(GetPrivateRefCount(bufnum) > 0) \
)
static int32 GetPrivateRefCount(Buffer buffer)
Definition: bufmgr.c:425
static bool BufferIsValid(Buffer bufnum)
Definition: bufmgr.h:368
int32 * LocalRefCount
Definition: localbuf.c:48

Definition at line 483 of file bufmgr.c.

◆ BufHdrGetBlock

#define BufHdrGetBlock (   bufHdr)    ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))

Definition at line 72 of file bufmgr.c.

◆ LocalBufHdrGetBlock

#define LocalBufHdrGetBlock (   bufHdr)     LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]

Definition at line 76 of file bufmgr.c.

◆ READV_COUNT_BITS

#define READV_COUNT_BITS   7

◆ READV_COUNT_MASK

#define READV_COUNT_MASK   ((1 << READV_COUNT_BITS) - 1)

◆ REFCOUNT_ARRAY_ENTRIES

#define REFCOUNT_ARRAY_ENTRIES   8

Definition at line 100 of file bufmgr.c.

◆ RELS_BSEARCH_THRESHOLD

#define RELS_BSEARCH_THRESHOLD   20

Definition at line 83 of file bufmgr.c.

◆ ST_COMPARE [1/2]

#define ST_COMPARE (   a,
  b 
)    ckpt_buforder_comparator(a, b)

Definition at line 6415 of file bufmgr.c.

◆ ST_COMPARE [2/2]

#define ST_COMPARE (   a,
  b 
)    buffertag_comparator(&a->tag, &b->tag)

Definition at line 6415 of file bufmgr.c.

◆ ST_DEFINE [1/2]

#define ST_DEFINE

Definition at line 6417 of file bufmgr.c.

◆ ST_DEFINE [2/2]

#define ST_DEFINE

Definition at line 6417 of file bufmgr.c.

◆ ST_ELEMENT_TYPE [1/2]

#define ST_ELEMENT_TYPE   CkptSortItem

Definition at line 6414 of file bufmgr.c.

◆ ST_ELEMENT_TYPE [2/2]

#define ST_ELEMENT_TYPE   PendingWriteback

Definition at line 6414 of file bufmgr.c.

◆ ST_SCOPE [1/2]

#define ST_SCOPE   static

Definition at line 6416 of file bufmgr.c.

◆ ST_SCOPE [2/2]

#define ST_SCOPE   static

Definition at line 6416 of file bufmgr.c.

◆ ST_SORT [1/2]

#define ST_SORT   sort_checkpoint_bufferids

Definition at line 6413 of file bufmgr.c.

◆ ST_SORT [2/2]

#define ST_SORT   sort_pending_writebacks

Definition at line 6413 of file bufmgr.c.

Typedef Documentation

◆ CkptTsStatus

typedef struct CkptTsStatus CkptTsStatus

◆ PrivateRefCountEntry

◆ SMgrSortArray

typedef struct SMgrSortArray SMgrSortArray

Function Documentation

◆ AbortBufferIO()

static void AbortBufferIO ( Buffer  buffer)
static

Definition at line 6126 of file bufmgr.c.

6127{
6128 BufferDesc *buf_hdr = GetBufferDescriptor(buffer - 1);
6129 uint32 buf_state;
6130
6131 buf_state = LockBufHdr(buf_hdr);
6132 Assert(buf_state & (BM_IO_IN_PROGRESS | BM_TAG_VALID));
6133
6134 if (!(buf_state & BM_VALID))
6135 {
6136 Assert(!(buf_state & BM_DIRTY));
6137 UnlockBufHdr(buf_hdr, buf_state);
6138 }
6139 else
6140 {
6141 Assert(buf_state & BM_DIRTY);
6142 UnlockBufHdr(buf_hdr, buf_state);
6143
6144 /* Issue notice if this is not the first failure... */
6145 if (buf_state & BM_IO_ERROR)
6146 {
6147 /* Buffer is pinned, so we can read tag without spinlock */
6149 (errcode(ERRCODE_IO_ERROR),
6150 errmsg("could not write block %u of %s",
6151 buf_hdr->tag.blockNum,
6153 BufTagGetForkNum(&buf_hdr->tag)).str),
6154 errdetail("Multiple failures --- write error might be permanent.")));
6155 }
6156 }
6157
6158 TerminateBufferIO(buf_hdr, false, BM_IO_ERROR, false, false);
6159}
#define BM_TAG_VALID
Definition: buf_internals.h:71
static ForkNumber BufTagGetForkNum(const BufferTag *tag)
static void UnlockBufHdr(BufferDesc *desc, uint32 buf_state)
#define BM_DIRTY
Definition: buf_internals.h:69
#define BM_IO_IN_PROGRESS
Definition: buf_internals.h:72
static RelFileLocator BufTagGetRelFileLocator(const BufferTag *tag)
#define BM_VALID
Definition: buf_internals.h:70
#define BM_IO_ERROR
Definition: buf_internals.h:73
static BufferDesc * GetBufferDescriptor(uint32 id)
void TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits, bool forget_owner, bool release_aio)
Definition: bufmgr.c:6067
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:6224
uint32_t uint32
Definition: c.h:539
int errdetail(const char *fmt,...)
Definition: elog.c:1207
int errcode(int sqlerrcode)
Definition: elog.c:854
int errmsg(const char *fmt,...)
Definition: elog.c:1071
#define WARNING
Definition: elog.h:36
#define ereport(elevel,...)
Definition: elog.h:150
Assert(PointerIsAligned(start, uint64))
#define relpathperm(rlocator, forknum)
Definition: relpath.h:146
BufferTag tag
BlockNumber blockNum

References Assert(), buftag::blockNum, BM_DIRTY, BM_IO_ERROR, BM_IO_IN_PROGRESS, BM_TAG_VALID, BM_VALID, PrivateRefCountEntry::buffer, BufTagGetForkNum(), BufTagGetRelFileLocator(), ereport, errcode(), errdetail(), errmsg(), GetBufferDescriptor(), LockBufHdr(), relpathperm, BufferDesc::tag, TerminateBufferIO(), UnlockBufHdr(), and WARNING.

Referenced by ResOwnerReleaseBufferIO().

◆ AsyncReadBuffers()

static bool AsyncReadBuffers ( ReadBuffersOperation operation,
int *  nblocks_progress 
)
static

Definition at line 1764 of file bufmgr.c.

1765{
1766 Buffer *buffers = &operation->buffers[0];
1767 int flags = operation->flags;
1768 BlockNumber blocknum = operation->blocknum;
1769 ForkNumber forknum = operation->forknum;
1770 char persistence = operation->persistence;
1771 int16 nblocks_done = operation->nblocks_done;
1772 Buffer *io_buffers = &operation->buffers[nblocks_done];
1773 int io_buffers_len = 0;
1774 PgAioHandle *ioh;
1775 uint32 ioh_flags = 0;
1776 void *io_pages[MAX_IO_COMBINE_LIMIT];
1777 IOContext io_context;
1778 IOObject io_object;
1779 bool did_start_io;
1780
1781 /*
1782 * When this IO is executed synchronously, either because the caller will
1783 * immediately block waiting for the IO or because IOMETHOD_SYNC is used,
1784 * the AIO subsystem needs to know.
1785 */
1786 if (flags & READ_BUFFERS_SYNCHRONOUSLY)
1787 ioh_flags |= PGAIO_HF_SYNCHRONOUS;
1788
1789 if (persistence == RELPERSISTENCE_TEMP)
1790 {
1791 io_context = IOCONTEXT_NORMAL;
1792 io_object = IOOBJECT_TEMP_RELATION;
1793 ioh_flags |= PGAIO_HF_REFERENCES_LOCAL;
1794 }
1795 else
1796 {
1797 io_context = IOContextForStrategy(operation->strategy);
1798 io_object = IOOBJECT_RELATION;
1799 }
1800
1801 /*
1802 * If zero_damaged_pages is enabled, add the READ_BUFFERS_ZERO_ON_ERROR
1803 * flag. The reason for that is that, hopefully, zero_damaged_pages isn't
1804 * set globally, but on a per-session basis. The completion callback,
1805 * which may be run in other processes, e.g. in IO workers, may have a
1806 * different value of the zero_damaged_pages GUC.
1807 *
1808 * XXX: We probably should eventually use a different flag for
1809 * zero_damaged_pages, so we can report different log levels / error codes
1810 * for zero_damaged_pages and ZERO_ON_ERROR.
1811 */
1814
1815 /*
1816 * For the same reason as with zero_damaged_pages we need to use this
1817 * backend's ignore_checksum_failure value.
1818 */
1821
1822
1823 /*
1824 * To be allowed to report stats in the local completion callback we need
1825 * to prepare to report stats now. This ensures we can safely report the
1826 * checksum failure even in a critical section.
1827 */
1829
1830 /*
1831 * Get IO handle before ReadBuffersCanStartIO(), as pgaio_io_acquire()
1832 * might block, which we don't want after setting IO_IN_PROGRESS.
1833 *
1834 * If we need to wait for IO before we can get a handle, submit
1835 * already-staged IO first, so that other backends don't need to wait.
1836 * There wouldn't be a deadlock risk, as pgaio_io_acquire() just needs to
1837 * wait for already submitted IO, which doesn't require additional locks,
1838 * but it could still cause undesirable waits.
1839 *
1840 * A secondary benefit is that this would allow us to measure the time in
1841 * pgaio_io_acquire() without causing undue timer overhead in the common,
1842 * non-blocking, case. However, currently the pgstats infrastructure
1843 * doesn't really allow that, as it a) asserts that an operation can't
1844 * have time without operations b) doesn't have an API to report
1845 * "accumulated" time.
1846 */
1848 if (unlikely(!ioh))
1849 {
1851
1853 }
1854
1855 /*
1856 * Check if we can start IO on the first to-be-read buffer.
1857 *
1858 * If an I/O is already in progress in another backend, we want to wait
1859 * for the outcome: either done, or something went wrong and we will
1860 * retry.
1861 */
1862 if (!ReadBuffersCanStartIO(buffers[nblocks_done], false))
1863 {
1864 /*
1865 * Someone else has already completed this block, we're done.
1866 *
1867 * When IO is necessary, ->nblocks_done is updated in
1868 * ProcessReadBuffersResult(), but that is not called if no IO is
1869 * necessary. Thus update here.
1870 */
1871 operation->nblocks_done += 1;
1872 *nblocks_progress = 1;
1873
1874 pgaio_io_release(ioh);
1875 pgaio_wref_clear(&operation->io_wref);
1876 did_start_io = false;
1877
1878 /*
1879 * Report and track this as a 'hit' for this backend, even though it
1880 * must have started out as a miss in PinBufferForBlock(). The other
1881 * backend will track this as a 'read'.
1882 */
1883 TRACE_POSTGRESQL_BUFFER_READ_DONE(forknum, blocknum + operation->nblocks_done,
1884 operation->smgr->smgr_rlocator.locator.spcOid,
1885 operation->smgr->smgr_rlocator.locator.dbOid,
1886 operation->smgr->smgr_rlocator.locator.relNumber,
1887 operation->smgr->smgr_rlocator.backend,
1888 true);
1889
1890 if (persistence == RELPERSISTENCE_TEMP)
1892 else
1894
1895 if (operation->rel)
1896 pgstat_count_buffer_hit(operation->rel);
1897
1898 pgstat_count_io_op(io_object, io_context, IOOP_HIT, 1, 0);
1899
1900 if (VacuumCostActive)
1902 }
1903 else
1904 {
1905 instr_time io_start;
1906
1907 /* We found a buffer that we need to read in. */
1908 Assert(io_buffers[0] == buffers[nblocks_done]);
1909 io_pages[0] = BufferGetBlock(buffers[nblocks_done]);
1910 io_buffers_len = 1;
1911
1912 /*
1913 * How many neighboring-on-disk blocks can we scatter-read into other
1914 * buffers at the same time? In this case we don't wait if we see an
1915 * I/O already in progress. We already set BM_IO_IN_PROGRESS for the
1916 * head block, so we should get on with that I/O as soon as possible.
1917 */
1918 for (int i = nblocks_done + 1; i < operation->nblocks; i++)
1919 {
1920 if (!ReadBuffersCanStartIO(buffers[i], true))
1921 break;
1922 /* Must be consecutive block numbers. */
1923 Assert(BufferGetBlockNumber(buffers[i - 1]) ==
1924 BufferGetBlockNumber(buffers[i]) - 1);
1925 Assert(io_buffers[io_buffers_len] == buffers[i]);
1926
1927 io_pages[io_buffers_len++] = BufferGetBlock(buffers[i]);
1928 }
1929
1930 /* get a reference to wait for in WaitReadBuffers() */
1931 pgaio_io_get_wref(ioh, &operation->io_wref);
1932
1933 /* provide the list of buffers to the completion callbacks */
1934 pgaio_io_set_handle_data_32(ioh, (uint32 *) io_buffers, io_buffers_len);
1935
1937 persistence == RELPERSISTENCE_TEMP ?
1940 flags);
1941
1942 pgaio_io_set_flag(ioh, ioh_flags);
1943
1944 /* ---
1945 * Even though we're trying to issue IO asynchronously, track the time
1946 * in smgrstartreadv():
1947 * - if io_method == IOMETHOD_SYNC, we will always perform the IO
1948 * immediately
1949 * - the io method might not support the IO (e.g. worker IO for a temp
1950 * table)
1951 * ---
1952 */
1954 smgrstartreadv(ioh, operation->smgr, forknum,
1955 blocknum + nblocks_done,
1956 io_pages, io_buffers_len);
1957 pgstat_count_io_op_time(io_object, io_context, IOOP_READ,
1958 io_start, 1, io_buffers_len * BLCKSZ);
1959
1960 if (persistence == RELPERSISTENCE_TEMP)
1961 pgBufferUsage.local_blks_read += io_buffers_len;
1962 else
1963 pgBufferUsage.shared_blks_read += io_buffers_len;
1964
1965 /*
1966 * Track vacuum cost when issuing IO, not after waiting for it.
1967 * Otherwise we could end up issuing a lot of IO in a short timespan,
1968 * despite a low cost limit.
1969 */
1970 if (VacuumCostActive)
1971 VacuumCostBalance += VacuumCostPageMiss * io_buffers_len;
1972
1973 *nblocks_progress = io_buffers_len;
1974 did_start_io = true;
1975 }
1976
1977 return did_start_io;
1978}
PgAioHandle * pgaio_io_acquire(struct ResourceOwnerData *resowner, PgAioReturn *ret)
Definition: aio.c:159
void pgaio_wref_clear(PgAioWaitRef *iow)
Definition: aio.c:961
void pgaio_io_get_wref(PgAioHandle *ioh, PgAioWaitRef *iow)
Definition: aio.c:363
void pgaio_io_set_flag(PgAioHandle *ioh, PgAioHandleFlags flag)
Definition: aio.c:327
void pgaio_submit_staged(void)
Definition: aio.c:1120
void pgaio_io_release(PgAioHandle *ioh)
Definition: aio.c:237
PgAioHandle * pgaio_io_acquire_nb(struct ResourceOwnerData *resowner, PgAioReturn *ret)
Definition: aio.c:185
@ PGAIO_HCB_LOCAL_BUFFER_READV
Definition: aio.h:200
@ PGAIO_HCB_SHARED_BUFFER_READV
Definition: aio.h:198
@ PGAIO_HF_SYNCHRONOUS
Definition: aio.h:70
@ PGAIO_HF_REFERENCES_LOCAL
Definition: aio.h:60
void pgaio_io_set_handle_data_32(PgAioHandle *ioh, uint32 *data, uint8 len)
Definition: aio_callback.c:140
void pgaio_io_register_callbacks(PgAioHandle *ioh, PgAioHandleCallbackID cb_id, uint8 cb_data)
Definition: aio_callback.c:86
uint32 BlockNumber
Definition: block.h:31
int Buffer
Definition: buf.h:23
bool track_io_timing
Definition: bufmgr.c:147
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:4198
static bool ReadBuffersCanStartIO(Buffer buffer, bool nowait)
Definition: bufmgr.c:1564
bool zero_damaged_pages
Definition: bufmgr.c:144
#define READ_BUFFERS_ZERO_ON_ERROR
Definition: bufmgr.h:115
static Block BufferGetBlock(Buffer buffer)
Definition: bufmgr.h:384
#define MAX_IO_COMBINE_LIMIT
Definition: bufmgr.h:166
#define READ_BUFFERS_IGNORE_CHECKSUM_FAILURES
Definition: bufmgr.h:119
#define READ_BUFFERS_SYNCHRONOUSLY
Definition: bufmgr.h:121
bool ignore_checksum_failure
Definition: bufpage.c:27
int16_t int16
Definition: c.h:534
#define unlikely(x)
Definition: c.h:403
IOContext IOContextForStrategy(BufferAccessStrategy strategy)
Definition: freelist.c:683
int VacuumCostPageMiss
Definition: globals.c:152
bool VacuumCostActive
Definition: globals.c:158
int VacuumCostBalance
Definition: globals.c:157
int VacuumCostPageHit
Definition: globals.c:151
BufferUsage pgBufferUsage
Definition: instrument.c:20
int i
Definition: isn.c:77
IOObject
Definition: pgstat.h:274
@ IOOBJECT_RELATION
Definition: pgstat.h:275
@ IOOBJECT_TEMP_RELATION
Definition: pgstat.h:276
IOContext
Definition: pgstat.h:283
@ IOCONTEXT_NORMAL
Definition: pgstat.h:287
@ IOOP_READ
Definition: pgstat.h:313
@ IOOP_HIT
Definition: pgstat.h:307
#define pgstat_count_buffer_hit(rel)
Definition: pgstat.h:710
void pgstat_prepare_report_checksum_failure(Oid dboid)
instr_time pgstat_prepare_io_time(bool track_io_guc)
Definition: pgstat_io.c:91
void pgstat_count_io_op(IOObject io_object, IOContext io_context, IOOp io_op, uint32 cnt, uint64 bytes)
Definition: pgstat_io.c:68
void pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op, instr_time start_time, uint32 cnt, uint64 bytes)
Definition: pgstat_io.c:122
ForkNumber
Definition: relpath.h:56
ResourceOwner CurrentResourceOwner
Definition: resowner.c:173
void smgrstartreadv(PgAioHandle *ioh, SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void **buffers, BlockNumber nblocks)
Definition: smgr.c:753
int64 local_blks_hit
Definition: instrument.h:30
int64 shared_blks_read
Definition: instrument.h:27
int64 local_blks_read
Definition: instrument.h:31
int64 shared_blks_hit
Definition: instrument.h:26
ForkNumber forknum
Definition: bufmgr.h:130
PgAioWaitRef io_wref
Definition: bufmgr.h:143
Buffer * buffers
Definition: bufmgr.h:138
SMgrRelation smgr
Definition: bufmgr.h:128
BufferAccessStrategy strategy
Definition: bufmgr.h:131
BlockNumber blocknum
Definition: bufmgr.h:139
PgAioReturn io_return
Definition: bufmgr.h:144
RelFileLocator locator
RelFileNumber relNumber
RelFileLocatorBackend smgr_rlocator
Definition: smgr.h:38

References Assert(), RelFileLocatorBackend::backend, ReadBuffersOperation::blocknum, BufferGetBlock(), BufferGetBlockNumber(), ReadBuffersOperation::buffers, CurrentResourceOwner, RelFileLocator::dbOid, ReadBuffersOperation::flags, ReadBuffersOperation::forknum, i, ignore_checksum_failure, ReadBuffersOperation::io_return, ReadBuffersOperation::io_wref, IOCONTEXT_NORMAL, IOContextForStrategy(), IOOBJECT_RELATION, IOOBJECT_TEMP_RELATION, IOOP_HIT, IOOP_READ, BufferUsage::local_blks_hit, BufferUsage::local_blks_read, RelFileLocatorBackend::locator, MAX_IO_COMBINE_LIMIT, ReadBuffersOperation::nblocks, ReadBuffersOperation::nblocks_done, ReadBuffersOperation::persistence, PGAIO_HCB_LOCAL_BUFFER_READV, PGAIO_HCB_SHARED_BUFFER_READV, PGAIO_HF_REFERENCES_LOCAL, PGAIO_HF_SYNCHRONOUS, pgaio_io_acquire(), pgaio_io_acquire_nb(), pgaio_io_get_wref(), pgaio_io_register_callbacks(), pgaio_io_release(), pgaio_io_set_flag(), pgaio_io_set_handle_data_32(), pgaio_submit_staged(), pgaio_wref_clear(), pgBufferUsage, pgstat_count_buffer_hit, pgstat_count_io_op(), pgstat_count_io_op_time(), pgstat_prepare_io_time(), pgstat_prepare_report_checksum_failure(), READ_BUFFERS_IGNORE_CHECKSUM_FAILURES, READ_BUFFERS_SYNCHRONOUSLY, READ_BUFFERS_ZERO_ON_ERROR, ReadBuffersCanStartIO(), ReadBuffersOperation::rel, RelFileLocator::relNumber, BufferUsage::shared_blks_hit, BufferUsage::shared_blks_read, ReadBuffersOperation::smgr, SMgrRelationData::smgr_rlocator, smgrstartreadv(), RelFileLocator::spcOid, ReadBuffersOperation::strategy, track_io_timing, unlikely, VacuumCostActive, VacuumCostBalance, VacuumCostPageHit, VacuumCostPageMiss, and zero_damaged_pages.

Referenced by StartReadBuffersImpl(), and WaitReadBuffers().

◆ AtEOXact_Buffers()

void AtEOXact_Buffers ( bool  isCommit)

Definition at line 3965 of file bufmgr.c.

3966{
3968
3969 AtEOXact_LocalBuffers(isCommit);
3970
3972}
static void CheckForBufferLeaks(void)
Definition: bufmgr.c:4034
static int32 PrivateRefCountOverflowed
Definition: bufmgr.c:217
void AtEOXact_LocalBuffers(bool isCommit)
Definition: localbuf.c:1001

References Assert(), AtEOXact_LocalBuffers(), CheckForBufferLeaks(), and PrivateRefCountOverflowed.

Referenced by AbortTransaction(), AutoVacLauncherMain(), BackgroundWriterMain(), CheckpointerMain(), CommitTransaction(), PrepareTransaction(), and WalWriterMain().

◆ AtProcExit_Buffers()

static void AtProcExit_Buffers ( int  code,
Datum  arg 
)
static

Definition at line 4016 of file bufmgr.c.

4017{
4018 UnlockBuffers();
4019
4021
4022 /* localbuf.c needs a chance too */
4024}
void UnlockBuffers(void)
Definition: bufmgr.c:5544
void AtProcExit_LocalBuffers(void)
Definition: localbuf.c:1012

References AtProcExit_LocalBuffers(), CheckForBufferLeaks(), and UnlockBuffers().

Referenced by InitBufferManagerAccess().

◆ BgBufferSync()

bool BgBufferSync ( WritebackContext wb_context)

Definition at line 3594 of file bufmgr.c.

3595{
3596 /* info obtained from freelist.c */
3597 int strategy_buf_id;
3598 uint32 strategy_passes;
3599 uint32 recent_alloc;
3600
3601 /*
3602 * Information saved between calls so we can determine the strategy
3603 * point's advance rate and avoid scanning already-cleaned buffers.
3604 */
3605 static bool saved_info_valid = false;
3606 static int prev_strategy_buf_id;
3607 static uint32 prev_strategy_passes;
3608 static int next_to_clean;
3609 static uint32 next_passes;
3610
3611 /* Moving averages of allocation rate and clean-buffer density */
3612 static float smoothed_alloc = 0;
3613 static float smoothed_density = 10.0;
3614
3615 /* Potentially these could be tunables, but for now, not */
3616 float smoothing_samples = 16;
3617 float scan_whole_pool_milliseconds = 120000.0;
3618
3619 /* Used to compute how far we scan ahead */
3620 long strategy_delta;
3621 int bufs_to_lap;
3622 int bufs_ahead;
3623 float scans_per_alloc;
3624 int reusable_buffers_est;
3625 int upcoming_alloc_est;
3626 int min_scan_buffers;
3627
3628 /* Variables for the scanning loop proper */
3629 int num_to_scan;
3630 int num_written;
3631 int reusable_buffers;
3632
3633 /* Variables for final smoothed_density update */
3634 long new_strategy_delta;
3635 uint32 new_recent_alloc;
3636
3637 /*
3638 * Find out where the clock-sweep currently is, and how many buffer
3639 * allocations have happened since our last call.
3640 */
3641 strategy_buf_id = StrategySyncStart(&strategy_passes, &recent_alloc);
3642
3643 /* Report buffer alloc counts to pgstat */
3644 PendingBgWriterStats.buf_alloc += recent_alloc;
3645
3646 /*
3647 * If we're not running the LRU scan, just stop after doing the stats
3648 * stuff. We mark the saved state invalid so that we can recover sanely
3649 * if LRU scan is turned back on later.
3650 */
3651 if (bgwriter_lru_maxpages <= 0)
3652 {
3653 saved_info_valid = false;
3654 return true;
3655 }
3656
3657 /*
3658 * Compute strategy_delta = how many buffers have been scanned by the
3659 * clock-sweep since last time. If first time through, assume none. Then
3660 * see if we are still ahead of the clock-sweep, and if so, how many
3661 * buffers we could scan before we'd catch up with it and "lap" it. Note:
3662 * weird-looking coding of xxx_passes comparisons are to avoid bogus
3663 * behavior when the passes counts wrap around.
3664 */
3665 if (saved_info_valid)
3666 {
3667 int32 passes_delta = strategy_passes - prev_strategy_passes;
3668
3669 strategy_delta = strategy_buf_id - prev_strategy_buf_id;
3670 strategy_delta += (long) passes_delta * NBuffers;
3671
3672 Assert(strategy_delta >= 0);
3673
3674 if ((int32) (next_passes - strategy_passes) > 0)
3675 {
3676 /* we're one pass ahead of the strategy point */
3677 bufs_to_lap = strategy_buf_id - next_to_clean;
3678#ifdef BGW_DEBUG
3679 elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
3680 next_passes, next_to_clean,
3681 strategy_passes, strategy_buf_id,
3682 strategy_delta, bufs_to_lap);
3683#endif
3684 }
3685 else if (next_passes == strategy_passes &&
3686 next_to_clean >= strategy_buf_id)
3687 {
3688 /* on same pass, but ahead or at least not behind */
3689 bufs_to_lap = NBuffers - (next_to_clean - strategy_buf_id);
3690#ifdef BGW_DEBUG
3691 elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
3692 next_passes, next_to_clean,
3693 strategy_passes, strategy_buf_id,
3694 strategy_delta, bufs_to_lap);
3695#endif
3696 }
3697 else
3698 {
3699 /*
3700 * We're behind, so skip forward to the strategy point and start
3701 * cleaning from there.
3702 */
3703#ifdef BGW_DEBUG
3704 elog(DEBUG2, "bgwriter behind: bgw %u-%u strategy %u-%u delta=%ld",
3705 next_passes, next_to_clean,
3706 strategy_passes, strategy_buf_id,
3707 strategy_delta);
3708#endif
3709 next_to_clean = strategy_buf_id;
3710 next_passes = strategy_passes;
3711 bufs_to_lap = NBuffers;
3712 }
3713 }
3714 else
3715 {
3716 /*
3717 * Initializing at startup or after LRU scanning had been off. Always
3718 * start at the strategy point.
3719 */
3720#ifdef BGW_DEBUG
3721 elog(DEBUG2, "bgwriter initializing: strategy %u-%u",
3722 strategy_passes, strategy_buf_id);
3723#endif
3724 strategy_delta = 0;
3725 next_to_clean = strategy_buf_id;
3726 next_passes = strategy_passes;
3727 bufs_to_lap = NBuffers;
3728 }
3729
3730 /* Update saved info for next time */
3731 prev_strategy_buf_id = strategy_buf_id;
3732 prev_strategy_passes = strategy_passes;
3733 saved_info_valid = true;
3734
3735 /*
3736 * Compute how many buffers had to be scanned for each new allocation, ie,
3737 * 1/density of reusable buffers, and track a moving average of that.
3738 *
3739 * If the strategy point didn't move, we don't update the density estimate
3740 */
3741 if (strategy_delta > 0 && recent_alloc > 0)
3742 {
3743 scans_per_alloc = (float) strategy_delta / (float) recent_alloc;
3744 smoothed_density += (scans_per_alloc - smoothed_density) /
3745 smoothing_samples;
3746 }
3747
3748 /*
3749 * Estimate how many reusable buffers there are between the current
3750 * strategy point and where we've scanned ahead to, based on the smoothed
3751 * density estimate.
3752 */
3753 bufs_ahead = NBuffers - bufs_to_lap;
3754 reusable_buffers_est = (float) bufs_ahead / smoothed_density;
3755
3756 /*
3757 * Track a moving average of recent buffer allocations. Here, rather than
3758 * a true average we want a fast-attack, slow-decline behavior: we
3759 * immediately follow any increase.
3760 */
3761 if (smoothed_alloc <= (float) recent_alloc)
3762 smoothed_alloc = recent_alloc;
3763 else
3764 smoothed_alloc += ((float) recent_alloc - smoothed_alloc) /
3765 smoothing_samples;
3766
3767 /* Scale the estimate by a GUC to allow more aggressive tuning. */
3768 upcoming_alloc_est = (int) (smoothed_alloc * bgwriter_lru_multiplier);
3769
3770 /*
3771 * If recent_alloc remains at zero for many cycles, smoothed_alloc will
3772 * eventually underflow to zero, and the underflows produce annoying
3773 * kernel warnings on some platforms. Once upcoming_alloc_est has gone to
3774 * zero, there's no point in tracking smaller and smaller values of
3775 * smoothed_alloc, so just reset it to exactly zero to avoid this
3776 * syndrome. It will pop back up as soon as recent_alloc increases.
3777 */
3778 if (upcoming_alloc_est == 0)
3779 smoothed_alloc = 0;
3780
3781 /*
3782 * Even in cases where there's been little or no buffer allocation
3783 * activity, we want to make a small amount of progress through the buffer
3784 * cache so that as many reusable buffers as possible are clean after an
3785 * idle period.
3786 *
3787 * (scan_whole_pool_milliseconds / BgWriterDelay) computes how many times
3788 * the BGW will be called during the scan_whole_pool time; slice the
3789 * buffer pool into that many sections.
3790 */
3791 min_scan_buffers = (int) (NBuffers / (scan_whole_pool_milliseconds / BgWriterDelay));
3792
3793 if (upcoming_alloc_est < (min_scan_buffers + reusable_buffers_est))
3794 {
3795#ifdef BGW_DEBUG
3796 elog(DEBUG2, "bgwriter: alloc_est=%d too small, using min=%d + reusable_est=%d",
3797 upcoming_alloc_est, min_scan_buffers, reusable_buffers_est);
3798#endif
3799 upcoming_alloc_est = min_scan_buffers + reusable_buffers_est;
3800 }
3801
3802 /*
3803 * Now write out dirty reusable buffers, working forward from the
3804 * next_to_clean point, until we have lapped the strategy scan, or cleaned
3805 * enough buffers to match our estimate of the next cycle's allocation
3806 * requirements, or hit the bgwriter_lru_maxpages limit.
3807 */
3808
3809 num_to_scan = bufs_to_lap;
3810 num_written = 0;
3811 reusable_buffers = reusable_buffers_est;
3812
3813 /* Execute the LRU scan */
3814 while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est)
3815 {
3816 int sync_state = SyncOneBuffer(next_to_clean, true,
3817 wb_context);
3818
3819 if (++next_to_clean >= NBuffers)
3820 {
3821 next_to_clean = 0;
3822 next_passes++;
3823 }
3824 num_to_scan--;
3825
3826 if (sync_state & BUF_WRITTEN)
3827 {
3828 reusable_buffers++;
3829 if (++num_written >= bgwriter_lru_maxpages)
3830 {
3832 break;
3833 }
3834 }
3835 else if (sync_state & BUF_REUSABLE)
3836 reusable_buffers++;
3837 }
3838
3840
3841#ifdef BGW_DEBUG
3842 elog(DEBUG1, "bgwriter: recent_alloc=%u smoothed=%.2f delta=%ld ahead=%d density=%.2f reusable_est=%d upcoming_est=%d scanned=%d wrote=%d reusable=%d",
3843 recent_alloc, smoothed_alloc, strategy_delta, bufs_ahead,
3844 smoothed_density, reusable_buffers_est, upcoming_alloc_est,
3845 bufs_to_lap - num_to_scan,
3846 num_written,
3847 reusable_buffers - reusable_buffers_est);
3848#endif
3849
3850 /*
3851 * Consider the above scan as being like a new allocation scan.
3852 * Characterize its density and update the smoothed one based on it. This
3853 * effectively halves the moving average period in cases where both the
3854 * strategy and the background writer are doing some useful scanning,
3855 * which is helpful because a long memory isn't as desirable on the
3856 * density estimates.
3857 */
3858 new_strategy_delta = bufs_to_lap - num_to_scan;
3859 new_recent_alloc = reusable_buffers - reusable_buffers_est;
3860 if (new_strategy_delta > 0 && new_recent_alloc > 0)
3861 {
3862 scans_per_alloc = (float) new_strategy_delta / (float) new_recent_alloc;
3863 smoothed_density += (scans_per_alloc - smoothed_density) /
3864 smoothing_samples;
3865
3866#ifdef BGW_DEBUG
3867 elog(DEBUG2, "bgwriter: cleaner density alloc=%u scan=%ld density=%.2f new smoothed=%.2f",
3868 new_recent_alloc, new_strategy_delta,
3869 scans_per_alloc, smoothed_density);
3870#endif
3871 }
3872
3873 /* Return true if OK to hibernate */
3874 return (bufs_to_lap == 0 && recent_alloc == 0);
3875}
int BgWriterDelay
Definition: bgwriter.c:58
#define BUF_REUSABLE
Definition: bufmgr.c:81
double bgwriter_lru_multiplier
Definition: bufmgr.c:146
static int SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
Definition: bufmgr.c:3892
int bgwriter_lru_maxpages
Definition: bufmgr.c:145
#define BUF_WRITTEN
Definition: bufmgr.c:80
int32_t int32
Definition: c.h:535
#define DEBUG2
Definition: elog.h:29
#define DEBUG1
Definition: elog.h:30
#define elog(elevel,...)
Definition: elog.h:226
int StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc)
Definition: freelist.c:284
int NBuffers
Definition: globals.c:142
PgStat_BgWriterStats PendingBgWriterStats
PgStat_Counter buf_written_clean
Definition: pgstat.h:240
PgStat_Counter maxwritten_clean
Definition: pgstat.h:241
PgStat_Counter buf_alloc
Definition: pgstat.h:242

References Assert(), bgwriter_lru_maxpages, bgwriter_lru_multiplier, BgWriterDelay, PgStat_BgWriterStats::buf_alloc, BUF_REUSABLE, BUF_WRITTEN, PgStat_BgWriterStats::buf_written_clean, DEBUG1, DEBUG2, elog, PgStat_BgWriterStats::maxwritten_clean, NBuffers, PendingBgWriterStats, StrategySyncStart(), and SyncOneBuffer().

Referenced by BackgroundWriterMain().

◆ buffer_readv_complete()

static pg_attribute_always_inline PgAioResult buffer_readv_complete ( PgAioHandle ioh,
PgAioResult  prior_result,
uint8  cb_data,
bool  is_temp 
)
static

Definition at line 7142 of file bufmgr.c.

7144{
7145 PgAioResult result = prior_result;
7147 uint8 first_error_off = 0;
7148 uint8 first_zeroed_off = 0;
7149 uint8 first_ignored_off = 0;
7150 uint8 error_count = 0;
7151 uint8 zeroed_count = 0;
7152 uint8 ignored_count = 0;
7153 uint8 checkfail_count = 0;
7154 uint64 *io_data;
7155 uint8 handle_data_len;
7156
7157 if (is_temp)
7158 {
7159 Assert(td->smgr.is_temp);
7161 }
7162 else
7163 Assert(!td->smgr.is_temp);
7164
7165 /*
7166 * Iterate over all the buffers affected by this IO and call the
7167 * per-buffer completion function for each buffer.
7168 */
7169 io_data = pgaio_io_get_handle_data(ioh, &handle_data_len);
7170 for (uint8 buf_off = 0; buf_off < handle_data_len; buf_off++)
7171 {
7172 Buffer buf = io_data[buf_off];
7173 bool failed;
7174 bool failed_verification = false;
7175 bool failed_checksum = false;
7176 bool zeroed_buffer = false;
7177 bool ignored_checksum = false;
7178
7180
7181 /*
7182 * If the entire I/O failed on a lower-level, each buffer needs to be
7183 * marked as failed. In case of a partial read, the first few buffers
7184 * may be ok.
7185 */
7186 failed =
7187 prior_result.status == PGAIO_RS_ERROR
7188 || prior_result.result <= buf_off;
7189
7190 buffer_readv_complete_one(td, buf_off, buf, cb_data, failed, is_temp,
7191 &failed_verification,
7192 &failed_checksum,
7193 &ignored_checksum,
7194 &zeroed_buffer);
7195
7196 /*
7197 * Track information about the number of different kinds of error
7198 * conditions across all pages, as there can be multiple pages failing
7199 * verification as part of one IO.
7200 */
7201 if (failed_verification && !zeroed_buffer && error_count++ == 0)
7202 first_error_off = buf_off;
7203 if (zeroed_buffer && zeroed_count++ == 0)
7204 first_zeroed_off = buf_off;
7205 if (ignored_checksum && ignored_count++ == 0)
7206 first_ignored_off = buf_off;
7207 if (failed_checksum)
7208 checkfail_count++;
7209 }
7210
7211 /*
7212 * If the smgr read succeeded [partially] and page verification failed for
7213 * some of the pages, adjust the IO's result state appropriately.
7214 */
7215 if (prior_result.status != PGAIO_RS_ERROR &&
7216 (error_count > 0 || ignored_count > 0 || zeroed_count > 0))
7217 {
7218 buffer_readv_encode_error(&result, is_temp,
7219 zeroed_count > 0, ignored_count > 0,
7220 error_count, zeroed_count, checkfail_count,
7221 first_error_off, first_zeroed_off,
7222 first_ignored_off);
7223 pgaio_result_report(result, td, DEBUG1);
7224 }
7225
7226 /*
7227 * For shared relations this reporting is done in
7228 * shared_buffer_readv_complete_local().
7229 */
7230 if (is_temp && checkfail_count > 0)
7232 checkfail_count);
7233
7234 return result;
7235}
ProcNumber pgaio_io_get_owner(PgAioHandle *ioh)
Definition: aio.c:352
uint64 * pgaio_io_get_handle_data(PgAioHandle *ioh, uint8 *len)
Definition: aio_callback.c:156
void pgaio_result_report(PgAioResult result, const PgAioTargetData *target_data, int elevel)
Definition: aio_callback.c:173
PgAioTargetData * pgaio_io_get_target_data(PgAioHandle *ioh)
Definition: aio_target.c:73
@ PGAIO_RS_ERROR
Definition: aio_types.h:84
static pg_attribute_always_inline void buffer_readv_complete_one(PgAioTargetData *td, uint8 buf_off, Buffer buffer, uint8 flags, bool failed, bool is_temp, bool *buffer_invalid, bool *failed_checksum, bool *ignored_checksum, bool *zeroed_buffer)
Definition: bufmgr.c:6998
static void buffer_readv_encode_error(PgAioResult *result, bool is_temp, bool zeroed_any, bool ignored_any, uint8 error_count, uint8 zeroed_count, uint8 checkfail_count, uint8 first_error_off, uint8 first_zeroed_off, uint8 first_ignored_off)
Definition: bufmgr.c:6903
uint8_t uint8
Definition: c.h:537
uint64_t uint64
Definition: c.h:540
ProcNumber MyProcNumber
Definition: globals.c:90
static char * buf
Definition: pg_test_fsync.c:72
void pgstat_report_checksum_failures_in_db(Oid dboid, int failurecount)
uint32 status
Definition: aio_types.h:108
int32 result
Definition: aio_types.h:113
struct PgAioTargetData::@123 smgr
RelFileLocator rlocator
Definition: aio_types.h:65

References Assert(), buf, buffer_readv_complete_one(), buffer_readv_encode_error(), BufferIsValid(), RelFileLocator::dbOid, DEBUG1, PgAioTargetData::is_temp, MyProcNumber, pgaio_io_get_handle_data(), pgaio_io_get_owner(), pgaio_io_get_target_data(), pgaio_result_report(), PGAIO_RS_ERROR, pgstat_report_checksum_failures_in_db(), PgAioResult::result, PgAioTargetData::rlocator, PgAioTargetData::smgr, and PgAioResult::status.

Referenced by local_buffer_readv_complete(), and shared_buffer_readv_complete().

◆ buffer_readv_complete_one()

static pg_attribute_always_inline void buffer_readv_complete_one ( PgAioTargetData td,
uint8  buf_off,
Buffer  buffer,
uint8  flags,
bool  failed,
bool  is_temp,
bool *  buffer_invalid,
bool *  failed_checksum,
bool *  ignored_checksum,
bool *  zeroed_buffer 
)
static

Definition at line 6998 of file bufmgr.c.

7004{
7005 BufferDesc *buf_hdr = is_temp ?
7006 GetLocalBufferDescriptor(-buffer - 1)
7007 : GetBufferDescriptor(buffer - 1);
7008 BufferTag tag = buf_hdr->tag;
7009 char *bufdata = BufferGetBlock(buffer);
7010 uint32 set_flag_bits;
7011 int piv_flags;
7012
7013 /* check that the buffer is in the expected state for a read */
7014#ifdef USE_ASSERT_CHECKING
7015 {
7016 uint32 buf_state = pg_atomic_read_u32(&buf_hdr->state);
7017
7018 Assert(buf_state & BM_TAG_VALID);
7019 Assert(!(buf_state & BM_VALID));
7020 /* temp buffers don't use BM_IO_IN_PROGRESS */
7021 if (!is_temp)
7022 Assert(buf_state & BM_IO_IN_PROGRESS);
7023 Assert(!(buf_state & BM_DIRTY));
7024 }
7025#endif
7026
7027 *buffer_invalid = false;
7028 *failed_checksum = false;
7029 *ignored_checksum = false;
7030 *zeroed_buffer = false;
7031
7032 /*
7033 * We ask PageIsVerified() to only log the message about checksum errors,
7034 * as the completion might be run in any backend (or IO workers). We will
7035 * report checksum errors in buffer_readv_report().
7036 */
7037 piv_flags = PIV_LOG_LOG;
7038
7039 /* the local zero_damaged_pages may differ from the definer's */
7041 piv_flags |= PIV_IGNORE_CHECKSUM_FAILURE;
7042
7043 /* Check for garbage data. */
7044 if (!failed)
7045 {
7046 /*
7047 * If the buffer is not currently pinned by this backend, e.g. because
7048 * we're completing this IO after an error, the buffer data will have
7049 * been marked as inaccessible when the buffer was unpinned. The AIO
7050 * subsystem holds a pin, but that doesn't prevent the buffer from
7051 * having been marked as inaccessible. The completion might also be
7052 * executed in a different process.
7053 */
7054#ifdef USE_VALGRIND
7055 if (!BufferIsPinned(buffer))
7056 VALGRIND_MAKE_MEM_DEFINED(bufdata, BLCKSZ);
7057#endif
7058
7059 if (!PageIsVerified((Page) bufdata, tag.blockNum, piv_flags,
7060 failed_checksum))
7061 {
7062 if (flags & READ_BUFFERS_ZERO_ON_ERROR)
7063 {
7064 memset(bufdata, 0, BLCKSZ);
7065 *zeroed_buffer = true;
7066 }
7067 else
7068 {
7069 *buffer_invalid = true;
7070 /* mark buffer as having failed */
7071 failed = true;
7072 }
7073 }
7074 else if (*failed_checksum)
7075 *ignored_checksum = true;
7076
7077 /* undo what we did above */
7078#ifdef USE_VALGRIND
7079 if (!BufferIsPinned(buffer))
7080 VALGRIND_MAKE_MEM_NOACCESS(bufdata, BLCKSZ);
7081#endif
7082
7083 /*
7084 * Immediately log a message about the invalid page, but only to the
7085 * server log. The reason to do so immediately is that this may be
7086 * executed in a different backend than the one that originated the
7087 * request. The reason to do so immediately is that the originator
7088 * might not process the query result immediately (because it is busy
7089 * doing another part of query processing) or at all (e.g. if it was
7090 * cancelled or errored out due to another IO also failing). The
7091 * definer of the IO will emit an ERROR or WARNING when processing the
7092 * IO's results
7093 *
7094 * To avoid duplicating the code to emit these log messages, we reuse
7095 * buffer_readv_report().
7096 */
7097 if (*buffer_invalid || *failed_checksum || *zeroed_buffer)
7098 {
7099 PgAioResult result_one = {0};
7100
7101 buffer_readv_encode_error(&result_one, is_temp,
7102 *zeroed_buffer,
7103 *ignored_checksum,
7104 *buffer_invalid,
7105 *zeroed_buffer ? 1 : 0,
7106 *failed_checksum ? 1 : 0,
7107 buf_off, buf_off, buf_off);
7108 pgaio_result_report(result_one, td, LOG_SERVER_ONLY);
7109 }
7110 }
7111
7112 /* Terminate I/O and set BM_VALID. */
7113 set_flag_bits = failed ? BM_IO_ERROR : BM_VALID;
7114 if (is_temp)
7115 TerminateLocalBufferIO(buf_hdr, false, set_flag_bits, true);
7116 else
7117 TerminateBufferIO(buf_hdr, false, set_flag_bits, false, true);
7118
7119 /*
7120 * Call the BUFFER_READ_DONE tracepoint in the callback, even though the
7121 * callback may not be executed in the same backend that called
7122 * BUFFER_READ_START. The alternative would be to defer calling the
7123 * tracepoint to a later point (e.g. the local completion callback for
7124 * shared buffer reads), which seems even less helpful.
7125 */
7126 TRACE_POSTGRESQL_BUFFER_READ_DONE(tag.forkNum,
7127 tag.blockNum,
7128 tag.spcOid,
7129 tag.dbOid,
7130 tag.relNumber,
7132 false);
7133}
static uint32 pg_atomic_read_u32(volatile pg_atomic_uint32 *ptr)
Definition: atomics.h:237
static BufferDesc * GetLocalBufferDescriptor(uint32 id)
#define BufferIsPinned(bufnum)
Definition: bufmgr.c:483
bool PageIsVerified(PageData *page, BlockNumber blkno, int flags, bool *checksum_failure_p)
Definition: bufpage.c:94
#define PIV_LOG_LOG
Definition: bufpage.h:469
PageData * Page
Definition: bufpage.h:82
#define PIV_IGNORE_CHECKSUM_FAILURE
Definition: bufpage.h:470
#define LOG_SERVER_ONLY
Definition: elog.h:32
void TerminateLocalBufferIO(BufferDesc *bufHdr, bool clear_dirty, uint32 set_flag_bits, bool release_aio)
Definition: localbuf.c:560
#define VALGRIND_MAKE_MEM_DEFINED(addr, size)
Definition: memdebug.h:26
#define VALGRIND_MAKE_MEM_NOACCESS(addr, size)
Definition: memdebug.h:27
#define INVALID_PROC_NUMBER
Definition: procnumber.h:26
pg_atomic_uint32 state
RelFileNumber relNumber
ForkNumber forkNum
Oid spcOid

References Assert(), buftag::blockNum, BM_DIRTY, BM_IO_ERROR, BM_IO_IN_PROGRESS, BM_TAG_VALID, BM_VALID, PrivateRefCountEntry::buffer, buffer_readv_encode_error(), BufferGetBlock(), BufferIsPinned, buftag::dbOid, buftag::forkNum, GetBufferDescriptor(), GetLocalBufferDescriptor(), INVALID_PROC_NUMBER, LOG_SERVER_ONLY, MyProcNumber, PageIsVerified(), pg_atomic_read_u32(), pgaio_result_report(), PIV_IGNORE_CHECKSUM_FAILURE, PIV_LOG_LOG, READ_BUFFERS_IGNORE_CHECKSUM_FAILURES, READ_BUFFERS_ZERO_ON_ERROR, buftag::relNumber, buftag::spcOid, BufferDesc::state, BufferDesc::tag, TerminateBufferIO(), TerminateLocalBufferIO(), VALGRIND_MAKE_MEM_DEFINED, and VALGRIND_MAKE_MEM_NOACCESS.

Referenced by buffer_readv_complete().

◆ buffer_readv_decode_error()

static void buffer_readv_decode_error ( PgAioResult  result,
bool *  zeroed_any,
bool *  ignored_any,
uint8 zeroed_or_error_count,
uint8 checkfail_count,
uint8 first_off 
)
inlinestatic

Definition at line 6861 of file bufmgr.c.

6867{
6868 uint32 rem_error = result.error_data;
6869
6870 /* see static asserts in buffer_readv_encode_error */
6871#define READV_COUNT_BITS 7
6872#define READV_COUNT_MASK ((1 << READV_COUNT_BITS) - 1)
6873
6874 *zeroed_any = rem_error & 1;
6875 rem_error >>= 1;
6876
6877 *ignored_any = rem_error & 1;
6878 rem_error >>= 1;
6879
6880 *zeroed_or_error_count = rem_error & READV_COUNT_MASK;
6881 rem_error >>= READV_COUNT_BITS;
6882
6883 *checkfail_count = rem_error & READV_COUNT_MASK;
6884 rem_error >>= READV_COUNT_BITS;
6885
6886 *first_off = rem_error & READV_COUNT_MASK;
6887 rem_error >>= READV_COUNT_BITS;
6888}
#define READV_COUNT_BITS
#define READV_COUNT_MASK
uint32 error_data
Definition: aio_types.h:111

References PgAioResult::error_data, READV_COUNT_BITS, and READV_COUNT_MASK.

Referenced by buffer_readv_encode_error(), buffer_readv_report(), and shared_buffer_readv_complete_local().

◆ buffer_readv_encode_error()

static void buffer_readv_encode_error ( PgAioResult result,
bool  is_temp,
bool  zeroed_any,
bool  ignored_any,
uint8  error_count,
uint8  zeroed_count,
uint8  checkfail_count,
uint8  first_error_off,
uint8  first_zeroed_off,
uint8  first_ignored_off 
)
inlinestatic

Definition at line 6903 of file bufmgr.c.

6913{
6914
6915 uint8 shift = 0;
6916 uint8 zeroed_or_error_count =
6917 error_count > 0 ? error_count : zeroed_count;
6918 uint8 first_off;
6919
6921 "PG_IOV_MAX is bigger than reserved space for error data");
6923 "PGAIO_RESULT_ERROR_BITS is insufficient for buffer_readv");
6924
6925 /*
6926 * We only have space to encode one offset - but luckily that's good
6927 * enough. If there is an error, the error is the interesting offset, same
6928 * with a zeroed buffer vs an ignored buffer.
6929 */
6930 if (error_count > 0)
6931 first_off = first_error_off;
6932 else if (zeroed_count > 0)
6933 first_off = first_zeroed_off;
6934 else
6935 first_off = first_ignored_off;
6936
6937 Assert(!zeroed_any || error_count == 0);
6938
6939 result->error_data = 0;
6940
6941 result->error_data |= zeroed_any << shift;
6942 shift += 1;
6943
6944 result->error_data |= ignored_any << shift;
6945 shift += 1;
6946
6947 result->error_data |= ((uint32) zeroed_or_error_count) << shift;
6948 shift += READV_COUNT_BITS;
6949
6950 result->error_data |= ((uint32) checkfail_count) << shift;
6951 shift += READV_COUNT_BITS;
6952
6953 result->error_data |= ((uint32) first_off) << shift;
6954 shift += READV_COUNT_BITS;
6955
6956 result->id = is_temp ? PGAIO_HCB_LOCAL_BUFFER_READV :
6958
6959 if (error_count > 0)
6960 result->status = PGAIO_RS_ERROR;
6961 else
6962 result->status = PGAIO_RS_WARNING;
6963
6964 /*
6965 * The encoding is complicated enough to warrant cross-checking it against
6966 * the decode function.
6967 */
6968#ifdef USE_ASSERT_CHECKING
6969 {
6970 bool zeroed_any_2,
6971 ignored_any_2;
6972 uint8 zeroed_or_error_count_2,
6973 checkfail_count_2,
6974 first_off_2;
6975
6977 &zeroed_any_2, &ignored_any_2,
6978 &zeroed_or_error_count_2,
6979 &checkfail_count_2,
6980 &first_off_2);
6981 Assert(zeroed_any == zeroed_any_2);
6982 Assert(ignored_any == ignored_any_2);
6983 Assert(zeroed_or_error_count == zeroed_or_error_count_2);
6984 Assert(checkfail_count == checkfail_count_2);
6985 Assert(first_off == first_off_2);
6986 }
6987#endif
6988
6989#undef READV_COUNT_BITS
6990#undef READV_COUNT_MASK
6991}
#define PGAIO_RESULT_ERROR_BITS
Definition: aio_types.h:98
@ PGAIO_RS_WARNING
Definition: aio_types.h:83
static void buffer_readv_decode_error(PgAioResult result, bool *zeroed_any, bool *ignored_any, uint8 *zeroed_or_error_count, uint8 *checkfail_count, uint8 *first_off)
Definition: bufmgr.c:6861
#define StaticAssertStmt(condition, errmessage)
Definition: c.h:938
#define PG_IOV_MAX
Definition: pg_iovec.h:47
uint32 id
Definition: aio_types.h:105

References Assert(), buffer_readv_decode_error(), PgAioResult::error_data, PgAioResult::id, PG_IOV_MAX, PGAIO_HCB_LOCAL_BUFFER_READV, PGAIO_HCB_SHARED_BUFFER_READV, PGAIO_RESULT_ERROR_BITS, PGAIO_RS_ERROR, PGAIO_RS_WARNING, READV_COUNT_BITS, StaticAssertStmt, and PgAioResult::status.

Referenced by buffer_readv_complete(), and buffer_readv_complete_one().

◆ buffer_readv_report()

static void buffer_readv_report ( PgAioResult  result,
const PgAioTargetData td,
int  elevel 
)
static

Definition at line 7245 of file bufmgr.c.

7247{
7248 int nblocks = td->smgr.nblocks;
7249 BlockNumber first = td->smgr.blockNum;
7250 BlockNumber last = first + nblocks - 1;
7251 ProcNumber errProc =
7253 RelPathStr rpath =
7254 relpathbackend(td->smgr.rlocator, errProc, td->smgr.forkNum);
7255 bool zeroed_any,
7256 ignored_any;
7257 uint8 zeroed_or_error_count,
7258 checkfail_count,
7259 first_off;
7260 uint8 affected_count;
7261 const char *msg_one,
7262 *msg_mult,
7263 *det_mult,
7264 *hint_mult;
7265
7266 buffer_readv_decode_error(result, &zeroed_any, &ignored_any,
7267 &zeroed_or_error_count,
7268 &checkfail_count,
7269 &first_off);
7270
7271 /*
7272 * Treat a read that had both zeroed buffers *and* ignored checksums as a
7273 * special case, it's too irregular to be emitted the same way as the
7274 * other cases.
7275 */
7276 if (zeroed_any && ignored_any)
7277 {
7278 Assert(zeroed_any && ignored_any);
7279 Assert(nblocks > 1); /* same block can't be both zeroed and ignored */
7280 Assert(result.status != PGAIO_RS_ERROR);
7281 affected_count = zeroed_or_error_count;
7282
7283 ereport(elevel,
7285 errmsg("zeroing %u page(s) and ignoring %u checksum failure(s) among blocks %u..%u of relation \"%s\"",
7286 affected_count, checkfail_count, first, last, rpath.str),
7287 affected_count > 1 ?
7288 errdetail("Block %u held the first zeroed page.",
7289 first + first_off) : 0,
7290 errhint_plural("See server log for details about the other %d invalid block.",
7291 "See server log for details about the other %d invalid blocks.",
7292 affected_count + checkfail_count - 1,
7293 affected_count + checkfail_count - 1));
7294 return;
7295 }
7296
7297 /*
7298 * The other messages are highly repetitive. To avoid duplicating a long
7299 * and complicated ereport(), gather the translated format strings
7300 * separately and then do one common ereport.
7301 */
7302 if (result.status == PGAIO_RS_ERROR)
7303 {
7304 Assert(!zeroed_any); /* can't have invalid pages when zeroing them */
7305 affected_count = zeroed_or_error_count;
7306 msg_one = _("invalid page in block %u of relation \"%s\"");
7307 msg_mult = _("%u invalid pages among blocks %u..%u of relation \"%s\"");
7308 det_mult = _("Block %u held the first invalid page.");
7309 hint_mult = _("See server log for the other %u invalid block(s).");
7310 }
7311 else if (zeroed_any && !ignored_any)
7312 {
7313 affected_count = zeroed_or_error_count;
7314 msg_one = _("invalid page in block %u of relation \"%s\"; zeroing out page");
7315 msg_mult = _("zeroing out %u invalid pages among blocks %u..%u of relation \"%s\"");
7316 det_mult = _("Block %u held the first zeroed page.");
7317 hint_mult = _("See server log for the other %u zeroed block(s).");
7318 }
7319 else if (!zeroed_any && ignored_any)
7320 {
7321 affected_count = checkfail_count;
7322 msg_one = _("ignoring checksum failure in block %u of relation \"%s\"");
7323 msg_mult = _("ignoring %u checksum failures among blocks %u..%u of relation \"%s\"");
7324 det_mult = _("Block %u held the first ignored page.");
7325 hint_mult = _("See server log for the other %u ignored block(s).");
7326 }
7327 else
7329
7330 ereport(elevel,
7332 affected_count == 1 ?
7333 errmsg_internal(msg_one, first + first_off, rpath.str) :
7334 errmsg_internal(msg_mult, affected_count, first, last, rpath.str),
7335 affected_count > 1 ? errdetail_internal(det_mult, first + first_off) : 0,
7336 affected_count > 1 ? errhint_internal(hint_mult, affected_count - 1) : 0);
7337}
#define pg_unreachable()
Definition: c.h:331
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1161
int errdetail_internal(const char *fmt,...)
Definition: elog.c:1234
int errhint_internal(const char *fmt,...)
Definition: elog.c:1343
int errhint_plural(const char *fmt_singular, const char *fmt_plural, unsigned long n,...)
Definition: elog.c:1364
#define _(x)
Definition: elog.c:91
#define ERRCODE_DATA_CORRUPTED
Definition: pg_basebackup.c:42
int ProcNumber
Definition: procnumber.h:24
#define relpathbackend(rlocator, backend, forknum)
Definition: relpath.h:141
char str[REL_PATH_STR_MAXLEN+1]
Definition: relpath.h:123
BlockNumber blockNum
Definition: aio_types.h:66
BlockNumber nblocks
Definition: aio_types.h:67
ForkNumber forkNum
Definition: aio_types.h:68

References _, Assert(), PgAioTargetData::blockNum, buffer_readv_decode_error(), ereport, errcode(), ERRCODE_DATA_CORRUPTED, errdetail(), errdetail_internal(), errhint_internal(), errhint_plural(), errmsg(), errmsg_internal(), PgAioTargetData::forkNum, INVALID_PROC_NUMBER, PgAioTargetData::is_temp, MyProcNumber, PgAioTargetData::nblocks, pg_unreachable, PGAIO_RS_ERROR, relpathbackend, PgAioTargetData::rlocator, PgAioTargetData::smgr, PgAioResult::status, and RelPathStr::str.

◆ buffer_stage_common()

static pg_attribute_always_inline void buffer_stage_common ( PgAioHandle ioh,
bool  is_write,
bool  is_temp 
)
static

Definition at line 6752 of file bufmgr.c.

6753{
6754 uint64 *io_data;
6755 uint8 handle_data_len;
6756 PgAioWaitRef io_ref;
6758
6759 io_data = pgaio_io_get_handle_data(ioh, &handle_data_len);
6760
6761 pgaio_io_get_wref(ioh, &io_ref);
6762
6763 /* iterate over all buffers affected by the vectored readv/writev */
6764 for (int i = 0; i < handle_data_len; i++)
6765 {
6766 Buffer buffer = (Buffer) io_data[i];
6767 BufferDesc *buf_hdr = is_temp ?
6768 GetLocalBufferDescriptor(-buffer - 1)
6769 : GetBufferDescriptor(buffer - 1);
6770 uint32 buf_state;
6771
6772 /*
6773 * Check that all the buffers are actually ones that could conceivably
6774 * be done in one IO, i.e. are sequential. This is the last
6775 * buffer-aware code before IO is actually executed and confusion
6776 * about which buffers are targeted by IO can be hard to debug, making
6777 * it worth doing extra-paranoid checks.
6778 */
6779 if (i == 0)
6780 first = buf_hdr->tag;
6781 else
6782 {
6783 Assert(buf_hdr->tag.relNumber == first.relNumber);
6784 Assert(buf_hdr->tag.blockNum == first.blockNum + i);
6785 }
6786
6787 if (is_temp)
6788 buf_state = pg_atomic_read_u32(&buf_hdr->state);
6789 else
6790 buf_state = LockBufHdr(buf_hdr);
6791
6792 /* verify the buffer is in the expected state */
6793 Assert(buf_state & BM_TAG_VALID);
6794 if (is_write)
6795 {
6796 Assert(buf_state & BM_VALID);
6797 Assert(buf_state & BM_DIRTY);
6798 }
6799 else
6800 {
6801 Assert(!(buf_state & BM_VALID));
6802 Assert(!(buf_state & BM_DIRTY));
6803 }
6804
6805 /* temp buffers don't use BM_IO_IN_PROGRESS */
6806 if (!is_temp)
6807 Assert(buf_state & BM_IO_IN_PROGRESS);
6808
6809 Assert(BUF_STATE_GET_REFCOUNT(buf_state) >= 1);
6810
6811 /*
6812 * Reflect that the buffer is now owned by the AIO subsystem.
6813 *
6814 * For local buffers: This can't be done just via LocalRefCount, as
6815 * one might initially think, as this backend could error out while
6816 * AIO is still in progress, releasing all the pins by the backend
6817 * itself.
6818 *
6819 * This pin is released again in TerminateBufferIO().
6820 */
6821 buf_state += BUF_REFCOUNT_ONE;
6822 buf_hdr->io_wref = io_ref;
6823
6824 if (is_temp)
6825 pg_atomic_unlocked_write_u32(&buf_hdr->state, buf_state);
6826 else
6827 UnlockBufHdr(buf_hdr, buf_state);
6828
6829 /*
6830 * Ensure the content lock that prevents buffer modifications while
6831 * the buffer is being written out is not released early due to an
6832 * error.
6833 */
6834 if (is_write && !is_temp)
6835 {
6836 LWLock *content_lock;
6837
6838 content_lock = BufferDescriptorGetContentLock(buf_hdr);
6839
6840 Assert(LWLockHeldByMe(content_lock));
6841
6842 /*
6843 * Lock is now owned by AIO subsystem.
6844 */
6845 LWLockDisown(content_lock);
6846 }
6847
6848 /*
6849 * Stop tracking this buffer via the resowner - the AIO system now
6850 * keeps track.
6851 */
6852 if (!is_temp)
6854 }
6855}
static void pg_atomic_unlocked_write_u32(volatile pg_atomic_uint32 *ptr, uint32 val)
Definition: atomics.h:293
#define BUF_REFCOUNT_ONE
Definition: buf_internals.h:51
static LWLock * BufferDescriptorGetContentLock(const BufferDesc *bdesc)
static void ResourceOwnerForgetBufferIO(ResourceOwner owner, Buffer buffer)
#define BUF_STATE_GET_REFCOUNT(state)
Definition: buf_internals.h:59
#define PG_USED_FOR_ASSERTS_ONLY
Definition: c.h:223
bool LWLockHeldByMe(LWLock *lock)
Definition: lwlock.c:1977
void LWLockDisown(LWLock *lock)
Definition: lwlock.c:1883
PgAioWaitRef io_wref
Definition: lwlock.h:42

References Assert(), buftag::blockNum, BM_DIRTY, BM_IO_IN_PROGRESS, BM_TAG_VALID, BM_VALID, BUF_REFCOUNT_ONE, BUF_STATE_GET_REFCOUNT, PrivateRefCountEntry::buffer, BufferDescriptorGetContentLock(), CurrentResourceOwner, GetBufferDescriptor(), GetLocalBufferDescriptor(), i, BufferDesc::io_wref, LockBufHdr(), LWLockDisown(), LWLockHeldByMe(), pg_atomic_read_u32(), pg_atomic_unlocked_write_u32(), PG_USED_FOR_ASSERTS_ONLY, pgaio_io_get_handle_data(), pgaio_io_get_wref(), buftag::relNumber, ResourceOwnerForgetBufferIO(), BufferDesc::state, BufferDesc::tag, and UnlockBufHdr().

Referenced by local_buffer_readv_stage(), and shared_buffer_readv_stage().

◆ BufferAlloc()

static pg_attribute_always_inline BufferDesc * BufferAlloc ( SMgrRelation  smgr,
char  relpersistence,
ForkNumber  forkNum,
BlockNumber  blockNum,
BufferAccessStrategy  strategy,
bool *  foundPtr,
IOContext  io_context 
)
inlinestatic

Definition at line 2000 of file bufmgr.c.

2004{
2005 BufferTag newTag; /* identity of requested block */
2006 uint32 newHash; /* hash value for newTag */
2007 LWLock *newPartitionLock; /* buffer partition lock for it */
2008 int existing_buf_id;
2009 Buffer victim_buffer;
2010 BufferDesc *victim_buf_hdr;
2011 uint32 victim_buf_state;
2012
2013 /* Make sure we will have room to remember the buffer pin */
2016
2017 /* create a tag so we can lookup the buffer */
2018 InitBufferTag(&newTag, &smgr->smgr_rlocator.locator, forkNum, blockNum);
2019
2020 /* determine its hash code and partition lock ID */
2021 newHash = BufTableHashCode(&newTag);
2022 newPartitionLock = BufMappingPartitionLock(newHash);
2023
2024 /* see if the block is in the buffer pool already */
2025 LWLockAcquire(newPartitionLock, LW_SHARED);
2026 existing_buf_id = BufTableLookup(&newTag, newHash);
2027 if (existing_buf_id >= 0)
2028 {
2029 BufferDesc *buf;
2030 bool valid;
2031
2032 /*
2033 * Found it. Now, pin the buffer so no one can steal it from the
2034 * buffer pool, and check to see if the correct data has been loaded
2035 * into the buffer.
2036 */
2037 buf = GetBufferDescriptor(existing_buf_id);
2038
2039 valid = PinBuffer(buf, strategy);
2040
2041 /* Can release the mapping lock as soon as we've pinned it */
2042 LWLockRelease(newPartitionLock);
2043
2044 *foundPtr = true;
2045
2046 if (!valid)
2047 {
2048 /*
2049 * We can only get here if (a) someone else is still reading in
2050 * the page, (b) a previous read attempt failed, or (c) someone
2051 * called StartReadBuffers() but not yet WaitReadBuffers().
2052 */
2053 *foundPtr = false;
2054 }
2055
2056 return buf;
2057 }
2058
2059 /*
2060 * Didn't find it in the buffer pool. We'll have to initialize a new
2061 * buffer. Remember to unlock the mapping lock while doing the work.
2062 */
2063 LWLockRelease(newPartitionLock);
2064
2065 /*
2066 * Acquire a victim buffer. Somebody else might try to do the same, we
2067 * don't hold any conflicting locks. If so we'll have to undo our work
2068 * later.
2069 */
2070 victim_buffer = GetVictimBuffer(strategy, io_context);
2071 victim_buf_hdr = GetBufferDescriptor(victim_buffer - 1);
2072
2073 /*
2074 * Try to make a hashtable entry for the buffer under its new tag. If
2075 * somebody else inserted another buffer for the tag, we'll release the
2076 * victim buffer we acquired and use the already inserted one.
2077 */
2078 LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
2079 existing_buf_id = BufTableInsert(&newTag, newHash, victim_buf_hdr->buf_id);
2080 if (existing_buf_id >= 0)
2081 {
2082 BufferDesc *existing_buf_hdr;
2083 bool valid;
2084
2085 /*
2086 * Got a collision. Someone has already done what we were about to do.
2087 * We'll just handle this as if it were found in the buffer pool in
2088 * the first place. First, give up the buffer we were planning to
2089 * use.
2090 *
2091 * We could do this after releasing the partition lock, but then we'd
2092 * have to call ResourceOwnerEnlarge() & ReservePrivateRefCountEntry()
2093 * before acquiring the lock, for the rare case of such a collision.
2094 */
2095 UnpinBuffer(victim_buf_hdr);
2096
2097 /* remaining code should match code at top of routine */
2098
2099 existing_buf_hdr = GetBufferDescriptor(existing_buf_id);
2100
2101 valid = PinBuffer(existing_buf_hdr, strategy);
2102
2103 /* Can release the mapping lock as soon as we've pinned it */
2104 LWLockRelease(newPartitionLock);
2105
2106 *foundPtr = true;
2107
2108 if (!valid)
2109 {
2110 /*
2111 * We can only get here if (a) someone else is still reading in
2112 * the page, (b) a previous read attempt failed, or (c) someone
2113 * called StartReadBuffers() but not yet WaitReadBuffers().
2114 */
2115 *foundPtr = false;
2116 }
2117
2118 return existing_buf_hdr;
2119 }
2120
2121 /*
2122 * Need to lock the buffer header too in order to change its tag.
2123 */
2124 victim_buf_state = LockBufHdr(victim_buf_hdr);
2125
2126 /* some sanity checks while we hold the buffer header lock */
2127 Assert(BUF_STATE_GET_REFCOUNT(victim_buf_state) == 1);
2128 Assert(!(victim_buf_state & (BM_TAG_VALID | BM_VALID | BM_DIRTY | BM_IO_IN_PROGRESS)));
2129
2130 victim_buf_hdr->tag = newTag;
2131
2132 /*
2133 * Make sure BM_PERMANENT is set for buffers that must be written at every
2134 * checkpoint. Unlogged buffers only need to be written at shutdown
2135 * checkpoints, except for their "init" forks, which need to be treated
2136 * just like permanent relations.
2137 */
2138 victim_buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
2139 if (relpersistence == RELPERSISTENCE_PERMANENT || forkNum == INIT_FORKNUM)
2140 victim_buf_state |= BM_PERMANENT;
2141
2142 UnlockBufHdr(victim_buf_hdr, victim_buf_state);
2143
2144 LWLockRelease(newPartitionLock);
2145
2146 /*
2147 * Buffer contents are currently invalid.
2148 */
2149 *foundPtr = false;
2150
2151 return victim_buf_hdr;
2152}
static void InitBufferTag(BufferTag *tag, const RelFileLocator *rlocator, ForkNumber forkNum, BlockNumber blockNum)
#define BM_PERMANENT
Definition: buf_internals.h:77
#define BUF_USAGECOUNT_ONE
Definition: buf_internals.h:54
static LWLock * BufMappingPartitionLock(uint32 hashcode)
int BufTableLookup(BufferTag *tagPtr, uint32 hashcode)
Definition: buf_table.c:90
uint32 BufTableHashCode(BufferTag *tagPtr)
Definition: buf_table.c:78
int BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id)
Definition: buf_table.c:118
static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy)
Definition: bufmgr.c:3041
static Buffer GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context)
Definition: bufmgr.c:2333
static void ReservePrivateRefCountEntry(void)
Definition: bufmgr.c:259
static void UnpinBuffer(BufferDesc *buf)
Definition: bufmgr.c:3233
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1174
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1894
@ LW_SHARED
Definition: lwlock.h:113
@ LW_EXCLUSIVE
Definition: lwlock.h:112
@ INIT_FORKNUM
Definition: relpath.h:61
void ResourceOwnerEnlarge(ResourceOwner owner)
Definition: resowner.c:449

References Assert(), BM_DIRTY, BM_IO_IN_PROGRESS, BM_PERMANENT, BM_TAG_VALID, BM_VALID, buf, BufferDesc::buf_id, BUF_STATE_GET_REFCOUNT, BUF_USAGECOUNT_ONE, BufMappingPartitionLock(), BufTableHashCode(), BufTableInsert(), BufTableLookup(), CurrentResourceOwner, GetBufferDescriptor(), GetVictimBuffer(), INIT_FORKNUM, InitBufferTag(), RelFileLocatorBackend::locator, LockBufHdr(), LW_EXCLUSIVE, LW_SHARED, LWLockAcquire(), LWLockRelease(), PinBuffer(), ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), SMgrRelationData::smgr_rlocator, BufferDesc::tag, UnlockBufHdr(), and UnpinBuffer().

Referenced by PinBufferForBlock().

◆ BufferGetBlockNumber()

BlockNumber BufferGetBlockNumber ( Buffer  buffer)

Definition at line 4198 of file bufmgr.c.

4199{
4200 BufferDesc *bufHdr;
4201
4202 Assert(BufferIsPinned(buffer));
4203
4204 if (BufferIsLocal(buffer))
4205 bufHdr = GetLocalBufferDescriptor(-buffer - 1);
4206 else
4207 bufHdr = GetBufferDescriptor(buffer - 1);
4208
4209 /* pinned, so OK to read tag without spinlock */
4210 return bufHdr->tag.blockNum;
4211}
#define BufferIsLocal(buffer)
Definition: buf.h:37

References Assert(), buftag::blockNum, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsPinned, GetBufferDescriptor(), GetLocalBufferDescriptor(), and BufferDesc::tag.

Referenced by _bt_binsrch_insert(), _bt_bottomupdel_pass(), _bt_check_unique(), _bt_checkpage(), _bt_delitems_delete(), _bt_delitems_vacuum(), _bt_doinsert(), _bt_finish_split(), _bt_getroot(), _bt_insert_parent(), _bt_insertonpg(), _bt_mark_page_halfdead(), _bt_moveright(), _bt_newlevel(), _bt_pagedel(), _bt_readpage(), _bt_restore_meta(), _bt_search(), _bt_simpledel_pass(), _bt_split(), _bt_unlink_halfdead_page(), _hash_addovflpage(), _hash_checkpage(), _hash_doinsert(), _hash_first(), _hash_freeovflpage(), _hash_getnewbuf(), _hash_readnext(), _hash_readpage(), _hash_splitbucket(), allocNewBuffer(), AsyncReadBuffers(), BitmapHeapScanNextBlock(), blinsert(), BloomInitMetapage(), brin_doinsert(), brin_doupdate(), brin_getinsertbuffer(), brin_initialize_empty_new_buffer(), brin_page_cleanup(), brin_xlog_insert_update(), brinbuild(), brinGetTupleForHeapBlock(), btvacuumpage(), check_index_page(), CheckReadBuffersOperation(), collect_corrupt_items(), collectMatchBitmap(), createPostingTree(), dataBeginPlaceToPageLeaf(), dataPrepareDownlink(), doPickSplit(), entryPrepareDownlink(), fill_seq_fork_with_data(), ginEntryInsert(), ginFindParents(), ginFinishSplit(), ginPlaceToPage(), ginRedoDeleteListPages(), ginRedoUpdateMetapage(), ginScanToDelete(), gistbufferinginserttuples(), gistbuild(), gistcheckpage(), gistdeletepage(), gistformdownlink(), gistinserttuples(), gistMemorizeAllDownlinks(), gistplacetopage(), gistRelocateBuildBuffersOnSplit(), gistScanPage(), gistvacuumpage(), hash_xlog_add_ovfl_page(), heap_delete(), heap_fetch_next_buffer(), heap_hot_search_buffer(), heap_insert(), heap_multi_insert(), heap_page_is_all_visible(), heap_page_prune_and_freeze(), heap_prepare_pagescan(), heap_update(), heap_xlog_confirm(), heap_xlog_lock(), heapam_scan_analyze_next_block(), heapgettup(), heapgettup_pagemode(), index_compute_xid_horizon_for_tuples(), lazy_scan_heap(), lazy_scan_noprune(), lazy_scan_prune(), lazy_vacuum_heap_rel(), makeSublist(), moveLeafs(), moveRightIfItNeeded(), pgstathashindex(), read_stream_start_pending_read(), ReadBufferBI(), RelationAddBlocks(), RelationCopyStorageUsingBuffer(), RelationGetBufferForTuple(), RelationPutHeapTuple(), revmap_get_buffer(), revmap_physical_extend(), ScanSourceDatabasePgClassPage(), spgAddNodeAction(), spgbuild(), spgdoinsert(), SpGistSetLastUsedPage(), spgSplitNodeAction(), spgvacuumpage(), spgWalk(), StartReadBuffersImpl(), startScanEntry(), terminate_brin_buildstate(), vacuumLeafPage(), verify_heapam(), visibilitymap_clear(), visibilitymap_get_status(), visibilitymap_pin(), visibilitymap_pin_ok(), and visibilitymap_set().

◆ BufferGetLSNAtomic()

XLogRecPtr BufferGetLSNAtomic ( Buffer  buffer)

Definition at line 4460 of file bufmgr.c.

4461{
4462 char *page = BufferGetPage(buffer);
4463 BufferDesc *bufHdr;
4464 XLogRecPtr lsn;
4465 uint32 buf_state;
4466
4467 /*
4468 * If we don't need locking for correctness, fastpath out.
4469 */
4470 if (!XLogHintBitIsNeeded() || BufferIsLocal(buffer))
4471 return PageGetLSN(page);
4472
4473 /* Make sure we've got a real buffer, and that we hold a pin on it. */
4474 Assert(BufferIsValid(buffer));
4475 Assert(BufferIsPinned(buffer));
4476
4477 bufHdr = GetBufferDescriptor(buffer - 1);
4478 buf_state = LockBufHdr(bufHdr);
4479 lsn = PageGetLSN(page);
4480 UnlockBufHdr(bufHdr, buf_state);
4481
4482 return lsn;
4483}
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:417
static XLogRecPtr PageGetLSN(const PageData *page)
Definition: bufpage.h:386
#define XLogHintBitIsNeeded()
Definition: xlog.h:120
uint64 XLogRecPtr
Definition: xlogdefs.h:21

References Assert(), PrivateRefCountEntry::buffer, BufferGetPage(), BufferIsLocal, BufferIsPinned, BufferIsValid(), GetBufferDescriptor(), LockBufHdr(), PageGetLSN(), UnlockBufHdr(), and XLogHintBitIsNeeded.

Referenced by _bt_drop_lock_and_maybe_pin(), _bt_killitems(), gistdoinsert(), gistFindPath(), gistkillitems(), gistScanPage(), SetHintBits(), and XLogSaveBufferForHint().

◆ BufferGetTag()

void BufferGetTag ( Buffer  buffer,
RelFileLocator rlocator,
ForkNumber forknum,
BlockNumber blknum 
)

Definition at line 4219 of file bufmgr.c.

4221{
4222 BufferDesc *bufHdr;
4223
4224 /* Do the same checks as BufferGetBlockNumber. */
4225 Assert(BufferIsPinned(buffer));
4226
4227 if (BufferIsLocal(buffer))
4228 bufHdr = GetLocalBufferDescriptor(-buffer - 1);
4229 else
4230 bufHdr = GetBufferDescriptor(buffer - 1);
4231
4232 /* pinned, so OK to read tag without spinlock */
4233 *rlocator = BufTagGetRelFileLocator(&bufHdr->tag);
4234 *forknum = BufTagGetForkNum(&bufHdr->tag);
4235 *blknum = bufHdr->tag.blockNum;
4236}

References Assert(), buftag::blockNum, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsPinned, BufTagGetForkNum(), BufTagGetRelFileLocator(), GetBufferDescriptor(), GetLocalBufferDescriptor(), and BufferDesc::tag.

Referenced by fsm_search_avail(), ginRedoInsertEntry(), heap_inplace_update_and_unlock(), log_newpage_buffer(), ResolveCminCmaxDuringDecoding(), XLogRegisterBuffer(), and XLogSaveBufferForHint().

◆ BufferIsDirty()

bool BufferIsDirty ( Buffer  buffer)

Definition at line 2888 of file bufmgr.c.

2889{
2890 BufferDesc *bufHdr;
2891
2892 Assert(BufferIsPinned(buffer));
2893
2894 if (BufferIsLocal(buffer))
2895 {
2896 int bufid = -buffer - 1;
2897
2898 bufHdr = GetLocalBufferDescriptor(bufid);
2899 /* Content locks are not maintained for local buffers. */
2900 }
2901 else
2902 {
2903 bufHdr = GetBufferDescriptor(buffer - 1);
2905 LW_EXCLUSIVE));
2906 }
2907
2908 return pg_atomic_read_u32(&bufHdr->state) & BM_DIRTY;
2909}
bool LWLockHeldByMeInMode(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:2021

References Assert(), BM_DIRTY, PrivateRefCountEntry::buffer, BufferDescriptorGetContentLock(), BufferIsLocal, BufferIsPinned, GetBufferDescriptor(), GetLocalBufferDescriptor(), LW_EXCLUSIVE, LWLockHeldByMeInMode(), pg_atomic_read_u32(), and BufferDesc::state.

Referenced by XLogRegisterBuffer().

◆ BufferIsExclusiveLocked()

bool BufferIsExclusiveLocked ( Buffer  buffer)

Definition at line 2860 of file bufmgr.c.

2861{
2862 BufferDesc *bufHdr;
2863
2864 Assert(BufferIsPinned(buffer));
2865
2866 if (BufferIsLocal(buffer))
2867 {
2868 /* Content locks are not maintained for local buffers. */
2869 return true;
2870 }
2871 else
2872 {
2873 bufHdr = GetBufferDescriptor(buffer - 1);
2875 LW_EXCLUSIVE);
2876 }
2877}

References Assert(), PrivateRefCountEntry::buffer, BufferDescriptorGetContentLock(), BufferIsLocal, BufferIsPinned, GetBufferDescriptor(), LW_EXCLUSIVE, and LWLockHeldByMeInMode().

Referenced by visibilitymap_set(), and XLogRegisterBuffer().

◆ BufferIsPermanent()

bool BufferIsPermanent ( Buffer  buffer)

Definition at line 4430 of file bufmgr.c.

4431{
4432 BufferDesc *bufHdr;
4433
4434 /* Local buffers are used only for temp relations. */
4435 if (BufferIsLocal(buffer))
4436 return false;
4437
4438 /* Make sure we've got a real buffer, and that we hold a pin on it. */
4439 Assert(BufferIsValid(buffer));
4440 Assert(BufferIsPinned(buffer));
4441
4442 /*
4443 * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
4444 * need not bother with the buffer header spinlock. Even if someone else
4445 * changes the buffer header state while we're doing this, the state is
4446 * changed atomically, so we'll read the old value or the new value, but
4447 * not random garbage.
4448 */
4449 bufHdr = GetBufferDescriptor(buffer - 1);
4450 return (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT) != 0;
4451}

References Assert(), BM_PERMANENT, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsPinned, BufferIsValid(), GetBufferDescriptor(), pg_atomic_read_u32(), and BufferDesc::state.

Referenced by SetHintBits().

◆ BufferSync()

static void BufferSync ( int  flags)
static

Definition at line 3318 of file bufmgr.c.

3319{
3320 uint32 buf_state;
3321 int buf_id;
3322 int num_to_scan;
3323 int num_spaces;
3324 int num_processed;
3325 int num_written;
3326 CkptTsStatus *per_ts_stat = NULL;
3327 Oid last_tsid;
3328 binaryheap *ts_heap;
3329 int i;
3330 int mask = BM_DIRTY;
3331 WritebackContext wb_context;
3332
3333 /*
3334 * Unless this is a shutdown checkpoint or we have been explicitly told,
3335 * we write only permanent, dirty buffers. But at shutdown or end of
3336 * recovery, we write all dirty buffers.
3337 */
3340 mask |= BM_PERMANENT;
3341
3342 /*
3343 * Loop over all buffers, and mark the ones that need to be written with
3344 * BM_CHECKPOINT_NEEDED. Count them as we go (num_to_scan), so that we
3345 * can estimate how much work needs to be done.
3346 *
3347 * This allows us to write only those pages that were dirty when the
3348 * checkpoint began, and not those that get dirtied while it proceeds.
3349 * Whenever a page with BM_CHECKPOINT_NEEDED is written out, either by us
3350 * later in this function, or by normal backends or the bgwriter cleaning
3351 * scan, the flag is cleared. Any buffer dirtied after this point won't
3352 * have the flag set.
3353 *
3354 * Note that if we fail to write some buffer, we may leave buffers with
3355 * BM_CHECKPOINT_NEEDED still set. This is OK since any such buffer would
3356 * certainly need to be written for the next checkpoint attempt, too.
3357 */
3358 num_to_scan = 0;
3359 for (buf_id = 0; buf_id < NBuffers; buf_id++)
3360 {
3361 BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
3362
3363 /*
3364 * Header spinlock is enough to examine BM_DIRTY, see comment in
3365 * SyncOneBuffer.
3366 */
3367 buf_state = LockBufHdr(bufHdr);
3368
3369 if ((buf_state & mask) == mask)
3370 {
3371 CkptSortItem *item;
3372
3373 buf_state |= BM_CHECKPOINT_NEEDED;
3374
3375 item = &CkptBufferIds[num_to_scan++];
3376 item->buf_id = buf_id;
3377 item->tsId = bufHdr->tag.spcOid;
3378 item->relNumber = BufTagGetRelNumber(&bufHdr->tag);
3379 item->forkNum = BufTagGetForkNum(&bufHdr->tag);
3380 item->blockNum = bufHdr->tag.blockNum;
3381 }
3382
3383 UnlockBufHdr(bufHdr, buf_state);
3384
3385 /* Check for barrier events in case NBuffers is large. */
3388 }
3389
3390 if (num_to_scan == 0)
3391 return; /* nothing to do */
3392
3394
3395 TRACE_POSTGRESQL_BUFFER_SYNC_START(NBuffers, num_to_scan);
3396
3397 /*
3398 * Sort buffers that need to be written to reduce the likelihood of random
3399 * IO. The sorting is also important for the implementation of balancing
3400 * writes between tablespaces. Without balancing writes we'd potentially
3401 * end up writing to the tablespaces one-by-one; possibly overloading the
3402 * underlying system.
3403 */
3404 sort_checkpoint_bufferids(CkptBufferIds, num_to_scan);
3405
3406 num_spaces = 0;
3407
3408 /*
3409 * Allocate progress status for each tablespace with buffers that need to
3410 * be flushed. This requires the to-be-flushed array to be sorted.
3411 */
3412 last_tsid = InvalidOid;
3413 for (i = 0; i < num_to_scan; i++)
3414 {
3415 CkptTsStatus *s;
3416 Oid cur_tsid;
3417
3418 cur_tsid = CkptBufferIds[i].tsId;
3419
3420 /*
3421 * Grow array of per-tablespace status structs, every time a new
3422 * tablespace is found.
3423 */
3424 if (last_tsid == InvalidOid || last_tsid != cur_tsid)
3425 {
3426 Size sz;
3427
3428 num_spaces++;
3429
3430 /*
3431 * Not worth adding grow-by-power-of-2 logic here - even with a
3432 * few hundred tablespaces this should be fine.
3433 */
3434 sz = sizeof(CkptTsStatus) * num_spaces;
3435
3436 if (per_ts_stat == NULL)
3437 per_ts_stat = (CkptTsStatus *) palloc(sz);
3438 else
3439 per_ts_stat = (CkptTsStatus *) repalloc(per_ts_stat, sz);
3440
3441 s = &per_ts_stat[num_spaces - 1];
3442 memset(s, 0, sizeof(*s));
3443 s->tsId = cur_tsid;
3444
3445 /*
3446 * The first buffer in this tablespace. As CkptBufferIds is sorted
3447 * by tablespace all (s->num_to_scan) buffers in this tablespace
3448 * will follow afterwards.
3449 */
3450 s->index = i;
3451
3452 /*
3453 * progress_slice will be determined once we know how many buffers
3454 * are in each tablespace, i.e. after this loop.
3455 */
3456
3457 last_tsid = cur_tsid;
3458 }
3459 else
3460 {
3461 s = &per_ts_stat[num_spaces - 1];
3462 }
3463
3464 s->num_to_scan++;
3465
3466 /* Check for barrier events. */
3469 }
3470
3471 Assert(num_spaces > 0);
3472
3473 /*
3474 * Build a min-heap over the write-progress in the individual tablespaces,
3475 * and compute how large a portion of the total progress a single
3476 * processed buffer is.
3477 */
3478 ts_heap = binaryheap_allocate(num_spaces,
3480 NULL);
3481
3482 for (i = 0; i < num_spaces; i++)
3483 {
3484 CkptTsStatus *ts_stat = &per_ts_stat[i];
3485
3486 ts_stat->progress_slice = (float8) num_to_scan / ts_stat->num_to_scan;
3487
3488 binaryheap_add_unordered(ts_heap, PointerGetDatum(ts_stat));
3489 }
3490
3491 binaryheap_build(ts_heap);
3492
3493 /*
3494 * Iterate through to-be-checkpointed buffers and write the ones (still)
3495 * marked with BM_CHECKPOINT_NEEDED. The writes are balanced between
3496 * tablespaces; otherwise the sorting would lead to only one tablespace
3497 * receiving writes at a time, making inefficient use of the hardware.
3498 */
3499 num_processed = 0;
3500 num_written = 0;
3501 while (!binaryheap_empty(ts_heap))
3502 {
3503 BufferDesc *bufHdr = NULL;
3504 CkptTsStatus *ts_stat = (CkptTsStatus *)
3506
3507 buf_id = CkptBufferIds[ts_stat->index].buf_id;
3508 Assert(buf_id != -1);
3509
3510 bufHdr = GetBufferDescriptor(buf_id);
3511
3512 num_processed++;
3513
3514 /*
3515 * We don't need to acquire the lock here, because we're only looking
3516 * at a single bit. It's possible that someone else writes the buffer
3517 * and clears the flag right after we check, but that doesn't matter
3518 * since SyncOneBuffer will then do nothing. However, there is a
3519 * further race condition: it's conceivable that between the time we
3520 * examine the bit here and the time SyncOneBuffer acquires the lock,
3521 * someone else not only wrote the buffer but replaced it with another
3522 * page and dirtied it. In that improbable case, SyncOneBuffer will
3523 * write the buffer though we didn't need to. It doesn't seem worth
3524 * guarding against this, though.
3525 */
3527 {
3528 if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN)
3529 {
3530 TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id);
3532 num_written++;
3533 }
3534 }
3535
3536 /*
3537 * Measure progress independent of actually having to flush the buffer
3538 * - otherwise writing become unbalanced.
3539 */
3540 ts_stat->progress += ts_stat->progress_slice;
3541 ts_stat->num_scanned++;
3542 ts_stat->index++;
3543
3544 /* Have all the buffers from the tablespace been processed? */
3545 if (ts_stat->num_scanned == ts_stat->num_to_scan)
3546 {
3547 binaryheap_remove_first(ts_heap);
3548 }
3549 else
3550 {
3551 /* update heap with the new progress */
3552 binaryheap_replace_first(ts_heap, PointerGetDatum(ts_stat));
3553 }
3554
3555 /*
3556 * Sleep to throttle our I/O rate.
3557 *
3558 * (This will check for barrier events even if it doesn't sleep.)
3559 */
3560 CheckpointWriteDelay(flags, (double) num_processed / num_to_scan);
3561 }
3562
3563 /*
3564 * Issue all pending flushes. Only checkpointer calls BufferSync(), so
3565 * IOContext will always be IOCONTEXT_NORMAL.
3566 */
3568
3569 pfree(per_ts_stat);
3570 per_ts_stat = NULL;
3571 binaryheap_free(ts_heap);
3572
3573 /*
3574 * Update checkpoint statistics. As noted above, this doesn't include
3575 * buffers written by other backends or bgwriter scan.
3576 */
3577 CheckpointStats.ckpt_bufs_written += num_written;
3578
3579 TRACE_POSTGRESQL_BUFFER_SYNC_DONE(NBuffers, num_written, num_to_scan);
3580}
void binaryheap_build(binaryheap *heap)
Definition: binaryheap.c:138
void binaryheap_replace_first(binaryheap *heap, bh_node_type d)
Definition: binaryheap.c:255
bh_node_type binaryheap_first(binaryheap *heap)
Definition: binaryheap.c:177
bh_node_type binaryheap_remove_first(binaryheap *heap)
Definition: binaryheap.c:192
void binaryheap_free(binaryheap *heap)
Definition: binaryheap.c:75
void binaryheap_add_unordered(binaryheap *heap, bh_node_type d)
Definition: binaryheap.c:116
binaryheap * binaryheap_allocate(int capacity, binaryheap_comparator compare, void *arg)
Definition: binaryheap.c:39
#define binaryheap_empty(h)
Definition: binaryheap.h:65
CkptSortItem * CkptBufferIds
Definition: buf_init.c:25
static RelFileNumber BufTagGetRelNumber(const BufferTag *tag)
#define BM_CHECKPOINT_NEEDED
Definition: buf_internals.h:76
static int ts_ckpt_progress_comparator(Datum a, Datum b, void *arg)
Definition: bufmgr.c:6343
int checkpoint_flush_after
Definition: bufmgr.c:178
void WritebackContextInit(WritebackContext *context, int *max_pending)
Definition: bufmgr.c:6366
void IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_context)
Definition: bufmgr.c:6428
struct CkptTsStatus CkptTsStatus
double float8
Definition: c.h:636
size_t Size
Definition: c.h:611
void CheckpointWriteDelay(int flags, double progress)
Definition: checkpointer.c:785
volatile sig_atomic_t ProcSignalBarrierPending
Definition: globals.c:40
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1610
void pfree(void *pointer)
Definition: mcxt.c:1594
void * palloc(Size size)
Definition: mcxt.c:1365
PgStat_CheckpointerStats PendingCheckpointerStats
static Datum PointerGetDatum(const void *X)
Definition: postgres.h:332
static Pointer DatumGetPointer(Datum X)
Definition: postgres.h:322
#define InvalidOid
Definition: postgres_ext.h:37
unsigned int Oid
Definition: postgres_ext.h:32
void ProcessProcSignalBarrier(void)
Definition: procsignal.c:499
int ckpt_bufs_written
Definition: xlog.h:167
ForkNumber forkNum
RelFileNumber relNumber
BlockNumber blockNum
float8 progress_slice
Definition: bufmgr.c:119
int index
Definition: bufmgr.c:127
int num_scanned
Definition: bufmgr.c:124
float8 progress
Definition: bufmgr.c:118
int num_to_scan
Definition: bufmgr.c:122
Oid tsId
Definition: bufmgr.c:109
PgStat_Counter buffers_written
Definition: pgstat.h:264
CheckpointStatsData CheckpointStats
Definition: xlog.c:210
#define CHECKPOINT_FLUSH_UNLOGGED
Definition: xlog.h:143
#define CHECKPOINT_END_OF_RECOVERY
Definition: xlog.h:140
#define CHECKPOINT_IS_SHUTDOWN
Definition: xlog.h:139

References Assert(), binaryheap_add_unordered(), binaryheap_allocate(), binaryheap_build(), binaryheap_empty, binaryheap_first(), binaryheap_free(), binaryheap_remove_first(), binaryheap_replace_first(), buftag::blockNum, CkptSortItem::blockNum, BM_CHECKPOINT_NEEDED, BM_DIRTY, BM_PERMANENT, CkptSortItem::buf_id, BUF_WRITTEN, PgStat_CheckpointerStats::buffers_written, BufTagGetForkNum(), BufTagGetRelNumber(), CHECKPOINT_END_OF_RECOVERY, checkpoint_flush_after, CHECKPOINT_FLUSH_UNLOGGED, CHECKPOINT_IS_SHUTDOWN, CheckpointStats, CheckpointWriteDelay(), CheckpointStatsData::ckpt_bufs_written, CkptBufferIds, DatumGetPointer(), CkptSortItem::forkNum, GetBufferDescriptor(), i, CkptTsStatus::index, InvalidOid, IOCONTEXT_NORMAL, IssuePendingWritebacks(), LockBufHdr(), NBuffers, CkptTsStatus::num_scanned, CkptTsStatus::num_to_scan, palloc(), PendingCheckpointerStats, pfree(), pg_atomic_read_u32(), PointerGetDatum(), ProcessProcSignalBarrier(), ProcSignalBarrierPending, CkptTsStatus::progress, CkptTsStatus::progress_slice, CkptSortItem::relNumber, repalloc(), buftag::spcOid, BufferDesc::state, SyncOneBuffer(), BufferDesc::tag, ts_ckpt_progress_comparator(), CkptTsStatus::tsId, CkptSortItem::tsId, UnlockBufHdr(), and WritebackContextInit().

Referenced by CheckPointBuffers().

◆ buffertag_comparator()

static int buffertag_comparator ( const BufferTag ba,
const BufferTag bb 
)
inlinestatic

Definition at line 6278 of file bufmgr.c.

6279{
6280 int ret;
6281 RelFileLocator rlocatora;
6282 RelFileLocator rlocatorb;
6283
6284 rlocatora = BufTagGetRelFileLocator(ba);
6285 rlocatorb = BufTagGetRelFileLocator(bb);
6286
6287 ret = rlocator_comparator(&rlocatora, &rlocatorb);
6288
6289 if (ret != 0)
6290 return ret;
6291
6292 if (BufTagGetForkNum(ba) < BufTagGetForkNum(bb))
6293 return -1;
6294 if (BufTagGetForkNum(ba) > BufTagGetForkNum(bb))
6295 return 1;
6296
6297 if (ba->blockNum < bb->blockNum)
6298 return -1;
6299 if (ba->blockNum > bb->blockNum)
6300 return 1;
6301
6302 return 0;
6303}
static int rlocator_comparator(const void *p1, const void *p2)
Definition: bufmgr.c:6197

References buftag::blockNum, BufTagGetForkNum(), BufTagGetRelFileLocator(), and rlocator_comparator().

◆ CheckBufferIsPinnedOnce()

void CheckBufferIsPinnedOnce ( Buffer  buffer)

Definition at line 5619 of file bufmgr.c.

5620{
5621 if (BufferIsLocal(buffer))
5622 {
5623 if (LocalRefCount[-buffer - 1] != 1)
5624 elog(ERROR, "incorrect local pin count: %d",
5625 LocalRefCount[-buffer - 1]);
5626 }
5627 else
5628 {
5629 if (GetPrivateRefCount(buffer) != 1)
5630 elog(ERROR, "incorrect local pin count: %d",
5631 GetPrivateRefCount(buffer));
5632 }
5633}
#define ERROR
Definition: elog.h:39

References PrivateRefCountEntry::buffer, BufferIsLocal, elog, ERROR, GetPrivateRefCount(), and LocalRefCount.

Referenced by GetVictimBuffer(), lazy_scan_heap(), and LockBufferForCleanup().

◆ CheckForBufferLeaks()

static void CheckForBufferLeaks ( void  )
static

Definition at line 4034 of file bufmgr.c.

4035{
4036#ifdef USE_ASSERT_CHECKING
4037 int RefCountErrors = 0;
4039 int i;
4040 char *s;
4041
4042 /* check the array */
4043 for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
4044 {
4045 res = &PrivateRefCountArray[i];
4046
4047 if (res->buffer != InvalidBuffer)
4048 {
4050 elog(WARNING, "buffer refcount leak: %s", s);
4051 pfree(s);
4052
4053 RefCountErrors++;
4054 }
4055 }
4056
4057 /* if necessary search the hash */
4059 {
4060 HASH_SEQ_STATUS hstat;
4061
4063 while ((res = (PrivateRefCountEntry *) hash_seq_search(&hstat)) != NULL)
4064 {
4066 elog(WARNING, "buffer refcount leak: %s", s);
4067 pfree(s);
4068 RefCountErrors++;
4069 }
4070 }
4071
4072 Assert(RefCountErrors == 0);
4073#endif
4074}
#define InvalidBuffer
Definition: buf.h:25
char * DebugPrintBufferRefcount(Buffer buffer)
Definition: bufmgr.c:4141
#define REFCOUNT_ARRAY_ENTRIES
Definition: bufmgr.c:100
static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES]
Definition: bufmgr.c:215
static HTAB * PrivateRefCountHash
Definition: bufmgr.c:216
void * hash_seq_search(HASH_SEQ_STATUS *status)
Definition: dynahash.c:1415
void hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
Definition: dynahash.c:1380

References Assert(), PrivateRefCountEntry::buffer, DebugPrintBufferRefcount(), elog, hash_seq_init(), hash_seq_search(), i, InvalidBuffer, pfree(), PrivateRefCountArray, PrivateRefCountHash, PrivateRefCountOverflowed, REFCOUNT_ARRAY_ENTRIES, and WARNING.

Referenced by AtEOXact_Buffers(), and AtProcExit_Buffers().

◆ CheckPointBuffers()

void CheckPointBuffers ( int  flags)

Definition at line 4184 of file bufmgr.c.

4185{
4186 BufferSync(flags);
4187}
static void BufferSync(int flags)
Definition: bufmgr.c:3318

References BufferSync().

Referenced by CheckPointGuts().

◆ CheckReadBuffersOperation()

static void CheckReadBuffersOperation ( ReadBuffersOperation operation,
bool  is_complete 
)
static

Definition at line 1527 of file bufmgr.c.

1528{
1529#ifdef USE_ASSERT_CHECKING
1530 Assert(operation->nblocks_done <= operation->nblocks);
1531 Assert(!is_complete || operation->nblocks == operation->nblocks_done);
1532
1533 for (int i = 0; i < operation->nblocks; i++)
1534 {
1535 Buffer buffer = operation->buffers[i];
1536 BufferDesc *buf_hdr = BufferIsLocal(buffer) ?
1537 GetLocalBufferDescriptor(-buffer - 1) :
1538 GetBufferDescriptor(buffer - 1);
1539
1540 Assert(BufferGetBlockNumber(buffer) == operation->blocknum + i);
1542
1543 if (i < operation->nblocks_done)
1545 }
1546#endif
1547}

References Assert(), ReadBuffersOperation::blocknum, BM_TAG_VALID, BM_VALID, PrivateRefCountEntry::buffer, BufferGetBlockNumber(), BufferIsLocal, ReadBuffersOperation::buffers, GetBufferDescriptor(), GetLocalBufferDescriptor(), i, ReadBuffersOperation::nblocks, ReadBuffersOperation::nblocks_done, pg_atomic_read_u32(), and BufferDesc::state.

Referenced by StartReadBuffersImpl(), and WaitReadBuffers().

◆ ckpt_buforder_comparator()

static int ckpt_buforder_comparator ( const CkptSortItem a,
const CkptSortItem b 
)
inlinestatic

Definition at line 6312 of file bufmgr.c.

6313{
6314 /* compare tablespace */
6315 if (a->tsId < b->tsId)
6316 return -1;
6317 else if (a->tsId > b->tsId)
6318 return 1;
6319 /* compare relation */
6320 if (a->relNumber < b->relNumber)
6321 return -1;
6322 else if (a->relNumber > b->relNumber)
6323 return 1;
6324 /* compare fork */
6325 else if (a->forkNum < b->forkNum)
6326 return -1;
6327 else if (a->forkNum > b->forkNum)
6328 return 1;
6329 /* compare block number */
6330 else if (a->blockNum < b->blockNum)
6331 return -1;
6332 else if (a->blockNum > b->blockNum)
6333 return 1;
6334 /* equal page IDs are unlikely, but not impossible */
6335 return 0;
6336}
int b
Definition: isn.c:74
int a
Definition: isn.c:73

References a, and b.

◆ ConditionalLockBuffer()

bool ConditionalLockBuffer ( Buffer  buffer)

◆ ConditionalLockBufferForCleanup()

bool ConditionalLockBufferForCleanup ( Buffer  buffer)

Definition at line 5820 of file bufmgr.c.

5821{
5822 BufferDesc *bufHdr;
5823 uint32 buf_state,
5824 refcount;
5825
5826 Assert(BufferIsValid(buffer));
5827
5828 /* see AIO related comment in LockBufferForCleanup() */
5829
5830 if (BufferIsLocal(buffer))
5831 {
5832 refcount = LocalRefCount[-buffer - 1];
5833 /* There should be exactly one pin */
5834 Assert(refcount > 0);
5835 if (refcount != 1)
5836 return false;
5837 /* Nobody else to wait for */
5838 return true;
5839 }
5840
5841 /* There should be exactly one local pin */
5842 refcount = GetPrivateRefCount(buffer);
5843 Assert(refcount);
5844 if (refcount != 1)
5845 return false;
5846
5847 /* Try to acquire lock */
5848 if (!ConditionalLockBuffer(buffer))
5849 return false;
5850
5851 bufHdr = GetBufferDescriptor(buffer - 1);
5852 buf_state = LockBufHdr(bufHdr);
5853 refcount = BUF_STATE_GET_REFCOUNT(buf_state);
5854
5855 Assert(refcount > 0);
5856 if (refcount == 1)
5857 {
5858 /* Successfully acquired exclusive lock with pincount 1 */
5859 UnlockBufHdr(bufHdr, buf_state);
5860 return true;
5861 }
5862
5863 /* Failed, so release the lock */
5864 UnlockBufHdr(bufHdr, buf_state);
5866 return false;
5867}
bool ConditionalLockBuffer(Buffer buffer)
Definition: bufmgr.c:5598
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:5572
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:196

References Assert(), BUF_STATE_GET_REFCOUNT, PrivateRefCountEntry::buffer, BUFFER_LOCK_UNLOCK, BufferIsLocal, BufferIsValid(), ConditionalLockBuffer(), GetBufferDescriptor(), GetPrivateRefCount(), LocalRefCount, LockBuffer(), LockBufHdr(), PrivateRefCountEntry::refcount, and UnlockBufHdr().

Referenced by _hash_finish_split(), _hash_getbuf_with_condlock_cleanup(), heap_page_prune_opt(), and lazy_scan_heap().

◆ CreateAndCopyRelationData()

void CreateAndCopyRelationData ( RelFileLocator  src_rlocator,
RelFileLocator  dst_rlocator,
bool  permanent 
)

Definition at line 5212 of file bufmgr.c.

5214{
5215 char relpersistence;
5216 SMgrRelation src_rel;
5217 SMgrRelation dst_rel;
5218
5219 /* Set the relpersistence. */
5220 relpersistence = permanent ?
5221 RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED;
5222
5223 src_rel = smgropen(src_rlocator, INVALID_PROC_NUMBER);
5224 dst_rel = smgropen(dst_rlocator, INVALID_PROC_NUMBER);
5225
5226 /*
5227 * Create and copy all forks of the relation. During create database we
5228 * have a separate cleanup mechanism which deletes complete database
5229 * directory. Therefore, each individual relation doesn't need to be
5230 * registered for cleanup.
5231 */
5232 RelationCreateStorage(dst_rlocator, relpersistence, false);
5233
5234 /* copy main fork. */
5235 RelationCopyStorageUsingBuffer(src_rlocator, dst_rlocator, MAIN_FORKNUM,
5236 permanent);
5237
5238 /* copy those extra forks that exist */
5239 for (ForkNumber forkNum = MAIN_FORKNUM + 1;
5240 forkNum <= MAX_FORKNUM; forkNum++)
5241 {
5242 if (smgrexists(src_rel, forkNum))
5243 {
5244 smgrcreate(dst_rel, forkNum, false);
5245
5246 /*
5247 * WAL log creation if the relation is persistent, or this is the
5248 * init fork of an unlogged relation.
5249 */
5250 if (permanent || forkNum == INIT_FORKNUM)
5251 log_smgrcreate(&dst_rlocator, forkNum);
5252
5253 /* Copy a fork's data, block by block. */
5254 RelationCopyStorageUsingBuffer(src_rlocator, dst_rlocator, forkNum,
5255 permanent);
5256 }
5257 }
5258}
static void RelationCopyStorageUsingBuffer(RelFileLocator srclocator, RelFileLocator dstlocator, ForkNumber forkNum, bool permanent)
Definition: bufmgr.c:5098
@ MAIN_FORKNUM
Definition: relpath.h:58
#define MAX_FORKNUM
Definition: relpath.h:70
SMgrRelation smgropen(RelFileLocator rlocator, ProcNumber backend)
Definition: smgr.c:240
void smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
Definition: smgr.c:481
bool smgrexists(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:462
SMgrRelation RelationCreateStorage(RelFileLocator rlocator, char relpersistence, bool register_delete)
Definition: storage.c:122
void log_smgrcreate(const RelFileLocator *rlocator, ForkNumber forkNum)
Definition: storage.c:187

References INIT_FORKNUM, INVALID_PROC_NUMBER, log_smgrcreate(), MAIN_FORKNUM, MAX_FORKNUM, RelationCopyStorageUsingBuffer(), RelationCreateStorage(), smgrcreate(), smgrexists(), and smgropen().

Referenced by CreateDatabaseUsingWalLog().

◆ DebugPrintBufferRefcount()

char * DebugPrintBufferRefcount ( Buffer  buffer)

Definition at line 4141 of file bufmgr.c.

4142{
4143 BufferDesc *buf;
4144 int32 loccount;
4145 char *result;
4146 ProcNumber backend;
4147 uint32 buf_state;
4148
4149 Assert(BufferIsValid(buffer));
4150 if (BufferIsLocal(buffer))
4151 {
4152 buf = GetLocalBufferDescriptor(-buffer - 1);
4153 loccount = LocalRefCount[-buffer - 1];
4154 backend = MyProcNumber;
4155 }
4156 else
4157 {
4158 buf = GetBufferDescriptor(buffer - 1);
4159 loccount = GetPrivateRefCount(buffer);
4160 backend = INVALID_PROC_NUMBER;
4161 }
4162
4163 /* theoretically we should lock the bufhdr here */
4164 buf_state = pg_atomic_read_u32(&buf->state);
4165
4166 result = psprintf("[%03d] (rel=%s, blockNum=%u, flags=0x%x, refcount=%u %d)",
4167 buffer,
4169 BufTagGetForkNum(&buf->tag)).str,
4170 buf->tag.blockNum, buf_state & BUF_FLAG_MASK,
4171 BUF_STATE_GET_REFCOUNT(buf_state), loccount);
4172 return result;
4173}
#define BUF_FLAG_MASK
Definition: buf_internals.h:56
char * psprintf(const char *fmt,...)
Definition: psprintf.c:43

References Assert(), buf, BUF_FLAG_MASK, BUF_STATE_GET_REFCOUNT, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsValid(), BufTagGetForkNum(), BufTagGetRelFileLocator(), GetBufferDescriptor(), GetLocalBufferDescriptor(), GetPrivateRefCount(), INVALID_PROC_NUMBER, LocalRefCount, MyProcNumber, pg_atomic_read_u32(), psprintf(), and relpathbackend.

Referenced by buffer_call_start_io(), buffer_call_terminate_io(), CheckForBufferLeaks(), CheckForLocalBufferLeaks(), and ResOwnerPrintBufferPin().

◆ DropDatabaseBuffers()

void DropDatabaseBuffers ( Oid  dbid)

Definition at line 4860 of file bufmgr.c.

4861{
4862 int i;
4863
4864 /*
4865 * We needn't consider local buffers, since by assumption the target
4866 * database isn't our own.
4867 */
4868
4869 for (i = 0; i < NBuffers; i++)
4870 {
4871 BufferDesc *bufHdr = GetBufferDescriptor(i);
4872 uint32 buf_state;
4873
4874 /*
4875 * As in DropRelationBuffers, an unlocked precheck should be safe and
4876 * saves some cycles.
4877 */
4878 if (bufHdr->tag.dbOid != dbid)
4879 continue;
4880
4881 buf_state = LockBufHdr(bufHdr);
4882 if (bufHdr->tag.dbOid == dbid)
4883 InvalidateBuffer(bufHdr); /* releases spinlock */
4884 else
4885 UnlockBufHdr(bufHdr, buf_state);
4886 }
4887}
static void InvalidateBuffer(BufferDesc *buf)
Definition: bufmgr.c:2171

References buftag::dbOid, GetBufferDescriptor(), i, InvalidateBuffer(), LockBufHdr(), NBuffers, BufferDesc::tag, and UnlockBufHdr().

Referenced by createdb_failure_callback(), dbase_redo(), dropdb(), and movedb().

◆ DropRelationBuffers()

void DropRelationBuffers ( SMgrRelation  smgr_reln,
ForkNumber forkNum,
int  nforks,
BlockNumber firstDelBlock 
)

Definition at line 4507 of file bufmgr.c.

4509{
4510 int i;
4511 int j;
4512 RelFileLocatorBackend rlocator;
4513 BlockNumber nForkBlock[MAX_FORKNUM];
4514 uint64 nBlocksToInvalidate = 0;
4515
4516 rlocator = smgr_reln->smgr_rlocator;
4517
4518 /* If it's a local relation, it's localbuf.c's problem. */
4519 if (RelFileLocatorBackendIsTemp(rlocator))
4520 {
4521 if (rlocator.backend == MyProcNumber)
4522 DropRelationLocalBuffers(rlocator.locator, forkNum, nforks,
4523 firstDelBlock);
4524
4525 return;
4526 }
4527
4528 /*
4529 * To remove all the pages of the specified relation forks from the buffer
4530 * pool, we need to scan the entire buffer pool but we can optimize it by
4531 * finding the buffers from BufMapping table provided we know the exact
4532 * size of each fork of the relation. The exact size is required to ensure
4533 * that we don't leave any buffer for the relation being dropped as
4534 * otherwise the background writer or checkpointer can lead to a PANIC
4535 * error while flushing buffers corresponding to files that don't exist.
4536 *
4537 * To know the exact size, we rely on the size cached for each fork by us
4538 * during recovery which limits the optimization to recovery and on
4539 * standbys but we can easily extend it once we have shared cache for
4540 * relation size.
4541 *
4542 * In recovery, we cache the value returned by the first lseek(SEEK_END)
4543 * and the future writes keeps the cached value up-to-date. See
4544 * smgrextend. It is possible that the value of the first lseek is smaller
4545 * than the actual number of existing blocks in the file due to buggy
4546 * Linux kernels that might not have accounted for the recent write. But
4547 * that should be fine because there must not be any buffers after that
4548 * file size.
4549 */
4550 for (i = 0; i < nforks; i++)
4551 {
4552 /* Get the number of blocks for a relation's fork */
4553 nForkBlock[i] = smgrnblocks_cached(smgr_reln, forkNum[i]);
4554
4555 if (nForkBlock[i] == InvalidBlockNumber)
4556 {
4557 nBlocksToInvalidate = InvalidBlockNumber;
4558 break;
4559 }
4560
4561 /* calculate the number of blocks to be invalidated */
4562 nBlocksToInvalidate += (nForkBlock[i] - firstDelBlock[i]);
4563 }
4564
4565 /*
4566 * We apply the optimization iff the total number of blocks to invalidate
4567 * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
4568 */
4569 if (BlockNumberIsValid(nBlocksToInvalidate) &&
4570 nBlocksToInvalidate < BUF_DROP_FULL_SCAN_THRESHOLD)
4571 {
4572 for (j = 0; j < nforks; j++)
4573 FindAndDropRelationBuffers(rlocator.locator, forkNum[j],
4574 nForkBlock[j], firstDelBlock[j]);
4575 return;
4576 }
4577
4578 for (i = 0; i < NBuffers; i++)
4579 {
4580 BufferDesc *bufHdr = GetBufferDescriptor(i);
4581 uint32 buf_state;
4582
4583 /*
4584 * We can make this a tad faster by prechecking the buffer tag before
4585 * we attempt to lock the buffer; this saves a lot of lock
4586 * acquisitions in typical cases. It should be safe because the
4587 * caller must have AccessExclusiveLock on the relation, or some other
4588 * reason to be certain that no one is loading new pages of the rel
4589 * into the buffer pool. (Otherwise we might well miss such pages
4590 * entirely.) Therefore, while the tag might be changing while we
4591 * look at it, it can't be changing *to* a value we care about, only
4592 * *away* from such a value. So false negatives are impossible, and
4593 * false positives are safe because we'll recheck after getting the
4594 * buffer lock.
4595 *
4596 * We could check forkNum and blockNum as well as the rlocator, but
4597 * the incremental win from doing so seems small.
4598 */
4599 if (!BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator))
4600 continue;
4601
4602 buf_state = LockBufHdr(bufHdr);
4603
4604 for (j = 0; j < nforks; j++)
4605 {
4606 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator) &&
4607 BufTagGetForkNum(&bufHdr->tag) == forkNum[j] &&
4608 bufHdr->tag.blockNum >= firstDelBlock[j])
4609 {
4610 InvalidateBuffer(bufHdr); /* releases spinlock */
4611 break;
4612 }
4613 }
4614 if (j >= nforks)
4615 UnlockBufHdr(bufHdr, buf_state);
4616 }
4617}
#define InvalidBlockNumber
Definition: block.h:33
static bool BlockNumberIsValid(BlockNumber blockNumber)
Definition: block.h:71
static bool BufTagMatchesRelFileLocator(const BufferTag *tag, const RelFileLocator *rlocator)
#define BUF_DROP_FULL_SCAN_THRESHOLD
Definition: bufmgr.c:91
static void FindAndDropRelationBuffers(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber nForkBlock, BlockNumber firstDelBlock)
Definition: bufmgr.c:4799
int j
Definition: isn.c:78
void DropRelationLocalBuffers(RelFileLocator rlocator, ForkNumber *forkNum, int nforks, BlockNumber *firstDelBlock)
Definition: localbuf.c:663
#define RelFileLocatorBackendIsTemp(rlocator)
BlockNumber smgrnblocks_cached(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:847

References RelFileLocatorBackend::backend, buftag::blockNum, BlockNumberIsValid(), BUF_DROP_FULL_SCAN_THRESHOLD, BufTagGetForkNum(), BufTagMatchesRelFileLocator(), DropRelationLocalBuffers(), FindAndDropRelationBuffers(), GetBufferDescriptor(), i, InvalidateBuffer(), InvalidBlockNumber, j, RelFileLocatorBackend::locator, LockBufHdr(), MAX_FORKNUM, MyProcNumber, NBuffers, RelFileLocatorBackendIsTemp, SMgrRelationData::smgr_rlocator, smgrnblocks_cached(), BufferDesc::tag, and UnlockBufHdr().

Referenced by smgrtruncate().

◆ DropRelationsAllBuffers()

void DropRelationsAllBuffers ( SMgrRelation smgr_reln,
int  nlocators 
)

Definition at line 4628 of file bufmgr.c.

4629{
4630 int i;
4631 int n = 0;
4632 SMgrRelation *rels;
4633 BlockNumber (*block)[MAX_FORKNUM + 1];
4634 uint64 nBlocksToInvalidate = 0;
4635 RelFileLocator *locators;
4636 bool cached = true;
4637 bool use_bsearch;
4638
4639 if (nlocators == 0)
4640 return;
4641
4642 rels = palloc(sizeof(SMgrRelation) * nlocators); /* non-local relations */
4643
4644 /* If it's a local relation, it's localbuf.c's problem. */
4645 for (i = 0; i < nlocators; i++)
4646 {
4647 if (RelFileLocatorBackendIsTemp(smgr_reln[i]->smgr_rlocator))
4648 {
4649 if (smgr_reln[i]->smgr_rlocator.backend == MyProcNumber)
4650 DropRelationAllLocalBuffers(smgr_reln[i]->smgr_rlocator.locator);
4651 }
4652 else
4653 rels[n++] = smgr_reln[i];
4654 }
4655
4656 /*
4657 * If there are no non-local relations, then we're done. Release the
4658 * memory and return.
4659 */
4660 if (n == 0)
4661 {
4662 pfree(rels);
4663 return;
4664 }
4665
4666 /*
4667 * This is used to remember the number of blocks for all the relations
4668 * forks.
4669 */
4670 block = (BlockNumber (*)[MAX_FORKNUM + 1])
4671 palloc(sizeof(BlockNumber) * n * (MAX_FORKNUM + 1));
4672
4673 /*
4674 * We can avoid scanning the entire buffer pool if we know the exact size
4675 * of each of the given relation forks. See DropRelationBuffers.
4676 */
4677 for (i = 0; i < n && cached; i++)
4678 {
4679 for (int j = 0; j <= MAX_FORKNUM; j++)
4680 {
4681 /* Get the number of blocks for a relation's fork. */
4682 block[i][j] = smgrnblocks_cached(rels[i], j);
4683
4684 /* We need to only consider the relation forks that exists. */
4685 if (block[i][j] == InvalidBlockNumber)
4686 {
4687 if (!smgrexists(rels[i], j))
4688 continue;
4689 cached = false;
4690 break;
4691 }
4692
4693 /* calculate the total number of blocks to be invalidated */
4694 nBlocksToInvalidate += block[i][j];
4695 }
4696 }
4697
4698 /*
4699 * We apply the optimization iff the total number of blocks to invalidate
4700 * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
4701 */
4702 if (cached && nBlocksToInvalidate < BUF_DROP_FULL_SCAN_THRESHOLD)
4703 {
4704 for (i = 0; i < n; i++)
4705 {
4706 for (int j = 0; j <= MAX_FORKNUM; j++)
4707 {
4708 /* ignore relation forks that doesn't exist */
4709 if (!BlockNumberIsValid(block[i][j]))
4710 continue;
4711
4712 /* drop all the buffers for a particular relation fork */
4713 FindAndDropRelationBuffers(rels[i]->smgr_rlocator.locator,
4714 j, block[i][j], 0);
4715 }
4716 }
4717
4718 pfree(block);
4719 pfree(rels);
4720 return;
4721 }
4722
4723 pfree(block);
4724 locators = palloc(sizeof(RelFileLocator) * n); /* non-local relations */
4725 for (i = 0; i < n; i++)
4726 locators[i] = rels[i]->smgr_rlocator.locator;
4727
4728 /*
4729 * For low number of relations to drop just use a simple walk through, to
4730 * save the bsearch overhead. The threshold to use is rather a guess than
4731 * an exactly determined value, as it depends on many factors (CPU and RAM
4732 * speeds, amount of shared buffers etc.).
4733 */
4734 use_bsearch = n > RELS_BSEARCH_THRESHOLD;
4735
4736 /* sort the list of rlocators if necessary */
4737 if (use_bsearch)
4738 qsort(locators, n, sizeof(RelFileLocator), rlocator_comparator);
4739
4740 for (i = 0; i < NBuffers; i++)
4741 {
4742 RelFileLocator *rlocator = NULL;
4743 BufferDesc *bufHdr = GetBufferDescriptor(i);
4744 uint32 buf_state;
4745
4746 /*
4747 * As in DropRelationBuffers, an unlocked precheck should be safe and
4748 * saves some cycles.
4749 */
4750
4751 if (!use_bsearch)
4752 {
4753 int j;
4754
4755 for (j = 0; j < n; j++)
4756 {
4757 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &locators[j]))
4758 {
4759 rlocator = &locators[j];
4760 break;
4761 }
4762 }
4763 }
4764 else
4765 {
4766 RelFileLocator locator;
4767
4768 locator = BufTagGetRelFileLocator(&bufHdr->tag);
4769 rlocator = bsearch(&locator,
4770 locators, n, sizeof(RelFileLocator),
4772 }
4773
4774 /* buffer doesn't belong to any of the given relfilelocators; skip it */
4775 if (rlocator == NULL)
4776 continue;
4777
4778 buf_state = LockBufHdr(bufHdr);
4779 if (BufTagMatchesRelFileLocator(&bufHdr->tag, rlocator))
4780 InvalidateBuffer(bufHdr); /* releases spinlock */
4781 else
4782 UnlockBufHdr(bufHdr, buf_state);
4783 }
4784
4785 pfree(locators);
4786 pfree(rels);
4787}
#define RELS_BSEARCH_THRESHOLD
Definition: bufmgr.c:83
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:81
void DropRelationAllLocalBuffers(RelFileLocator rlocator)
Definition: localbuf.c:700
#define qsort(a, b, c, d)
Definition: port.h:479

References BlockNumberIsValid(), BUF_DROP_FULL_SCAN_THRESHOLD, BufTagGetRelFileLocator(), BufTagMatchesRelFileLocator(), DropRelationAllLocalBuffers(), FindAndDropRelationBuffers(), GetBufferDescriptor(), i, if(), InvalidateBuffer(), InvalidBlockNumber, j, LockBufHdr(), MAX_FORKNUM, MyProcNumber, NBuffers, palloc(), pfree(), qsort, RelFileLocatorBackendIsTemp, RELS_BSEARCH_THRESHOLD, rlocator_comparator(), smgrexists(), smgrnblocks_cached(), BufferDesc::tag, and UnlockBufHdr().

Referenced by smgrdounlinkall().

◆ EvictAllUnpinnedBuffers()

void EvictAllUnpinnedBuffers ( int32 buffers_evicted,
int32 buffers_flushed,
int32 buffers_skipped 
)

Definition at line 6645 of file bufmgr.c.

6647{
6648 *buffers_evicted = 0;
6649 *buffers_skipped = 0;
6650 *buffers_flushed = 0;
6651
6652 for (int buf = 1; buf <= NBuffers; buf++)
6653 {
6654 BufferDesc *desc = GetBufferDescriptor(buf - 1);
6655 uint32 buf_state;
6656 bool buffer_flushed;
6657
6658 buf_state = pg_atomic_read_u32(&desc->state);
6659 if (!(buf_state & BM_VALID))
6660 continue;
6661
6664
6665 LockBufHdr(desc);
6666
6667 if (EvictUnpinnedBufferInternal(desc, &buffer_flushed))
6668 (*buffers_evicted)++;
6669 else
6670 (*buffers_skipped)++;
6671
6672 if (buffer_flushed)
6673 (*buffers_flushed)++;
6674 }
6675}
static bool EvictUnpinnedBufferInternal(BufferDesc *desc, bool *buffer_flushed)
Definition: bufmgr.c:6552

References BM_VALID, buf, CurrentResourceOwner, EvictUnpinnedBufferInternal(), GetBufferDescriptor(), LockBufHdr(), NBuffers, pg_atomic_read_u32(), ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), and BufferDesc::state.

Referenced by pg_buffercache_evict_all().

◆ EvictRelUnpinnedBuffers()

void EvictRelUnpinnedBuffers ( Relation  rel,
int32 buffers_evicted,
int32 buffers_flushed,
int32 buffers_skipped 
)

Definition at line 6693 of file bufmgr.c.

6695{
6697
6698 *buffers_skipped = 0;
6699 *buffers_evicted = 0;
6700 *buffers_flushed = 0;
6701
6702 for (int buf = 1; buf <= NBuffers; buf++)
6703 {
6704 BufferDesc *desc = GetBufferDescriptor(buf - 1);
6705 uint32 buf_state = pg_atomic_read_u32(&(desc->state));
6706 bool buffer_flushed;
6707
6708 /* An unlocked precheck should be safe and saves some cycles. */
6709 if ((buf_state & BM_VALID) == 0 ||
6711 continue;
6712
6713 /* Make sure we can pin the buffer. */
6716
6717 buf_state = LockBufHdr(desc);
6718
6719 /* recheck, could have changed without the lock */
6720 if ((buf_state & BM_VALID) == 0 ||
6722 {
6723 UnlockBufHdr(desc, buf_state);
6724 continue;
6725 }
6726
6727 if (EvictUnpinnedBufferInternal(desc, &buffer_flushed))
6728 (*buffers_evicted)++;
6729 else
6730 (*buffers_skipped)++;
6731
6732 if (buffer_flushed)
6733 (*buffers_flushed)++;
6734 }
6735}
#define RelationUsesLocalBuffers(relation)
Definition: rel.h:646
RelFileLocator rd_locator
Definition: rel.h:57

References Assert(), BM_VALID, buf, BufTagMatchesRelFileLocator(), CurrentResourceOwner, EvictUnpinnedBufferInternal(), GetBufferDescriptor(), LockBufHdr(), NBuffers, pg_atomic_read_u32(), RelationData::rd_locator, RelationUsesLocalBuffers, ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), BufferDesc::state, BufferDesc::tag, and UnlockBufHdr().

Referenced by pg_buffercache_evict_relation().

◆ EvictUnpinnedBuffer()

bool EvictUnpinnedBuffer ( Buffer  buf,
bool *  buffer_flushed 
)

Definition at line 6616 of file bufmgr.c.

6617{
6618 BufferDesc *desc;
6619
6621
6622 /* Make sure we can pin the buffer. */
6625
6626 desc = GetBufferDescriptor(buf - 1);
6627 LockBufHdr(desc);
6628
6629 return EvictUnpinnedBufferInternal(desc, buffer_flushed);
6630}

References Assert(), buf, BufferIsLocal, BufferIsValid(), CurrentResourceOwner, EvictUnpinnedBufferInternal(), GetBufferDescriptor(), LockBufHdr(), ReservePrivateRefCountEntry(), and ResourceOwnerEnlarge().

Referenced by invalidate_rel_block(), modify_rel_block(), and pg_buffercache_evict().

◆ EvictUnpinnedBufferInternal()

static bool EvictUnpinnedBufferInternal ( BufferDesc desc,
bool *  buffer_flushed 
)
static

Definition at line 6552 of file bufmgr.c.

6553{
6554 uint32 buf_state;
6555 bool result;
6556
6557 *buffer_flushed = false;
6558
6559 buf_state = pg_atomic_read_u32(&(desc->state));
6560 Assert(buf_state & BM_LOCKED);
6561
6562 if ((buf_state & BM_VALID) == 0)
6563 {
6564 UnlockBufHdr(desc, buf_state);
6565 return false;
6566 }
6567
6568 /* Check that it's not pinned already. */
6569 if (BUF_STATE_GET_REFCOUNT(buf_state) > 0)
6570 {
6571 UnlockBufHdr(desc, buf_state);
6572 return false;
6573 }
6574
6575 PinBuffer_Locked(desc); /* releases spinlock */
6576
6577 /* If it was dirty, try to clean it once. */
6578 if (buf_state & BM_DIRTY)
6579 {
6582 *buffer_flushed = true;
6584 }
6585
6586 /* This will return false if it becomes dirty or someone else pins it. */
6587 result = InvalidateVictimBuffer(desc);
6588
6589 UnpinBuffer(desc);
6590
6591 return result;
6592}
#define BM_LOCKED
Definition: buf_internals.h:68
static void FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object, IOContext io_context)
Definition: bufmgr.c:4258
static void PinBuffer_Locked(BufferDesc *buf)
Definition: bufmgr.c:3152
static bool InvalidateVictimBuffer(BufferDesc *buf_hdr)
Definition: bufmgr.c:2265

References Assert(), BM_DIRTY, BM_LOCKED, BM_VALID, BUF_STATE_GET_REFCOUNT, BufferDescriptorGetContentLock(), FlushBuffer(), InvalidateVictimBuffer(), IOCONTEXT_NORMAL, IOOBJECT_RELATION, LW_SHARED, LWLockAcquire(), LWLockRelease(), pg_atomic_read_u32(), PinBuffer_Locked(), BufferDesc::state, UnlockBufHdr(), and UnpinBuffer().

Referenced by EvictAllUnpinnedBuffers(), EvictRelUnpinnedBuffers(), and EvictUnpinnedBuffer().

◆ ExtendBufferedRel()

Buffer ExtendBufferedRel ( BufferManagerRelation  bmr,
ForkNumber  forkNum,
BufferAccessStrategy  strategy,
uint32  flags 
)

Definition at line 858 of file bufmgr.c.

862{
863 Buffer buf;
864 uint32 extend_by = 1;
865
866 ExtendBufferedRelBy(bmr, forkNum, strategy, flags, extend_by,
867 &buf, &extend_by);
868
869 return buf;
870}
BlockNumber ExtendBufferedRelBy(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, Buffer *buffers, uint32 *extended_by)
Definition: bufmgr.c:890

References buf, and ExtendBufferedRelBy().

Referenced by _bt_allocbuf(), _hash_getnewbuf(), BloomNewBuffer(), brinbuild(), brinbuildempty(), fill_seq_fork_with_data(), ginbuildempty(), GinNewBuffer(), gistbuildempty(), gistNewBuffer(), ReadBuffer_common(), revmap_physical_extend(), and SpGistNewBuffer().

◆ ExtendBufferedRelBy()

BlockNumber ExtendBufferedRelBy ( BufferManagerRelation  bmr,
ForkNumber  fork,
BufferAccessStrategy  strategy,
uint32  flags,
uint32  extend_by,
Buffer buffers,
uint32 extended_by 
)

Definition at line 890 of file bufmgr.c.

897{
898 Assert((bmr.rel != NULL) != (bmr.smgr != NULL));
899 Assert(bmr.smgr == NULL || bmr.relpersistence != 0);
900 Assert(extend_by > 0);
901
902 if (bmr.smgr == NULL)
903 {
904 bmr.smgr = RelationGetSmgr(bmr.rel);
905 bmr.relpersistence = bmr.rel->rd_rel->relpersistence;
906 }
907
908 return ExtendBufferedRelCommon(bmr, fork, strategy, flags,
909 extend_by, InvalidBlockNumber,
910 buffers, extended_by);
911}
static BlockNumber ExtendBufferedRelCommon(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
Definition: bufmgr.c:2549
static SMgrRelation RelationGetSmgr(Relation rel)
Definition: rel.h:576
SMgrRelation smgr
Definition: bufmgr.h:107
Form_pg_class rd_rel
Definition: rel.h:111

References Assert(), ExtendBufferedRelCommon(), InvalidBlockNumber, RelationData::rd_rel, BufferManagerRelation::rel, RelationGetSmgr(), BufferManagerRelation::relpersistence, and BufferManagerRelation::smgr.

Referenced by ExtendBufferedRel(), grow_rel(), and RelationAddBlocks().

◆ ExtendBufferedRelCommon()

static BlockNumber ExtendBufferedRelCommon ( BufferManagerRelation  bmr,
ForkNumber  fork,
BufferAccessStrategy  strategy,
uint32  flags,
uint32  extend_by,
BlockNumber  extend_upto,
Buffer buffers,
uint32 extended_by 
)
static

Definition at line 2549 of file bufmgr.c.

2557{
2558 BlockNumber first_block;
2559
2560 TRACE_POSTGRESQL_BUFFER_EXTEND_START(fork,
2565 extend_by);
2566
2567 if (bmr.relpersistence == RELPERSISTENCE_TEMP)
2568 first_block = ExtendBufferedRelLocal(bmr, fork, flags,
2569 extend_by, extend_upto,
2570 buffers, &extend_by);
2571 else
2572 first_block = ExtendBufferedRelShared(bmr, fork, strategy, flags,
2573 extend_by, extend_upto,
2574 buffers, &extend_by);
2575 *extended_by = extend_by;
2576
2577 TRACE_POSTGRESQL_BUFFER_EXTEND_DONE(fork,
2582 *extended_by,
2583 first_block);
2584
2585 return first_block;
2586}
static BlockNumber ExtendBufferedRelShared(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
Definition: bufmgr.c:2593
BlockNumber ExtendBufferedRelLocal(BufferManagerRelation bmr, ForkNumber fork, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
Definition: localbuf.c:345

References RelFileLocatorBackend::backend, RelFileLocator::dbOid, ExtendBufferedRelLocal(), ExtendBufferedRelShared(), RelFileLocatorBackend::locator, RelFileLocator::relNumber, BufferManagerRelation::relpersistence, BufferManagerRelation::smgr, SMgrRelationData::smgr_rlocator, and RelFileLocator::spcOid.

Referenced by ExtendBufferedRelBy(), and ExtendBufferedRelTo().

◆ ExtendBufferedRelShared()

static BlockNumber ExtendBufferedRelShared ( BufferManagerRelation  bmr,
ForkNumber  fork,
BufferAccessStrategy  strategy,
uint32  flags,
uint32  extend_by,
BlockNumber  extend_upto,
Buffer buffers,
uint32 extended_by 
)
static

Definition at line 2593 of file bufmgr.c.

2601{
2602 BlockNumber first_block;
2603 IOContext io_context = IOContextForStrategy(strategy);
2604 instr_time io_start;
2605
2606 LimitAdditionalPins(&extend_by);
2607
2608 /*
2609 * Acquire victim buffers for extension without holding extension lock.
2610 * Writing out victim buffers is the most expensive part of extending the
2611 * relation, particularly when doing so requires WAL flushes. Zeroing out
2612 * the buffers is also quite expensive, so do that before holding the
2613 * extension lock as well.
2614 *
2615 * These pages are pinned by us and not valid. While we hold the pin they
2616 * can't be acquired as victim buffers by another backend.
2617 */
2618 for (uint32 i = 0; i < extend_by; i++)
2619 {
2620 Block buf_block;
2621
2622 buffers[i] = GetVictimBuffer(strategy, io_context);
2623 buf_block = BufHdrGetBlock(GetBufferDescriptor(buffers[i] - 1));
2624
2625 /* new buffers are zero-filled */
2626 MemSet(buf_block, 0, BLCKSZ);
2627 }
2628
2629 /*
2630 * Lock relation against concurrent extensions, unless requested not to.
2631 *
2632 * We use the same extension lock for all forks. That's unnecessarily
2633 * restrictive, but currently extensions for forks don't happen often
2634 * enough to make it worth locking more granularly.
2635 *
2636 * Note that another backend might have extended the relation by the time
2637 * we get the lock.
2638 */
2639 if (!(flags & EB_SKIP_EXTENSION_LOCK))
2641
2642 /*
2643 * If requested, invalidate size cache, so that smgrnblocks asks the
2644 * kernel.
2645 */
2646 if (flags & EB_CLEAR_SIZE_CACHE)
2648
2649 first_block = smgrnblocks(bmr.smgr, fork);
2650
2651 /*
2652 * Now that we have the accurate relation size, check if the caller wants
2653 * us to extend to only up to a specific size. If there were concurrent
2654 * extensions, we might have acquired too many buffers and need to release
2655 * them.
2656 */
2657 if (extend_upto != InvalidBlockNumber)
2658 {
2659 uint32 orig_extend_by = extend_by;
2660
2661 if (first_block > extend_upto)
2662 extend_by = 0;
2663 else if ((uint64) first_block + extend_by > extend_upto)
2664 extend_by = extend_upto - first_block;
2665
2666 for (uint32 i = extend_by; i < orig_extend_by; i++)
2667 {
2668 BufferDesc *buf_hdr = GetBufferDescriptor(buffers[i] - 1);
2669
2670 UnpinBuffer(buf_hdr);
2671 }
2672
2673 if (extend_by == 0)
2674 {
2675 if (!(flags & EB_SKIP_EXTENSION_LOCK))
2677 *extended_by = extend_by;
2678 return first_block;
2679 }
2680 }
2681
2682 /* Fail if relation is already at maximum possible length */
2683 if ((uint64) first_block + extend_by >= MaxBlockNumber)
2684 ereport(ERROR,
2685 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
2686 errmsg("cannot extend relation %s beyond %u blocks",
2687 relpath(bmr.smgr->smgr_rlocator, fork).str,
2688 MaxBlockNumber)));
2689
2690 /*
2691 * Insert buffers into buffer table, mark as IO_IN_PROGRESS.
2692 *
2693 * This needs to happen before we extend the relation, because as soon as
2694 * we do, other backends can start to read in those pages.
2695 */
2696 for (uint32 i = 0; i < extend_by; i++)
2697 {
2698 Buffer victim_buf = buffers[i];
2699 BufferDesc *victim_buf_hdr = GetBufferDescriptor(victim_buf - 1);
2700 BufferTag tag;
2701 uint32 hash;
2702 LWLock *partition_lock;
2703 int existing_id;
2704
2705 /* in case we need to pin an existing buffer below */
2708
2709 InitBufferTag(&tag, &bmr.smgr->smgr_rlocator.locator, fork, first_block + i);
2710 hash = BufTableHashCode(&tag);
2711 partition_lock = BufMappingPartitionLock(hash);
2712
2713 LWLockAcquire(partition_lock, LW_EXCLUSIVE);
2714
2715 existing_id = BufTableInsert(&tag, hash, victim_buf_hdr->buf_id);
2716
2717 /*
2718 * We get here only in the corner case where we are trying to extend
2719 * the relation but we found a pre-existing buffer. This can happen
2720 * because a prior attempt at extending the relation failed, and
2721 * because mdread doesn't complain about reads beyond EOF (when
2722 * zero_damaged_pages is ON) and so a previous attempt to read a block
2723 * beyond EOF could have left a "valid" zero-filled buffer.
2724 *
2725 * This has also been observed when relation was overwritten by
2726 * external process. Since the legitimate cases should always have
2727 * left a zero-filled buffer, complain if not PageIsNew.
2728 */
2729 if (existing_id >= 0)
2730 {
2731 BufferDesc *existing_hdr = GetBufferDescriptor(existing_id);
2732 Block buf_block;
2733 bool valid;
2734
2735 /*
2736 * Pin the existing buffer before releasing the partition lock,
2737 * preventing it from being evicted.
2738 */
2739 valid = PinBuffer(existing_hdr, strategy);
2740
2741 LWLockRelease(partition_lock);
2742 UnpinBuffer(victim_buf_hdr);
2743
2744 buffers[i] = BufferDescriptorGetBuffer(existing_hdr);
2745 buf_block = BufHdrGetBlock(existing_hdr);
2746
2747 if (valid && !PageIsNew((Page) buf_block))
2748 ereport(ERROR,
2749 (errmsg("unexpected data beyond EOF in block %u of relation \"%s\"",
2750 existing_hdr->tag.blockNum,
2751 relpath(bmr.smgr->smgr_rlocator, fork).str)));
2752
2753 /*
2754 * We *must* do smgr[zero]extend before succeeding, else the page
2755 * will not be reserved by the kernel, and the next P_NEW call
2756 * will decide to return the same page. Clear the BM_VALID bit,
2757 * do StartBufferIO() and proceed.
2758 *
2759 * Loop to handle the very small possibility that someone re-sets
2760 * BM_VALID between our clearing it and StartBufferIO inspecting
2761 * it.
2762 */
2763 do
2764 {
2765 uint32 buf_state = LockBufHdr(existing_hdr);
2766
2767 buf_state &= ~BM_VALID;
2768 UnlockBufHdr(existing_hdr, buf_state);
2769 } while (!StartBufferIO(existing_hdr, true, false));
2770 }
2771 else
2772 {
2773 uint32 buf_state;
2774
2775 buf_state = LockBufHdr(victim_buf_hdr);
2776
2777 /* some sanity checks while we hold the buffer header lock */
2778 Assert(!(buf_state & (BM_VALID | BM_TAG_VALID | BM_DIRTY | BM_JUST_DIRTIED)));
2779 Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 1);
2780
2781 victim_buf_hdr->tag = tag;
2782
2783 buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
2784 if (bmr.relpersistence == RELPERSISTENCE_PERMANENT || fork == INIT_FORKNUM)
2785 buf_state |= BM_PERMANENT;
2786
2787 UnlockBufHdr(victim_buf_hdr, buf_state);
2788
2789 LWLockRelease(partition_lock);
2790
2791 /* XXX: could combine the locked operations in it with the above */
2792 StartBufferIO(victim_buf_hdr, true, false);
2793 }
2794 }
2795
2797
2798 /*
2799 * Note: if smgrzeroextend fails, we will end up with buffers that are
2800 * allocated but not marked BM_VALID. The next relation extension will
2801 * still select the same block number (because the relation didn't get any
2802 * longer on disk) and so future attempts to extend the relation will find
2803 * the same buffers (if they have not been recycled) but come right back
2804 * here to try smgrzeroextend again.
2805 *
2806 * We don't need to set checksum for all-zero pages.
2807 */
2808 smgrzeroextend(bmr.smgr, fork, first_block, extend_by, false);
2809
2810 /*
2811 * Release the file-extension lock; it's now OK for someone else to extend
2812 * the relation some more.
2813 *
2814 * We remove IO_IN_PROGRESS after this, as waking up waiting backends can
2815 * take noticeable time.
2816 */
2817 if (!(flags & EB_SKIP_EXTENSION_LOCK))
2819
2821 io_start, 1, extend_by * BLCKSZ);
2822
2823 /* Set BM_VALID, terminate IO, and wake up any waiters */
2824 for (uint32 i = 0; i < extend_by; i++)
2825 {
2826 Buffer buf = buffers[i];
2827 BufferDesc *buf_hdr = GetBufferDescriptor(buf - 1);
2828 bool lock = false;
2829
2830 if (flags & EB_LOCK_FIRST && i == 0)
2831 lock = true;
2832 else if (flags & EB_LOCK_TARGET)
2833 {
2834 Assert(extend_upto != InvalidBlockNumber);
2835 if (first_block + i + 1 == extend_upto)
2836 lock = true;
2837 }
2838
2839 if (lock)
2841
2842 TerminateBufferIO(buf_hdr, false, BM_VALID, true, false);
2843 }
2844
2846
2847 *extended_by = extend_by;
2848
2849 return first_block;
2850}
#define MaxBlockNumber
Definition: block.h:35
#define BM_JUST_DIRTIED
Definition: buf_internals.h:74
static Buffer BufferDescriptorGetBuffer(const BufferDesc *bdesc)
#define BufHdrGetBlock(bufHdr)
Definition: bufmgr.c:72
void LimitAdditionalPins(uint32 *additional_pins)
Definition: bufmgr.c:2531
bool StartBufferIO(BufferDesc *buf, bool forInput, bool nowait)
Definition: bufmgr.c:6010
void * Block
Definition: bufmgr.h:26
@ EB_LOCK_TARGET
Definition: bufmgr.h:93
@ EB_CLEAR_SIZE_CACHE
Definition: bufmgr.h:90
@ EB_SKIP_EXTENSION_LOCK
Definition: bufmgr.h:75
@ EB_LOCK_FIRST
Definition: bufmgr.h:87
static bool PageIsNew(const PageData *page)
Definition: bufpage.h:234
#define MemSet(start, val, len)
Definition: c.h:1020
void LockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:424
void UnlockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:474
#define ExclusiveLock
Definition: lockdefs.h:42
@ IOOP_EXTEND
Definition: pgstat.h:312
static unsigned hash(unsigned *uv, int n)
Definition: rege_dfa.c:715
#define relpath(rlocator, forknum)
Definition: relpath.h:150
BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:819
void smgrzeroextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks, bool skipFsync)
Definition: smgr.c:649
int64 shared_blks_written
Definition: instrument.h:29
BlockNumber smgr_cached_nblocks[MAX_FORKNUM+1]
Definition: smgr.h:47

References Assert(), buftag::blockNum, BM_DIRTY, BM_JUST_DIRTIED, BM_PERMANENT, BM_TAG_VALID, BM_VALID, buf, BufferDesc::buf_id, BUF_STATE_GET_REFCOUNT, BUF_USAGECOUNT_ONE, BufferDescriptorGetBuffer(), BufferDescriptorGetContentLock(), BufHdrGetBlock, BufMappingPartitionLock(), BufTableHashCode(), BufTableInsert(), CurrentResourceOwner, EB_CLEAR_SIZE_CACHE, EB_LOCK_FIRST, EB_LOCK_TARGET, EB_SKIP_EXTENSION_LOCK, ereport, errcode(), errmsg(), ERROR, ExclusiveLock, GetBufferDescriptor(), GetVictimBuffer(), hash(), i, INIT_FORKNUM, InitBufferTag(), InvalidBlockNumber, IOContextForStrategy(), IOOBJECT_RELATION, IOOP_EXTEND, LimitAdditionalPins(), RelFileLocatorBackend::locator, LockBufHdr(), LockRelationForExtension(), LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), MaxBlockNumber, MemSet, PageIsNew(), pgBufferUsage, pgstat_count_io_op_time(), pgstat_prepare_io_time(), PinBuffer(), BufferManagerRelation::rel, relpath, BufferManagerRelation::relpersistence, ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), BufferUsage::shared_blks_written, BufferManagerRelation::smgr, SMgrRelationData::smgr_cached_nblocks, SMgrRelationData::smgr_rlocator, smgrnblocks(), smgrzeroextend(), StartBufferIO(), BufferDesc::tag, TerminateBufferIO(), track_io_timing, UnlockBufHdr(), UnlockRelationForExtension(), and UnpinBuffer().

Referenced by ExtendBufferedRelCommon().

◆ ExtendBufferedRelTo()

Buffer ExtendBufferedRelTo ( BufferManagerRelation  bmr,
ForkNumber  fork,
BufferAccessStrategy  strategy,
uint32  flags,
BlockNumber  extend_to,
ReadBufferMode  mode 
)

Definition at line 922 of file bufmgr.c.

928{
930 uint32 extended_by = 0;
931 Buffer buffer = InvalidBuffer;
932 Buffer buffers[64];
933
934 Assert((bmr.rel != NULL) != (bmr.smgr != NULL));
935 Assert(bmr.smgr == NULL || bmr.relpersistence != 0);
936 Assert(extend_to != InvalidBlockNumber && extend_to > 0);
937
938 if (bmr.smgr == NULL)
939 {
940 bmr.smgr = RelationGetSmgr(bmr.rel);
941 bmr.relpersistence = bmr.rel->rd_rel->relpersistence;
942 }
943
944 /*
945 * If desired, create the file if it doesn't exist. If
946 * smgr_cached_nblocks[fork] is positive then it must exist, no need for
947 * an smgrexists call.
948 */
949 if ((flags & EB_CREATE_FORK_IF_NEEDED) &&
950 (bmr.smgr->smgr_cached_nblocks[fork] == 0 ||
952 !smgrexists(bmr.smgr, fork))
953 {
955
956 /* recheck, fork might have been created concurrently */
957 if (!smgrexists(bmr.smgr, fork))
958 smgrcreate(bmr.smgr, fork, flags & EB_PERFORMING_RECOVERY);
959
961 }
962
963 /*
964 * If requested, invalidate size cache, so that smgrnblocks asks the
965 * kernel.
966 */
967 if (flags & EB_CLEAR_SIZE_CACHE)
969
970 /*
971 * Estimate how many pages we'll need to extend by. This avoids acquiring
972 * unnecessarily many victim buffers.
973 */
974 current_size = smgrnblocks(bmr.smgr, fork);
975
976 /*
977 * Since no-one else can be looking at the page contents yet, there is no
978 * difference between an exclusive lock and a cleanup-strength lock. Note
979 * that we pass the original mode to ReadBuffer_common() below, when
980 * falling back to reading the buffer to a concurrent relation extension.
981 */
983 flags |= EB_LOCK_TARGET;
984
985 while (current_size < extend_to)
986 {
987 uint32 num_pages = lengthof(buffers);
988 BlockNumber first_block;
989
990 if ((uint64) current_size + num_pages > extend_to)
991 num_pages = extend_to - current_size;
992
993 first_block = ExtendBufferedRelCommon(bmr, fork, strategy, flags,
994 num_pages, extend_to,
995 buffers, &extended_by);
996
997 current_size = first_block + extended_by;
998 Assert(num_pages != 0 || current_size >= extend_to);
999
1000 for (uint32 i = 0; i < extended_by; i++)
1001 {
1002 if (first_block + i != extend_to - 1)
1003 ReleaseBuffer(buffers[i]);
1004 else
1005 buffer = buffers[i];
1006 }
1007 }
1008
1009 /*
1010 * It's possible that another backend concurrently extended the relation.
1011 * In that case read the buffer.
1012 *
1013 * XXX: Should we control this via a flag?
1014 */
1015 if (buffer == InvalidBuffer)
1016 {
1017 Assert(extended_by == 0);
1018 buffer = ReadBuffer_common(bmr.rel, bmr.smgr, bmr.relpersistence,
1019 fork, extend_to - 1, mode, strategy);
1020 }
1021
1022 return buffer;
1023}
static Buffer ReadBuffer_common(Relation rel, SMgrRelation smgr, char smgr_persistence, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:1193
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:5338
@ EB_PERFORMING_RECOVERY
Definition: bufmgr.h:78
@ EB_CREATE_FORK_IF_NEEDED
Definition: bufmgr.h:84
@ RBM_ZERO_AND_CLEANUP_LOCK
Definition: bufmgr.h:49
@ RBM_ZERO_AND_LOCK
Definition: bufmgr.h:47
#define lengthof(array)
Definition: c.h:788
static PgChecksumMode mode
Definition: pg_checksums.c:55
static int64 current_size
Definition: pg_checksums.c:63

References Assert(), PrivateRefCountEntry::buffer, current_size, EB_CLEAR_SIZE_CACHE, EB_CREATE_FORK_IF_NEEDED, EB_LOCK_TARGET, EB_PERFORMING_RECOVERY, ExclusiveLock, ExtendBufferedRelCommon(), i, InvalidBlockNumber, InvalidBuffer, lengthof, LockRelationForExtension(), mode, RBM_ZERO_AND_CLEANUP_LOCK, RBM_ZERO_AND_LOCK, RelationData::rd_rel, ReadBuffer_common(), BufferManagerRelation::rel, RelationGetSmgr(), ReleaseBuffer(), BufferManagerRelation::relpersistence, BufferManagerRelation::smgr, SMgrRelationData::smgr_cached_nblocks, smgrcreate(), smgrexists(), smgrnblocks(), and UnlockRelationForExtension().

Referenced by fsm_extend(), vm_extend(), and XLogReadBufferExtended().

◆ FindAndDropRelationBuffers()

static void FindAndDropRelationBuffers ( RelFileLocator  rlocator,
ForkNumber  forkNum,
BlockNumber  nForkBlock,
BlockNumber  firstDelBlock 
)
static

Definition at line 4799 of file bufmgr.c.

4802{
4803 BlockNumber curBlock;
4804
4805 for (curBlock = firstDelBlock; curBlock < nForkBlock; curBlock++)
4806 {
4807 uint32 bufHash; /* hash value for tag */
4808 BufferTag bufTag; /* identity of requested block */
4809 LWLock *bufPartitionLock; /* buffer partition lock for it */
4810 int buf_id;
4811 BufferDesc *bufHdr;
4812 uint32 buf_state;
4813
4814 /* create a tag so we can lookup the buffer */
4815 InitBufferTag(&bufTag, &rlocator, forkNum, curBlock);
4816
4817 /* determine its hash code and partition lock ID */
4818 bufHash = BufTableHashCode(&bufTag);
4819 bufPartitionLock = BufMappingPartitionLock(bufHash);
4820
4821 /* Check that it is in the buffer pool. If not, do nothing. */
4822 LWLockAcquire(bufPartitionLock, LW_SHARED);
4823 buf_id = BufTableLookup(&bufTag, bufHash);
4824 LWLockRelease(bufPartitionLock);
4825
4826 if (buf_id < 0)
4827 continue;
4828
4829 bufHdr = GetBufferDescriptor(buf_id);
4830
4831 /*
4832 * We need to lock the buffer header and recheck if the buffer is
4833 * still associated with the same block because the buffer could be
4834 * evicted by some other backend loading blocks for a different
4835 * relation after we release lock on the BufMapping table.
4836 */
4837 buf_state = LockBufHdr(bufHdr);
4838
4839 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator) &&
4840 BufTagGetForkNum(&bufHdr->tag) == forkNum &&
4841 bufHdr->tag.blockNum >= firstDelBlock)
4842 InvalidateBuffer(bufHdr); /* releases spinlock */
4843 else
4844 UnlockBufHdr(bufHdr, buf_state);
4845 }
4846}

References buftag::blockNum, BufMappingPartitionLock(), BufTableHashCode(), BufTableLookup(), BufTagGetForkNum(), BufTagMatchesRelFileLocator(), GetBufferDescriptor(), InitBufferTag(), InvalidateBuffer(), LockBufHdr(), LW_SHARED, LWLockAcquire(), LWLockRelease(), BufferDesc::tag, and UnlockBufHdr().

Referenced by DropRelationBuffers(), and DropRelationsAllBuffers().

◆ FlushBuffer()

static void FlushBuffer ( BufferDesc buf,
SMgrRelation  reln,
IOObject  io_object,
IOContext  io_context 
)
static

Definition at line 4258 of file bufmgr.c.

4260{
4261 XLogRecPtr recptr;
4262 ErrorContextCallback errcallback;
4263 instr_time io_start;
4264 Block bufBlock;
4265 char *bufToWrite;
4266 uint32 buf_state;
4267
4268 /*
4269 * Try to start an I/O operation. If StartBufferIO returns false, then
4270 * someone else flushed the buffer before we could, so we need not do
4271 * anything.
4272 */
4273 if (!StartBufferIO(buf, false, false))
4274 return;
4275
4276 /* Setup error traceback support for ereport() */
4278 errcallback.arg = buf;
4279 errcallback.previous = error_context_stack;
4280 error_context_stack = &errcallback;
4281
4282 /* Find smgr relation for buffer */
4283 if (reln == NULL)
4285
4286 TRACE_POSTGRESQL_BUFFER_FLUSH_START(BufTagGetForkNum(&buf->tag),
4287 buf->tag.blockNum,
4291
4292 buf_state = LockBufHdr(buf);
4293
4294 /*
4295 * Run PageGetLSN while holding header lock, since we don't have the
4296 * buffer locked exclusively in all cases.
4297 */
4298 recptr = BufferGetLSN(buf);
4299
4300 /* To check if block content changes while flushing. - vadim 01/17/97 */
4301 buf_state &= ~BM_JUST_DIRTIED;
4302 UnlockBufHdr(buf, buf_state);
4303
4304 /*
4305 * Force XLOG flush up to buffer's LSN. This implements the basic WAL
4306 * rule that log updates must hit disk before any of the data-file changes
4307 * they describe do.
4308 *
4309 * However, this rule does not apply to unlogged relations, which will be
4310 * lost after a crash anyway. Most unlogged relation pages do not bear
4311 * LSNs since we never emit WAL records for them, and therefore flushing
4312 * up through the buffer LSN would be useless, but harmless. However,
4313 * GiST indexes use LSNs internally to track page-splits, and therefore
4314 * unlogged GiST pages bear "fake" LSNs generated by
4315 * GetFakeLSNForUnloggedRel. It is unlikely but possible that the fake
4316 * LSN counter could advance past the WAL insertion point; and if it did
4317 * happen, attempting to flush WAL through that location would fail, with
4318 * disastrous system-wide consequences. To make sure that can't happen,
4319 * skip the flush if the buffer isn't permanent.
4320 */
4321 if (buf_state & BM_PERMANENT)
4322 XLogFlush(recptr);
4323
4324 /*
4325 * Now it's safe to write the buffer to disk. Note that no one else should
4326 * have been able to write it, while we were busy with log flushing,
4327 * because we got the exclusive right to perform I/O by setting the
4328 * BM_IO_IN_PROGRESS bit.
4329 */
4330 bufBlock = BufHdrGetBlock(buf);
4331
4332 /*
4333 * Update page checksum if desired. Since we have only shared lock on the
4334 * buffer, other processes might be updating hint bits in it, so we must
4335 * copy the page to private storage if we do checksumming.
4336 */
4337 bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum);
4338
4340
4341 /*
4342 * bufToWrite is either the shared buffer or a copy, as appropriate.
4343 */
4344 smgrwrite(reln,
4345 BufTagGetForkNum(&buf->tag),
4346 buf->tag.blockNum,
4347 bufToWrite,
4348 false);
4349
4350 /*
4351 * When a strategy is in use, only flushes of dirty buffers already in the
4352 * strategy ring are counted as strategy writes (IOCONTEXT
4353 * [BULKREAD|BULKWRITE|VACUUM] IOOP_WRITE) for the purpose of IO
4354 * statistics tracking.
4355 *
4356 * If a shared buffer initially added to the ring must be flushed before
4357 * being used, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE.
4358 *
4359 * If a shared buffer which was added to the ring later because the
4360 * current strategy buffer is pinned or in use or because all strategy
4361 * buffers were dirty and rejected (for BAS_BULKREAD operations only)
4362 * requires flushing, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE
4363 * (from_ring will be false).
4364 *
4365 * When a strategy is not in use, the write can only be a "regular" write
4366 * of a dirty shared buffer (IOCONTEXT_NORMAL IOOP_WRITE).
4367 */
4369 IOOP_WRITE, io_start, 1, BLCKSZ);
4370
4372
4373 /*
4374 * Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and
4375 * end the BM_IO_IN_PROGRESS state.
4376 */
4377 TerminateBufferIO(buf, true, 0, true, false);
4378
4379 TRACE_POSTGRESQL_BUFFER_FLUSH_DONE(BufTagGetForkNum(&buf->tag),
4380 buf->tag.blockNum,
4384
4385 /* Pop the error context stack */
4386 error_context_stack = errcallback.previous;
4387}
#define BufferGetLSN(bufHdr)
Definition: bufmgr.c:73
static void shared_buffer_write_error_callback(void *arg)
Definition: bufmgr.c:6165
char * PageSetChecksumCopy(Page page, BlockNumber blkno)
Definition: bufpage.c:1509
ErrorContextCallback * error_context_stack
Definition: elog.c:95
@ IOOP_WRITE
Definition: pgstat.h:314
static void smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync)
Definition: smgr.h:131
struct ErrorContextCallback * previous
Definition: elog.h:297
void(* callback)(void *arg)
Definition: elog.h:298
void XLogFlush(XLogRecPtr record)
Definition: xlog.c:2780

References ErrorContextCallback::arg, BM_PERMANENT, buf, BufferGetLSN, BufHdrGetBlock, BufTagGetForkNum(), BufTagGetRelFileLocator(), ErrorContextCallback::callback, RelFileLocator::dbOid, error_context_stack, INVALID_PROC_NUMBER, IOOBJECT_RELATION, IOOP_WRITE, RelFileLocatorBackend::locator, LockBufHdr(), PageSetChecksumCopy(), pgBufferUsage, pgstat_count_io_op_time(), pgstat_prepare_io_time(), ErrorContextCallback::previous, RelFileLocator::relNumber, BufferUsage::shared_blks_written, shared_buffer_write_error_callback(), SMgrRelationData::smgr_rlocator, smgropen(), smgrwrite(), RelFileLocator::spcOid, StartBufferIO(), TerminateBufferIO(), track_io_timing, UnlockBufHdr(), and XLogFlush().

Referenced by EvictUnpinnedBufferInternal(), FlushDatabaseBuffers(), FlushOneBuffer(), FlushRelationBuffers(), FlushRelationsAllBuffers(), GetVictimBuffer(), and SyncOneBuffer().

◆ FlushDatabaseBuffers()

void FlushDatabaseBuffers ( Oid  dbid)

Definition at line 5276 of file bufmgr.c.

5277{
5278 int i;
5279 BufferDesc *bufHdr;
5280
5281 for (i = 0; i < NBuffers; i++)
5282 {
5283 uint32 buf_state;
5284
5285 bufHdr = GetBufferDescriptor(i);
5286
5287 /*
5288 * As in DropRelationBuffers, an unlocked precheck should be safe and
5289 * saves some cycles.
5290 */
5291 if (bufHdr->tag.dbOid != dbid)
5292 continue;
5293
5294 /* Make sure we can handle the pin */
5297
5298 buf_state = LockBufHdr(bufHdr);
5299 if (bufHdr->tag.dbOid == dbid &&
5300 (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
5301 {
5302 PinBuffer_Locked(bufHdr);
5306 UnpinBuffer(bufHdr);
5307 }
5308 else
5309 UnlockBufHdr(bufHdr, buf_state);
5310 }
5311}

References BM_DIRTY, BM_VALID, BufferDescriptorGetContentLock(), CurrentResourceOwner, buftag::dbOid, FlushBuffer(), GetBufferDescriptor(), i, IOCONTEXT_NORMAL, IOOBJECT_RELATION, LockBufHdr(), LW_SHARED, LWLockAcquire(), LWLockRelease(), NBuffers, PinBuffer_Locked(), ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), BufferDesc::tag, UnlockBufHdr(), and UnpinBuffer().

Referenced by dbase_redo().

◆ FlushOneBuffer()

void FlushOneBuffer ( Buffer  buffer)

Definition at line 5318 of file bufmgr.c.

5319{
5320 BufferDesc *bufHdr;
5321
5322 /* currently not needed, but no fundamental reason not to support */
5323 Assert(!BufferIsLocal(buffer));
5324
5325 Assert(BufferIsPinned(buffer));
5326
5327 bufHdr = GetBufferDescriptor(buffer - 1);
5328
5330
5332}

References Assert(), PrivateRefCountEntry::buffer, BufferDescriptorGetContentLock(), BufferIsLocal, BufferIsPinned, FlushBuffer(), GetBufferDescriptor(), IOCONTEXT_NORMAL, IOOBJECT_RELATION, and LWLockHeldByMe().

Referenced by hash_xlog_init_bitmap_page(), hash_xlog_init_meta_page(), invalidate_rel_block(), and XLogReadBufferForRedoExtended().

◆ FlushRelationBuffers()

void FlushRelationBuffers ( Relation  rel)

Definition at line 4908 of file bufmgr.c.

4909{
4910 int i;
4911 BufferDesc *bufHdr;
4912 SMgrRelation srel = RelationGetSmgr(rel);
4913
4914 if (RelationUsesLocalBuffers(rel))
4915 {
4916 for (i = 0; i < NLocBuffer; i++)
4917 {
4918 uint32 buf_state;
4919
4920 bufHdr = GetLocalBufferDescriptor(i);
4921 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
4922 ((buf_state = pg_atomic_read_u32(&bufHdr->state)) &
4923 (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
4924 {
4925 ErrorContextCallback errcallback;
4926
4927 /* Setup error traceback support for ereport() */
4929 errcallback.arg = bufHdr;
4930 errcallback.previous = error_context_stack;
4931 error_context_stack = &errcallback;
4932
4933 /* Make sure we can handle the pin */
4936
4937 /*
4938 * Pin/unpin mostly to make valgrind work, but it also seems
4939 * like the right thing to do.
4940 */
4941 PinLocalBuffer(bufHdr, false);
4942
4943
4944 FlushLocalBuffer(bufHdr, srel);
4945
4947
4948 /* Pop the error context stack */
4949 error_context_stack = errcallback.previous;
4950 }
4951 }
4952
4953 return;
4954 }
4955
4956 for (i = 0; i < NBuffers; i++)
4957 {
4958 uint32 buf_state;
4959
4960 bufHdr = GetBufferDescriptor(i);
4961
4962 /*
4963 * As in DropRelationBuffers, an unlocked precheck should be safe and
4964 * saves some cycles.
4965 */
4966 if (!BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator))
4967 continue;
4968
4969 /* Make sure we can handle the pin */
4972
4973 buf_state = LockBufHdr(bufHdr);
4974 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
4975 (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
4976 {
4977 PinBuffer_Locked(bufHdr);
4981 UnpinBuffer(bufHdr);
4982 }
4983 else
4984 UnlockBufHdr(bufHdr, buf_state);
4985 }
4986}
static void local_buffer_write_error_callback(void *arg)
Definition: bufmgr.c:6181
void FlushLocalBuffer(BufferDesc *bufHdr, SMgrRelation reln)
Definition: localbuf.c:182
void UnpinLocalBuffer(Buffer buffer)
Definition: localbuf.c:839
bool PinLocalBuffer(BufferDesc *buf_hdr, bool adjust_usagecount)
Definition: localbuf.c:803
int NLocBuffer
Definition: localbuf.c:44

References ErrorContextCallback::arg, BM_DIRTY, BM_VALID, BufferDescriptorGetBuffer(), BufferDescriptorGetContentLock(), BufTagMatchesRelFileLocator(), ErrorContextCallback::callback, CurrentResourceOwner, error_context_stack, FlushBuffer(), FlushLocalBuffer(), GetBufferDescriptor(), GetLocalBufferDescriptor(), i, IOCONTEXT_NORMAL, IOOBJECT_RELATION, local_buffer_write_error_callback(), LockBufHdr(), LW_SHARED, LWLockAcquire(), LWLockRelease(), NBuffers, NLocBuffer, pg_atomic_read_u32(), PinBuffer_Locked(), PinLocalBuffer(), ErrorContextCallback::previous, RelationData::rd_locator, RelationGetSmgr(), RelationUsesLocalBuffers, ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), BufferDesc::state, BufferDesc::tag, UnlockBufHdr(), UnpinBuffer(), and UnpinLocalBuffer().

Referenced by fill_seq_with_data(), heapam_relation_copy_data(), and index_copy_data().

◆ FlushRelationsAllBuffers()

void FlushRelationsAllBuffers ( SMgrRelation smgrs,
int  nrels 
)

Definition at line 4998 of file bufmgr.c.

4999{
5000 int i;
5001 SMgrSortArray *srels;
5002 bool use_bsearch;
5003
5004 if (nrels == 0)
5005 return;
5006
5007 /* fill-in array for qsort */
5008 srels = palloc(sizeof(SMgrSortArray) * nrels);
5009
5010 for (i = 0; i < nrels; i++)
5011 {
5012 Assert(!RelFileLocatorBackendIsTemp(smgrs[i]->smgr_rlocator));
5013
5014 srels[i].rlocator = smgrs[i]->smgr_rlocator.locator;
5015 srels[i].srel = smgrs[i];
5016 }
5017
5018 /*
5019 * Save the bsearch overhead for low number of relations to sync. See
5020 * DropRelationsAllBuffers for details.
5021 */
5022 use_bsearch = nrels > RELS_BSEARCH_THRESHOLD;
5023
5024 /* sort the list of SMgrRelations if necessary */
5025 if (use_bsearch)
5026 qsort(srels, nrels, sizeof(SMgrSortArray), rlocator_comparator);
5027
5028 for (i = 0; i < NBuffers; i++)
5029 {
5030 SMgrSortArray *srelent = NULL;
5031 BufferDesc *bufHdr = GetBufferDescriptor(i);
5032 uint32 buf_state;
5033
5034 /*
5035 * As in DropRelationBuffers, an unlocked precheck should be safe and
5036 * saves some cycles.
5037 */
5038
5039 if (!use_bsearch)
5040 {
5041 int j;
5042
5043 for (j = 0; j < nrels; j++)
5044 {
5045 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srels[j].rlocator))
5046 {
5047 srelent = &srels[j];
5048 break;
5049 }
5050 }
5051 }
5052 else
5053 {
5054 RelFileLocator rlocator;
5055
5056 rlocator = BufTagGetRelFileLocator(&bufHdr->tag);
5057 srelent = bsearch(&rlocator,
5058 srels, nrels, sizeof(SMgrSortArray),
5060 }
5061
5062 /* buffer doesn't belong to any of the given relfilelocators; skip it */
5063 if (srelent == NULL)
5064 continue;
5065
5066 /* Make sure we can handle the pin */
5069
5070 buf_state = LockBufHdr(bufHdr);
5071 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srelent->rlocator) &&
5072 (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
5073 {
5074 PinBuffer_Locked(bufHdr);
5078 UnpinBuffer(bufHdr);
5079 }
5080 else
5081 UnlockBufHdr(bufHdr, buf_state);
5082 }
5083
5084 pfree(srels);
5085}
SMgrRelation srel
Definition: bufmgr.c:140
RelFileLocator rlocator
Definition: bufmgr.c:139

References Assert(), BM_DIRTY, BM_VALID, BufferDescriptorGetContentLock(), BufTagGetRelFileLocator(), BufTagMatchesRelFileLocator(), CurrentResourceOwner, FlushBuffer(), GetBufferDescriptor(), i, IOCONTEXT_NORMAL, IOOBJECT_RELATION, j, RelFileLocatorBackend::locator, LockBufHdr(), LW_SHARED, LWLockAcquire(), LWLockRelease(), NBuffers, palloc(), pfree(), PinBuffer_Locked(), qsort, RelFileLocatorBackendIsTemp, RELS_BSEARCH_THRESHOLD, ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), SMgrSortArray::rlocator, rlocator_comparator(), SMgrRelationData::smgr_rlocator, SMgrSortArray::srel, BufferDesc::tag, UnlockBufHdr(), and UnpinBuffer().

Referenced by smgrdosyncall().

◆ ForgetPrivateRefCountEntry()

static void ForgetPrivateRefCountEntry ( PrivateRefCountEntry ref)
static

Definition at line 448 of file bufmgr.c.

449{
450 Assert(ref->refcount == 0);
451
452 if (ref >= &PrivateRefCountArray[0] &&
454 {
455 ref->buffer = InvalidBuffer;
456
457 /*
458 * Mark the just used entry as reserved - in many scenarios that
459 * allows us to avoid ever having to search the array/hash for free
460 * entries.
461 */
463 }
464 else
465 {
466 bool found;
467 Buffer buffer = ref->buffer;
468
470 Assert(found);
473 }
474}
static PrivateRefCountEntry * ReservedRefCountEntry
Definition: bufmgr.c:219
void * hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, bool *foundPtr)
Definition: dynahash.c:952
@ HASH_REMOVE
Definition: hsearch.h:115

References Assert(), PrivateRefCountEntry::buffer, HASH_REMOVE, hash_search(), InvalidBuffer, PrivateRefCountArray, PrivateRefCountHash, PrivateRefCountOverflowed, PrivateRefCountEntry::refcount, REFCOUNT_ARRAY_ENTRIES, and ReservedRefCountEntry.

Referenced by UnpinBufferNoOwner().

◆ GetAdditionalPinLimit()

uint32 GetAdditionalPinLimit ( void  )

Definition at line 2505 of file bufmgr.c.

2506{
2507 uint32 estimated_pins_held;
2508
2509 /*
2510 * We get the number of "overflowed" pins for free, but don't know the
2511 * number of pins in PrivateRefCountArray. The cost of calculating that
2512 * exactly doesn't seem worth it, so just assume the max.
2513 */
2514 estimated_pins_held = PrivateRefCountOverflowed + REFCOUNT_ARRAY_ENTRIES;
2515
2516 /* Is this backend already holding more than its fair share? */
2517 if (estimated_pins_held > MaxProportionalPins)
2518 return 0;
2519
2520 return MaxProportionalPins - estimated_pins_held;
2521}
static uint32 MaxProportionalPins
Definition: bufmgr.c:221

References MaxProportionalPins, PrivateRefCountOverflowed, and REFCOUNT_ARRAY_ENTRIES.

Referenced by LimitAdditionalPins(), and read_stream_start_pending_read().

◆ GetPinLimit()

uint32 GetPinLimit ( void  )

Definition at line 2493 of file bufmgr.c.

2494{
2495 return MaxProportionalPins;
2496}

References MaxProportionalPins.

Referenced by GetAccessStrategy(), and read_stream_begin_impl().

◆ GetPrivateRefCount()

static int32 GetPrivateRefCount ( Buffer  buffer)
inlinestatic

Definition at line 425 of file bufmgr.c.

426{
428
429 Assert(BufferIsValid(buffer));
430 Assert(!BufferIsLocal(buffer));
431
432 /*
433 * Not moving the entry - that's ok for the current users, but we might
434 * want to change this one day.
435 */
436 ref = GetPrivateRefCountEntry(buffer, false);
437
438 if (ref == NULL)
439 return 0;
440 return ref->refcount;
441}
static PrivateRefCountEntry * GetPrivateRefCountEntry(Buffer buffer, bool do_move)
Definition: bufmgr.c:351

References Assert(), PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsValid(), GetPrivateRefCountEntry(), and PrivateRefCountEntry::refcount.

Referenced by CheckBufferIsPinnedOnce(), ConditionalLockBufferForCleanup(), DebugPrintBufferRefcount(), HoldingBufferPinThatDelaysRecovery(), InvalidateBuffer(), InvalidateVictimBuffer(), IsBufferCleanupOK(), MarkBufferDirtyHint(), and ReadRecentBuffer().

◆ GetPrivateRefCountEntry()

static PrivateRefCountEntry * GetPrivateRefCountEntry ( Buffer  buffer,
bool  do_move 
)
static

Definition at line 351 of file bufmgr.c.

352{
354 int i;
355
356 Assert(BufferIsValid(buffer));
357 Assert(!BufferIsLocal(buffer));
358
359 /*
360 * First search for references in the array, that'll be sufficient in the
361 * majority of cases.
362 */
363 for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
364 {
365 res = &PrivateRefCountArray[i];
366
367 if (res->buffer == buffer)
368 return res;
369 }
370
371 /*
372 * By here we know that the buffer, if already pinned, isn't residing in
373 * the array.
374 *
375 * Only look up the buffer in the hashtable if we've previously overflowed
376 * into it.
377 */
379 return NULL;
380
381 res = hash_search(PrivateRefCountHash, &buffer, HASH_FIND, NULL);
382
383 if (res == NULL)
384 return NULL;
385 else if (!do_move)
386 {
387 /* caller doesn't want us to move the hash entry into the array */
388 return res;
389 }
390 else
391 {
392 /* move buffer from hashtable into the free array slot */
393 bool found;
395
396 /* Ensure there's a free array slot */
398
399 /* Use up the reserved slot */
403 Assert(free->buffer == InvalidBuffer);
404
405 /* and fill it */
406 free->buffer = buffer;
407 free->refcount = res->refcount;
408
409 /* delete from hashtable */
411 Assert(found);
414
415 return free;
416 }
417}
#define free(a)
Definition: header.h:65
@ HASH_FIND
Definition: hsearch.h:113

References Assert(), PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsValid(), free, HASH_FIND, HASH_REMOVE, hash_search(), i, InvalidBuffer, PrivateRefCountArray, PrivateRefCountHash, PrivateRefCountOverflowed, PrivateRefCountEntry::refcount, REFCOUNT_ARRAY_ENTRIES, ReservedRefCountEntry, and ReservePrivateRefCountEntry().

Referenced by GetPrivateRefCount(), IncrBufferRefCount(), PinBuffer(), PinBuffer_Locked(), and UnpinBufferNoOwner().

◆ GetVictimBuffer()

static Buffer GetVictimBuffer ( BufferAccessStrategy  strategy,
IOContext  io_context 
)
static

Definition at line 2333 of file bufmgr.c.

2334{
2335 BufferDesc *buf_hdr;
2336 Buffer buf;
2337 uint32 buf_state;
2338 bool from_ring;
2339
2340 /*
2341 * Ensure, while the spinlock's not yet held, that there's a free refcount
2342 * entry, and a resource owner slot for the pin.
2343 */
2346
2347 /* we return here if a prospective victim buffer gets used concurrently */
2348again:
2349
2350 /*
2351 * Select a victim buffer. The buffer is returned with its header
2352 * spinlock still held!
2353 */
2354 buf_hdr = StrategyGetBuffer(strategy, &buf_state, &from_ring);
2355 buf = BufferDescriptorGetBuffer(buf_hdr);
2356
2357 Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 0);
2358
2359 /* Pin the buffer and then release the buffer spinlock */
2360 PinBuffer_Locked(buf_hdr);
2361
2362 /*
2363 * We shouldn't have any other pins for this buffer.
2364 */
2366
2367 /*
2368 * If the buffer was dirty, try to write it out. There is a race
2369 * condition here, in that someone might dirty it after we released the
2370 * buffer header lock above, or even while we are writing it out (since
2371 * our share-lock won't prevent hint-bit updates). We will recheck the
2372 * dirty bit after re-locking the buffer header.
2373 */
2374 if (buf_state & BM_DIRTY)
2375 {
2376 LWLock *content_lock;
2377
2378 Assert(buf_state & BM_TAG_VALID);
2379 Assert(buf_state & BM_VALID);
2380
2381 /*
2382 * We need a share-lock on the buffer contents to write it out (else
2383 * we might write invalid data, eg because someone else is compacting
2384 * the page contents while we write). We must use a conditional lock
2385 * acquisition here to avoid deadlock. Even though the buffer was not
2386 * pinned (and therefore surely not locked) when StrategyGetBuffer
2387 * returned it, someone else could have pinned and exclusive-locked it
2388 * by the time we get here. If we try to get the lock unconditionally,
2389 * we'd block waiting for them; if they later block waiting for us,
2390 * deadlock ensues. (This has been observed to happen when two
2391 * backends are both trying to split btree index pages, and the second
2392 * one just happens to be trying to split the page the first one got
2393 * from StrategyGetBuffer.)
2394 */
2395 content_lock = BufferDescriptorGetContentLock(buf_hdr);
2396 if (!LWLockConditionalAcquire(content_lock, LW_SHARED))
2397 {
2398 /*
2399 * Someone else has locked the buffer, so give it up and loop back
2400 * to get another one.
2401 */
2402 UnpinBuffer(buf_hdr);
2403 goto again;
2404 }
2405
2406 /*
2407 * If using a nondefault strategy, and writing the buffer would
2408 * require a WAL flush, let the strategy decide whether to go ahead
2409 * and write/reuse the buffer or to choose another victim. We need a
2410 * lock to inspect the page LSN, so this can't be done inside
2411 * StrategyGetBuffer.
2412 */
2413 if (strategy != NULL)
2414 {
2415 XLogRecPtr lsn;
2416
2417 /* Read the LSN while holding buffer header lock */
2418 buf_state = LockBufHdr(buf_hdr);
2419 lsn = BufferGetLSN(buf_hdr);
2420 UnlockBufHdr(buf_hdr, buf_state);
2421
2422 if (XLogNeedsFlush(lsn)
2423 && StrategyRejectBuffer(strategy, buf_hdr, from_ring))
2424 {
2425 LWLockRelease(content_lock);
2426 UnpinBuffer(buf_hdr);
2427 goto again;
2428 }
2429 }
2430
2431 /* OK, do the I/O */
2432 FlushBuffer(buf_hdr, NULL, IOOBJECT_RELATION, io_context);
2433 LWLockRelease(content_lock);
2434
2436 &buf_hdr->tag);
2437 }
2438
2439
2440 if (buf_state & BM_VALID)
2441 {
2442 /*
2443 * When a BufferAccessStrategy is in use, blocks evicted from shared
2444 * buffers are counted as IOOP_EVICT in the corresponding context
2445 * (e.g. IOCONTEXT_BULKWRITE). Shared buffers are evicted by a
2446 * strategy in two cases: 1) while initially claiming buffers for the
2447 * strategy ring 2) to replace an existing strategy ring buffer
2448 * because it is pinned or in use and cannot be reused.
2449 *
2450 * Blocks evicted from buffers already in the strategy ring are
2451 * counted as IOOP_REUSE in the corresponding strategy context.
2452 *
2453 * At this point, we can accurately count evictions and reuses,
2454 * because we have successfully claimed the valid buffer. Previously,
2455 * we may have been forced to release the buffer due to concurrent
2456 * pinners or erroring out.
2457 */
2459 from_ring ? IOOP_REUSE : IOOP_EVICT, 1, 0);
2460 }
2461
2462 /*
2463 * If the buffer has an entry in the buffer mapping table, delete it. This
2464 * can fail because another backend could have pinned or dirtied the
2465 * buffer.
2466 */
2467 if ((buf_state & BM_TAG_VALID) && !InvalidateVictimBuffer(buf_hdr))
2468 {
2469 UnpinBuffer(buf_hdr);
2470 goto again;
2471 }
2472
2473 /* a final set of sanity checks */
2474#ifdef USE_ASSERT_CHECKING
2475 buf_state = pg_atomic_read_u32(&buf_hdr->state);
2476
2477 Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 1);
2478 Assert(!(buf_state & (BM_TAG_VALID | BM_VALID | BM_DIRTY)));
2479
2481#endif
2482
2483 return buf;
2484}
WritebackContext BackendWritebackContext
Definition: buf_init.c:24
void CheckBufferIsPinnedOnce(Buffer buffer)
Definition: bufmgr.c:5619
void ScheduleBufferTagForWriteback(WritebackContext *wb_context, IOContext io_context, BufferTag *tag)
Definition: bufmgr.c:6378
BufferDesc * StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_ring)
Definition: freelist.c:171
bool StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf, bool from_ring)
Definition: freelist.c:723
@ IOOP_EVICT
Definition: pgstat.h:305
@ IOOP_REUSE
Definition: pgstat.h:308
bool XLogNeedsFlush(XLogRecPtr record)
Definition: xlog.c:3124

References Assert(), BackendWritebackContext, BM_DIRTY, BM_TAG_VALID, BM_VALID, buf, BUF_STATE_GET_REFCOUNT, BufferDescriptorGetBuffer(), BufferDescriptorGetContentLock(), BufferGetLSN, CheckBufferIsPinnedOnce(), CurrentResourceOwner, FlushBuffer(), InvalidateVictimBuffer(), IOOBJECT_RELATION, IOOP_EVICT, IOOP_REUSE, LockBufHdr(), LW_SHARED, LWLockConditionalAcquire(), LWLockRelease(), pg_atomic_read_u32(), pgstat_count_io_op(), PinBuffer_Locked(), ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), ScheduleBufferTagForWriteback(), BufferDesc::state, StrategyGetBuffer(), StrategyRejectBuffer(), BufferDesc::tag, UnlockBufHdr(), UnpinBuffer(), and XLogNeedsFlush().

Referenced by BufferAlloc(), and ExtendBufferedRelShared().

◆ HoldingBufferPinThatDelaysRecovery()

bool HoldingBufferPinThatDelaysRecovery ( void  )

Definition at line 5794 of file bufmgr.c.

5795{
5796 int bufid = GetStartupBufferPinWaitBufId();
5797
5798 /*
5799 * If we get woken slowly then it's possible that the Startup process was
5800 * already woken by other backends before we got here. Also possible that
5801 * we get here by multiple interrupts or interrupts at inappropriate
5802 * times, so make sure we do nothing if the bufid is not set.
5803 */
5804 if (bufid < 0)
5805 return false;
5806
5807 if (GetPrivateRefCount(bufid + 1) > 0)
5808 return true;
5809
5810 return false;
5811}
int GetStartupBufferPinWaitBufId(void)
Definition: proc.c:766

References GetPrivateRefCount(), and GetStartupBufferPinWaitBufId().

Referenced by CheckRecoveryConflictDeadlock(), and ProcessRecoveryConflictInterrupt().

◆ IncrBufferRefCount()

void IncrBufferRefCount ( Buffer  buffer)

◆ InitBufferManagerAccess()

void InitBufferManagerAccess ( void  )

Definition at line 3982 of file bufmgr.c.

3983{
3984 HASHCTL hash_ctl;
3985
3986 /*
3987 * An advisory limit on the number of pins each backend should hold, based
3988 * on shared_buffers and the maximum number of connections possible.
3989 * That's very pessimistic, but outside toy-sized shared_buffers it should
3990 * allow plenty of pins. LimitAdditionalPins() and
3991 * GetAdditionalPinLimit() can be used to check the remaining balance.
3992 */
3994
3995 memset(&PrivateRefCountArray, 0, sizeof(PrivateRefCountArray));
3996
3997 hash_ctl.keysize = sizeof(int32);
3998 hash_ctl.entrysize = sizeof(PrivateRefCountEntry);
3999
4000 PrivateRefCountHash = hash_create("PrivateRefCount", 100, &hash_ctl,
4002
4003 /*
4004 * AtProcExit_Buffers needs LWLock access, and thereby has to be called at
4005 * the corresponding phase of backend shutdown.
4006 */
4007 Assert(MyProc != NULL);
4009}
static void AtProcExit_Buffers(int code, Datum arg)
Definition: bufmgr.c:4016
struct PrivateRefCountEntry PrivateRefCountEntry
HTAB * hash_create(const char *tabname, int64 nelem, const HASHCTL *info, int flags)
Definition: dynahash.c:358
int MaxBackends
Definition: globals.c:146
#define HASH_ELEM
Definition: hsearch.h:95
#define HASH_BLOBS
Definition: hsearch.h:97
void on_shmem_exit(pg_on_exit_callback function, Datum arg)
Definition: ipc.c:365
#define NUM_AUXILIARY_PROCS
Definition: proc.h:463
PGPROC * MyProc
Definition: proc.c:66
Size keysize
Definition: hsearch.h:75
Size entrysize
Definition: hsearch.h:76

References Assert(), AtProcExit_Buffers(), HASHCTL::entrysize, HASH_BLOBS, hash_create(), HASH_ELEM, HASHCTL::keysize, MaxBackends, MaxProportionalPins, MyProc, NBuffers, NUM_AUXILIARY_PROCS, on_shmem_exit(), PrivateRefCountArray, and PrivateRefCountHash.

Referenced by BaseInit().

◆ InvalidateBuffer()

static void InvalidateBuffer ( BufferDesc buf)
static

Definition at line 2171 of file bufmgr.c.

2172{
2173 BufferTag oldTag;
2174 uint32 oldHash; /* hash value for oldTag */
2175 LWLock *oldPartitionLock; /* buffer partition lock for it */
2176 uint32 oldFlags;
2177 uint32 buf_state;
2178
2179 /* Save the original buffer tag before dropping the spinlock */
2180 oldTag = buf->tag;
2181
2182 buf_state = pg_atomic_read_u32(&buf->state);
2183 Assert(buf_state & BM_LOCKED);
2184 UnlockBufHdr(buf, buf_state);
2185
2186 /*
2187 * Need to compute the old tag's hashcode and partition lock ID. XXX is it
2188 * worth storing the hashcode in BufferDesc so we need not recompute it
2189 * here? Probably not.
2190 */
2191 oldHash = BufTableHashCode(&oldTag);
2192 oldPartitionLock = BufMappingPartitionLock(oldHash);
2193
2194retry:
2195
2196 /*
2197 * Acquire exclusive mapping lock in preparation for changing the buffer's
2198 * association.
2199 */
2200 LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
2201
2202 /* Re-lock the buffer header */
2203 buf_state = LockBufHdr(buf);
2204
2205 /* If it's changed while we were waiting for lock, do nothing */
2206 if (!BufferTagsEqual(&buf->tag, &oldTag))
2207 {
2208 UnlockBufHdr(buf, buf_state);
2209 LWLockRelease(oldPartitionLock);
2210 return;
2211 }
2212
2213 /*
2214 * We assume the reason for it to be pinned is that either we were
2215 * asynchronously reading the page in before erroring out or someone else
2216 * is flushing the page out. Wait for the IO to finish. (This could be
2217 * an infinite loop if the refcount is messed up... it would be nice to
2218 * time out after awhile, but there seems no way to be sure how many loops
2219 * may be needed. Note that if the other guy has pinned the buffer but
2220 * not yet done StartBufferIO, WaitIO will fall through and we'll
2221 * effectively be busy-looping here.)
2222 */
2223 if (BUF_STATE_GET_REFCOUNT(buf_state) != 0)
2224 {
2225 UnlockBufHdr(buf, buf_state);
2226 LWLockRelease(oldPartitionLock);
2227 /* safety check: should definitely not be our *own* pin */
2229 elog(ERROR, "buffer is pinned in InvalidateBuffer");
2230 WaitIO(buf);
2231 goto retry;
2232 }
2233
2234 /*
2235 * Clear out the buffer's tag and flags. We must do this to ensure that
2236 * linear scans of the buffer array don't think the buffer is valid.
2237 */
2238 oldFlags = buf_state & BUF_FLAG_MASK;
2239 ClearBufferTag(&buf->tag);
2240 buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
2241 UnlockBufHdr(buf, buf_state);
2242
2243 /*
2244 * Remove the buffer from the lookup hashtable, if it was in there.
2245 */
2246 if (oldFlags & BM_TAG_VALID)
2247 BufTableDelete(&oldTag, oldHash);
2248
2249 /*
2250 * Done with mapping lock.
2251 */
2252 LWLockRelease(oldPartitionLock);
2253}
#define BUF_USAGECOUNT_MASK
Definition: buf_internals.h:53
static bool BufferTagsEqual(const BufferTag *tag1, const BufferTag *tag2)
static void ClearBufferTag(BufferTag *tag)
void BufTableDelete(BufferTag *tagPtr, uint32 hashcode)
Definition: buf_table.c:148
static void WaitIO(BufferDesc *buf)
Definition: bufmgr.c:5931

References Assert(), BM_LOCKED, BM_TAG_VALID, buf, BUF_FLAG_MASK, BUF_STATE_GET_REFCOUNT, BUF_USAGECOUNT_MASK, BufferDescriptorGetBuffer(), BufferTagsEqual(), BufMappingPartitionLock(), BufTableDelete(), BufTableHashCode(), ClearBufferTag(), elog, ERROR, GetPrivateRefCount(), LockBufHdr(), LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), pg_atomic_read_u32(), UnlockBufHdr(), and WaitIO().

Referenced by DropDatabaseBuffers(), DropRelationBuffers(), DropRelationsAllBuffers(), and FindAndDropRelationBuffers().

◆ InvalidateVictimBuffer()

static bool InvalidateVictimBuffer ( BufferDesc buf_hdr)
static

Definition at line 2265 of file bufmgr.c.

2266{
2267 uint32 buf_state;
2268 uint32 hash;
2269 LWLock *partition_lock;
2270 BufferTag tag;
2271
2273
2274 /* have buffer pinned, so it's safe to read tag without lock */
2275 tag = buf_hdr->tag;
2276
2277 hash = BufTableHashCode(&tag);
2278 partition_lock = BufMappingPartitionLock(hash);
2279
2280 LWLockAcquire(partition_lock, LW_EXCLUSIVE);
2281
2282 /* lock the buffer header */
2283 buf_state = LockBufHdr(buf_hdr);
2284
2285 /*
2286 * We have the buffer pinned nobody else should have been able to unset
2287 * this concurrently.
2288 */
2289 Assert(buf_state & BM_TAG_VALID);
2290 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
2291 Assert(BufferTagsEqual(&buf_hdr->tag, &tag));
2292
2293 /*
2294 * If somebody else pinned the buffer since, or even worse, dirtied it,
2295 * give up on this buffer: It's clearly in use.
2296 */
2297 if (BUF_STATE_GET_REFCOUNT(buf_state) != 1 || (buf_state & BM_DIRTY))
2298 {
2299 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
2300
2301 UnlockBufHdr(buf_hdr, buf_state);
2302 LWLockRelease(partition_lock);
2303
2304 return false;
2305 }
2306
2307 /*
2308 * Clear out the buffer's tag and flags and usagecount. This is not
2309 * strictly required, as BM_TAG_VALID/BM_VALID needs to be checked before
2310 * doing anything with the buffer. But currently it's beneficial, as the
2311 * cheaper pre-check for several linear scans of shared buffers use the
2312 * tag (see e.g. FlushDatabaseBuffers()).
2313 */
2314 ClearBufferTag(&buf_hdr->tag);
2315 buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
2316 UnlockBufHdr(buf_hdr, buf_state);
2317
2318 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
2319
2320 /* finally delete buffer from the buffer mapping table */
2321 BufTableDelete(&tag, hash);
2322
2323 LWLockRelease(partition_lock);
2324
2325 Assert(!(buf_state & (BM_DIRTY | BM_VALID | BM_TAG_VALID)));
2326 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
2328
2329 return true;
2330}

References Assert(), BM_DIRTY, BM_TAG_VALID, BM_VALID, BUF_FLAG_MASK, BUF_STATE_GET_REFCOUNT, BUF_USAGECOUNT_MASK, BufferDescriptorGetBuffer(), BufferTagsEqual(), BufMappingPartitionLock(), BufTableDelete(), BufTableHashCode(), ClearBufferTag(), GetPrivateRefCount(), hash(), LockBufHdr(), LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), pg_atomic_read_u32(), BufferDesc::state, BufferDesc::tag, and UnlockBufHdr().

Referenced by EvictUnpinnedBufferInternal(), and GetVictimBuffer().

◆ IsBufferCleanupOK()

bool IsBufferCleanupOK ( Buffer  buffer)

Definition at line 5878 of file bufmgr.c.

5879{
5880 BufferDesc *bufHdr;
5881 uint32 buf_state;
5882
5883 Assert(BufferIsValid(buffer));
5884
5885 /* see AIO related comment in LockBufferForCleanup() */
5886
5887 if (BufferIsLocal(buffer))
5888 {
5889 /* There should be exactly one pin */
5890 if (LocalRefCount[-buffer - 1] != 1)
5891 return false;
5892 /* Nobody else to wait for */
5893 return true;
5894 }
5895
5896 /* There should be exactly one local pin */
5897 if (GetPrivateRefCount(buffer) != 1)
5898 return false;
5899
5900 bufHdr = GetBufferDescriptor(buffer - 1);
5901
5902 /* caller must hold exclusive lock on buffer */
5904 LW_EXCLUSIVE));
5905
5906 buf_state = LockBufHdr(bufHdr);
5907
5908 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
5909 if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
5910 {
5911 /* pincount is OK. */
5912 UnlockBufHdr(bufHdr, buf_state);
5913 return true;
5914 }
5915
5916 UnlockBufHdr(bufHdr, buf_state);
5917 return false;
5918}

References Assert(), BUF_STATE_GET_REFCOUNT, PrivateRefCountEntry::buffer, BufferDescriptorGetContentLock(), BufferIsLocal, BufferIsValid(), GetBufferDescriptor(), GetPrivateRefCount(), LocalRefCount, LockBufHdr(), LW_EXCLUSIVE, LWLockHeldByMeInMode(), and UnlockBufHdr().

Referenced by _hash_doinsert(), _hash_expandtable(), _hash_splitbucket(), and hashbucketcleanup().

◆ IssuePendingWritebacks()

void IssuePendingWritebacks ( WritebackContext wb_context,
IOContext  io_context 
)

Definition at line 6428 of file bufmgr.c.

6429{
6430 instr_time io_start;
6431 int i;
6432
6433 if (wb_context->nr_pending == 0)
6434 return;
6435
6436 /*
6437 * Executing the writes in-order can make them a lot faster, and allows to
6438 * merge writeback requests to consecutive blocks into larger writebacks.
6439 */
6440 sort_pending_writebacks(wb_context->pending_writebacks,
6441 wb_context->nr_pending);
6442
6444
6445 /*
6446 * Coalesce neighbouring writes, but nothing else. For that we iterate
6447 * through the, now sorted, array of pending flushes, and look forward to
6448 * find all neighbouring (or identical) writes.
6449 */
6450 for (i = 0; i < wb_context->nr_pending; i++)
6451 {
6454 SMgrRelation reln;
6455 int ahead;
6456 BufferTag tag;
6457 RelFileLocator currlocator;
6458 Size nblocks = 1;
6459
6460 cur = &wb_context->pending_writebacks[i];
6461 tag = cur->tag;
6462 currlocator = BufTagGetRelFileLocator(&tag);
6463
6464 /*
6465 * Peek ahead, into following writeback requests, to see if they can
6466 * be combined with the current one.
6467 */
6468 for (ahead = 0; i + ahead + 1 < wb_context->nr_pending; ahead++)
6469 {
6470
6471 next = &wb_context->pending_writebacks[i + ahead + 1];
6472
6473 /* different file, stop */
6474 if (!RelFileLocatorEquals(currlocator,
6475 BufTagGetRelFileLocator(&next->tag)) ||
6476 BufTagGetForkNum(&cur->tag) != BufTagGetForkNum(&next->tag))
6477 break;
6478
6479 /* ok, block queued twice, skip */
6480 if (cur->tag.blockNum == next->tag.blockNum)
6481 continue;
6482
6483 /* only merge consecutive writes */
6484 if (cur->tag.blockNum + 1 != next->tag.blockNum)
6485 break;
6486
6487 nblocks++;
6488 cur = next;
6489 }
6490
6491 i += ahead;
6492
6493 /* and finally tell the kernel to write the data to storage */
6494 reln = smgropen(currlocator, INVALID_PROC_NUMBER);
6495 smgrwriteback(reln, BufTagGetForkNum(&tag), tag.blockNum, nblocks);
6496 }
6497
6498 /*
6499 * Assume that writeback requests are only issued for buffers containing
6500 * blocks of permanent relations.
6501 */
6503 IOOP_WRITEBACK, io_start, wb_context->nr_pending, 0);
6504
6505 wb_context->nr_pending = 0;
6506}
static int32 next
Definition: blutils.c:224
struct cursor * cur
Definition: ecpg.c:29
@ IOOP_WRITEBACK
Definition: pgstat.h:309
#define RelFileLocatorEquals(locator1, locator2)
void smgrwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks)
Definition: smgr.c:805
PendingWriteback pending_writebacks[WRITEBACK_MAX_PENDING_FLUSHES]

References buftag::blockNum, BufTagGetForkNum(), BufTagGetRelFileLocator(), cur, i, INVALID_PROC_NUMBER, IOOBJECT_RELATION, IOOP_WRITEBACK, next, WritebackContext::nr_pending, WritebackContext::pending_writebacks, pgstat_count_io_op_time(), pgstat_prepare_io_time(), RelFileLocatorEquals, smgropen(), smgrwriteback(), and track_io_timing.

Referenced by BufferSync(), and ScheduleBufferTagForWriteback().

◆ LimitAdditionalPins()

void LimitAdditionalPins ( uint32 additional_pins)

Definition at line 2531 of file bufmgr.c.

2532{
2533 uint32 limit;
2534
2535 if (*additional_pins <= 1)
2536 return;
2537
2538 limit = GetAdditionalPinLimit();
2539 limit = Max(limit, 1);
2540 if (limit < *additional_pins)
2541 *additional_pins = limit;
2542}
uint32 GetAdditionalPinLimit(void)
Definition: bufmgr.c:2505
#define Max(x, y)
Definition: c.h:998

References GetAdditionalPinLimit(), and Max.

Referenced by ExtendBufferedRelShared().

◆ local_buffer_readv_complete()

static PgAioResult local_buffer_readv_complete ( PgAioHandle ioh,
PgAioResult  prior_result,
uint8  cb_data 
)
static

Definition at line 7397 of file bufmgr.c.

7399{
7400 return buffer_readv_complete(ioh, prior_result, cb_data, true);
7401}
static pg_attribute_always_inline PgAioResult buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data, bool is_temp)
Definition: bufmgr.c:7142

References buffer_readv_complete().

◆ local_buffer_readv_stage()

static void local_buffer_readv_stage ( PgAioHandle ioh,
uint8  cb_data 
)
static

Definition at line 7391 of file bufmgr.c.

7392{
7393 buffer_stage_common(ioh, false, true);
7394}
static pg_attribute_always_inline void buffer_stage_common(PgAioHandle *ioh, bool is_write, bool is_temp)
Definition: bufmgr.c:6752

References buffer_stage_common().

◆ local_buffer_write_error_callback()

static void local_buffer_write_error_callback ( void *  arg)
static

Definition at line 6181 of file bufmgr.c.

6182{
6183 BufferDesc *bufHdr = (BufferDesc *) arg;
6184
6185 if (bufHdr != NULL)
6186 errcontext("writing block %u of relation \"%s\"",
6187 bufHdr->tag.blockNum,
6190 BufTagGetForkNum(&bufHdr->tag)).str);
6191}
#define errcontext
Definition: elog.h:198
void * arg

References arg, buftag::blockNum, BufTagGetForkNum(), BufTagGetRelFileLocator(), errcontext, MyProcNumber, relpathbackend, and BufferDesc::tag.

Referenced by FlushRelationBuffers().

◆ LockBuffer()

void LockBuffer ( Buffer  buffer,
int  mode 
)

Definition at line 5572 of file bufmgr.c.

5573{
5574 BufferDesc *buf;
5575
5576 Assert(BufferIsPinned(buffer));
5577 if (BufferIsLocal(buffer))
5578 return; /* local buffers need no lock */
5579
5580 buf = GetBufferDescriptor(buffer - 1);
5581
5582 if (mode == BUFFER_LOCK_UNLOCK)
5584 else if (mode == BUFFER_LOCK_SHARE)
5586 else if (mode == BUFFER_LOCK_EXCLUSIVE)
5588 else
5589 elog(ERROR, "unrecognized buffer lock mode: %d", mode);
5590}
#define BUFFER_LOCK_SHARE
Definition: bufmgr.h:197
#define BUFFER_LOCK_EXCLUSIVE
Definition: bufmgr.h:198

References Assert(), buf, PrivateRefCountEntry::buffer, BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_SHARE, BUFFER_LOCK_UNLOCK, BufferDescriptorGetContentLock(), BufferIsLocal, BufferIsPinned, elog, ERROR, GetBufferDescriptor(), LW_EXCLUSIVE, LW_SHARED, LWLockAcquire(), LWLockRelease(), and mode.

Referenced by _bt_lockbuf(), _bt_unlockbuf(), _bt_upgradelockbufcleanup(), _hash_addovflpage(), _hash_doinsert(), _hash_expandtable(), _hash_finish_split(), _hash_first(), _hash_freeovflpage(), _hash_getbuf(), _hash_getbuf_with_strategy(), _hash_getcachedmetap(), _hash_init(), _hash_kill_items(), _hash_readnext(), _hash_readpage(), _hash_readprev(), _hash_splitbucket(), _hash_squeezebucket(), _hash_vacuum_one_page(), BitmapHeapScanNextBlock(), blbulkdelete(), blgetbitmap(), blinsert(), BloomInitMetapage(), BloomNewBuffer(), blvacuumcleanup(), brin_doinsert(), brin_doupdate(), brin_evacuate_page(), brin_getinsertbuffer(), brin_page_cleanup(), bringetbitmap(), brinGetStats(), brinGetTupleForHeapBlock(), brininsert(), brinLockRevmapPageForUpdate(), brinRevmapDesummarizeRange(), brinRevmapInitialize(), brinsummarize(), bt_metap(), bt_multi_page_stats(), bt_page_items_internal(), bt_page_stats_internal(), bt_recheck_sibling_links(), collect_corrupt_items(), collect_visibility_data(), collectMatchBitmap(), ConditionalLockBufferForCleanup(), count_nondeletable_pages(), create_toy_buffer(), entryLoadMoreItems(), FreeSpaceMapPrepareTruncateRel(), fsm_readbuf(), fsm_search(), fsm_search_avail(), fsm_set_and_search(), fsm_vacuum_page(), get_raw_page_internal(), GetVisibilityMapPins(), gin_check_parent_keys_consistency(), gin_check_posting_tree_parent_keys_consistency(), gin_refind_parent(), ginbulkdelete(), ginEntryInsert(), ginFindLeafPage(), ginFindParents(), ginFinishOldSplit(), ginFinishSplit(), ginGetStats(), ginHeapTupleFastInsert(), ginInsertCleanup(), ginInsertValue(), GinNewBuffer(), ginScanToDelete(), ginStepRight(), ginTraverseLock(), ginUpdateStats(), ginvacuumcleanup(), ginVacuumPostingTreeLeaves(), gistBufferingFindCorrectParent(), gistbufferinginserttuples(), gistdoinsert(), gistFindCorrectParent(), gistFindPath(), gistfinishsplit(), gistfixsplit(), gistformdownlink(), gistGetMaxLevel(), gistinserttuples(), gistkillitems(), gistNewBuffer(), gistProcessItup(), gistScanPage(), gistvacuum_delete_empty_pages(), gistvacuumpage(), hashbucketcleanup(), hashbulkdelete(), heap_abort_speculative(), heap_delete(), heap_fetch(), heap_finish_speculative(), heap_get_latest_tid(), heap_index_delete_tuples(), heap_inplace_lock(), heap_inplace_unlock(), heap_inplace_update_and_unlock(), heap_lock_tuple(), heap_lock_updated_tuple_rec(), heap_page_prune_opt(), heap_prepare_pagescan(), heap_update(), heap_xlog_visible(), heapam_index_build_range_scan(), heapam_index_fetch_tuple(), heapam_index_validate_scan(), heapam_relation_copy_for_cluster(), heapam_scan_analyze_next_block(), heapam_scan_sample_next_tuple(), heapam_tuple_satisfies_snapshot(), heapgettup(), initBloomState(), invalidate_rel_block(), lazy_scan_heap(), lazy_scan_new_or_empty(), lazy_vacuum_heap_rel(), LockBufferForCleanup(), log_newpage_range(), modify_rel_block(), palloc_btree_page(), pg_visibility(), pgrowlocks(), pgstat_btree_page(), pgstat_gist_page(), pgstat_heap(), pgstatginindex_internal(), pgstathashindex(), pgstatindex_impl(), read_seq_tuple(), RelationAddBlocks(), RelationCopyStorageUsingBuffer(), RelationGetBufferForTuple(), revmap_physical_extend(), scanGetCandidate(), scanPendingInsert(), ScanSourceDatabasePgClass(), shiftList(), spgdoinsert(), spgGetCache(), SpGistNewBuffer(), spgprocesspending(), spgvacuumpage(), spgWalk(), startScanEntry(), statapprox_heap(), summarize_range(), UnlockReleaseBuffer(), update_most_recent_deletion_info(), verify_heapam(), verifyBackupPageConsistency(), visibilitymap_clear(), visibilitymap_prepare_truncate(), visibilitymap_set(), vm_readbuf(), XLogReadBufferForRedoExtended(), XLogRecordPageWithFreeSpace(), and ZeroAndLockBuffer().

◆ LockBufferForCleanup()

void LockBufferForCleanup ( Buffer  buffer)

Definition at line 5652 of file bufmgr.c.

5653{
5654 BufferDesc *bufHdr;
5655 TimestampTz waitStart = 0;
5656 bool waiting = false;
5657 bool logged_recovery_conflict = false;
5658
5659 Assert(BufferIsPinned(buffer));
5660 Assert(PinCountWaitBuf == NULL);
5661
5663
5664 /*
5665 * We do not yet need to be worried about in-progress AIOs holding a pin,
5666 * as we, so far, only support doing reads via AIO and this function can
5667 * only be called once the buffer is valid (i.e. no read can be in
5668 * flight).
5669 */
5670
5671 /* Nobody else to wait for */
5672 if (BufferIsLocal(buffer))
5673 return;
5674
5675 bufHdr = GetBufferDescriptor(buffer - 1);
5676
5677 for (;;)
5678 {
5679 uint32 buf_state;
5680
5681 /* Try to acquire lock */
5683 buf_state = LockBufHdr(bufHdr);
5684
5685 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
5686 if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
5687 {
5688 /* Successfully acquired exclusive lock with pincount 1 */
5689 UnlockBufHdr(bufHdr, buf_state);
5690
5691 /*
5692 * Emit the log message if recovery conflict on buffer pin was
5693 * resolved but the startup process waited longer than
5694 * deadlock_timeout for it.
5695 */
5696 if (logged_recovery_conflict)
5698 waitStart, GetCurrentTimestamp(),
5699 NULL, false);
5700
5701 if (waiting)
5702 {
5703 /* reset ps display to remove the suffix if we added one */
5705 waiting = false;
5706 }
5707 return;
5708 }
5709 /* Failed, so mark myself as waiting for pincount 1 */
5710 if (buf_state & BM_PIN_COUNT_WAITER)
5711 {
5712 UnlockBufHdr(bufHdr, buf_state);
5714 elog(ERROR, "multiple backends attempting to wait for pincount 1");
5715 }
5717 PinCountWaitBuf = bufHdr;
5718 buf_state |= BM_PIN_COUNT_WAITER;
5719 UnlockBufHdr(bufHdr, buf_state);
5721
5722 /* Wait to be signaled by UnpinBuffer() */
5723 if (InHotStandby)
5724 {
5725 if (!waiting)
5726 {
5727 /* adjust the process title to indicate that it's waiting */
5728 set_ps_display_suffix("waiting");
5729 waiting = true;
5730 }
5731
5732 /*
5733 * Emit the log message if the startup process is waiting longer
5734 * than deadlock_timeout for recovery conflict on buffer pin.
5735 *
5736 * Skip this if first time through because the startup process has
5737 * not started waiting yet in this case. So, the wait start
5738 * timestamp is set after this logic.
5739 */
5740 if (waitStart != 0 && !logged_recovery_conflict)
5741 {
5743
5744 if (TimestampDifferenceExceeds(waitStart, now,
5746 {
5748 waitStart, now, NULL, true);
5749 logged_recovery_conflict = true;
5750 }
5751 }
5752
5753 /*
5754 * Set the wait start timestamp if logging is enabled and first
5755 * time through.
5756 */
5757 if (log_recovery_conflict_waits && waitStart == 0)
5758 waitStart = GetCurrentTimestamp();
5759
5760 /* Publish the bufid that Startup process waits on */
5761 SetStartupBufferPinWaitBufId(buffer - 1);
5762 /* Set alarm and then wait to be signaled by UnpinBuffer() */
5764 /* Reset the published bufid */
5766 }
5767 else
5768 ProcWaitForSignal(WAIT_EVENT_BUFFER_PIN);
5769
5770 /*
5771 * Remove flag marking us as waiter. Normally this will not be set
5772 * anymore, but ProcWaitForSignal() can return for other signals as
5773 * well. We take care to only reset the flag if we're the waiter, as
5774 * theoretically another backend could have started waiting. That's
5775 * impossible with the current usages due to table level locking, but
5776 * better be safe.
5777 */
5778 buf_state = LockBufHdr(bufHdr);
5779 if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
5781 buf_state &= ~BM_PIN_COUNT_WAITER;
5782 UnlockBufHdr(bufHdr, buf_state);
5783
5784 PinCountWaitBuf = NULL;
5785 /* Loop back and try again */
5786 }
5787}
bool TimestampDifferenceExceeds(TimestampTz start_time, TimestampTz stop_time, int msec)
Definition: timestamp.c:1781
TimestampTz GetCurrentTimestamp(void)
Definition: timestamp.c:1645
Datum now(PG_FUNCTION_ARGS)
Definition: timestamp.c:1609
#define BM_PIN_COUNT_WAITER
Definition: buf_internals.h:75
static BufferDesc * PinCountWaitBuf
Definition: bufmgr.c:183
int64 TimestampTz
Definition: timestamp.h:39
@ PROCSIG_RECOVERY_CONFLICT_BUFFERPIN
Definition: procsignal.h:47
void set_ps_display_remove_suffix(void)
Definition: ps_status.c:439
void set_ps_display_suffix(const char *suffix)
Definition: ps_status.c:387
int DeadlockTimeout
Definition: proc.c:57
void SetStartupBufferPinWaitBufId(int bufid)
Definition: proc.c:754
void ProcWaitForSignal(uint32 wait_event_info)
Definition: proc.c:1974
void ResolveRecoveryConflictWithBufferPin(void)
Definition: standby.c:793
bool log_recovery_conflict_waits
Definition: standby.c:42
void LogRecoveryConflict(ProcSignalReason reason, TimestampTz wait_start, TimestampTz now, VirtualTransactionId *wait_list, bool still_waiting)
Definition: standby.c:274
int wait_backend_pgprocno
static volatile sig_atomic_t waiting
Definition: waiteventset.c:171
#define InHotStandby
Definition: xlogutils.h:60

References Assert(), BM_PIN_COUNT_WAITER, BUF_STATE_GET_REFCOUNT, PrivateRefCountEntry::buffer, BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_UNLOCK, BufferIsLocal, BufferIsPinned, CheckBufferIsPinnedOnce(), DeadlockTimeout, elog, ERROR, GetBufferDescriptor(), GetCurrentTimestamp(), InHotStandby, LockBuffer(), LockBufHdr(), log_recovery_conflict_waits, LogRecoveryConflict(), MyProcNumber, now(), PinCountWaitBuf, PROCSIG_RECOVERY_CONFLICT_BUFFERPIN, ProcWaitForSignal(), ResolveRecoveryConflictWithBufferPin(), set_ps_display_remove_suffix(), set_ps_display_suffix(), SetStartupBufferPinWaitBufId(), TimestampDifferenceExceeds(), UnlockBufHdr(), BufferDesc::wait_backend_pgprocno, and waiting.

Referenced by _bt_upgradelockbufcleanup(), ginVacuumPostingTree(), hashbulkdelete(), heap_force_common(), lazy_scan_heap(), XLogReadBufferForRedoExtended(), and ZeroAndLockBuffer().

◆ LockBufHdr()

uint32 LockBufHdr ( BufferDesc desc)

Definition at line 6224 of file bufmgr.c.

6225{
6226 SpinDelayStatus delayStatus;
6227 uint32 old_buf_state;
6228
6230
6231 init_local_spin_delay(&delayStatus);
6232
6233 while (true)
6234 {
6235 /* set BM_LOCKED flag */
6236 old_buf_state = pg_atomic_fetch_or_u32(&desc->state, BM_LOCKED);
6237 /* if it wasn't set before we're OK */
6238 if (!(old_buf_state & BM_LOCKED))
6239 break;
6240 perform_spin_delay(&delayStatus);
6241 }
6242 finish_spin_delay(&delayStatus);
6243 return old_buf_state | BM_LOCKED;
6244}
static uint32 pg_atomic_fetch_or_u32(volatile pg_atomic_uint32 *ptr, uint32 or_)
Definition: atomics.h:408
void perform_spin_delay(SpinDelayStatus *status)
Definition: s_lock.c:126
void finish_spin_delay(SpinDelayStatus *status)
Definition: s_lock.c:186
#define init_local_spin_delay(status)
Definition: s_lock.h:733

References Assert(), BM_LOCKED, BufferDescriptorGetBuffer(), BufferIsLocal, finish_spin_delay(), init_local_spin_delay, perform_spin_delay(), pg_atomic_fetch_or_u32(), and BufferDesc::state.

Referenced by AbortBufferIO(), apw_dump_now(), buffer_stage_common(), BufferAlloc(), BufferGetLSNAtomic(), BufferSync(), ConditionalLockBufferForCleanup(), create_toy_buffer(), DropDatabaseBuffers(), DropRelationBuffers(), DropRelationsAllBuffers(), EvictAllUnpinnedBuffers(), EvictRelUnpinnedBuffers(), EvictUnpinnedBuffer(), ExtendBufferedRelShared(), FindAndDropRelationBuffers(), FlushBuffer(), FlushDatabaseBuffers(), FlushRelationBuffers(), FlushRelationsAllBuffers(), GetBufferFromRing(), GetVictimBuffer(), InvalidateBuffer(), InvalidateVictimBuffer(), IsBufferCleanupOK(), LockBufferForCleanup(), MarkBufferDirtyHint(), pg_buffercache_numa_pages(), pg_buffercache_pages(), ReadRecentBuffer(), StartBufferIO(), StrategyGetBuffer(), SyncOneBuffer(), TerminateBufferIO(), UnlockBuffers(), WaitIO(), and WakePinCountWaiter().

◆ MarkBufferDirty()

void MarkBufferDirty ( Buffer  buffer)

Definition at line 2921 of file bufmgr.c.

2922{
2923 BufferDesc *bufHdr;
2924 uint32 buf_state;
2925 uint32 old_buf_state;
2926
2927 if (!BufferIsValid(buffer))
2928 elog(ERROR, "bad buffer ID: %d", buffer);
2929
2930 if (BufferIsLocal(buffer))
2931 {
2932 MarkLocalBufferDirty(buffer);
2933 return;
2934 }
2935
2936 bufHdr = GetBufferDescriptor(buffer - 1);
2937
2938 Assert(BufferIsPinned(buffer));
2940 LW_EXCLUSIVE));
2941
2942 old_buf_state = pg_atomic_read_u32(&bufHdr->state);
2943 for (;;)
2944 {
2945 if (old_buf_state & BM_LOCKED)
2946 old_buf_state = WaitBufHdrUnlocked(bufHdr);
2947
2948 buf_state = old_buf_state;
2949
2950 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
2951 buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
2952
2953 if (pg_atomic_compare_exchange_u32(&bufHdr->state, &old_buf_state,
2954 buf_state))
2955 break;
2956 }
2957
2958 /*
2959 * If the buffer was not dirty already, do vacuum accounting.
2960 */
2961 if (!(old_buf_state & BM_DIRTY))
2962 {
2964 if (VacuumCostActive)
2966 }
2967}
static bool pg_atomic_compare_exchange_u32(volatile pg_atomic_uint32 *ptr, uint32 *expected, uint32 newval)
Definition: atomics.h:347
static uint32 WaitBufHdrUnlocked(BufferDesc *buf)
Definition: bufmgr.c:6254
int VacuumCostPageDirty
Definition: globals.c:153
void MarkLocalBufferDirty(Buffer buffer)
Definition: localbuf.c:489
int64 shared_blks_dirtied
Definition: instrument.h:28

References Assert(), BM_DIRTY, BM_JUST_DIRTIED, BM_LOCKED, BUF_STATE_GET_REFCOUNT, PrivateRefCountEntry::buffer, BufferDescriptorGetContentLock(), BufferIsLocal, BufferIsPinned, BufferIsValid(), elog, ERROR, GetBufferDescriptor(), LW_EXCLUSIVE, LWLockHeldByMeInMode(), MarkLocalBufferDirty(), pg_atomic_compare_exchange_u32(), pg_atomic_read_u32(), pgBufferUsage, BufferUsage::shared_blks_dirtied, BufferDesc::state, VacuumCostActive, VacuumCostBalance, VacuumCostPageDirty, and WaitBufHdrUnlocked().

Referenced by _bt_clear_incomplete_split(), _bt_dedup_pass(), _bt_delitems_delete(), _bt_delitems_vacuum(), _bt_getroot(), _bt_insertonpg(), _bt_mark_page_halfdead(), _bt_newlevel(), _bt_restore_meta(), _bt_set_cleanup_info(), _bt_split(), _bt_unlink_halfdead_page(), _hash_addovflpage(), _hash_doinsert(), _hash_expandtable(), _hash_freeovflpage(), _hash_init(), _hash_splitbucket(), _hash_squeezebucket(), _hash_vacuum_one_page(), addLeafTuple(), brin_doinsert(), brin_doupdate(), brin_initialize_empty_new_buffer(), brin_xlog_createidx(), brin_xlog_desummarize_page(), brin_xlog_insert_update(), brin_xlog_revmap_extend(), brin_xlog_samepage_update(), brin_xlog_update(), brinbuild(), brinbuildempty(), brinRevmapDesummarizeRange(), btree_xlog_dedup(), btree_xlog_delete(), btree_xlog_insert(), btree_xlog_mark_page_halfdead(), btree_xlog_newroot(), btree_xlog_split(), btree_xlog_unlink_page(), btree_xlog_vacuum(), createPostingTree(), dataExecPlaceToPageInternal(), dataExecPlaceToPageLeaf(), do_setval(), doPickSplit(), entryExecPlaceToPage(), fill_seq_fork_with_data(), FreeSpaceMapPrepareTruncateRel(), generic_redo(), GenericXLogFinish(), ginbuild(), ginbuildempty(), ginbulkdelete(), ginDeletePage(), ginHeapTupleFastInsert(), ginPlaceToPage(), ginRedoClearIncompleteSplit(), ginRedoCreatePTree(), ginRedoDeleteListPages(), ginRedoDeletePage(), ginRedoInsert(), ginRedoInsertListPage(), ginRedoUpdateMetapage(), ginRedoVacuumDataLeafPage(), ginUpdateStats(), ginVacuumPostingTreeLeaf(), gistbuild(), gistbuildempty(), gistdeletepage(), gistplacetopage(), gistprunepage(), gistRedoClearFollowRight(), gistRedoDeleteRecord(), gistRedoPageDelete(), gistRedoPageSplitRecord(), gistRedoPageUpdateRecord(), gistvacuumpage(), hash_xlog_add_ovfl_page(), hash_xlog_delete(), hash_xlog_init_bitmap_page(), hash_xlog_init_meta_page(), hash_xlog_insert(), hash_xlog_move_page_contents(), hash_xlog_split_allocate_page(), hash_xlog_split_cleanup(), hash_xlog_split_complete(), hash_xlog_squeeze_page(), hash_xlog_update_meta_page(), hash_xlog_vacuum_one_page(), hashbucketcleanup(), hashbulkdelete(), heap_abort_speculative(), heap_delete(), heap_finish_speculative(), heap_force_common(), heap_inplace_update_and_unlock(), heap_insert(), heap_lock_tuple(), heap_lock_updated_tuple_rec(), heap_multi_insert(), heap_page_prune_and_freeze(), heap_update(), heap_xlog_confirm(), heap_xlog_delete(), heap_xlog_inplace(), heap_xlog_insert(), heap_xlog_lock(), heap_xlog_lock_updated(), heap_xlog_multi_insert(), heap_xlog_prune_freeze(), heap_xlog_update(), heap_xlog_visible(), lazy_scan_new_or_empty(), lazy_scan_prune(), lazy_vacuum_heap_page(), log_newpage_range(), moveLeafs(), nextval_internal(), RelationAddBlocks(), RelationCopyStorageUsingBuffer(), RelationGetBufferForTuple(), revmap_physical_extend(), saveNodeLink(), seq_redo(), shiftList(), spgAddNodeAction(), spgbuild(), SpGistUpdateMetaPage(), spgRedoAddLeaf(), spgRedoAddNode(), spgRedoMoveLeafs(), spgRedoPickSplit(), spgRedoSplitTuple(), spgRedoVacuumLeaf(), spgRedoVacuumRedirect(), spgRedoVacuumRoot(), spgSplitNodeAction(), vacuumLeafPage(), vacuumLeafRoot(), vacuumRedirectAndPlaceholder(), visibilitymap_clear(), visibilitymap_prepare_truncate(), visibilitymap_set(), writeListPage(), and XLogReadBufferForRedoExtended().

◆ MarkBufferDirtyHint()

void MarkBufferDirtyHint ( Buffer  buffer,
bool  buffer_std 
)

Definition at line 5402 of file bufmgr.c.

5403{
5404 BufferDesc *bufHdr;
5405 Page page = BufferGetPage(buffer);
5406
5407 if (!BufferIsValid(buffer))
5408 elog(ERROR, "bad buffer ID: %d", buffer);
5409
5410 if (BufferIsLocal(buffer))
5411 {
5412 MarkLocalBufferDirty(buffer);
5413 return;
5414 }
5415
5416 bufHdr = GetBufferDescriptor(buffer - 1);
5417
5418 Assert(GetPrivateRefCount(buffer) > 0);
5419 /* here, either share or exclusive lock is OK */
5421
5422 /*
5423 * This routine might get called many times on the same page, if we are
5424 * making the first scan after commit of an xact that added/deleted many
5425 * tuples. So, be as quick as we can if the buffer is already dirty. We
5426 * do this by not acquiring spinlock if it looks like the status bits are
5427 * already set. Since we make this test unlocked, there's a chance we
5428 * might fail to notice that the flags have just been cleared, and failed
5429 * to reset them, due to memory-ordering issues. But since this function
5430 * is only intended to be used in cases where failing to write out the
5431 * data would be harmless anyway, it doesn't really matter.
5432 */
5433 if ((pg_atomic_read_u32(&bufHdr->state) & (BM_DIRTY | BM_JUST_DIRTIED)) !=
5435 {
5437 bool dirtied = false;
5438 bool delayChkptFlags = false;
5439 uint32 buf_state;
5440
5441 /*
5442 * If we need to protect hint bit updates from torn writes, WAL-log a
5443 * full page image of the page. This full page image is only necessary
5444 * if the hint bit update is the first change to the page since the
5445 * last checkpoint.
5446 *
5447 * We don't check full_page_writes here because that logic is included
5448 * when we call XLogInsert() since the value changes dynamically.
5449 */
5450 if (XLogHintBitIsNeeded() &&
5452 {
5453 /*
5454 * If we must not write WAL, due to a relfilelocator-specific
5455 * condition or being in recovery, don't dirty the page. We can
5456 * set the hint, just not dirty the page as a result so the hint
5457 * is lost when we evict the page or shutdown.
5458 *
5459 * See src/backend/storage/page/README for longer discussion.
5460 */
5461 if (RecoveryInProgress() ||
5463 return;
5464
5465 /*
5466 * If the block is already dirty because we either made a change
5467 * or set a hint already, then we don't need to write a full page
5468 * image. Note that aggressive cleaning of blocks dirtied by hint
5469 * bit setting would increase the call rate. Bulk setting of hint
5470 * bits would reduce the call rate...
5471 *
5472 * We must issue the WAL record before we mark the buffer dirty.
5473 * Otherwise we might write the page before we write the WAL. That
5474 * causes a race condition, since a checkpoint might occur between
5475 * writing the WAL record and marking the buffer dirty. We solve
5476 * that with a kluge, but one that is already in use during
5477 * transaction commit to prevent race conditions. Basically, we
5478 * simply prevent the checkpoint WAL record from being written
5479 * until we have marked the buffer dirty. We don't start the
5480 * checkpoint flush until we have marked dirty, so our checkpoint
5481 * must flush the change to disk successfully or the checkpoint
5482 * never gets written, so crash recovery will fix.
5483 *
5484 * It's possible we may enter here without an xid, so it is
5485 * essential that CreateCheckPoint waits for virtual transactions
5486 * rather than full transactionids.
5487 */
5490 delayChkptFlags = true;
5491 lsn = XLogSaveBufferForHint(buffer, buffer_std);
5492 }
5493
5494 buf_state = LockBufHdr(bufHdr);
5495
5496 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
5497
5498 if (!(buf_state & BM_DIRTY))
5499 {
5500 dirtied = true; /* Means "will be dirtied by this action" */
5501
5502 /*
5503 * Set the page LSN if we wrote a backup block. We aren't supposed
5504 * to set this when only holding a share lock but as long as we
5505 * serialise it somehow we're OK. We choose to set LSN while
5506 * holding the buffer header lock, which causes any reader of an
5507 * LSN who holds only a share lock to also obtain a buffer header
5508 * lock before using PageGetLSN(), which is enforced in
5509 * BufferGetLSNAtomic().
5510 *
5511 * If checksums are enabled, you might think we should reset the
5512 * checksum here. That will happen when the page is written
5513 * sometime later in this checkpoint cycle.
5514 */
5515 if (!XLogRecPtrIsInvalid(lsn))
5516 PageSetLSN(page, lsn);
5517 }
5518
5519 buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
5520 UnlockBufHdr(bufHdr, buf_state);
5521
5522 if (delayChkptFlags)
5523 MyProc->delayChkptFlags &= ~DELAY_CHKPT_START;
5524
5525 if (dirtied)
5526 {
5528 if (VacuumCostActive)
5530 }
5531 }
5532}
static void PageSetLSN(Page page, XLogRecPtr lsn)
Definition: bufpage.h:391
#define DELAY_CHKPT_START
Definition: proc.h:135
bool RelFileLocatorSkippingWAL(RelFileLocator rlocator)
Definition: storage.c:573
int delayChkptFlags
Definition: proc.h:257
bool RecoveryInProgress(void)
Definition: xlog.c:6383
#define XLogRecPtrIsInvalid(r)
Definition: xlogdefs.h:29
#define InvalidXLogRecPtr
Definition: xlogdefs.h:28
XLogRecPtr XLogSaveBufferForHint(Buffer buffer, bool buffer_std)
Definition: xloginsert.c:1077

References Assert(), BM_DIRTY, BM_JUST_DIRTIED, BM_PERMANENT, BUF_STATE_GET_REFCOUNT, PrivateRefCountEntry::buffer, BufferDescriptorGetContentLock(), BufferGetPage(), BufferIsLocal, BufferIsValid(), BufTagGetRelFileLocator(), DELAY_CHKPT_START, PGPROC::delayChkptFlags, elog, ERROR, GetBufferDescriptor(), GetPrivateRefCount(), InvalidXLogRecPtr, LockBufHdr(), LWLockHeldByMe(), MarkLocalBufferDirty(), MyProc, PageSetLSN(), pg_atomic_read_u32(), pgBufferUsage, RecoveryInProgress(), RelFileLocatorSkippingWAL(), BufferUsage::shared_blks_dirtied, BufferDesc::state, BufferDesc::tag, UnlockBufHdr(), VacuumCostActive, VacuumCostBalance, VacuumCostPageDirty, XLogHintBitIsNeeded, XLogRecPtrIsInvalid, and XLogSaveBufferForHint().

Referenced by _bt_check_unique(), _bt_killitems(), _hash_kill_items(), brin_start_evacuating_page(), btvacuumpage(), fsm_search(), fsm_search_avail(), fsm_set_and_search(), fsm_vacuum_page(), gistkillitems(), heap_page_prune_and_freeze(), read_seq_tuple(), SetHintBits(), and XLogRecordPageWithFreeSpace().

◆ NewPrivateRefCountEntry()

static PrivateRefCountEntry * NewPrivateRefCountEntry ( Buffer  buffer)
static

Definition at line 325 of file bufmgr.c.

326{
328
329 /* only allowed to be called when a reservation has been made */
331
332 /* use up the reserved entry */
335
336 /* and fill it */
337 res->buffer = buffer;
338 res->refcount = 0;
339
340 return res;
341}

References Assert(), PrivateRefCountEntry::buffer, PrivateRefCountEntry::refcount, and ReservedRefCountEntry.

Referenced by PinBuffer(), and PinBuffer_Locked().

◆ PinBuffer()

static bool PinBuffer ( BufferDesc buf,
BufferAccessStrategy  strategy 
)
static

Definition at line 3041 of file bufmgr.c.

3042{
3044 bool result;
3046
3049
3050 ref = GetPrivateRefCountEntry(b, true);
3051
3052 if (ref == NULL)
3053 {
3054 uint32 buf_state;
3055 uint32 old_buf_state;
3056
3058
3059 old_buf_state = pg_atomic_read_u32(&buf->state);
3060 for (;;)
3061 {
3062 if (old_buf_state & BM_LOCKED)
3063 old_buf_state = WaitBufHdrUnlocked(buf);
3064
3065 buf_state = old_buf_state;
3066
3067 /* increase refcount */
3068 buf_state += BUF_REFCOUNT_ONE;
3069
3070 if (strategy == NULL)
3071 {
3072 /* Default case: increase usagecount unless already max. */
3074 buf_state += BUF_USAGECOUNT_ONE;
3075 }
3076 else
3077 {
3078 /*
3079 * Ring buffers shouldn't evict others from pool. Thus we
3080 * don't make usagecount more than 1.
3081 */
3082 if (BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
3083 buf_state += BUF_USAGECOUNT_ONE;
3084 }
3085
3086 if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
3087 buf_state))
3088 {
3089 result = (buf_state & BM_VALID) != 0;
3090
3091 /*
3092 * Assume that we acquired a buffer pin for the purposes of
3093 * Valgrind buffer client checks (even in !result case) to
3094 * keep things simple. Buffers that are unsafe to access are
3095 * not generally guaranteed to be marked undefined or
3096 * non-accessible in any case.
3097 */
3099 break;
3100 }
3101 }
3102 }
3103 else
3104 {
3105 /*
3106 * If we previously pinned the buffer, it is likely to be valid, but
3107 * it may not be if StartReadBuffers() was called and
3108 * WaitReadBuffers() hasn't been called yet. We'll check by loading
3109 * the flags without locking. This is racy, but it's OK to return
3110 * false spuriously: when WaitReadBuffers() calls StartBufferIO(),
3111 * it'll see that it's now valid.
3112 *
3113 * Note: We deliberately avoid a Valgrind client request here.
3114 * Individual access methods can optionally superimpose buffer page
3115 * client requests on top of our client requests to enforce that
3116 * buffers are only accessed while locked (and pinned). It's possible
3117 * that the buffer page is legitimately non-accessible here. We
3118 * cannot meddle with that.
3119 */
3120 result = (pg_atomic_read_u32(&buf->state) & BM_VALID) != 0;
3121 }
3122
3123 ref->refcount++;
3124 Assert(ref->refcount > 0);
3126 return result;
3127}
#define BM_MAX_USAGE_COUNT
Definition: buf_internals.h:86
#define BUF_STATE_GET_USAGECOUNT(state)
Definition: buf_internals.h:60
static PrivateRefCountEntry * NewPrivateRefCountEntry(Buffer buffer)
Definition: bufmgr.c:325

References Assert(), b, BM_LOCKED, BM_MAX_USAGE_COUNT, BM_VALID, buf, BUF_REFCOUNT_ONE, BUF_STATE_GET_USAGECOUNT, BUF_USAGECOUNT_ONE, BufferDescriptorGetBuffer(), BufferIsLocal, BufHdrGetBlock, CurrentResourceOwner, GetPrivateRefCountEntry(), NewPrivateRefCountEntry(), pg_atomic_compare_exchange_u32(), pg_atomic_read_u32(), PrivateRefCountEntry::refcount, ReservedRefCountEntry, ResourceOwnerRememberBuffer(), VALGRIND_MAKE_MEM_DEFINED, and WaitBufHdrUnlocked().

Referenced by BufferAlloc(), ExtendBufferedRelShared(), and ReadRecentBuffer().

◆ PinBuffer_Locked()

static void PinBuffer_Locked ( BufferDesc buf)
static

Definition at line 3152 of file bufmgr.c.

3153{
3154 Buffer b;
3156 uint32 buf_state;
3157
3158 /*
3159 * As explained, We don't expect any preexisting pins. That allows us to
3160 * manipulate the PrivateRefCount after releasing the spinlock
3161 */
3163
3164 /*
3165 * Buffer can't have a preexisting pin, so mark its page as defined to
3166 * Valgrind (this is similar to the PinBuffer() case where the backend
3167 * doesn't already have a buffer pin)
3168 */
3170
3171 /*
3172 * Since we hold the buffer spinlock, we can update the buffer state and
3173 * release the lock in one operation.
3174 */
3175 buf_state = pg_atomic_read_u32(&buf->state);
3176 Assert(buf_state & BM_LOCKED);
3177 buf_state += BUF_REFCOUNT_ONE;
3178 UnlockBufHdr(buf, buf_state);
3179
3181
3183 ref->refcount++;
3184
3186}

References Assert(), b, BM_LOCKED, buf, BUF_REFCOUNT_ONE, BufferDescriptorGetBuffer(), BufHdrGetBlock, CurrentResourceOwner, GetPrivateRefCountEntry(), NewPrivateRefCountEntry(), pg_atomic_read_u32(), PrivateRefCountEntry::refcount, ResourceOwnerRememberBuffer(), UnlockBufHdr(), and VALGRIND_MAKE_MEM_DEFINED.

Referenced by EvictUnpinnedBufferInternal(), FlushDatabaseBuffers(), FlushRelationBuffers(), FlushRelationsAllBuffers(), GetVictimBuffer(), ReadRecentBuffer(), and SyncOneBuffer().

◆ PinBufferForBlock()

static pg_attribute_always_inline Buffer PinBufferForBlock ( Relation  rel,
SMgrRelation  smgr,
char  persistence,
ForkNumber  forkNum,
BlockNumber  blockNum,
BufferAccessStrategy  strategy,
bool *  foundPtr 
)
static

Definition at line 1110 of file bufmgr.c.

1117{
1118 BufferDesc *bufHdr;
1119 IOContext io_context;
1120 IOObject io_object;
1121
1122 Assert(blockNum != P_NEW);
1123
1124 /* Persistence should be set before */
1125 Assert((persistence == RELPERSISTENCE_TEMP ||
1126 persistence == RELPERSISTENCE_PERMANENT ||
1127 persistence == RELPERSISTENCE_UNLOGGED));
1128
1129 if (persistence == RELPERSISTENCE_TEMP)
1130 {
1131 io_context = IOCONTEXT_NORMAL;
1132 io_object = IOOBJECT_TEMP_RELATION;
1133 }
1134 else
1135 {
1136 io_context = IOContextForStrategy(strategy);
1137 io_object = IOOBJECT_RELATION;
1138 }
1139
1140 TRACE_POSTGRESQL_BUFFER_READ_START(forkNum, blockNum,
1144 smgr->smgr_rlocator.backend);
1145
1146 if (persistence == RELPERSISTENCE_TEMP)
1147 {
1148 bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, foundPtr);
1149 if (*foundPtr)
1151 }
1152 else
1153 {
1154 bufHdr = BufferAlloc(smgr, persistence, forkNum, blockNum,
1155 strategy, foundPtr, io_context);
1156 if (*foundPtr)
1158 }
1159 if (rel)
1160 {
1161 /*
1162 * While pgBufferUsage's "read" counter isn't bumped unless we reach
1163 * WaitReadBuffers() (so, not for hits, and not for buffers that are
1164 * zeroed instead), the per-relation stats always count them.
1165 */
1167 if (*foundPtr)
1169 }
1170 if (*foundPtr)
1171 {
1172 pgstat_count_io_op(io_object, io_context, IOOP_HIT, 1, 0);
1173 if (VacuumCostActive)
1175
1176 TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
1180 smgr->smgr_rlocator.backend,
1181 true);
1182 }
1183
1184 return BufferDescriptorGetBuffer(bufHdr);
1185}
static BufferDesc * BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, bool *foundPtr, IOContext io_context)
Definition: bufmgr.c:2000
#define P_NEW
Definition: bufmgr.h:191
BufferDesc * LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, bool *foundPtr)
Definition: localbuf.c:118
#define pgstat_count_buffer_read(rel)
Definition: pgstat.h:705

References Assert(), RelFileLocatorBackend::backend, BufferAlloc(), BufferDescriptorGetBuffer(), RelFileLocator::dbOid, IOCONTEXT_NORMAL, IOContextForStrategy(), IOOBJECT_RELATION, IOOBJECT_TEMP_RELATION, IOOP_HIT, BufferUsage::local_blks_hit, LocalBufferAlloc(), RelFileLocatorBackend::locator, P_NEW, pgBufferUsage, pgstat_count_buffer_hit, pgstat_count_buffer_read, pgstat_count_io_op(), RelFileLocator::relNumber, BufferUsage::shared_blks_hit, SMgrRelationData::smgr_rlocator, RelFileLocator::spcOid, VacuumCostActive, VacuumCostBalance, and VacuumCostPageHit.

Referenced by ReadBuffer_common(), and StartReadBuffersImpl().

◆ PrefetchBuffer()

PrefetchBufferResult PrefetchBuffer ( Relation  reln,
ForkNumber  forkNum,
BlockNumber  blockNum 
)

Definition at line 651 of file bufmgr.c.

652{
653 Assert(RelationIsValid(reln));
654 Assert(BlockNumberIsValid(blockNum));
655
656 if (RelationUsesLocalBuffers(reln))
657 {
658 /* see comments in ReadBufferExtended */
659 if (RELATION_IS_OTHER_TEMP(reln))
661 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
662 errmsg("cannot access temporary tables of other sessions")));
663
664 /* pass it off to localbuf.c */
665 return PrefetchLocalBuffer(RelationGetSmgr(reln), forkNum, blockNum);
666 }
667 else
668 {
669 /* pass it to the shared buffer version */
670 return PrefetchSharedBuffer(RelationGetSmgr(reln), forkNum, blockNum);
671 }
672}
PrefetchBufferResult PrefetchSharedBuffer(SMgrRelation smgr_reln, ForkNumber forkNum, BlockNumber blockNum)
Definition: bufmgr.c:561
PrefetchBufferResult PrefetchLocalBuffer(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum)
Definition: localbuf.c:71
#define RELATION_IS_OTHER_TEMP(relation)
Definition: rel.h:667
#define RelationIsValid(relation)
Definition: rel.h:489

References Assert(), BlockNumberIsValid(), ereport, errcode(), errmsg(), ERROR, PrefetchLocalBuffer(), PrefetchSharedBuffer(), RELATION_IS_OTHER_TEMP, RelationGetSmgr(), RelationIsValid, and RelationUsesLocalBuffers.

Referenced by count_nondeletable_pages(), invalidate_rel_block(), and pg_prewarm().

◆ PrefetchSharedBuffer()

PrefetchBufferResult PrefetchSharedBuffer ( SMgrRelation  smgr_reln,
ForkNumber  forkNum,
BlockNumber  blockNum 
)

Definition at line 561 of file bufmgr.c.

564{
565 PrefetchBufferResult result = {InvalidBuffer, false};
566 BufferTag newTag; /* identity of requested block */
567 uint32 newHash; /* hash value for newTag */
568 LWLock *newPartitionLock; /* buffer partition lock for it */
569 int buf_id;
570
571 Assert(BlockNumberIsValid(blockNum));
572
573 /* create a tag so we can lookup the buffer */
574 InitBufferTag(&newTag, &smgr_reln->smgr_rlocator.locator,
575 forkNum, blockNum);
576
577 /* determine its hash code and partition lock ID */
578 newHash = BufTableHashCode(&newTag);
579 newPartitionLock = BufMappingPartitionLock(newHash);
580
581 /* see if the block is in the buffer pool already */
582 LWLockAcquire(newPartitionLock, LW_SHARED);
583 buf_id = BufTableLookup(&newTag, newHash);
584 LWLockRelease(newPartitionLock);
585
586 /* If not in buffers, initiate prefetch */
587 if (buf_id < 0)
588 {
589#ifdef USE_PREFETCH
590 /*
591 * Try to initiate an asynchronous read. This returns false in
592 * recovery if the relation file doesn't exist.
593 */
594 if ((io_direct_flags & IO_DIRECT_DATA) == 0 &&
595 smgrprefetch(smgr_reln, forkNum, blockNum, 1))
596 {
597 result.initiated_io = true;
598 }
599#endif /* USE_PREFETCH */
600 }
601 else
602 {
603 /*
604 * Report the buffer it was in at that time. The caller may be able
605 * to avoid a buffer table lookup, but it's not pinned and it must be
606 * rechecked!
607 */
608 result.recent_buffer = buf_id + 1;
609 }
610
611 /*
612 * If the block *is* in buffers, we do nothing. This is not really ideal:
613 * the block might be just about to be evicted, which would be stupid
614 * since we know we are going to need it soon. But the only easy answer
615 * is to bump the usage_count, which does not seem like a great solution:
616 * when the caller does ultimately touch the block, usage_count would get
617 * bumped again, resulting in too much favoritism for blocks that are
618 * involved in a prefetch sequence. A real fix would involve some
619 * additional per-buffer state, and it's not clear that there's enough of
620 * a problem to justify that.
621 */
622
623 return result;
624}
int io_direct_flags
Definition: fd.c:168
#define IO_DIRECT_DATA
Definition: fd.h:54
bool smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks)
Definition: smgr.c:678
Buffer recent_buffer
Definition: bufmgr.h:61

References Assert(), BlockNumberIsValid(), BufMappingPartitionLock(), BufTableHashCode(), BufTableLookup(), InitBufferTag(), PrefetchBufferResult::initiated_io, InvalidBuffer, IO_DIRECT_DATA, io_direct_flags, RelFileLocatorBackend::locator, LW_SHARED, LWLockAcquire(), LWLockRelease(), PrefetchBufferResult::recent_buffer, SMgrRelationData::smgr_rlocator, and smgrprefetch().

Referenced by PrefetchBuffer(), and XLogPrefetcherNextBlock().

◆ ProcessReadBuffersResult()

static void ProcessReadBuffersResult ( ReadBuffersOperation operation)
static

Definition at line 1593 of file bufmgr.c.

1594{
1595 PgAioReturn *aio_ret = &operation->io_return;
1596 PgAioResultStatus rs = aio_ret->result.status;
1597 int newly_read_blocks = 0;
1598
1599 Assert(pgaio_wref_valid(&operation->io_wref));
1600 Assert(aio_ret->result.status != PGAIO_RS_UNKNOWN);
1601
1602 /*
1603 * SMGR reports the number of blocks successfully read as the result of
1604 * the IO operation. Thus we can simply add that to ->nblocks_done.
1605 */
1606
1607 if (likely(rs != PGAIO_RS_ERROR))
1608 newly_read_blocks = aio_ret->result.result;
1609
1610 if (rs == PGAIO_RS_ERROR || rs == PGAIO_RS_WARNING)
1611 pgaio_result_report(aio_ret->result, &aio_ret->target_data,
1612 rs == PGAIO_RS_ERROR ? ERROR : WARNING);
1613 else if (aio_ret->result.status == PGAIO_RS_PARTIAL)
1614 {
1615 /*
1616 * We'll retry, so we just emit a debug message to the server log (or
1617 * not even that in prod scenarios).
1618 */
1619 pgaio_result_report(aio_ret->result, &aio_ret->target_data, DEBUG1);
1620 elog(DEBUG3, "partial read, will retry");
1621 }
1622
1623 Assert(newly_read_blocks > 0);
1624 Assert(newly_read_blocks <= MAX_IO_COMBINE_LIMIT);
1625
1626 operation->nblocks_done += newly_read_blocks;
1627
1628 Assert(operation->nblocks_done <= operation->nblocks);
1629}
bool pgaio_wref_valid(PgAioWaitRef *iow)
Definition: aio.c:968
PgAioResultStatus
Definition: aio_types.h:79
@ PGAIO_RS_UNKNOWN
Definition: aio_types.h:80
@ PGAIO_RS_PARTIAL
Definition: aio_types.h:82
#define likely(x)
Definition: c.h:402
#define DEBUG3
Definition: elog.h:28
PgAioResult result
Definition: aio_types.h:132
PgAioTargetData target_data
Definition: aio_types.h:133

References Assert(), DEBUG1, DEBUG3, elog, ERROR, ReadBuffersOperation::io_return, ReadBuffersOperation::io_wref, likely, MAX_IO_COMBINE_LIMIT, ReadBuffersOperation::nblocks, ReadBuffersOperation::nblocks_done, pgaio_result_report(), PGAIO_RS_ERROR, PGAIO_RS_PARTIAL, PGAIO_RS_UNKNOWN, PGAIO_RS_WARNING, pgaio_wref_valid(), PgAioResult::result, PgAioReturn::result, PgAioResult::status, PgAioReturn::target_data, and WARNING.

Referenced by WaitReadBuffers().

◆ ReadBuffer()

Buffer ReadBuffer ( Relation  reln,
BlockNumber  blockNum 
)

Definition at line 758 of file bufmgr.c.

759{
760 return ReadBufferExtended(reln, MAIN_FORKNUM, blockNum, RBM_NORMAL, NULL);
761}
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:805
@ RBM_NORMAL
Definition: bufmgr.h:46

References MAIN_FORKNUM, RBM_NORMAL, and ReadBufferExtended().

Referenced by _bt_allocbuf(), _bt_getbuf(), _bt_search_insert(), _hash_getbuf(), _hash_getbuf_with_condlock_cleanup(), blbulkdelete(), blinsert(), BloomNewBuffer(), brin_getinsertbuffer(), brinGetStats(), brinGetTupleForHeapBlock(), brinRevmapDesummarizeRange(), brinRevmapInitialize(), bt_metap(), bt_multi_page_stats(), bt_page_items_internal(), bt_page_stats_internal(), ginFindLeafPage(), ginFindParents(), ginGetStats(), ginHeapTupleFastInsert(), ginInsertCleanup(), GinNewBuffer(), ginStepRight(), ginUpdateStats(), gistBufferingFindCorrectParent(), gistbufferinginserttuples(), gistdoinsert(), gistFindCorrectParent(), gistFindPath(), gistfixsplit(), gistGetMaxLevel(), gistkillitems(), gistNewBuffer(), gistProcessItup(), gistScanPage(), heap_abort_speculative(), heap_delete(), heap_fetch(), heap_finish_speculative(), heap_force_common(), heap_get_latest_tid(), heap_index_delete_tuples(), heap_lock_tuple(), heap_update(), initBloomState(), pg_visibility(), pgstatginindex_internal(), read_seq_tuple(), RelationGetBufferForTuple(), ReleaseAndReadBuffer(), revmap_get_buffer(), revmap_physical_extend(), scanGetCandidate(), scanPendingInsert(), shiftList(), spgdoinsert(), spgGetCache(), SpGistGetBuffer(), SpGistNewBuffer(), SpGistUpdateMetaPage(), and spgWalk().

◆ ReadBuffer_common()

static pg_attribute_always_inline Buffer ReadBuffer_common ( Relation  rel,
SMgrRelation  smgr,
char  smgr_persistence,
ForkNumber  forkNum,
BlockNumber  blockNum,
ReadBufferMode  mode,
BufferAccessStrategy  strategy 
)
static

Definition at line 1193 of file bufmgr.c.

1197{
1198 ReadBuffersOperation operation;
1199 Buffer buffer;
1200 int flags;
1201 char persistence;
1202
1203 /*
1204 * Backward compatibility path, most code should use ExtendBufferedRel()
1205 * instead, as acquiring the extension lock inside ExtendBufferedRel()
1206 * scales a lot better.
1207 */
1208 if (unlikely(blockNum == P_NEW))
1209 {
1211
1212 /*
1213 * Since no-one else can be looking at the page contents yet, there is
1214 * no difference between an exclusive lock and a cleanup-strength
1215 * lock.
1216 */
1218 flags |= EB_LOCK_FIRST;
1219
1220 return ExtendBufferedRel(BMR_REL(rel), forkNum, strategy, flags);
1221 }
1222
1223 if (rel)
1224 persistence = rel->rd_rel->relpersistence;
1225 else
1226 persistence = smgr_persistence;
1227
1230 {
1231 bool found;
1232
1233 buffer = PinBufferForBlock(rel, smgr, persistence,
1234 forkNum, blockNum, strategy, &found);
1235 ZeroAndLockBuffer(buffer, mode, found);
1236 return buffer;
1237 }
1238
1239 /*
1240 * Signal that we are going to immediately wait. If we're immediately
1241 * waiting, there is no benefit in actually executing the IO
1242 * asynchronously, it would just add dispatch overhead.
1243 */
1245 if (mode == RBM_ZERO_ON_ERROR)
1247 operation.smgr = smgr;
1248 operation.rel = rel;
1249 operation.persistence = persistence;
1250 operation.forknum = forkNum;
1251 operation.strategy = strategy;
1252 if (StartReadBuffer(&operation,
1253 &buffer,
1254 blockNum,
1255 flags))
1256 WaitReadBuffers(&operation);
1257
1258 return buffer;
1259}
Buffer ExtendBufferedRel(BufferManagerRelation bmr, ForkNumber forkNum, BufferAccessStrategy strategy, uint32 flags)
Definition: bufmgr.c:858
static void ZeroAndLockBuffer(Buffer buffer, ReadBufferMode mode, bool already_valid)
Definition: bufmgr.c:1031
static pg_attribute_always_inline Buffer PinBufferForBlock(Relation rel, SMgrRelation smgr, char persistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, bool *foundPtr)
Definition: bufmgr.c:1110
void WaitReadBuffers(ReadBuffersOperation *operation)
Definition: bufmgr.c:1632
bool StartReadBuffer(ReadBuffersOperation *operation, Buffer *buffer, BlockNumber blocknum, int flags)
Definition: bufmgr.c:1508
@ RBM_ZERO_ON_ERROR
Definition: bufmgr.h:51
#define BMR_REL(p_rel)
Definition: bufmgr.h:111

References BMR_REL, PrivateRefCountEntry::buffer, EB_LOCK_FIRST, EB_SKIP_EXTENSION_LOCK, ExtendBufferedRel(), ReadBuffersOperation::forknum, mode, P_NEW, ReadBuffersOperation::persistence, PinBufferForBlock(), RBM_ZERO_AND_CLEANUP_LOCK, RBM_ZERO_AND_LOCK, RBM_ZERO_ON_ERROR, RelationData::rd_rel, READ_BUFFERS_SYNCHRONOUSLY, READ_BUFFERS_ZERO_ON_ERROR, ReadBuffersOperation::rel, ReadBuffersOperation::smgr, StartReadBuffer(), ReadBuffersOperation::strategy, unlikely, WaitReadBuffers(), and ZeroAndLockBuffer().

Referenced by ExtendBufferedRelTo(), ReadBufferExtended(), and ReadBufferWithoutRelcache().

◆ ReadBufferExtended()

Buffer ReadBufferExtended ( Relation  reln,
ForkNumber  forkNum,
BlockNumber  blockNum,
ReadBufferMode  mode,
BufferAccessStrategy  strategy 
)
inline

Definition at line 805 of file bufmgr.c.

807{
808 Buffer buf;
809
810 /*
811 * Reject attempts to read non-local temporary relations; we would be
812 * likely to get wrong data since we have no visibility into the owning
813 * session's local buffers.
814 */
815 if (RELATION_IS_OTHER_TEMP(reln))
817 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
818 errmsg("cannot access temporary tables of other sessions")));
819
820 /*
821 * Read the buffer, and update pgstat counters to reflect a cache hit or
822 * miss.
823 */
824 buf = ReadBuffer_common(reln, RelationGetSmgr(reln), 0,
825 forkNum, blockNum, mode, strategy);
826
827 return buf;
828}

References buf, ereport, errcode(), errmsg(), ERROR, mode, ReadBuffer_common(), RELATION_IS_OTHER_TEMP, and RelationGetSmgr().

Referenced by _hash_getbuf_with_strategy(), _hash_getinitbuf(), _hash_getnewbuf(), blbulkdelete(), blgetbitmap(), BloomInitMetapage(), blvacuumcleanup(), brin_vacuum_scan(), bt_recheck_sibling_links(), btvacuumpage(), count_nondeletable_pages(), create_toy_buffer(), fsm_readbuf(), get_raw_page_internal(), gin_check_parent_keys_consistency(), gin_check_posting_tree_parent_keys_consistency(), gin_refind_parent(), ginbulkdelete(), ginDeletePage(), ginScanToDelete(), ginvacuumcleanup(), ginVacuumPostingTree(), ginVacuumPostingTreeLeaves(), gistvacuum_delete_empty_pages(), gistvacuumpage(), hashbulkdelete(), heapam_scan_sample_next_block(), log_newpage_range(), modify_rel_block(), palloc_btree_page(), pgstat_btree_page(), pgstat_gist_page(), pgstat_heap(), pgstathashindex(), pgstatindex_impl(), ReadBuffer(), ReadBufferBI(), spgprocesspending(), statapprox_heap(), and vm_readbuf().

◆ ReadBuffersCanStartIO()

static bool ReadBuffersCanStartIO ( Buffer  buffer,
bool  nowait 
)
inlinestatic

Definition at line 1564 of file bufmgr.c.

1565{
1566 /*
1567 * If this backend currently has staged IO, we need to submit the pending
1568 * IO before waiting for the right to issue IO, to avoid the potential for
1569 * deadlocks (and, more commonly, unnecessary delays for other backends).
1570 */
1571 if (!nowait && pgaio_have_staged())
1572 {
1573 if (ReadBuffersCanStartIOOnce(buffer, true))
1574 return true;
1575
1576 /*
1577 * Unfortunately StartBufferIO() returning false doesn't allow to
1578 * distinguish between the buffer already being valid and IO already
1579 * being in progress. Since IO already being in progress is quite
1580 * rare, this approach seems fine.
1581 */
1583 }
1584
1585 return ReadBuffersCanStartIOOnce(buffer, nowait);
1586}
bool pgaio_have_staged(void)
Definition: aio.c:1104
static bool ReadBuffersCanStartIOOnce(Buffer buffer, bool nowait)
Definition: bufmgr.c:1551

References PrivateRefCountEntry::buffer, pgaio_have_staged(), pgaio_submit_staged(), and ReadBuffersCanStartIOOnce().

Referenced by AsyncReadBuffers().

◆ ReadBuffersCanStartIOOnce()

static bool ReadBuffersCanStartIOOnce ( Buffer  buffer,
bool  nowait 
)
inlinestatic

Definition at line 1551 of file bufmgr.c.

1552{
1553 if (BufferIsLocal(buffer))
1554 return StartLocalBufferIO(GetLocalBufferDescriptor(-buffer - 1),
1555 true, nowait);
1556 else
1557 return StartBufferIO(GetBufferDescriptor(buffer - 1), true, nowait);
1558}
bool StartLocalBufferIO(BufferDesc *bufHdr, bool forInput, bool nowait)
Definition: localbuf.c:521

References PrivateRefCountEntry::buffer, BufferIsLocal, GetBufferDescriptor(), GetLocalBufferDescriptor(), StartBufferIO(), and StartLocalBufferIO().

Referenced by ReadBuffersCanStartIO().

◆ ReadBufferWithoutRelcache()

Buffer ReadBufferWithoutRelcache ( RelFileLocator  rlocator,
ForkNumber  forkNum,
BlockNumber  blockNum,
ReadBufferMode  mode,
BufferAccessStrategy  strategy,
bool  permanent 
)

Definition at line 842 of file bufmgr.c.

845{
846 SMgrRelation smgr = smgropen(rlocator, INVALID_PROC_NUMBER);
847
848 return ReadBuffer_common(NULL, smgr,
849 permanent ? RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED,
850 forkNum, blockNum,
851 mode, strategy);
852}

References INVALID_PROC_NUMBER, mode, ReadBuffer_common(), and smgropen().

Referenced by RelationCopyStorageUsingBuffer(), ScanSourceDatabasePgClass(), and XLogReadBufferExtended().

◆ ReadRecentBuffer()

bool ReadRecentBuffer ( RelFileLocator  rlocator,
ForkNumber  forkNum,
BlockNumber  blockNum,
Buffer  recent_buffer 
)

Definition at line 682 of file bufmgr.c.

684{
685 BufferDesc *bufHdr;
686 BufferTag tag;
687 uint32 buf_state;
688 bool have_private_ref;
689
690 Assert(BufferIsValid(recent_buffer));
691
694 InitBufferTag(&tag, &rlocator, forkNum, blockNum);
695
696 if (BufferIsLocal(recent_buffer))
697 {
698 int b = -recent_buffer - 1;
699
700 bufHdr = GetLocalBufferDescriptor(b);
701 buf_state = pg_atomic_read_u32(&bufHdr->state);
702
703 /* Is it still valid and holding the right tag? */
704 if ((buf_state & BM_VALID) && BufferTagsEqual(&tag, &bufHdr->tag))
705 {
706 PinLocalBuffer(bufHdr, true);
707
709
710 return true;
711 }
712 }
713 else
714 {
715 bufHdr = GetBufferDescriptor(recent_buffer - 1);
716 have_private_ref = GetPrivateRefCount(recent_buffer) > 0;
717
718 /*
719 * Do we already have this buffer pinned with a private reference? If
720 * so, it must be valid and it is safe to check the tag without
721 * locking. If not, we have to lock the header first and then check.
722 */
723 if (have_private_ref)
724 buf_state = pg_atomic_read_u32(&bufHdr->state);
725 else
726 buf_state = LockBufHdr(bufHdr);
727
728 if ((buf_state & BM_VALID) && BufferTagsEqual(&tag, &bufHdr->tag))
729 {
730 /*
731 * It's now safe to pin the buffer. We can't pin first and ask
732 * questions later, because it might confuse code paths like
733 * InvalidateBuffer() if we pinned a random non-matching buffer.
734 */
735 if (have_private_ref)
736 PinBuffer(bufHdr, NULL); /* bump pin count */
737 else
738 PinBuffer_Locked(bufHdr); /* pin for first time */
739
741
742 return true;
743 }
744
745 /* If we locked the header above, now unlock. */
746 if (!have_private_ref)
747 UnlockBufHdr(bufHdr, buf_state);
748 }
749
750 return false;
751}

References Assert(), b, BM_VALID, BufferIsLocal, BufferIsValid(), BufferTagsEqual(), CurrentResourceOwner, GetBufferDescriptor(), GetLocalBufferDescriptor(), GetPrivateRefCount(), InitBufferTag(), BufferUsage::local_blks_hit, LockBufHdr(), pg_atomic_read_u32(), pgBufferUsage, PinBuffer(), PinBuffer_Locked(), PinLocalBuffer(), ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), BufferUsage::shared_blks_hit, BufferDesc::state, BufferDesc::tag, and UnlockBufHdr().

Referenced by invalidate_rel_block(), and XLogReadBufferExtended().

◆ RelationCopyStorageUsingBuffer()

static void RelationCopyStorageUsingBuffer ( RelFileLocator  srclocator,
RelFileLocator  dstlocator,
ForkNumber  forkNum,
bool  permanent 
)
static

Definition at line 5098 of file bufmgr.c.

5101{
5102 Buffer srcBuf;
5103 Buffer dstBuf;
5104 Page srcPage;
5105 Page dstPage;
5106 bool use_wal;
5107 BlockNumber nblocks;
5108 BlockNumber blkno;
5110 BufferAccessStrategy bstrategy_src;
5111 BufferAccessStrategy bstrategy_dst;
5113 ReadStream *src_stream;
5114 SMgrRelation src_smgr;
5115
5116 /*
5117 * In general, we want to write WAL whenever wal_level > 'minimal', but we
5118 * can skip it when copying any fork of an unlogged relation other than
5119 * the init fork.
5120 */
5121 use_wal = XLogIsNeeded() && (permanent || forkNum == INIT_FORKNUM);
5122
5123 /* Get number of blocks in the source relation. */
5124 nblocks = smgrnblocks(smgropen(srclocator, INVALID_PROC_NUMBER),
5125 forkNum);
5126
5127 /* Nothing to copy; just return. */
5128 if (nblocks == 0)
5129 return;
5130
5131 /*
5132 * Bulk extend the destination relation of the same size as the source
5133 * relation before starting to copy block by block.
5134 */
5135 memset(buf.data, 0, BLCKSZ);
5136 smgrextend(smgropen(dstlocator, INVALID_PROC_NUMBER), forkNum, nblocks - 1,
5137 buf.data, true);
5138
5139 /* This is a bulk operation, so use buffer access strategies. */
5140 bstrategy_src = GetAccessStrategy(BAS_BULKREAD);
5141 bstrategy_dst = GetAccessStrategy(BAS_BULKWRITE);
5142
5143 /* Initialize streaming read */
5144 p.current_blocknum = 0;
5145 p.last_exclusive = nblocks;
5146 src_smgr = smgropen(srclocator, INVALID_PROC_NUMBER);
5147
5148 /*
5149 * It is safe to use batchmode as block_range_read_stream_cb takes no
5150 * locks.
5151 */
5154 bstrategy_src,
5155 src_smgr,
5156 permanent ? RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED,
5157 forkNum,
5159 &p,
5160 0);
5161
5162 /* Iterate over each block of the source relation file. */
5163 for (blkno = 0; blkno < nblocks; blkno++)
5164 {
5166
5167 /* Read block from source relation. */
5168 srcBuf = read_stream_next_buffer(src_stream, NULL);
5170 srcPage = BufferGetPage(srcBuf);
5171
5172 dstBuf = ReadBufferWithoutRelcache(dstlocator, forkNum,
5173 BufferGetBlockNumber(srcBuf),
5174 RBM_ZERO_AND_LOCK, bstrategy_dst,
5175 permanent);
5176 dstPage = BufferGetPage(dstBuf);
5177
5179
5180 /* Copy page data from the source to the destination. */
5181 memcpy(dstPage, srcPage, BLCKSZ);
5182 MarkBufferDirty(dstBuf);
5183
5184 /* WAL-log the copied page. */
5185 if (use_wal)
5186 log_newpage_buffer(dstBuf, true);
5187
5189
5190 UnlockReleaseBuffer(dstBuf);
5191 UnlockReleaseBuffer(srcBuf);
5192 }
5193 Assert(read_stream_next_buffer(src_stream, NULL) == InvalidBuffer);
5194 read_stream_end(src_stream);
5195
5196 FreeAccessStrategy(bstrategy_src);
5197 FreeAccessStrategy(bstrategy_dst);
5198}
void UnlockReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:5355
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:2921
Buffer ReadBufferWithoutRelcache(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy, bool permanent)
Definition: bufmgr.c:842
@ BAS_BULKREAD
Definition: bufmgr.h:37
@ BAS_BULKWRITE
Definition: bufmgr.h:39
BufferAccessStrategy GetAccessStrategy(BufferAccessStrategyType btype)
Definition: freelist.c:424
void FreeAccessStrategy(BufferAccessStrategy strategy)
Definition: freelist.c:606
#define START_CRIT_SECTION()
Definition: miscadmin.h:149
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:122
#define END_CRIT_SECTION()
Definition: miscadmin.h:151
ReadStream * read_stream_begin_smgr_relation(int flags, BufferAccessStrategy strategy, SMgrRelation smgr, char smgr_persistence, ForkNumber forknum, ReadStreamBlockNumberCB callback, void *callback_private_data, size_t per_buffer_data_size)
Definition: read_stream.c:761
Buffer read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
Definition: read_stream.c:791
void read_stream_end(ReadStream *stream)
Definition: read_stream.c:1089
BlockNumber block_range_read_stream_cb(ReadStream *stream, void *callback_private_data, void *per_buffer_data)
Definition: read_stream.c:162
#define READ_STREAM_USE_BATCHING
Definition: read_stream.h:64
#define READ_STREAM_FULL
Definition: read_stream.h:43
void smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync)
Definition: smgr.c:620
#define XLogIsNeeded()
Definition: xlog.h:109
XLogRecPtr log_newpage_buffer(Buffer buffer, bool page_std)
Definition: xloginsert.c:1249

References Assert(), BAS_BULKREAD, BAS_BULKWRITE, block_range_read_stream_cb(), buf, BUFFER_LOCK_SHARE, BufferGetBlockNumber(), BufferGetPage(), CHECK_FOR_INTERRUPTS, BlockRangeReadStreamPrivate::current_blocknum, END_CRIT_SECTION, FreeAccessStrategy(), GetAccessStrategy(), INIT_FORKNUM, INVALID_PROC_NUMBER, InvalidBuffer, BlockRangeReadStreamPrivate::last_exclusive, LockBuffer(), log_newpage_buffer(), MarkBufferDirty(), RBM_ZERO_AND_LOCK, read_stream_begin_smgr_relation(), read_stream_end(), READ_STREAM_FULL, read_stream_next_buffer(), READ_STREAM_USE_BATCHING, ReadBufferWithoutRelcache(), smgrextend(), smgrnblocks(), smgropen(), START_CRIT_SECTION, UnlockReleaseBuffer(), and XLogIsNeeded.

Referenced by CreateAndCopyRelationData().

◆ RelationGetNumberOfBlocksInFork()

BlockNumber RelationGetNumberOfBlocksInFork ( Relation  relation,
ForkNumber  forkNum 
)

Definition at line 4398 of file bufmgr.c.

4399{
4400 if (RELKIND_HAS_TABLE_AM(relation->rd_rel->relkind))
4401 {
4402 /*
4403 * Not every table AM uses BLCKSZ wide fixed size blocks. Therefore
4404 * tableam returns the size in bytes - but for the purpose of this
4405 * routine, we want the number of blocks. Therefore divide, rounding
4406 * up.
4407 */
4408 uint64 szbytes;
4409
4410 szbytes = table_relation_size(relation, forkNum);
4411
4412 return (szbytes + (BLCKSZ - 1)) / BLCKSZ;
4413 }
4414 else if (RELKIND_HAS_STORAGE(relation->rd_rel->relkind))
4415 {
4416 return smgrnblocks(RelationGetSmgr(relation), forkNum);
4417 }
4418 else
4419 Assert(false);
4420
4421 return 0; /* keep compiler quiet */
4422}
static uint64 table_relation_size(Relation rel, ForkNumber forkNumber)
Definition: tableam.h:1837

References Assert(), RelationData::rd_rel, RelationGetSmgr(), smgrnblocks(), and table_relation_size().

Referenced by _hash_getnewbuf(), _hash_init(), autoprewarm_database_main(), get_raw_page_internal(), and pg_prewarm().

◆ ReleaseAndReadBuffer()

Buffer ReleaseAndReadBuffer ( Buffer  buffer,
Relation  relation,
BlockNumber  blockNum 
)

Definition at line 2983 of file bufmgr.c.

2986{
2987 ForkNumber forkNum = MAIN_FORKNUM;
2988 BufferDesc *bufHdr;
2989
2990 if (BufferIsValid(buffer))
2991 {
2992 Assert(BufferIsPinned(buffer));
2993 if (BufferIsLocal(buffer))
2994 {
2995 bufHdr = GetLocalBufferDescriptor(-buffer - 1);
2996 if (bufHdr->tag.blockNum == blockNum &&
2997 BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) &&
2998 BufTagGetForkNum(&bufHdr->tag) == forkNum)
2999 return buffer;
3000 UnpinLocalBuffer(buffer);
3001 }
3002 else
3003 {
3004 bufHdr = GetBufferDescriptor(buffer - 1);
3005 /* we have pin, so it's ok to examine tag without spinlock */
3006 if (bufHdr->tag.blockNum == blockNum &&
3007 BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) &&
3008 BufTagGetForkNum(&bufHdr->tag) == forkNum)
3009 return buffer;
3010 UnpinBuffer(bufHdr);
3011 }
3012 }
3013
3014 return ReadBuffer(relation, blockNum);
3015}
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition: bufmgr.c:758

References Assert(), buftag::blockNum, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsPinned, BufferIsValid(), BufTagGetForkNum(), BufTagMatchesRelFileLocator(), GetBufferDescriptor(), GetLocalBufferDescriptor(), MAIN_FORKNUM, RelationData::rd_locator, ReadBuffer(), BufferDesc::tag, UnpinBuffer(), and UnpinLocalBuffer().

Referenced by _bt_relandgetbuf(), ginFindLeafPage(), and heapam_index_fetch_tuple().

◆ ReleaseBuffer()

void ReleaseBuffer ( Buffer  buffer)

Definition at line 5338 of file bufmgr.c.

5339{
5340 if (!BufferIsValid(buffer))
5341 elog(ERROR, "bad buffer ID: %d", buffer);
5342
5343 if (BufferIsLocal(buffer))
5344 UnpinLocalBuffer(buffer);
5345 else
5346 UnpinBuffer(GetBufferDescriptor(buffer - 1));
5347}

References PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsValid(), elog, ERROR, GetBufferDescriptor(), UnpinBuffer(), and UnpinLocalBuffer().

Referenced by _bt_allocbuf(), _bt_pagedel(), _bt_relbuf(), _bt_search_insert(), _bt_unlink_halfdead_page(), _hash_dropbuf(), _hash_getbuf_with_condlock_cleanup(), autoprewarm_database_main(), BitmapHeapScanNextBlock(), blinsert(), BloomNewBuffer(), brin_getinsertbuffer(), brin_vacuum_scan(), bringetbitmap(), brinGetTupleForHeapBlock(), brininsert(), brinRevmapTerminate(), brinsummarize(), buffer_create_toy(), collect_corrupt_items(), collect_visibility_data(), entryLoadMoreItems(), ExecEndIndexOnlyScan(), ExtendBufferedRelTo(), FreeBulkInsertState(), freeGinBtreeStack(), fsm_search(), fsm_vacuum_page(), get_actual_variable_endpoint(), get_raw_page_internal(), GetRecordedFreeSpace(), gin_check_parent_keys_consistency(), gin_check_posting_tree_parent_keys_consistency(), ginDeletePage(), ginFindParents(), ginFinishSplit(), ginFreeScanKeys(), ginInsertCleanup(), GinNewBuffer(), ginScanToDelete(), gistdoinsert(), gistFindCorrectParent(), gistNewBuffer(), gistvacuum_delete_empty_pages(), grow_rel(), heap_abort_speculative(), heap_delete(), heap_endscan(), heap_fetch(), heap_fetch_next_buffer(), heap_force_common(), heap_insert(), heap_lock_tuple(), heap_lock_updated_tuple_rec(), heap_multi_insert(), heap_rescan(), heap_update(), heap_vac_scan_next_block(), heap_xlog_delete(), heap_xlog_insert(), heap_xlog_lock(), heap_xlog_lock_updated(), heap_xlog_multi_insert(), heap_xlog_update(), heap_xlog_visible(), heapam_index_fetch_reset(), heapam_scan_sample_next_block(), heapam_tuple_lock(), heapgettup(), heapgettup_pagemode(), invalidate_rel_block(), lazy_scan_heap(), lazy_vacuum_heap_rel(), modify_rel_block(), pg_prewarm(), pg_visibility(), pg_visibility_map(), pg_visibility_map_summary(), pgstatindex_impl(), read_rel_block_ll(), read_stream_reset(), ReadBufferBI(), RelationAddBlocks(), RelationGetBufferForTuple(), ReleaseBulkInsertStatePin(), revmap_get_buffer(), spgdoinsert(), SpGistGetBuffer(), SpGistNewBuffer(), SpGistUpdateMetaPage(), statapprox_heap(), summarize_range(), terminate_brin_buildstate(), tts_buffer_heap_clear(), tts_buffer_heap_materialize(), tts_buffer_heap_store_tuple(), UnlockReleaseBuffer(), verify_heapam(), visibilitymap_count(), visibilitymap_get_status(), visibilitymap_pin(), and XLogReadBufferExtended().

◆ ReservePrivateRefCountEntry()

static void ReservePrivateRefCountEntry ( void  )
static

Definition at line 259 of file bufmgr.c.

260{
261 /* Already reserved (or freed), nothing to do */
262 if (ReservedRefCountEntry != NULL)
263 return;
264
265 /*
266 * First search for a free entry the array, that'll be sufficient in the
267 * majority of cases.
268 */
269 {
270 int i;
271
272 for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
273 {
275
276 res = &PrivateRefCountArray[i];
277
278 if (res->buffer == InvalidBuffer)
279 {
281 return;
282 }
283 }
284 }
285
286 /*
287 * No luck. All array entries are full. Move one array entry into the hash
288 * table.
289 */
290 {
291 /*
292 * Move entry from the current clock position in the array into the
293 * hashtable. Use that slot.
294 */
295 PrivateRefCountEntry *hashent;
296 bool found;
297
298 /* select victim slot */
301
302 /* Better be used, otherwise we shouldn't get here. */
304
305 /* enter victim array entry into hashtable */
309 &found);
310 Assert(!found);
312
313 /* clear the now free array slot */
316
318 }
319}
static uint32 PrivateRefCountClock
Definition: bufmgr.c:218
@ HASH_ENTER
Definition: hsearch.h:114

References Assert(), PrivateRefCountEntry::buffer, HASH_ENTER, hash_search(), i, InvalidBuffer, PrivateRefCountArray, PrivateRefCountClock, PrivateRefCountHash, PrivateRefCountOverflowed, PrivateRefCountEntry::refcount, REFCOUNT_ARRAY_ENTRIES, and ReservedRefCountEntry.

Referenced by BufferAlloc(), EvictAllUnpinnedBuffers(), EvictRelUnpinnedBuffers(), EvictUnpinnedBuffer(), ExtendBufferedRelShared(), FlushDatabaseBuffers(), FlushRelationBuffers(), FlushRelationsAllBuffers(), GetPrivateRefCountEntry(), GetVictimBuffer(), ReadRecentBuffer(), and SyncOneBuffer().

◆ ResOwnerPrintBufferIO()

static char * ResOwnerPrintBufferIO ( Datum  res)
static

Definition at line 6519 of file bufmgr.c.

6520{
6521 Buffer buffer = DatumGetInt32(res);
6522
6523 return psprintf("lost track of buffer IO on buffer %d", buffer);
6524}
static int32 DatumGetInt32(Datum X)
Definition: postgres.h:212

References PrivateRefCountEntry::buffer, DatumGetInt32(), and psprintf().

◆ ResOwnerPrintBufferPin()

static char * ResOwnerPrintBufferPin ( Datum  res)
static

Definition at line 6542 of file bufmgr.c.

6543{
6545}

References DatumGetInt32(), and DebugPrintBufferRefcount().

◆ ResOwnerReleaseBufferIO()

static void ResOwnerReleaseBufferIO ( Datum  res)
static

Definition at line 6511 of file bufmgr.c.

6512{
6513 Buffer buffer = DatumGetInt32(res);
6514
6515 AbortBufferIO(buffer);
6516}
static void AbortBufferIO(Buffer buffer)
Definition: bufmgr.c:6126

References AbortBufferIO(), PrivateRefCountEntry::buffer, and DatumGetInt32().

◆ ResOwnerReleaseBufferPin()

static void ResOwnerReleaseBufferPin ( Datum  res)
static

Definition at line 6527 of file bufmgr.c.

6528{
6529 Buffer buffer = DatumGetInt32(res);
6530
6531 /* Like ReleaseBuffer, but don't call ResourceOwnerForgetBuffer */
6532 if (!BufferIsValid(buffer))
6533 elog(ERROR, "bad buffer ID: %d", buffer);
6534
6535 if (BufferIsLocal(buffer))
6537 else
6539}
static void UnpinBufferNoOwner(BufferDesc *buf)
Definition: bufmgr.c:3242
void UnpinLocalBufferNoOwner(Buffer buffer)
Definition: localbuf.c:846

References PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsValid(), DatumGetInt32(), elog, ERROR, GetBufferDescriptor(), UnpinBufferNoOwner(), and UnpinLocalBufferNoOwner().

◆ rlocator_comparator()

static int rlocator_comparator ( const void *  p1,
const void *  p2 
)
static

Definition at line 6197 of file bufmgr.c.

6198{
6199 RelFileLocator n1 = *(const RelFileLocator *) p1;
6200 RelFileLocator n2 = *(const RelFileLocator *) p2;
6201
6202 if (n1.relNumber < n2.relNumber)
6203 return -1;
6204 else if (n1.relNumber > n2.relNumber)
6205 return 1;
6206
6207 if (n1.dbOid < n2.dbOid)
6208 return -1;
6209 else if (n1.dbOid > n2.dbOid)
6210 return 1;
6211
6212 if (n1.spcOid < n2.spcOid)
6213 return -1;
6214 else if (n1.spcOid > n2.spcOid)
6215 return 1;
6216 else
6217 return 0;
6218}

References RelFileLocator::dbOid, RelFileLocator::relNumber, and RelFileLocator::spcOid.

Referenced by buffertag_comparator(), DropRelationsAllBuffers(), and FlushRelationsAllBuffers().

◆ ScheduleBufferTagForWriteback()

void ScheduleBufferTagForWriteback ( WritebackContext wb_context,
IOContext  io_context,
BufferTag tag 
)

Definition at line 6378 of file bufmgr.c.

6380{
6381 PendingWriteback *pending;
6382
6383 /*
6384 * As pg_flush_data() doesn't do anything with fsync disabled, there's no
6385 * point in tracking in that case.
6386 */
6388 !enableFsync)
6389 return;
6390
6391 /*
6392 * Add buffer to the pending writeback array, unless writeback control is
6393 * disabled.
6394 */
6395 if (*wb_context->max_pending > 0)
6396 {
6398
6399 pending = &wb_context->pending_writebacks[wb_context->nr_pending++];
6400
6401 pending->tag = *tag;
6402 }
6403
6404 /*
6405 * Perform pending flushes if the writeback limit is exceeded. This
6406 * includes the case where previously an item has been added, but control
6407 * is now disabled.
6408 */
6409 if (wb_context->nr_pending >= *wb_context->max_pending)
6410 IssuePendingWritebacks(wb_context, io_context);
6411}
bool enableFsync
Definition: globals.c:129
#define WRITEBACK_MAX_PENDING_FLUSHES

References Assert(), enableFsync, IO_DIRECT_DATA, io_direct_flags, IssuePendingWritebacks(), WritebackContext::max_pending, WritebackContext::nr_pending, WritebackContext::pending_writebacks, PendingWriteback::tag, and WRITEBACK_MAX_PENDING_FLUSHES.

Referenced by GetVictimBuffer(), and SyncOneBuffer().

◆ shared_buffer_readv_complete()

static PgAioResult shared_buffer_readv_complete ( PgAioHandle ioh,
PgAioResult  prior_result,
uint8  cb_data 
)
static

Definition at line 7346 of file bufmgr.c.

7348{
7349 return buffer_readv_complete(ioh, prior_result, cb_data, false);
7350}

References buffer_readv_complete().

◆ shared_buffer_readv_complete_local()

static PgAioResult shared_buffer_readv_complete_local ( PgAioHandle ioh,
PgAioResult  prior_result,
uint8  cb_data 
)
static

Definition at line 7360 of file bufmgr.c.

7362{
7363 bool zeroed_any,
7364 ignored_any;
7365 uint8 zeroed_or_error_count,
7366 checkfail_count,
7367 first_off;
7368
7369 if (prior_result.status == PGAIO_RS_OK)
7370 return prior_result;
7371
7372 buffer_readv_decode_error(prior_result,
7373 &zeroed_any,
7374 &ignored_any,
7375 &zeroed_or_error_count,
7376 &checkfail_count,
7377 &first_off);
7378
7379 if (checkfail_count)
7380 {
7382
7384 checkfail_count);
7385 }
7386
7387 return prior_result;
7388}
@ PGAIO_RS_OK
Definition: aio_types.h:81

References buffer_readv_decode_error(), RelFileLocator::dbOid, pgaio_io_get_target_data(), PGAIO_RS_OK, pgstat_report_checksum_failures_in_db(), PgAioTargetData::rlocator, PgAioTargetData::smgr, and PgAioResult::status.

◆ shared_buffer_readv_stage()

static void shared_buffer_readv_stage ( PgAioHandle ioh,
uint8  cb_data 
)
static

Definition at line 7340 of file bufmgr.c.

7341{
7342 buffer_stage_common(ioh, false, false);
7343}

References buffer_stage_common().

◆ shared_buffer_write_error_callback()

static void shared_buffer_write_error_callback ( void *  arg)
static

Definition at line 6165 of file bufmgr.c.

6166{
6167 BufferDesc *bufHdr = (BufferDesc *) arg;
6168
6169 /* Buffer is pinned, so we can read the tag without locking the spinlock */
6170 if (bufHdr != NULL)
6171 errcontext("writing block %u of relation \"%s\"",
6172 bufHdr->tag.blockNum,
6174 BufTagGetForkNum(&bufHdr->tag)).str);
6175}

References arg, buftag::blockNum, BufTagGetForkNum(), BufTagGetRelFileLocator(), errcontext, relpathperm, and BufferDesc::tag.

Referenced by FlushBuffer().

◆ StartBufferIO()

bool StartBufferIO ( BufferDesc buf,
bool  forInput,
bool  nowait 
)

Definition at line 6010 of file bufmgr.c.

6011{
6012 uint32 buf_state;
6013
6015
6016 for (;;)
6017 {
6018 buf_state = LockBufHdr(buf);
6019
6020 if (!(buf_state & BM_IO_IN_PROGRESS))
6021 break;
6022 UnlockBufHdr(buf, buf_state);
6023 if (nowait)
6024 return false;
6025 WaitIO(buf);
6026 }
6027
6028 /* Once we get here, there is definitely no I/O active on this buffer */
6029
6030 /* Check if someone else already did the I/O */
6031 if (forInput ? (buf_state & BM_VALID) : !(buf_state & BM_DIRTY))
6032 {
6033 UnlockBufHdr(buf, buf_state);
6034 return false;
6035 }
6036
6037 buf_state |= BM_IO_IN_PROGRESS;
6038 UnlockBufHdr(buf, buf_state);
6039
6042
6043 return true;
6044}
static void ResourceOwnerRememberBufferIO(ResourceOwner owner, Buffer buffer)

References BM_DIRTY, BM_IO_IN_PROGRESS, BM_VALID, buf, BufferDescriptorGetBuffer(), CurrentResourceOwner, LockBufHdr(), ResourceOwnerEnlarge(), ResourceOwnerRememberBufferIO(), UnlockBufHdr(), and WaitIO().

Referenced by buffer_call_start_io(), ExtendBufferedRelShared(), FlushBuffer(), read_rel_block_ll(), ReadBuffersCanStartIOOnce(), and ZeroAndLockBuffer().

◆ StartReadBuffer()

bool StartReadBuffer ( ReadBuffersOperation operation,
Buffer buffer,
BlockNumber  blocknum,
int  flags 
)

Definition at line 1508 of file bufmgr.c.

1512{
1513 int nblocks = 1;
1514 bool result;
1515
1516 result = StartReadBuffersImpl(operation, buffer, blocknum, &nblocks, flags,
1517 false /* single block, no forwarding */ );
1518 Assert(nblocks == 1); /* single block can't be short */
1519
1520 return result;
1521}
static pg_attribute_always_inline bool StartReadBuffersImpl(ReadBuffersOperation *operation, Buffer *buffers, BlockNumber blockNum, int *nblocks, int flags, bool allow_forwarding)
Definition: bufmgr.c:1262

References Assert(), PrivateRefCountEntry::buffer, and StartReadBuffersImpl().

Referenced by read_stream_next_buffer(), and ReadBuffer_common().

◆ StartReadBuffers()

bool StartReadBuffers ( ReadBuffersOperation operation,
Buffer buffers,
BlockNumber  blockNum,
int *  nblocks,
int  flags 
)

Definition at line 1489 of file bufmgr.c.

1494{
1495 return StartReadBuffersImpl(operation, buffers, blockNum, nblocks, flags,
1496 true /* expect forwarded buffers */ );
1497}

References StartReadBuffersImpl().

Referenced by read_stream_start_pending_read().

◆ StartReadBuffersImpl()

static pg_attribute_always_inline bool StartReadBuffersImpl ( ReadBuffersOperation operation,
Buffer buffers,
BlockNumber  blockNum,
int *  nblocks,
int  flags,
bool  allow_forwarding 
)
static

Definition at line 1262 of file bufmgr.c.

1268{
1269 int actual_nblocks = *nblocks;
1270 int maxcombine = 0;
1271 bool did_start_io;
1272
1273 Assert(*nblocks == 1 || allow_forwarding);
1274 Assert(*nblocks > 0);
1275 Assert(*nblocks <= MAX_IO_COMBINE_LIMIT);
1276
1277 for (int i = 0; i < actual_nblocks; ++i)
1278 {
1279 bool found;
1280
1281 if (allow_forwarding && buffers[i] != InvalidBuffer)
1282 {
1283 BufferDesc *bufHdr;
1284
1285 /*
1286 * This is a buffer that was pinned by an earlier call to
1287 * StartReadBuffers(), but couldn't be handled in one operation at
1288 * that time. The operation was split, and the caller has passed
1289 * an already pinned buffer back to us to handle the rest of the
1290 * operation. It must continue at the expected block number.
1291 */
1292 Assert(BufferGetBlockNumber(buffers[i]) == blockNum + i);
1293
1294 /*
1295 * It might be an already valid buffer (a hit) that followed the
1296 * final contiguous block of an earlier I/O (a miss) marking the
1297 * end of it, or a buffer that some other backend has since made
1298 * valid by performing the I/O for us, in which case we can handle
1299 * it as a hit now. It is safe to check for a BM_VALID flag with
1300 * a relaxed load, because we got a fresh view of it while pinning
1301 * it in the previous call.
1302 *
1303 * On the other hand if we don't see BM_VALID yet, it must be an
1304 * I/O that was split by the previous call and we need to try to
1305 * start a new I/O from this block. We're also racing against any
1306 * other backend that might start the I/O or even manage to mark
1307 * it BM_VALID after this check, but StartBufferIO() will handle
1308 * those cases.
1309 */
1310 if (BufferIsLocal(buffers[i]))
1311 bufHdr = GetLocalBufferDescriptor(-buffers[i] - 1);
1312 else
1313 bufHdr = GetBufferDescriptor(buffers[i] - 1);
1315 found = pg_atomic_read_u32(&bufHdr->state) & BM_VALID;
1316 }
1317 else
1318 {
1319 buffers[i] = PinBufferForBlock(operation->rel,
1320 operation->smgr,
1321 operation->persistence,
1322 operation->forknum,
1323 blockNum + i,
1324 operation->strategy,
1325 &found);
1326 }
1327
1328 if (found)
1329 {
1330 /*
1331 * We have a hit. If it's the first block in the requested range,
1332 * we can return it immediately and report that WaitReadBuffers()
1333 * does not need to be called. If the initial value of *nblocks
1334 * was larger, the caller will have to call again for the rest.
1335 */
1336 if (i == 0)
1337 {
1338 *nblocks = 1;
1339
1340#ifdef USE_ASSERT_CHECKING
1341
1342 /*
1343 * Initialize enough of ReadBuffersOperation to make
1344 * CheckReadBuffersOperation() work. Outside of assertions
1345 * that's not necessary when no IO is issued.
1346 */
1347 operation->buffers = buffers;
1348 operation->blocknum = blockNum;
1349 operation->nblocks = 1;
1350 operation->nblocks_done = 1;
1351 CheckReadBuffersOperation(operation, true);
1352#endif
1353 return false;
1354 }
1355
1356 /*
1357 * Otherwise we already have an I/O to perform, but this block
1358 * can't be included as it is already valid. Split the I/O here.
1359 * There may or may not be more blocks requiring I/O after this
1360 * one, we haven't checked, but they can't be contiguous with this
1361 * one in the way. We'll leave this buffer pinned, forwarding it
1362 * to the next call, avoiding the need to unpin it here and re-pin
1363 * it in the next call.
1364 */
1365 actual_nblocks = i;
1366 break;
1367 }
1368 else
1369 {
1370 /*
1371 * Check how many blocks we can cover with the same IO. The smgr
1372 * implementation might e.g. be limited due to a segment boundary.
1373 */
1374 if (i == 0 && actual_nblocks > 1)
1375 {
1376 maxcombine = smgrmaxcombine(operation->smgr,
1377 operation->forknum,
1378 blockNum);
1379 if (unlikely(maxcombine < actual_nblocks))
1380 {
1381 elog(DEBUG2, "limiting nblocks at %u from %u to %u",
1382 blockNum, actual_nblocks, maxcombine);
1383 actual_nblocks = maxcombine;
1384 }
1385 }
1386 }
1387 }
1388 *nblocks = actual_nblocks;
1389
1390 /* Populate information needed for I/O. */
1391 operation->buffers = buffers;
1392 operation->blocknum = blockNum;
1393 operation->flags = flags;
1394 operation->nblocks = actual_nblocks;
1395 operation->nblocks_done = 0;
1396 pgaio_wref_clear(&operation->io_wref);
1397
1398 /*
1399 * When using AIO, start the IO in the background. If not, issue prefetch
1400 * requests if desired by the caller.
1401 *
1402 * The reason we have a dedicated path for IOMETHOD_SYNC here is to
1403 * de-risk the introduction of AIO somewhat. It's a large architectural
1404 * change, with lots of chances for unanticipated performance effects.
1405 *
1406 * Use of IOMETHOD_SYNC already leads to not actually performing IO
1407 * asynchronously, but without the check here we'd execute IO earlier than
1408 * we used to. Eventually this IOMETHOD_SYNC specific path should go away.
1409 */
1410 if (io_method != IOMETHOD_SYNC)
1411 {
1412 /*
1413 * Try to start IO asynchronously. It's possible that no IO needs to
1414 * be started, if another backend already performed the IO.
1415 *
1416 * Note that if an IO is started, it might not cover the entire
1417 * requested range, e.g. because an intermediary block has been read
1418 * in by another backend. In that case any "trailing" buffers we
1419 * already pinned above will be "forwarded" by read_stream.c to the
1420 * next call to StartReadBuffers().
1421 *
1422 * This is signalled to the caller by decrementing *nblocks *and*
1423 * reducing operation->nblocks. The latter is done here, but not below
1424 * WaitReadBuffers(), as in WaitReadBuffers() we can't "shorten" the
1425 * overall read size anymore, we need to retry until done in its
1426 * entirety or until failed.
1427 */
1428 did_start_io = AsyncReadBuffers(operation, nblocks);
1429
1430 operation->nblocks = *nblocks;
1431 }
1432 else
1433 {
1434 operation->flags |= READ_BUFFERS_SYNCHRONOUSLY;
1435
1436 if (flags & READ_BUFFERS_ISSUE_ADVICE)
1437 {
1438 /*
1439 * In theory we should only do this if PinBufferForBlock() had to
1440 * allocate new buffers above. That way, if two calls to
1441 * StartReadBuffers() were made for the same blocks before
1442 * WaitReadBuffers(), only the first would issue the advice.
1443 * That'd be a better simulation of true asynchronous I/O, which
1444 * would only start the I/O once, but isn't done here for
1445 * simplicity.
1446 */
1447 smgrprefetch(operation->smgr,
1448 operation->forknum,
1449 blockNum,
1450 actual_nblocks);
1451 }
1452
1453 /*
1454 * Indicate that WaitReadBuffers() should be called. WaitReadBuffers()
1455 * will initiate the necessary IO.
1456 */
1457 did_start_io = true;
1458 }
1459
1460 CheckReadBuffersOperation(operation, !did_start_io);
1461
1462 return did_start_io;
1463}
int io_method
Definition: aio.c:74
@ IOMETHOD_SYNC
Definition: aio.h:34
static void CheckReadBuffersOperation(ReadBuffersOperation *operation, bool is_complete)
Definition: bufmgr.c:1527
static bool AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress)
Definition: bufmgr.c:1764
#define READ_BUFFERS_ISSUE_ADVICE
Definition: bufmgr.h:117
uint32 smgrmaxcombine(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
Definition: smgr.c:697

References Assert(), AsyncReadBuffers(), ReadBuffersOperation::blocknum, BM_TAG_VALID, BM_VALID, BufferGetBlockNumber(), BufferIsLocal, ReadBuffersOperation::buffers, CheckReadBuffersOperation(), DEBUG2, elog, ReadBuffersOperation::flags, ReadBuffersOperation::forknum, GetBufferDescriptor(), GetLocalBufferDescriptor(), i, InvalidBuffer, io_method, ReadBuffersOperation::io_wref, IOMETHOD_SYNC, MAX_IO_COMBINE_LIMIT, ReadBuffersOperation::nblocks, ReadBuffersOperation::nblocks_done, ReadBuffersOperation::persistence, pg_atomic_read_u32(), pgaio_wref_clear(), PinBufferForBlock(), READ_BUFFERS_ISSUE_ADVICE, READ_BUFFERS_SYNCHRONOUSLY, ReadBuffersOperation::rel, ReadBuffersOperation::smgr, smgrmaxcombine(), smgrprefetch(), BufferDesc::state, ReadBuffersOperation::strategy, and unlikely.

Referenced by StartReadBuffer(), and StartReadBuffers().

◆ SyncOneBuffer()

static int SyncOneBuffer ( int  buf_id,
bool  skip_recently_used,
WritebackContext wb_context 
)
static

Definition at line 3892 of file bufmgr.c.

3893{
3894 BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
3895 int result = 0;
3896 uint32 buf_state;
3897 BufferTag tag;
3898
3899 /* Make sure we can handle the pin */
3902
3903 /*
3904 * Check whether buffer needs writing.
3905 *
3906 * We can make this check without taking the buffer content lock so long
3907 * as we mark pages dirty in access methods *before* logging changes with
3908 * XLogInsert(): if someone marks the buffer dirty just after our check we
3909 * don't worry because our checkpoint.redo points before log record for
3910 * upcoming changes and so we are not required to write such dirty buffer.
3911 */
3912 buf_state = LockBufHdr(bufHdr);
3913
3914 if (BUF_STATE_GET_REFCOUNT(buf_state) == 0 &&
3915 BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
3916 {
3917 result |= BUF_REUSABLE;
3918 }
3919 else if (skip_recently_used)
3920 {
3921 /* Caller told us not to write recently-used buffers */
3922 UnlockBufHdr(bufHdr, buf_state);
3923 return result;
3924 }
3925
3926 if (!(buf_state & BM_VALID) || !(buf_state & BM_DIRTY))
3927 {
3928 /* It's clean, so nothing to do */
3929 UnlockBufHdr(bufHdr, buf_state);
3930 return result;
3931 }
3932
3933 /*
3934 * Pin it, share-lock it, write it. (FlushBuffer will do nothing if the
3935 * buffer is clean by the time we've locked it.)
3936 */
3937 PinBuffer_Locked(bufHdr);
3939
3941
3943
3944 tag = bufHdr->tag;
3945
3946 UnpinBuffer(bufHdr);
3947
3948 /*
3949 * SyncOneBuffer() is only called by checkpointer and bgwriter, so
3950 * IOContext will always be IOCONTEXT_NORMAL.
3951 */
3953
3954 return result | BUF_WRITTEN;
3955}

References BM_DIRTY, BM_VALID, BUF_REUSABLE, BUF_STATE_GET_REFCOUNT, BUF_STATE_GET_USAGECOUNT, BUF_WRITTEN, BufferDescriptorGetContentLock(), CurrentResourceOwner, FlushBuffer(), GetBufferDescriptor(), IOCONTEXT_NORMAL, IOOBJECT_RELATION, LockBufHdr(), LW_SHARED, LWLockAcquire(), LWLockRelease(), PinBuffer_Locked(), ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), ScheduleBufferTagForWriteback(), BufferDesc::tag, UnlockBufHdr(), and UnpinBuffer().

Referenced by BgBufferSync(), and BufferSync().

◆ TerminateBufferIO()

void TerminateBufferIO ( BufferDesc buf,
bool  clear_dirty,
uint32  set_flag_bits,
bool  forget_owner,
bool  release_aio 
)

Definition at line 6067 of file bufmgr.c.

6069{
6070 uint32 buf_state;
6071
6072 buf_state = LockBufHdr(buf);
6073
6074 Assert(buf_state & BM_IO_IN_PROGRESS);
6075 buf_state &= ~BM_IO_IN_PROGRESS;
6076
6077 /* Clear earlier errors, if this IO failed, it'll be marked again */
6078 buf_state &= ~BM_IO_ERROR;
6079
6080 if (clear_dirty && !(buf_state & BM_JUST_DIRTIED))
6081 buf_state &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED);
6082
6083 if (release_aio)
6084 {
6085 /* release ownership by the AIO subsystem */
6086 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
6087 buf_state -= BUF_REFCOUNT_ONE;
6088 pgaio_wref_clear(&buf->io_wref);
6089 }
6090
6091 buf_state |= set_flag_bits;
6092 UnlockBufHdr(buf, buf_state);
6093
6094 if (forget_owner)
6097
6099
6100 /*
6101 * Support LockBufferForCleanup()
6102 *
6103 * We may have just released the last pin other than the waiter's. In most
6104 * cases, this backend holds another pin on the buffer. But, if, for
6105 * example, this backend is completing an IO issued by another backend, it
6106 * may be time to wake the waiter.
6107 */
6108 if (release_aio && (buf_state & BM_PIN_COUNT_WAITER))
6110}
static ConditionVariable * BufferDescriptorGetIOCV(const BufferDesc *bdesc)
static void WakePinCountWaiter(BufferDesc *buf)
Definition: bufmgr.c:3198
void ConditionVariableBroadcast(ConditionVariable *cv)

References Assert(), BM_CHECKPOINT_NEEDED, BM_DIRTY, BM_IO_IN_PROGRESS, BM_JUST_DIRTIED, BM_PIN_COUNT_WAITER, buf, BUF_REFCOUNT_ONE, BUF_STATE_GET_REFCOUNT, BufferDescriptorGetBuffer(), BufferDescriptorGetIOCV(), ConditionVariableBroadcast(), CurrentResourceOwner, LockBufHdr(), pgaio_wref_clear(), ResourceOwnerForgetBufferIO(), UnlockBufHdr(), and WakePinCountWaiter().

Referenced by AbortBufferIO(), buffer_call_terminate_io(), buffer_readv_complete_one(), ExtendBufferedRelShared(), FlushBuffer(), and ZeroAndLockBuffer().

◆ ts_ckpt_progress_comparator()

static int ts_ckpt_progress_comparator ( Datum  a,
Datum  b,
void *  arg 
)
static

Definition at line 6343 of file bufmgr.c.

6344{
6347
6348 /* we want a min-heap, so return 1 for the a < b */
6349 if (sa->progress < sb->progress)
6350 return 1;
6351 else if (sa->progress == sb->progress)
6352 return 0;
6353 else
6354 return -1;
6355}

References a, b, DatumGetPointer(), and CkptTsStatus::progress.

Referenced by BufferSync().

◆ UnlockBuffers()

void UnlockBuffers ( void  )

Definition at line 5544 of file bufmgr.c.

5545{
5547
5548 if (buf)
5549 {
5550 uint32 buf_state;
5551
5552 buf_state = LockBufHdr(buf);
5553
5554 /*
5555 * Don't complain if flag bit not set; it could have been reset but we
5556 * got a cancel/die interrupt before getting the signal.
5557 */
5558 if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
5559 buf->wait_backend_pgprocno == MyProcNumber)
5560 buf_state &= ~BM_PIN_COUNT_WAITER;
5561
5562 UnlockBufHdr(buf, buf_state);
5563
5564 PinCountWaitBuf = NULL;
5565 }
5566}

References BM_PIN_COUNT_WAITER, buf, LockBufHdr(), MyProcNumber, PinCountWaitBuf, and UnlockBufHdr().

Referenced by AbortSubTransaction(), AbortTransaction(), AtProcExit_Buffers(), AutoVacLauncherMain(), BackgroundWriterMain(), CheckpointerMain(), and WalWriterMain().

◆ UnlockReleaseBuffer()

void UnlockReleaseBuffer ( Buffer  buffer)

Definition at line 5355 of file bufmgr.c.

5356{
5358 ReleaseBuffer(buffer);
5359}

References PrivateRefCountEntry::buffer, BUFFER_LOCK_UNLOCK, LockBuffer(), and ReleaseBuffer().

Referenced by _bt_clear_incomplete_split(), _bt_restore_meta(), _hash_relbuf(), allocNewBuffer(), AlterSequence(), blbulkdelete(), blgetbitmap(), blinsert(), BloomInitMetapage(), blvacuumcleanup(), brin_doinsert(), brin_doupdate(), brin_evacuate_page(), brin_getinsertbuffer(), brin_xlog_createidx(), brin_xlog_desummarize_page(), brin_xlog_insert_update(), brin_xlog_revmap_extend(), brin_xlog_samepage_update(), brin_xlog_update(), brinbuild(), brinbuildempty(), brinGetStats(), brinRevmapDesummarizeRange(), bt_metap(), bt_multi_page_stats(), bt_page_items_internal(), bt_page_stats_internal(), bt_recheck_sibling_links(), btree_xlog_dedup(), btree_xlog_delete(), btree_xlog_insert(), btree_xlog_mark_page_halfdead(), btree_xlog_newroot(), btree_xlog_split(), btree_xlog_unlink_page(), btree_xlog_vacuum(), collect_corrupt_items(), collect_visibility_data(), count_nondeletable_pages(), createPostingTree(), do_setval(), doPickSplit(), entryLoadMoreItems(), fill_seq_fork_with_data(), flushCachedPage(), FreeSpaceMapPrepareTruncateRel(), fsm_search(), fsm_set_and_search(), generic_redo(), gin_refind_parent(), ginbuild(), ginbuildempty(), ginbulkdelete(), ginGetStats(), ginHeapTupleFastInsert(), ginInsertCleanup(), ginPlaceToPage(), ginRedoClearIncompleteSplit(), ginRedoCreatePTree(), ginRedoDeleteListPages(), ginRedoDeletePage(), ginRedoInsert(), ginRedoInsertListPage(), ginRedoSplit(), ginRedoUpdateMetapage(), ginRedoVacuumDataLeafPage(), ginRedoVacuumPage(), ginScanToDelete(), ginStepRight(), ginUpdateStats(), ginvacuumcleanup(), ginVacuumPostingTree(), ginVacuumPostingTreeLeaves(), gistbufferinginserttuples(), gistbuild(), gistbuildempty(), gistdoinsert(), gistFindCorrectParent(), gistFindPath(), gistGetMaxLevel(), gistinserttuples(), gistkillitems(), gistplacetopage(), gistProcessItup(), gistRedoClearFollowRight(), gistRedoDeleteRecord(), gistRedoPageDelete(), gistRedoPageSplitRecord(), gistRedoPageUpdateRecord(), gistScanPage(), gistvacuum_delete_empty_pages(), gistvacuumpage(), hash_xlog_add_ovfl_page(), hash_xlog_delete(), hash_xlog_init_bitmap_page(), hash_xlog_init_meta_page(), hash_xlog_insert(), hash_xlog_move_page_contents(), hash_xlog_split_allocate_page(), hash_xlog_split_cleanup(), hash_xlog_split_complete(), hash_xlog_split_page(), hash_xlog_squeeze_page(), hash_xlog_update_meta_page(), hash_xlog_vacuum_one_page(), heap_delete(), heap_finish_speculative(), heap_force_common(), heap_get_latest_tid(), heap_index_delete_tuples(), heap_insert(), heap_lock_updated_tuple_rec(), heap_multi_insert(), heap_update(), heap_xlog_confirm(), heap_xlog_delete(), heap_xlog_inplace(), heap_xlog_insert(), heap_xlog_lock(), heap_xlog_lock_updated(), heap_xlog_multi_insert(), heap_xlog_prune_freeze(), heap_xlog_update(), heap_xlog_visible(), heapam_scan_analyze_next_tuple(), initBloomState(), lazy_scan_heap(), lazy_scan_new_or_empty(), lazy_vacuum_heap_rel(), log_newpage_range(), moveLeafs(), nextval_internal(), palloc_btree_page(), pg_get_sequence_data(), pg_sequence_last_value(), pg_visibility(), pgstat_gist_page(), pgstat_heap(), pgstatginindex_internal(), pgstathashindex(), RelationCopyStorageUsingBuffer(), RelationGetBufferForTuple(), ResetSequence(), revmap_physical_extend(), scanGetCandidate(), scanPendingInsert(), scanPostingTree(), ScanSourceDatabasePgClass(), seq_redo(), SequenceChangePersistence(), shiftList(), spgAddNodeAction(), spgbuild(), spgdoinsert(), spgGetCache(), SpGistGetBuffer(), SpGistUpdateMetaPage(), spgMatchNodeAction(), spgprocesspending(), spgRedoAddLeaf(), spgRedoAddNode(), spgRedoMoveLeafs(), spgRedoPickSplit(), spgRedoSplitTuple(), spgRedoVacuumLeaf(), spgRedoVacuumRedirect(), spgRedoVacuumRoot(), spgSplitNodeAction(), spgvacuumpage(), spgWalk(), statapprox_heap(), verify_heapam(), verifyBackupPageConsistency(), visibilitymap_prepare_truncate(), writeListPage(), xlog_redo(), and XLogRecordPageWithFreeSpace().

◆ UnpinBuffer()

◆ UnpinBufferNoOwner()

static void UnpinBufferNoOwner ( BufferDesc buf)
static

Definition at line 3242 of file bufmgr.c.

3243{
3246
3248
3249 /* not moving as we're likely deleting it soon anyway */
3250 ref = GetPrivateRefCountEntry(b, false);
3251 Assert(ref != NULL);
3252 Assert(ref->refcount > 0);
3253 ref->refcount--;
3254 if (ref->refcount == 0)
3255 {
3256 uint32 buf_state;
3257 uint32 old_buf_state;
3258
3259 /*
3260 * Mark buffer non-accessible to Valgrind.
3261 *
3262 * Note that the buffer may have already been marked non-accessible
3263 * within access method code that enforces that buffers are only
3264 * accessed while a buffer lock is held.
3265 */
3267
3268 /* I'd better not still hold the buffer content lock */
3270
3271 /*
3272 * Decrement the shared reference count.
3273 *
3274 * Since buffer spinlock holder can update status using just write,
3275 * it's not safe to use atomic decrement here; thus use a CAS loop.
3276 */
3277 old_buf_state = pg_atomic_read_u32(&buf->state);
3278 for (;;)
3279 {
3280 if (old_buf_state & BM_LOCKED)
3281 old_buf_state = WaitBufHdrUnlocked(buf);
3282
3283 buf_state = old_buf_state;
3284
3285 buf_state -= BUF_REFCOUNT_ONE;
3286
3287 if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
3288 buf_state))
3289 break;
3290 }
3291
3292 /* Support LockBufferForCleanup() */
3293 if (buf_state & BM_PIN_COUNT_WAITER)
3295
3297 }
3298}
static void ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref)
Definition: bufmgr.c:448

References Assert(), b, BM_LOCKED, BM_PIN_COUNT_WAITER, buf, BUF_REFCOUNT_ONE, BufferDescriptorGetBuffer(), BufferDescriptorGetContentLock(), BufferIsLocal, BufHdrGetBlock, ForgetPrivateRefCountEntry(), GetPrivateRefCountEntry(), LWLockHeldByMe(), pg_atomic_compare_exchange_u32(), pg_atomic_read_u32(), PrivateRefCountEntry::refcount, VALGRIND_MAKE_MEM_NOACCESS, WaitBufHdrUnlocked(), and WakePinCountWaiter().

Referenced by ResOwnerReleaseBufferPin(), and UnpinBuffer().

◆ WaitBufHdrUnlocked()

static uint32 WaitBufHdrUnlocked ( BufferDesc buf)
static

Definition at line 6254 of file bufmgr.c.

6255{
6256 SpinDelayStatus delayStatus;
6257 uint32 buf_state;
6258
6259 init_local_spin_delay(&delayStatus);
6260
6261 buf_state = pg_atomic_read_u32(&buf->state);
6262
6263 while (buf_state & BM_LOCKED)
6264 {
6265 perform_spin_delay(&delayStatus);
6266 buf_state = pg_atomic_read_u32(&buf->state);
6267 }
6268
6269 finish_spin_delay(&delayStatus);
6270
6271 return buf_state;
6272}

References BM_LOCKED, buf, finish_spin_delay(), init_local_spin_delay, perform_spin_delay(), and pg_atomic_read_u32().

Referenced by MarkBufferDirty(), PinBuffer(), and UnpinBufferNoOwner().

◆ WaitIO()

static void WaitIO ( BufferDesc buf)
static

Definition at line 5931 of file bufmgr.c.

5932{
5934
5936 for (;;)
5937 {
5938 uint32 buf_state;
5939 PgAioWaitRef iow;
5940
5941 /*
5942 * It may not be necessary to acquire the spinlock to check the flag
5943 * here, but since this test is essential for correctness, we'd better
5944 * play it safe.
5945 */
5946 buf_state = LockBufHdr(buf);
5947
5948 /*
5949 * Copy the wait reference while holding the spinlock. This protects
5950 * against a concurrent TerminateBufferIO() in another backend from
5951 * clearing the wref while it's being read.
5952 */
5953 iow = buf->io_wref;
5954 UnlockBufHdr(buf, buf_state);
5955
5956 /* no IO in progress, we don't need to wait */
5957 if (!(buf_state & BM_IO_IN_PROGRESS))
5958 break;
5959
5960 /*
5961 * The buffer has asynchronous IO in progress, wait for it to
5962 * complete.
5963 */
5964 if (pgaio_wref_valid(&iow))
5965 {
5966 pgaio_wref_wait(&iow);
5967
5968 /*
5969 * The AIO subsystem internally uses condition variables and thus
5970 * might remove this backend from the BufferDesc's CV. While that
5971 * wouldn't cause a correctness issue (the first CV sleep just
5972 * immediately returns if not already registered), it seems worth
5973 * avoiding unnecessary loop iterations, given that we take care
5974 * to do so at the start of the function.
5975 */
5977 continue;
5978 }
5979
5980 /* wait on BufferDesc->cv, e.g. for concurrent synchronous IO */
5981 ConditionVariableSleep(cv, WAIT_EVENT_BUFFER_IO);
5982 }
5984}
void pgaio_wref_wait(PgAioWaitRef *iow)
Definition: aio.c:988
bool ConditionVariableCancelSleep(void)
void ConditionVariablePrepareToSleep(ConditionVariable *cv)
void ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info)

References BM_IO_IN_PROGRESS, buf, BufferDescriptorGetIOCV(), ConditionVariableCancelSleep(), ConditionVariablePrepareToSleep(), ConditionVariableSleep(), LockBufHdr(), pgaio_wref_valid(), pgaio_wref_wait(), and UnlockBufHdr().

Referenced by InvalidateBuffer(), and StartBufferIO().

◆ WaitReadBuffers()

void WaitReadBuffers ( ReadBuffersOperation operation)

Definition at line 1632 of file bufmgr.c.

1633{
1634 PgAioReturn *aio_ret = &operation->io_return;
1635 IOContext io_context;
1636 IOObject io_object;
1637
1638 if (operation->persistence == RELPERSISTENCE_TEMP)
1639 {
1640 io_context = IOCONTEXT_NORMAL;
1641 io_object = IOOBJECT_TEMP_RELATION;
1642 }
1643 else
1644 {
1645 io_context = IOContextForStrategy(operation->strategy);
1646 io_object = IOOBJECT_RELATION;
1647 }
1648
1649 /*
1650 * If we get here without an IO operation having been issued, the
1651 * io_method == IOMETHOD_SYNC path must have been used. Otherwise the
1652 * caller should not have called WaitReadBuffers().
1653 *
1654 * In the case of IOMETHOD_SYNC, we start - as we used to before the
1655 * introducing of AIO - the IO in WaitReadBuffers(). This is done as part
1656 * of the retry logic below, no extra code is required.
1657 *
1658 * This path is expected to eventually go away.
1659 */
1660 if (!pgaio_wref_valid(&operation->io_wref) && io_method != IOMETHOD_SYNC)
1661 elog(ERROR, "waiting for read operation that didn't read");
1662
1663 /*
1664 * To handle partial reads, and IOMETHOD_SYNC, we re-issue IO until we're
1665 * done. We may need multiple retries, not just because we could get
1666 * multiple partial reads, but also because some of the remaining
1667 * to-be-read buffers may have been read in by other backends, limiting
1668 * the IO size.
1669 */
1670 while (true)
1671 {
1672 int ignored_nblocks_progress;
1673
1674 CheckReadBuffersOperation(operation, false);
1675
1676 /*
1677 * If there is an IO associated with the operation, we may need to
1678 * wait for it.
1679 */
1680 if (pgaio_wref_valid(&operation->io_wref))
1681 {
1682 /*
1683 * Track the time spent waiting for the IO to complete. As
1684 * tracking a wait even if we don't actually need to wait
1685 *
1686 * a) is not cheap, due to the timestamping overhead
1687 *
1688 * b) reports some time as waiting, even if we never waited
1689 *
1690 * we first check if we already know the IO is complete.
1691 */
1692 if (aio_ret->result.status == PGAIO_RS_UNKNOWN &&
1693 !pgaio_wref_check_done(&operation->io_wref))
1694 {
1696
1697 pgaio_wref_wait(&operation->io_wref);
1698
1699 /*
1700 * The IO operation itself was already counted earlier, in
1701 * AsyncReadBuffers(), this just accounts for the wait time.
1702 */
1703 pgstat_count_io_op_time(io_object, io_context, IOOP_READ,
1704 io_start, 0, 0);
1705 }
1706 else
1707 {
1708 Assert(pgaio_wref_check_done(&operation->io_wref));
1709 }
1710
1711 /*
1712 * We now are sure the IO completed. Check the results. This
1713 * includes reporting on errors if there were any.
1714 */
1715 ProcessReadBuffersResult(operation);
1716 }
1717
1718 /*
1719 * Most of the time, the one IO we already started, will read in
1720 * everything. But we need to deal with partial reads and buffers not
1721 * needing IO anymore.
1722 */
1723 if (operation->nblocks_done == operation->nblocks)
1724 break;
1725
1727
1728 /*
1729 * This may only complete the IO partially, either because some
1730 * buffers were already valid, or because of a partial read.
1731 *
1732 * NB: In contrast to after the AsyncReadBuffers() call in
1733 * StartReadBuffers(), we do *not* reduce
1734 * ReadBuffersOperation->nblocks here, callers expect the full
1735 * operation to be completed at this point (as more operations may
1736 * have been queued).
1737 */
1738 AsyncReadBuffers(operation, &ignored_nblocks_progress);
1739 }
1740
1741 CheckReadBuffersOperation(operation, true);
1742
1743 /* NB: READ_DONE tracepoint was already executed in completion callback */
1744}
bool pgaio_wref_check_done(PgAioWaitRef *iow)
Definition: aio.c:1002
static void ProcessReadBuffersResult(ReadBuffersOperation *operation)
Definition: bufmgr.c:1593

References Assert(), AsyncReadBuffers(), CHECK_FOR_INTERRUPTS, CheckReadBuffersOperation(), elog, ERROR, io_method, ReadBuffersOperation::io_return, ReadBuffersOperation::io_wref, IOCONTEXT_NORMAL, IOContextForStrategy(), IOMETHOD_SYNC, IOOBJECT_RELATION, IOOBJECT_TEMP_RELATION, IOOP_READ, ReadBuffersOperation::nblocks, ReadBuffersOperation::nblocks_done, ReadBuffersOperation::persistence, PGAIO_RS_UNKNOWN, pgaio_wref_check_done(), pgaio_wref_valid(), pgaio_wref_wait(), pgstat_count_io_op_time(), pgstat_prepare_io_time(), ProcessReadBuffersResult(), PgAioReturn::result, PgAioResult::status, ReadBuffersOperation::strategy, and track_io_timing.

Referenced by read_stream_next_buffer(), and ReadBuffer_common().

◆ WakePinCountWaiter()

static void WakePinCountWaiter ( BufferDesc buf)
static

Definition at line 3198 of file bufmgr.c.

3199{
3200 /*
3201 * Acquire the buffer header lock, re-check that there's a waiter. Another
3202 * backend could have unpinned this buffer, and already woken up the
3203 * waiter.
3204 *
3205 * There's no danger of the buffer being replaced after we unpinned it
3206 * above, as it's pinned by the waiter. The waiter removes
3207 * BM_PIN_COUNT_WAITER if it stops waiting for a reason other than this
3208 * backend waking it up.
3209 */
3210 uint32 buf_state = LockBufHdr(buf);
3211
3212 if ((buf_state & BM_PIN_COUNT_WAITER) &&
3213 BUF_STATE_GET_REFCOUNT(buf_state) == 1)
3214 {
3215 /* we just released the last pin other than the waiter's */
3216 int wait_backend_pgprocno = buf->wait_backend_pgprocno;
3217
3218 buf_state &= ~BM_PIN_COUNT_WAITER;
3219 UnlockBufHdr(buf, buf_state);
3220 ProcSendSignal(wait_backend_pgprocno);
3221 }
3222 else
3223 UnlockBufHdr(buf, buf_state);
3224}
void ProcSendSignal(ProcNumber procNumber)
Definition: proc.c:1986

References BM_PIN_COUNT_WAITER, buf, BUF_STATE_GET_REFCOUNT, LockBufHdr(), ProcSendSignal(), and UnlockBufHdr().

Referenced by TerminateBufferIO(), and UnpinBufferNoOwner().

◆ WritebackContextInit()

void WritebackContextInit ( WritebackContext context,
int *  max_pending 
)

Definition at line 6366 of file bufmgr.c.

6367{
6368 Assert(*max_pending <= WRITEBACK_MAX_PENDING_FLUSHES);
6369
6370 context->max_pending = max_pending;
6371 context->nr_pending = 0;
6372}

References Assert(), WritebackContext::max_pending, WritebackContext::nr_pending, and WRITEBACK_MAX_PENDING_FLUSHES.

Referenced by BackgroundWriterMain(), BufferManagerShmemInit(), and BufferSync().

◆ ZeroAndLockBuffer()

static void ZeroAndLockBuffer ( Buffer  buffer,
ReadBufferMode  mode,
bool  already_valid 
)
static

Definition at line 1031 of file bufmgr.c.

1032{
1033 BufferDesc *bufHdr;
1034 bool need_to_zero;
1035 bool isLocalBuf = BufferIsLocal(buffer);
1036
1038
1039 if (already_valid)
1040 {
1041 /*
1042 * If the caller already knew the buffer was valid, we can skip some
1043 * header interaction. The caller just wants to lock the buffer.
1044 */
1045 need_to_zero = false;
1046 }
1047 else if (isLocalBuf)
1048 {
1049 /* Simple case for non-shared buffers. */
1050 bufHdr = GetLocalBufferDescriptor(-buffer - 1);
1051 need_to_zero = StartLocalBufferIO(bufHdr, true, false);
1052 }
1053 else
1054 {
1055 /*
1056 * Take BM_IO_IN_PROGRESS, or discover that BM_VALID has been set
1057 * concurrently. Even though we aren't doing I/O, that ensures that
1058 * we don't zero a page that someone else has pinned. An exclusive
1059 * content lock wouldn't be enough, because readers are allowed to
1060 * drop the content lock after determining that a tuple is visible
1061 * (see buffer access rules in README).
1062 */
1063 bufHdr = GetBufferDescriptor(buffer - 1);
1064 need_to_zero = StartBufferIO(bufHdr, true, false);
1065 }
1066
1067 if (need_to_zero)
1068 {
1069 memset(BufferGetPage(buffer), 0, BLCKSZ);
1070
1071 /*
1072 * Grab the buffer content lock before marking the page as valid, to
1073 * make sure that no other backend sees the zeroed page before the
1074 * caller has had a chance to initialize it.
1075 *
1076 * Since no-one else can be looking at the page contents yet, there is
1077 * no difference between an exclusive lock and a cleanup-strength
1078 * lock. (Note that we cannot use LockBuffer() or
1079 * LockBufferForCleanup() here, because they assert that the buffer is
1080 * already valid.)
1081 */
1082 if (!isLocalBuf)
1084
1085 /* Set BM_VALID, terminate IO, and wake up any waiters */
1086 if (isLocalBuf)
1087 TerminateLocalBufferIO(bufHdr, false, BM_VALID, false);
1088 else
1089 TerminateBufferIO(bufHdr, false, BM_VALID, true, false);
1090 }
1091 else if (!isLocalBuf)
1092 {
1093 /*
1094 * The buffer is valid, so we can't zero it. The caller still expects
1095 * the page to be locked on return.
1096 */
1097 if (mode == RBM_ZERO_AND_LOCK)
1099 else
1100 LockBufferForCleanup(buffer);
1101 }
1102}
void LockBufferForCleanup(Buffer buffer)
Definition: bufmgr.c:5652

References Assert(), BM_VALID, PrivateRefCountEntry::buffer, BUFFER_LOCK_EXCLUSIVE, BufferDescriptorGetContentLock(), BufferGetPage(), BufferIsLocal, GetBufferDescriptor(), GetLocalBufferDescriptor(), LockBuffer(), LockBufferForCleanup(), LW_EXCLUSIVE, LWLockAcquire(), mode, RBM_ZERO_AND_CLEANUP_LOCK, RBM_ZERO_AND_LOCK, StartBufferIO(), StartLocalBufferIO(), TerminateBufferIO(), and TerminateLocalBufferIO().

Referenced by ReadBuffer_common().

Variable Documentation

◆ aio_local_buffer_readv_cb

const PgAioHandleCallbacks aio_local_buffer_readv_cb
Initial value:
= {
.complete_local = local_buffer_readv_complete,
}
static PgAioResult local_buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
Definition: bufmgr.c:7397
static void local_buffer_readv_stage(PgAioHandle *ioh, uint8 cb_data)
Definition: bufmgr.c:7391
static void buffer_readv_report(PgAioResult result, const PgAioTargetData *td, int elevel)
Definition: bufmgr.c:7245

Definition at line 7413 of file bufmgr.c.

◆ aio_shared_buffer_readv_cb

const PgAioHandleCallbacks aio_shared_buffer_readv_cb
Initial value:
= {
.complete_shared = shared_buffer_readv_complete,
}
static PgAioResult shared_buffer_readv_complete_local(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
Definition: bufmgr.c:7360
static void shared_buffer_readv_stage(PgAioHandle *ioh, uint8 cb_data)
Definition: bufmgr.c:7340
static PgAioResult shared_buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
Definition: bufmgr.c:7346

Definition at line 7404 of file bufmgr.c.

◆ backend_flush_after

int backend_flush_after = DEFAULT_BACKEND_FLUSH_AFTER

Definition at line 180 of file bufmgr.c.

Referenced by BufferManagerShmemInit().

◆ bgwriter_flush_after

int bgwriter_flush_after = DEFAULT_BGWRITER_FLUSH_AFTER

Definition at line 179 of file bufmgr.c.

Referenced by BackgroundWriterMain().

◆ bgwriter_lru_maxpages

int bgwriter_lru_maxpages = 100

Definition at line 145 of file bufmgr.c.

Referenced by BgBufferSync().

◆ bgwriter_lru_multiplier

double bgwriter_lru_multiplier = 2.0

Definition at line 146 of file bufmgr.c.

Referenced by BgBufferSync().

◆ buffer_io_resowner_desc

const ResourceOwnerDesc buffer_io_resowner_desc
Initial value:
=
{
.name = "buffer io",
.release_priority = RELEASE_PRIO_BUFFER_IOS,
.ReleaseResource = ResOwnerReleaseBufferIO,
.DebugPrint = ResOwnerPrintBufferIO
}
static void ResOwnerReleaseBufferIO(Datum res)
Definition: bufmgr.c:6511
static char * ResOwnerPrintBufferIO(Datum res)
Definition: bufmgr.c:6519
#define RELEASE_PRIO_BUFFER_IOS
Definition: resowner.h:62
@ RESOURCE_RELEASE_BEFORE_LOCKS
Definition: resowner.h:54

Definition at line 235 of file bufmgr.c.

Referenced by ResourceOwnerForgetBufferIO(), and ResourceOwnerRememberBufferIO().

◆ buffer_pin_resowner_desc

const ResourceOwnerDesc buffer_pin_resowner_desc
Initial value:
=
{
.name = "buffer pin",
.release_priority = RELEASE_PRIO_BUFFER_PINS,
.ReleaseResource = ResOwnerReleaseBufferPin,
.DebugPrint = ResOwnerPrintBufferPin
}
static char * ResOwnerPrintBufferPin(Datum res)
Definition: bufmgr.c:6542
static void ResOwnerReleaseBufferPin(Datum res)
Definition: bufmgr.c:6527
#define RELEASE_PRIO_BUFFER_PINS
Definition: resowner.h:63

Definition at line 244 of file bufmgr.c.

Referenced by ResourceOwnerForgetBuffer(), and ResourceOwnerRememberBuffer().

◆ checkpoint_flush_after

int checkpoint_flush_after = DEFAULT_CHECKPOINT_FLUSH_AFTER

Definition at line 178 of file bufmgr.c.

Referenced by BufferSync().

◆ effective_io_concurrency

◆ io_combine_limit

◆ io_combine_limit_guc

int io_combine_limit_guc = DEFAULT_IO_COMBINE_LIMIT

Definition at line 171 of file bufmgr.c.

Referenced by assign_io_max_combine_limit().

◆ io_max_combine_limit

◆ maintenance_io_concurrency

◆ MaxProportionalPins

uint32 MaxProportionalPins
static

Definition at line 221 of file bufmgr.c.

Referenced by GetAdditionalPinLimit(), GetPinLimit(), and InitBufferManagerAccess().

◆ PinCountWaitBuf

BufferDesc* PinCountWaitBuf = NULL
static

Definition at line 183 of file bufmgr.c.

Referenced by LockBufferForCleanup(), and UnlockBuffers().

◆ PrivateRefCountArray

◆ PrivateRefCountClock

uint32 PrivateRefCountClock = 0
static

Definition at line 218 of file bufmgr.c.

Referenced by ReservePrivateRefCountEntry().

◆ PrivateRefCountHash

◆ PrivateRefCountOverflowed

◆ ReservedRefCountEntry

◆ track_io_timing

◆ zero_damaged_pages

bool zero_damaged_pages = false

Definition at line 144 of file bufmgr.c.

Referenced by AsyncReadBuffers(), mdreadv(), and read_rel_block_ll().