Thanks to visit codestin.com
Credit goes to doxygen.postgresql.org

PostgreSQL Source Code git master
xlogrecovery.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * xlogrecovery.c
4 * Functions for WAL recovery, standby mode
5 *
6 * This source file contains functions controlling WAL recovery.
7 * InitWalRecovery() initializes the system for crash or archive recovery,
8 * or standby mode, depending on configuration options and the state of
9 * the control file and possible backup label file. PerformWalRecovery()
10 * performs the actual WAL replay, calling the rmgr-specific redo routines.
11 * FinishWalRecovery() performs end-of-recovery checks and cleanup actions,
12 * and prepares information needed to initialize the WAL for writes. In
13 * addition to these three main functions, there are a bunch of functions
14 * for interrogating recovery state and controlling the recovery process.
15 *
16 *
17 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
18 * Portions Copyright (c) 1994, Regents of the University of California
19 *
20 * src/backend/access/transam/xlogrecovery.c
21 *
22 *-------------------------------------------------------------------------
23 */
24
25#include "postgres.h"
26
27#include <ctype.h>
28#include <math.h>
29#include <time.h>
30#include <sys/stat.h>
31#include <sys/time.h>
32#include <unistd.h>
33
34#include "access/timeline.h"
35#include "access/transam.h"
36#include "access/xact.h"
38#include "access/xlogarchive.h"
40#include "access/xlogreader.h"
41#include "access/xlogrecovery.h"
42#include "access/xlogutils.h"
43#include "backup/basebackup.h"
44#include "catalog/pg_control.h"
45#include "commands/tablespace.h"
46#include "common/file_utils.h"
47#include "miscadmin.h"
48#include "nodes/miscnodes.h"
49#include "pgstat.h"
50#include "postmaster/bgwriter.h"
51#include "postmaster/startup.h"
52#include "replication/slot.h"
55#include "storage/fd.h"
56#include "storage/ipc.h"
57#include "storage/latch.h"
58#include "storage/pmsignal.h"
59#include "storage/procarray.h"
60#include "storage/spin.h"
61#include "utils/datetime.h"
62#include "utils/fmgrprotos.h"
63#include "utils/guc_hooks.h"
65#include "utils/pg_lsn.h"
66#include "utils/ps_status.h"
67#include "utils/pg_rusage.h"
68
69/* Unsupported old recovery command file names (relative to $PGDATA) */
70#define RECOVERY_COMMAND_FILE "recovery.conf"
71#define RECOVERY_COMMAND_DONE "recovery.done"
72
73/*
74 * GUC support
75 */
77 {"pause", RECOVERY_TARGET_ACTION_PAUSE, false},
78 {"promote", RECOVERY_TARGET_ACTION_PROMOTE, false},
79 {"shutdown", RECOVERY_TARGET_ACTION_SHUTDOWN, false},
80 {NULL, 0, false}
81};
82
83/* options formerly taken from recovery.conf for archive recovery */
85char *recoveryEndCommand = NULL;
96
97/* options formerly taken from recovery.conf for XLOG streaming */
98char *PrimaryConnInfo = NULL;
99char *PrimarySlotName = NULL;
101
102/*
103 * recoveryTargetTimeLineGoal: what the user requested, if any
104 *
105 * recoveryTargetTLIRequested: numeric value of requested timeline, if constant
106 *
107 * recoveryTargetTLI: the currently understood target timeline; changes
108 *
109 * expectedTLEs: a list of TimeLineHistoryEntries for recoveryTargetTLI and
110 * the timelines of its known parents, newest first (so recoveryTargetTLI is
111 * always the first list member). Only these TLIs are expected to be seen in
112 * the WAL segments we read, and indeed only these TLIs will be considered as
113 * candidate WAL files to open at all.
114 *
115 * curFileTLI: the TLI appearing in the name of the current input WAL file.
116 * (This is not necessarily the same as the timeline from which we are
117 * replaying WAL, which StartupXLOG calls replayTLI, because we could be
118 * scanning data that was copied from an ancestor timeline when the current
119 * file was created.) During a sequential scan we do not allow this value
120 * to decrease.
121 */
127
128/*
129 * When ArchiveRecoveryRequested is set, archive recovery was requested,
130 * ie. signal files were present. When InArchiveRecovery is set, we are
131 * currently recovering using offline XLOG archives. These variables are only
132 * valid in the startup process.
133 *
134 * When ArchiveRecoveryRequested is true, but InArchiveRecovery is false, we're
135 * currently performing crash recovery using only XLOG files in pg_wal, but
136 * will switch to using offline XLOG archives as soon as we reach the end of
137 * WAL in pg_wal.
138 */
140bool InArchiveRecovery = false;
141
142/*
143 * When StandbyModeRequested is set, standby mode was requested, i.e.
144 * standby.signal file was present. When StandbyMode is set, we are currently
145 * in standby mode. These variables are only valid in the startup process.
146 * They work similarly to ArchiveRecoveryRequested and InArchiveRecovery.
147 */
148static bool StandbyModeRequested = false;
149bool StandbyMode = false;
150
151/* was a signal file present at startup? */
152static bool standby_signal_file_found = false;
153static bool recovery_signal_file_found = false;
154
155/*
156 * CheckPointLoc is the position of the checkpoint record that determines
157 * where to start the replay. It comes from the backup label file or the
158 * control file.
159 *
160 * RedoStartLSN is the checkpoint's REDO location, also from the backup label
161 * file or the control file. In standby mode, XLOG streaming usually starts
162 * from the position where an invalid record was found. But if we fail to
163 * read even the initial checkpoint record, we use the REDO location instead
164 * of the checkpoint location as the start position of XLOG streaming.
165 * Otherwise we would have to jump backwards to the REDO location after
166 * reading the checkpoint record, because the REDO record can precede the
167 * checkpoint record.
168 */
173
174/*
175 * Local copy of SharedHotStandbyActive variable. False actually means "not
176 * known, need to check the shared state".
177 */
178static bool LocalHotStandbyActive = false;
179
180/*
181 * Local copy of SharedPromoteIsTriggered variable. False actually means "not
182 * known, need to check the shared state".
183 */
184static bool LocalPromoteIsTriggered = false;
185
186/* Has the recovery code requested a walreceiver wakeup? */
188
189/* XLogReader object used to parse the WAL records */
191
192/* XLogPrefetcher object used to consume WAL records with read-ahead */
194
195/* Parameters passed down from ReadRecord to the XLogPageRead callback. */
197{
198 int emode;
199 bool fetching_ckpt; /* are we fetching a checkpoint record? */
203
204/* flag to tell XLogPageRead that we have started replaying */
205static bool InRedo = false;
206
207/*
208 * Codes indicating where we got a WAL file from during recovery, or where
209 * to attempt to get one.
210 */
211typedef enum
212{
213 XLOG_FROM_ANY = 0, /* request to read WAL from any source */
214 XLOG_FROM_ARCHIVE, /* restored using restore_command */
215 XLOG_FROM_PG_WAL, /* existing file in pg_wal */
216 XLOG_FROM_STREAM, /* streamed from primary */
217} XLogSource;
218
219/* human-readable names for XLogSources, for debugging output */
220static const char *const xlogSourceNames[] = {"any", "archive", "pg_wal", "stream"};
221
222/*
223 * readFile is -1 or a kernel FD for the log file segment that's currently
224 * open for reading. readSegNo identifies the segment. readOff is the offset
225 * of the page just read, readLen indicates how much of it has been read into
226 * readBuf, and readSource indicates where we got the currently open file from.
227 *
228 * Note: we could use Reserve/ReleaseExternalFD to track consumption of this
229 * FD too (like for openLogFile in xlog.c); but it doesn't currently seem
230 * worthwhile, since the XLOG is not read by general-purpose sessions.
231 */
232static int readFile = -1;
234static uint32 readOff = 0;
235static uint32 readLen = 0;
237
238/*
239 * Keeps track of which source we're currently reading from. This is
240 * different from readSource in that this is always set, even when we don't
241 * currently have a WAL file open. If lastSourceFailed is set, our last
242 * attempt to read from currentSource failed, and we should try another source
243 * next.
244 *
245 * pendingWalRcvRestart is set when a config change occurs that requires a
246 * walreceiver restart. This is only valid in XLOG_FROM_STREAM state.
247 */
249static bool lastSourceFailed = false;
250static bool pendingWalRcvRestart = false;
251
252/*
253 * These variables track when we last obtained some WAL data to process,
254 * and where we got it from. (XLogReceiptSource is initially the same as
255 * readSource, but readSource gets reset to zero when we don't have data
256 * to process right now. It is also different from currentSource, which
257 * also changes when we try to read from a source and fail, while
258 * XLogReceiptSource tracks where we last successfully read some WAL.)
259 */
262
263/* Local copy of WalRcv->flushedUpto */
266
267/*
268 * Copy of minRecoveryPoint and backupEndPoint from the control file.
269 *
270 * In order to reach consistency, we must replay the WAL up to
271 * minRecoveryPoint. If backupEndRequired is true, we must also reach
272 * backupEndPoint, or if it's invalid, an end-of-backup record corresponding
273 * to backupStartPoint.
274 *
275 * Note: In archive recovery, after consistency has been reached, the
276 * functions in xlog.c will start updating minRecoveryPoint in the control
277 * file. But this copy of minRecoveryPoint variable reflects the value at the
278 * beginning of recovery, and is *not* updated after consistency is reached.
279 */
282
285static bool backupEndRequired = false;
286
287/*
288 * Have we reached a consistent database state? In crash recovery, we have
289 * to replay all the WAL, so reachedConsistency is never set. During archive
290 * recovery, the database is consistent once minRecoveryPoint is reached.
291 *
292 * Consistent state means that the system is internally consistent, all
293 * the WAL has been replayed up to a certain point, and importantly, there
294 * is no trace of later actions on disk.
295 *
296 * This flag is used only by the startup process and postmaster. When
297 * minRecoveryPoint is reached, the startup process sets it to true and
298 * sends a PMSIGNAL_RECOVERY_CONSISTENT signal to the postmaster,
299 * which then sets it to true upon receiving the signal.
300 */
302
303/* Buffers dedicated to consistency checks of size BLCKSZ */
304static char *replay_image_masked = NULL;
305static char *primary_image_masked = NULL;
306
307
308/*
309 * Shared-memory state for WAL recovery.
310 */
312{
313 /*
314 * SharedHotStandbyActive indicates if we allow hot standby queries to be
315 * run. Protected by info_lck.
316 */
318
319 /*
320 * SharedPromoteIsTriggered indicates if a standby promotion has been
321 * triggered. Protected by info_lck.
322 */
324
325 /*
326 * recoveryWakeupLatch is used to wake up the startup process to continue
327 * WAL replay, if it is waiting for WAL to arrive or promotion to be
328 * requested.
329 *
330 * Note that the startup process also uses another latch, its procLatch,
331 * to wait for recovery conflict. If we get rid of recoveryWakeupLatch for
332 * signaling the startup process in favor of using its procLatch, which
333 * comports better with possible generic signal handlers using that latch.
334 * But we should not do that because the startup process doesn't assume
335 * that it's waken up by walreceiver process or SIGHUP signal handler
336 * while it's waiting for recovery conflict. The separate latches,
337 * recoveryWakeupLatch and procLatch, should be used for inter-process
338 * communication for WAL replay and recovery conflict, respectively.
339 */
341
342 /*
343 * Last record successfully replayed.
344 */
345 XLogRecPtr lastReplayedReadRecPtr; /* start position */
346 XLogRecPtr lastReplayedEndRecPtr; /* end+1 position */
348
349 /*
350 * When we're currently replaying a record, ie. in a redo function,
351 * replayEndRecPtr points to the end+1 of the record being replayed,
352 * otherwise it's equal to lastReplayedEndRecPtr.
353 */
356 /* timestamp of last COMMIT/ABORT record replayed (or being replayed) */
358
359 /*
360 * timestamp of when we started replaying the current chunk of WAL data,
361 * only relevant for replication or archive recovery
362 */
364 /* Recovery pause state */
367
368 slock_t info_lck; /* locks shared variables shown above */
370
372
373/*
374 * abortedRecPtr is the start pointer of a broken record at end of WAL when
375 * recovery completes; missingContrecPtr is the location of the first
376 * contrecord that went missing. See CreateOverwriteContrecordRecord for
377 * details.
378 */
381
382/*
383 * if recoveryStopsBefore/After returns true, it saves information of the stop
384 * point here
385 */
391
392/* prototypes for local functions */
393static void ApplyWalRecord(XLogReaderState *xlogreader, XLogRecord *record, TimeLineID *replayTLI);
394
395static void EnableStandbyMode(void);
396static void readRecoverySignalFile(void);
397static void validateRecoveryParameters(void);
398static bool read_backup_label(XLogRecPtr *checkPointLoc,
399 TimeLineID *backupLabelTLI,
400 bool *backupEndRequired, bool *backupFromStandby);
401static bool read_tablespace_map(List **tablespaces);
402
403static void xlogrecovery_redo(XLogReaderState *record, TimeLineID replayTLI);
404static void CheckRecoveryConsistency(void);
405static void rm_redo_error_callback(void *arg);
406#ifdef WAL_DEBUG
407static void xlog_outrec(StringInfo buf, XLogReaderState *record);
408#endif
409static void xlog_block_info(StringInfo buf, XLogReaderState *record);
410static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI,
411 TimeLineID prevTLI, TimeLineID replayTLI);
412static bool getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime);
414
415static bool recoveryStopsBefore(XLogReaderState *record);
416static bool recoveryStopsAfter(XLogReaderState *record);
417static char *getRecoveryStopReason(void);
418static void recoveryPausesHere(bool endOfRecovery);
419static bool recoveryApplyDelay(XLogReaderState *record);
420static void ConfirmRecoveryPaused(void);
421
423 int emode, bool fetching_ckpt,
424 TimeLineID replayTLI);
425
426static int XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr,
427 int reqLen, XLogRecPtr targetRecPtr, char *readBuf);
429 bool randAccess,
430 bool fetching_ckpt,
431 XLogRecPtr tliRecPtr,
432 TimeLineID replayTLI,
433 XLogRecPtr replayLSN,
434 bool nonblocking);
435static int emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
437 XLogRecPtr RecPtr, TimeLineID replayTLI);
438static bool rescanLatestTimeLine(TimeLineID replayTLI, XLogRecPtr replayLSN);
439static int XLogFileRead(XLogSegNo segno, TimeLineID tli,
440 XLogSource source, bool notfoundOk);
442
443static bool CheckForStandbyTrigger(void);
444static void SetPromoteIsTriggered(void);
445static bool HotStandbyActiveInReplay(void);
446
447static void SetCurrentChunkStartTime(TimestampTz xtime);
448static void SetLatestXTime(TimestampTz xtime);
449
450/*
451 * Initialization of shared memory for WAL recovery
452 */
453Size
455{
456 Size size;
457
458 /* XLogRecoveryCtl */
459 size = sizeof(XLogRecoveryCtlData);
460
461 return size;
462}
463
464void
466{
467 bool found;
468
470 ShmemInitStruct("XLOG Recovery Ctl", XLogRecoveryShmemSize(), &found);
471 if (found)
472 return;
473 memset(XLogRecoveryCtl, 0, sizeof(XLogRecoveryCtlData));
474
478}
479
480/*
481 * A thin wrapper to enable StandbyMode and do other preparatory work as
482 * needed.
483 */
484static void
486{
487 StandbyMode = true;
488
489 /*
490 * To avoid server log bloat, we don't report recovery progress in a
491 * standby as it will always be in recovery unless promoted. We disable
492 * startup progress timeout in standby mode to avoid calling
493 * startup_progress_timeout_handler() unnecessarily.
494 */
496}
497
498/*
499 * Prepare the system for WAL recovery, if needed.
500 *
501 * This is called by StartupXLOG() which coordinates the server startup
502 * sequence. This function analyzes the control file and the backup label
503 * file, if any, and figures out whether we need to perform crash recovery or
504 * archive recovery, and how far we need to replay the WAL to reach a
505 * consistent state.
506 *
507 * This doesn't yet change the on-disk state, except for creating the symlinks
508 * from table space map file if any, and for fetching WAL files needed to find
509 * the checkpoint record. On entry, the caller has already read the control
510 * file into memory, and passes it as argument. This function updates it to
511 * reflect the recovery state, and the caller is expected to write it back to
512 * disk does after initializing other subsystems, but before calling
513 * PerformWalRecovery().
514 *
515 * This initializes some global variables like ArchiveRecoveryRequested, and
516 * StandbyModeRequested and InRecovery.
517 */
518void
520 bool *haveBackupLabel_ptr, bool *haveTblspcMap_ptr)
521{
522 XLogPageReadPrivate *private;
523 struct stat st;
524 bool wasShutdown;
525 XLogRecord *record;
526 DBState dbstate_at_startup;
527 bool haveTblspcMap = false;
528 bool haveBackupLabel = false;
529 CheckPoint checkPoint;
530 bool backupFromStandby = false;
531
532 dbstate_at_startup = ControlFile->state;
533
534 /*
535 * Initialize on the assumption we want to recover to the latest timeline
536 * that's active according to pg_control.
537 */
541 else
543
544 /*
545 * Check for signal files, and if so set up state for offline recovery
546 */
549
550 /*
551 * Take ownership of the wakeup latch if we're going to sleep during
552 * recovery, if required.
553 */
556
557 /*
558 * Set the WAL reading processor now, as it will be needed when reading
559 * the checkpoint record required (backup_label or not).
560 */
561 private = palloc0(sizeof(XLogPageReadPrivate));
562 xlogreader =
564 XL_ROUTINE(.page_read = &XLogPageRead,
565 .segment_open = NULL,
566 .segment_close = wal_segment_close),
567 private);
568 if (!xlogreader)
570 (errcode(ERRCODE_OUT_OF_MEMORY),
571 errmsg("out of memory"),
572 errdetail("Failed while allocating a WAL reading processor.")));
574
575 /*
576 * Set the WAL decode buffer size. This limits how far ahead we can read
577 * in the WAL.
578 */
580
581 /* Create a WAL prefetcher. */
583
584 /*
585 * Allocate two page buffers dedicated to WAL consistency checks. We do
586 * it this way, rather than just making static arrays, for two reasons:
587 * (1) no need to waste the storage in most instantiations of the backend;
588 * (2) a static char array isn't guaranteed to have any particular
589 * alignment, whereas palloc() will provide MAXALIGN'd storage.
590 */
591 replay_image_masked = (char *) palloc(BLCKSZ);
592 primary_image_masked = (char *) palloc(BLCKSZ);
593
594 /*
595 * Read the backup_label file. We want to run this part of the recovery
596 * process after checking for signal files and after performing validation
597 * of the recovery parameters.
598 */
600 &backupFromStandby))
601 {
602 List *tablespaces = NIL;
603
604 /*
605 * Archive recovery was requested, and thanks to the backup label
606 * file, we know how far we need to replay to reach consistency. Enter
607 * archive recovery directly.
608 */
609 InArchiveRecovery = true;
612
613 /*
614 * Omitting backup_label when creating a new replica, PITR node etc.
615 * unfortunately is a common cause of corruption. Logging that
616 * backup_label was used makes it a bit easier to exclude that as the
617 * cause of observed corruption.
618 *
619 * Do so before we try to read the checkpoint record (which can fail),
620 * as otherwise it can be hard to understand why a checkpoint other
621 * than ControlFile->checkPoint is used.
622 */
623 ereport(LOG,
624 errmsg("starting backup recovery with redo LSN %X/%08X, checkpoint LSN %X/%08X, on timeline ID %u",
628
629 /*
630 * When a backup_label file is present, we want to roll forward from
631 * the checkpoint it identifies, rather than using pg_control.
632 */
635 if (record != NULL)
636 {
637 memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
638 wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
640 errmsg_internal("checkpoint record is at %X/%08X",
642 InRecovery = true; /* force recovery even if SHUTDOWNED */
643
644 /*
645 * Make sure that REDO location exists. This may not be the case
646 * if there was a crash during an online backup, which left a
647 * backup_label around that references a WAL segment that's
648 * already been archived.
649 */
650 if (checkPoint.redo < CheckPointLoc)
651 {
653 if (!ReadRecord(xlogprefetcher, LOG, false,
654 checkPoint.ThisTimeLineID))
656 errmsg("could not find redo location %X/%08X referenced by checkpoint record at %X/%08X",
658 errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n"
659 "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
660 "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
662 }
663 }
664 else
665 {
667 errmsg("could not locate required checkpoint record at %X/%08X",
669 errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n"
670 "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
671 "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
673 wasShutdown = false; /* keep compiler quiet */
674 }
675
676 /* Read the tablespace_map file if present and create symlinks. */
677 if (read_tablespace_map(&tablespaces))
678 {
679 ListCell *lc;
680
681 foreach(lc, tablespaces)
682 {
683 tablespaceinfo *ti = lfirst(lc);
684 char *linkloc;
685
686 linkloc = psprintf("%s/%u", PG_TBLSPC_DIR, ti->oid);
687
688 /*
689 * Remove the existing symlink if any and Create the symlink
690 * under PGDATA.
691 */
693
694 if (symlink(ti->path, linkloc) < 0)
697 errmsg("could not create symbolic link \"%s\": %m",
698 linkloc)));
699
700 pfree(ti->path);
701 pfree(ti);
702 }
703
704 /* tell the caller to delete it later */
705 haveTblspcMap = true;
706 }
707
708 /* tell the caller to delete it later */
709 haveBackupLabel = true;
710 }
711 else
712 {
713 /* No backup_label file has been found if we are here. */
714
715 /*
716 * If tablespace_map file is present without backup_label file, there
717 * is no use of such file. There is no harm in retaining it, but it
718 * is better to get rid of the map file so that we don't have any
719 * redundant file in data directory and it will avoid any sort of
720 * confusion. It seems prudent though to just rename the file out of
721 * the way rather than delete it completely, also we ignore any error
722 * that occurs in rename operation as even if map file is present
723 * without backup_label file, it is harmless.
724 */
725 if (stat(TABLESPACE_MAP, &st) == 0)
726 {
727 unlink(TABLESPACE_MAP_OLD);
729 ereport(LOG,
730 (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
732 errdetail("File \"%s\" was renamed to \"%s\".",
734 else
735 ereport(LOG,
736 (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
738 errdetail("Could not rename file \"%s\" to \"%s\": %m.",
740 }
741
742 /*
743 * It's possible that archive recovery was requested, but we don't
744 * know how far we need to replay the WAL before we reach consistency.
745 * This can happen for example if a base backup is taken from a
746 * running server using an atomic filesystem snapshot, without calling
747 * pg_backup_start/stop. Or if you just kill a running primary server
748 * and put it into archive recovery by creating a recovery signal
749 * file.
750 *
751 * Our strategy in that case is to perform crash recovery first,
752 * replaying all the WAL present in pg_wal, and only enter archive
753 * recovery after that.
754 *
755 * But usually we already know how far we need to replay the WAL (up
756 * to minRecoveryPoint, up to backupEndPoint, or until we see an
757 * end-of-backup record), and we can enter archive recovery directly.
758 */
764 {
765 InArchiveRecovery = true;
768 }
769
770 /*
771 * For the same reason as when starting up with backup_label present,
772 * emit a log message when we continue initializing from a base
773 * backup.
774 */
776 ereport(LOG,
777 errmsg("restarting backup recovery with redo LSN %X/%08X",
779
780 /* Get the last valid checkpoint record. */
787 if (record != NULL)
788 {
790 errmsg_internal("checkpoint record is at %X/%08X",
792 }
793 else
794 {
795 /*
796 * We used to attempt to go back to a secondary checkpoint record
797 * here, but only when not in standby mode. We now just fail if we
798 * can't read the last checkpoint because this allows us to
799 * simplify processing around checkpoints.
800 */
802 errmsg("could not locate a valid checkpoint record at %X/%08X",
804 }
805 memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
806 wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
807 }
808
810 {
812 ereport(LOG,
813 (errmsg("entering standby mode")));
815 ereport(LOG,
816 (errmsg("starting point-in-time recovery to XID %u",
819 ereport(LOG,
820 (errmsg("starting point-in-time recovery to %s",
823 ereport(LOG,
824 (errmsg("starting point-in-time recovery to \"%s\"",
827 ereport(LOG,
828 errmsg("starting point-in-time recovery to WAL location (LSN) \"%X/%08X\"",
831 ereport(LOG,
832 (errmsg("starting point-in-time recovery to earliest consistent point")));
833 else
834 ereport(LOG,
835 (errmsg("starting archive recovery")));
836 }
837
838 /*
839 * If the location of the checkpoint record is not on the expected
840 * timeline in the history of the requested timeline, we cannot proceed:
841 * the backup is not part of the history of the requested timeline.
842 */
843 Assert(expectedTLEs); /* was initialized by reading checkpoint
844 * record */
847 {
848 XLogRecPtr switchpoint;
849
850 /*
851 * tliSwitchPoint will throw an error if the checkpoint's timeline is
852 * not in expectedTLEs at all.
853 */
854 switchpoint = tliSwitchPoint(CheckPointTLI, expectedTLEs, NULL);
856 (errmsg("requested timeline %u is not a child of this server's history",
858 /* translator: %s is a backup_label file or a pg_control file */
859 errdetail("Latest checkpoint in file \"%s\" is at %X/%08X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%08X.",
860 haveBackupLabel ? "backup_label" : "pg_control",
863 LSN_FORMAT_ARGS(switchpoint))));
864 }
865
866 /*
867 * The min recovery point should be part of the requested timeline's
868 * history, too.
869 */
874 errmsg("requested timeline %u does not contain minimum recovery point %X/%08X on timeline %u",
878
880 errmsg_internal("redo record is at %X/%08X; shutdown %s",
881 LSN_FORMAT_ARGS(checkPoint.redo),
882 wasShutdown ? "true" : "false"));
884 (errmsg_internal("next transaction ID: " UINT64_FORMAT "; next OID: %u",
886 checkPoint.nextOid)));
888 (errmsg_internal("next MultiXactId: %u; next MultiXactOffset: %u",
889 checkPoint.nextMulti, checkPoint.nextMultiOffset)));
891 (errmsg_internal("oldest unfrozen transaction ID: %u, in database %u",
892 checkPoint.oldestXid, checkPoint.oldestXidDB)));
894 (errmsg_internal("oldest MultiXactId: %u, in database %u",
895 checkPoint.oldestMulti, checkPoint.oldestMultiDB)));
897 (errmsg_internal("commit timestamp Xid oldest/newest: %u/%u",
898 checkPoint.oldestCommitTsXid,
899 checkPoint.newestCommitTsXid)));
902 (errmsg("invalid next transaction ID")));
903
904 /* sanity check */
905 if (checkPoint.redo > CheckPointLoc)
907 (errmsg("invalid redo in checkpoint record")));
908
909 /*
910 * Check whether we need to force recovery from WAL. If it appears to
911 * have been a clean shutdown and we did not have a recovery signal file,
912 * then assume no recovery needed.
913 */
914 if (checkPoint.redo < CheckPointLoc)
915 {
916 if (wasShutdown)
918 (errmsg("invalid redo record in shutdown checkpoint")));
919 InRecovery = true;
920 }
921 else if (ControlFile->state != DB_SHUTDOWNED)
922 InRecovery = true;
924 {
925 /* force recovery due to presence of recovery signal file */
926 InRecovery = true;
927 }
928
929 /*
930 * If recovery is needed, update our in-memory copy of pg_control to show
931 * that we are recovering and to show the selected checkpoint as the place
932 * we are starting from. We also mark pg_control with any minimum recovery
933 * stop point obtained from a backup history file.
934 *
935 * We don't write the changes to disk yet, though. Only do that after
936 * initializing various subsystems.
937 */
938 if (InRecovery)
939 {
941 {
943 }
944 else
945 {
946 ereport(LOG,
947 (errmsg("database system was not properly shut down; "
948 "automatic recovery in progress")));
950 ereport(LOG,
951 (errmsg("crash recovery starts in timeline %u "
952 "and has target timeline %u",
956 }
958 ControlFile->checkPointCopy = checkPoint;
960 {
961 /* initialize minRecoveryPoint if not set yet */
962 if (ControlFile->minRecoveryPoint < checkPoint.redo)
963 {
964 ControlFile->minRecoveryPoint = checkPoint.redo;
966 }
967 }
968
969 /*
970 * Set backupStartPoint if we're starting recovery from a base backup.
971 *
972 * Also set backupEndPoint and use minRecoveryPoint as the backup end
973 * location if we're starting recovery from a base backup which was
974 * taken from a standby. In this case, the database system status in
975 * pg_control must indicate that the database was already in recovery.
976 * Usually that will be DB_IN_ARCHIVE_RECOVERY but also can be
977 * DB_SHUTDOWNED_IN_RECOVERY if recovery previously was interrupted
978 * before reaching this point; e.g. because restore_command or
979 * primary_conninfo were faulty.
980 *
981 * Any other state indicates that the backup somehow became corrupted
982 * and we can't sensibly continue with recovery.
983 */
984 if (haveBackupLabel)
985 {
986 ControlFile->backupStartPoint = checkPoint.redo;
988
989 if (backupFromStandby)
990 {
991 if (dbstate_at_startup != DB_IN_ARCHIVE_RECOVERY &&
992 dbstate_at_startup != DB_SHUTDOWNED_IN_RECOVERY)
994 (errmsg("backup_label contains data inconsistent with control file"),
995 errhint("This means that the backup is corrupted and you will "
996 "have to use another backup for recovery.")));
998 }
999 }
1000 }
1001
1002 /* remember these, so that we know when we have reached consistency */
1007 {
1010 }
1011 else
1012 {
1015 }
1016
1017 /*
1018 * Start recovery assuming that the final record isn't lost.
1019 */
1022
1023 *wasShutdown_ptr = wasShutdown;
1024 *haveBackupLabel_ptr = haveBackupLabel;
1025 *haveTblspcMap_ptr = haveTblspcMap;
1026}
1027
1028/*
1029 * See if there are any recovery signal files and if so, set state for
1030 * recovery.
1031 *
1032 * See if there is a recovery command file (recovery.conf), and if so
1033 * throw an ERROR since as of PG12 we no longer recognize that.
1034 */
1035static void
1037{
1038 struct stat stat_buf;
1039
1041 return;
1042
1043 /*
1044 * Check for old recovery API file: recovery.conf
1045 */
1046 if (stat(RECOVERY_COMMAND_FILE, &stat_buf) == 0)
1047 ereport(FATAL,
1049 errmsg("using recovery command file \"%s\" is not supported",
1051
1052 /*
1053 * Remove unused .done file, if present. Ignore if absent.
1054 */
1055 unlink(RECOVERY_COMMAND_DONE);
1056
1057 /*
1058 * Check for recovery signal files and if found, fsync them since they
1059 * represent server state information. We don't sweat too much about the
1060 * possibility of fsync failure, however.
1061 *
1062 * If present, standby signal file takes precedence. If neither is present
1063 * then we won't enter archive recovery.
1064 */
1065 if (stat(STANDBY_SIGNAL_FILE, &stat_buf) == 0)
1066 {
1067 int fd;
1068
1070 S_IRUSR | S_IWUSR);
1071 if (fd >= 0)
1072 {
1073 (void) pg_fsync(fd);
1074 close(fd);
1075 }
1077 }
1078 else if (stat(RECOVERY_SIGNAL_FILE, &stat_buf) == 0)
1079 {
1080 int fd;
1081
1083 S_IRUSR | S_IWUSR);
1084 if (fd >= 0)
1085 {
1086 (void) pg_fsync(fd);
1087 close(fd);
1088 }
1090 }
1091
1092 StandbyModeRequested = false;
1095 {
1096 StandbyModeRequested = true;
1098 }
1100 {
1101 StandbyModeRequested = false;
1103 }
1104 else
1105 return;
1106
1107 /*
1108 * We don't support standby mode in standalone backends; that requires
1109 * other processes such as the WAL receiver to be alive.
1110 */
1112 ereport(FATAL,
1113 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1114 errmsg("standby mode is not supported by single-user servers")));
1115}
1116
1117static void
1119{
1121 return;
1122
1123 /*
1124 * Check for compulsory parameters
1125 */
1127 {
1128 if ((PrimaryConnInfo == NULL || strcmp(PrimaryConnInfo, "") == 0) &&
1129 (recoveryRestoreCommand == NULL || strcmp(recoveryRestoreCommand, "") == 0))
1131 (errmsg("specified neither \"primary_conninfo\" nor \"restore_command\""),
1132 errhint("The database server will regularly poll the pg_wal subdirectory to check for files placed there.")));
1133 }
1134 else
1135 {
1136 if (recoveryRestoreCommand == NULL ||
1137 strcmp(recoveryRestoreCommand, "") == 0)
1138 ereport(FATAL,
1139 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1140 errmsg("must specify \"restore_command\" when standby mode is not enabled")));
1141 }
1142
1143 /*
1144 * Override any inconsistent requests. Note that this is a change of
1145 * behaviour in 9.5; prior to this we simply ignored a request to pause if
1146 * hot_standby = off, which was surprising behaviour.
1147 */
1151
1152 /*
1153 * Final parsing of recovery_target_time string; see also
1154 * check_recovery_target_time().
1155 */
1157 {
1161 Int32GetDatum(-1)));
1162 }
1163
1164 /*
1165 * If user specified recovery_target_timeline, validate it or compute the
1166 * "latest" value. We can't do this until after we've gotten the restore
1167 * command and set InArchiveRecovery, because we need to fetch timeline
1168 * history files from the archive.
1169 */
1171 {
1173
1174 /* Timeline 1 does not have a history file, all else should */
1175 if (rtli != 1 && !existsTimeLineHistory(rtli))
1176 ereport(FATAL,
1177 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1178 errmsg("recovery target timeline %u does not exist",
1179 rtli)));
1180 recoveryTargetTLI = rtli;
1181 }
1183 {
1184 /* We start the "latest" search from pg_control's timeline */
1186 }
1187 else
1188 {
1189 /*
1190 * else we just use the recoveryTargetTLI as already read from
1191 * ControlFile
1192 */
1194 }
1195}
1196
1197/*
1198 * read_backup_label: check to see if a backup_label file is present
1199 *
1200 * If we see a backup_label during recovery, we assume that we are recovering
1201 * from a backup dump file, and we therefore roll forward from the checkpoint
1202 * identified by the label file, NOT what pg_control says. This avoids the
1203 * problem that pg_control might have been archived one or more checkpoints
1204 * later than the start of the dump, and so if we rely on it as the start
1205 * point, we will fail to restore a consistent database state.
1206 *
1207 * Returns true if a backup_label was found (and fills the checkpoint
1208 * location and TLI into *checkPointLoc and *backupLabelTLI, respectively);
1209 * returns false if not. If this backup_label came from a streamed backup,
1210 * *backupEndRequired is set to true. If this backup_label was created during
1211 * recovery, *backupFromStandby is set to true.
1212 *
1213 * Also sets the global variables RedoStartLSN and RedoStartTLI with the LSN
1214 * and TLI read from the backup file.
1215 */
1216static bool
1217read_backup_label(XLogRecPtr *checkPointLoc, TimeLineID *backupLabelTLI,
1218 bool *backupEndRequired, bool *backupFromStandby)
1219{
1220 char startxlogfilename[MAXFNAMELEN];
1221 TimeLineID tli_from_walseg,
1222 tli_from_file;
1223 FILE *lfp;
1224 char ch;
1225 char backuptype[20];
1226 char backupfrom[20];
1227 char backuplabel[MAXPGPATH];
1228 char backuptime[128];
1229 uint32 hi,
1230 lo;
1231
1232 /* suppress possible uninitialized-variable warnings */
1233 *checkPointLoc = InvalidXLogRecPtr;
1234 *backupLabelTLI = 0;
1235 *backupEndRequired = false;
1236 *backupFromStandby = false;
1237
1238 /*
1239 * See if label file is present
1240 */
1241 lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
1242 if (!lfp)
1243 {
1244 if (errno != ENOENT)
1245 ereport(FATAL,
1247 errmsg("could not read file \"%s\": %m",
1249 return false; /* it's not there, all is fine */
1250 }
1251
1252 /*
1253 * Read and parse the START WAL LOCATION and CHECKPOINT lines (this code
1254 * is pretty crude, but we are not expecting any variability in the file
1255 * format).
1256 */
1257 if (fscanf(lfp, "START WAL LOCATION: %X/%08X (file %08X%16s)%c",
1258 &hi, &lo, &tli_from_walseg, startxlogfilename, &ch) != 5 || ch != '\n')
1259 ereport(FATAL,
1260 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1261 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
1262 RedoStartLSN = ((uint64) hi) << 32 | lo;
1263 RedoStartTLI = tli_from_walseg;
1264 if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%08X%c",
1265 &hi, &lo, &ch) != 3 || ch != '\n')
1266 ereport(FATAL,
1267 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1268 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
1269 *checkPointLoc = ((uint64) hi) << 32 | lo;
1270 *backupLabelTLI = tli_from_walseg;
1271
1272 /*
1273 * BACKUP METHOD lets us know if this was a typical backup ("streamed",
1274 * which could mean either pg_basebackup or the pg_backup_start/stop
1275 * method was used) or if this label came from somewhere else (the only
1276 * other option today being from pg_rewind). If this was a streamed
1277 * backup then we know that we need to play through until we get to the
1278 * end of the WAL which was generated during the backup (at which point we
1279 * will have reached consistency and backupEndRequired will be reset to be
1280 * false).
1281 */
1282 if (fscanf(lfp, "BACKUP METHOD: %19s\n", backuptype) == 1)
1283 {
1284 if (strcmp(backuptype, "streamed") == 0)
1285 *backupEndRequired = true;
1286 }
1287
1288 /*
1289 * BACKUP FROM lets us know if this was from a primary or a standby. If
1290 * it was from a standby, we'll double-check that the control file state
1291 * matches that of a standby.
1292 */
1293 if (fscanf(lfp, "BACKUP FROM: %19s\n", backupfrom) == 1)
1294 {
1295 if (strcmp(backupfrom, "standby") == 0)
1296 *backupFromStandby = true;
1297 }
1298
1299 /*
1300 * Parse START TIME and LABEL. Those are not mandatory fields for recovery
1301 * but checking for their presence is useful for debugging and the next
1302 * sanity checks. Cope also with the fact that the result buffers have a
1303 * pre-allocated size, hence if the backup_label file has been generated
1304 * with strings longer than the maximum assumed here an incorrect parsing
1305 * happens. That's fine as only minor consistency checks are done
1306 * afterwards.
1307 */
1308 if (fscanf(lfp, "START TIME: %127[^\n]\n", backuptime) == 1)
1310 (errmsg_internal("backup time %s in file \"%s\"",
1311 backuptime, BACKUP_LABEL_FILE)));
1312
1313 if (fscanf(lfp, "LABEL: %1023[^\n]\n", backuplabel) == 1)
1315 (errmsg_internal("backup label %s in file \"%s\"",
1316 backuplabel, BACKUP_LABEL_FILE)));
1317
1318 /*
1319 * START TIMELINE is new as of 11. Its parsing is not mandatory, still use
1320 * it as a sanity check if present.
1321 */
1322 if (fscanf(lfp, "START TIMELINE: %u\n", &tli_from_file) == 1)
1323 {
1324 if (tli_from_walseg != tli_from_file)
1325 ereport(FATAL,
1326 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1327 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE),
1328 errdetail("Timeline ID parsed is %u, but expected %u.",
1329 tli_from_file, tli_from_walseg)));
1330
1332 (errmsg_internal("backup timeline %u in file \"%s\"",
1333 tli_from_file, BACKUP_LABEL_FILE)));
1334 }
1335
1336 if (fscanf(lfp, "INCREMENTAL FROM LSN: %X/%08X\n", &hi, &lo) > 0)
1337 ereport(FATAL,
1338 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1339 errmsg("this is an incremental backup, not a data directory"),
1340 errhint("Use pg_combinebackup to reconstruct a valid data directory.")));
1341
1342 if (ferror(lfp) || FreeFile(lfp))
1343 ereport(FATAL,
1345 errmsg("could not read file \"%s\": %m",
1347
1348 return true;
1349}
1350
1351/*
1352 * read_tablespace_map: check to see if a tablespace_map file is present
1353 *
1354 * If we see a tablespace_map file during recovery, we assume that we are
1355 * recovering from a backup dump file, and we therefore need to create symlinks
1356 * as per the information present in tablespace_map file.
1357 *
1358 * Returns true if a tablespace_map file was found (and fills *tablespaces
1359 * with a tablespaceinfo struct for each tablespace listed in the file);
1360 * returns false if not.
1361 */
1362static bool
1364{
1365 tablespaceinfo *ti;
1366 FILE *lfp;
1367 char str[MAXPGPATH];
1368 int ch,
1369 i,
1370 n;
1371 bool was_backslash;
1372
1373 /*
1374 * See if tablespace_map file is present
1375 */
1376 lfp = AllocateFile(TABLESPACE_MAP, "r");
1377 if (!lfp)
1378 {
1379 if (errno != ENOENT)
1380 ereport(FATAL,
1382 errmsg("could not read file \"%s\": %m",
1383 TABLESPACE_MAP)));
1384 return false; /* it's not there, all is fine */
1385 }
1386
1387 /*
1388 * Read and parse the link name and path lines from tablespace_map file
1389 * (this code is pretty crude, but we are not expecting any variability in
1390 * the file format). De-escape any backslashes that were inserted.
1391 */
1392 i = 0;
1393 was_backslash = false;
1394 while ((ch = fgetc(lfp)) != EOF)
1395 {
1396 if (!was_backslash && (ch == '\n' || ch == '\r'))
1397 {
1398 char *endp;
1399
1400 if (i == 0)
1401 continue; /* \r immediately followed by \n */
1402
1403 /*
1404 * The de-escaped line should contain an OID followed by exactly
1405 * one space followed by a path. The path might start with
1406 * spaces, so don't be too liberal about parsing.
1407 */
1408 str[i] = '\0';
1409 n = 0;
1410 while (str[n] && str[n] != ' ')
1411 n++;
1412 if (n < 1 || n >= i - 1)
1413 ereport(FATAL,
1414 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1415 errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
1416 str[n++] = '\0';
1417
1418 ti = palloc0(sizeof(tablespaceinfo));
1419 errno = 0;
1420 ti->oid = strtoul(str, &endp, 10);
1421 if (*endp != '\0' || errno == EINVAL || errno == ERANGE)
1422 ereport(FATAL,
1423 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1424 errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
1425 ti->path = pstrdup(str + n);
1426 *tablespaces = lappend(*tablespaces, ti);
1427
1428 i = 0;
1429 continue;
1430 }
1431 else if (!was_backslash && ch == '\\')
1432 was_backslash = true;
1433 else
1434 {
1435 if (i < sizeof(str) - 1)
1436 str[i++] = ch;
1437 was_backslash = false;
1438 }
1439 }
1440
1441 if (i != 0 || was_backslash) /* last line not terminated? */
1442 ereport(FATAL,
1443 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1444 errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
1445
1446 if (ferror(lfp) || FreeFile(lfp))
1447 ereport(FATAL,
1449 errmsg("could not read file \"%s\": %m",
1450 TABLESPACE_MAP)));
1451
1452 return true;
1453}
1454
1455/*
1456 * Finish WAL recovery.
1457 *
1458 * This does not close the 'xlogreader' yet, because in some cases the caller
1459 * still wants to re-read the last checkpoint record by calling
1460 * ReadCheckpointRecord().
1461 *
1462 * Returns the position of the last valid or applied record, after which new
1463 * WAL should be appended, information about why recovery was ended, and some
1464 * other things. See the EndOfWalRecoveryInfo struct for details.
1465 */
1468{
1470 XLogRecPtr lastRec;
1471 TimeLineID lastRecTLI;
1472 XLogRecPtr endOfLog;
1473
1474 /*
1475 * Kill WAL receiver, if it's still running, before we continue to write
1476 * the startup checkpoint and aborted-contrecord records. It will trump
1477 * over these records and subsequent ones if it's still alive when we
1478 * start writing WAL.
1479 */
1481
1482 /*
1483 * Shutdown the slot sync worker to drop any temporary slots acquired by
1484 * it and to prevent it from keep trying to fetch the failover slots.
1485 *
1486 * We do not update the 'synced' column in 'pg_replication_slots' system
1487 * view from true to false here, as any failed update could leave 'synced'
1488 * column false for some slots. This could cause issues during slot sync
1489 * after restarting the server as a standby. While updating the 'synced'
1490 * column after switching to the new timeline is an option, it does not
1491 * simplify the handling for the 'synced' column. Therefore, we retain the
1492 * 'synced' column as true after promotion as it may provide useful
1493 * information about the slot origin.
1494 */
1496
1497 /*
1498 * We are now done reading the xlog from stream. Turn off streaming
1499 * recovery to force fetching the files (which would be required at end of
1500 * recovery, e.g., timeline history file) from archive or pg_wal.
1501 *
1502 * Note that standby mode must be turned off after killing WAL receiver,
1503 * i.e., calling XLogShutdownWalRcv().
1504 */
1506 StandbyMode = false;
1507
1508 /*
1509 * Determine where to start writing WAL next.
1510 *
1511 * Re-fetch the last valid or last applied record, so we can identify the
1512 * exact endpoint of what we consider the valid portion of WAL. There may
1513 * be an incomplete continuation record after that, in which case
1514 * 'abortedRecPtr' and 'missingContrecPtr' are set and the caller will
1515 * write a special OVERWRITE_CONTRECORD message to mark that the rest of
1516 * it is intentionally missing. See CreateOverwriteContrecordRecord().
1517 *
1518 * An important side-effect of this is to load the last page into
1519 * xlogreader. The caller uses it to initialize the WAL for writing.
1520 */
1521 if (!InRecovery)
1522 {
1523 lastRec = CheckPointLoc;
1524 lastRecTLI = CheckPointTLI;
1525 }
1526 else
1527 {
1529 lastRecTLI = XLogRecoveryCtl->lastReplayedTLI;
1530 }
1532 (void) ReadRecord(xlogprefetcher, PANIC, false, lastRecTLI);
1533 endOfLog = xlogreader->EndRecPtr;
1534
1535 /*
1536 * Remember the TLI in the filename of the XLOG segment containing the
1537 * end-of-log. It could be different from the timeline that endOfLog
1538 * nominally belongs to, if there was a timeline switch in that segment,
1539 * and we were reading the old WAL from a segment belonging to a higher
1540 * timeline.
1541 */
1542 result->endOfLogTLI = xlogreader->seg.ws_tli;
1543
1545 {
1546 /*
1547 * We are no longer in archive recovery state.
1548 *
1549 * We are now done reading the old WAL. Turn off archive fetching if
1550 * it was active.
1551 */
1553 InArchiveRecovery = false;
1554
1555 /*
1556 * If the ending log segment is still open, close it (to avoid
1557 * problems on Windows with trying to rename or delete an open file).
1558 */
1559 if (readFile >= 0)
1560 {
1561 close(readFile);
1562 readFile = -1;
1563 }
1564 }
1565
1566 /*
1567 * Copy the last partial block to the caller, for initializing the WAL
1568 * buffer for appending new WAL.
1569 */
1570 if (endOfLog % XLOG_BLCKSZ != 0)
1571 {
1572 char *page;
1573 int len;
1574 XLogRecPtr pageBeginPtr;
1575
1576 pageBeginPtr = endOfLog - (endOfLog % XLOG_BLCKSZ);
1578
1579 /* Copy the valid part of the last block */
1580 len = endOfLog % XLOG_BLCKSZ;
1581 page = palloc(len);
1582 memcpy(page, xlogreader->readBuf, len);
1583
1584 result->lastPageBeginPtr = pageBeginPtr;
1585 result->lastPage = page;
1586 }
1587 else
1588 {
1589 /* There is no partial block to copy. */
1590 result->lastPageBeginPtr = endOfLog;
1591 result->lastPage = NULL;
1592 }
1593
1594 /*
1595 * Create a comment for the history file to explain why and where timeline
1596 * changed.
1597 */
1599
1600 result->lastRec = lastRec;
1601 result->lastRecTLI = lastRecTLI;
1602 result->endOfLog = endOfLog;
1603
1604 result->abortedRecPtr = abortedRecPtr;
1606
1609
1610 return result;
1611}
1612
1613/*
1614 * Clean up the WAL reader and leftovers from restoring WAL from archive
1615 */
1616void
1618{
1619 char recoveryPath[MAXPGPATH];
1620
1621 /* Final update of pg_stat_recovery_prefetch. */
1623
1624 /* Shut down xlogreader */
1625 if (readFile >= 0)
1626 {
1627 close(readFile);
1628 readFile = -1;
1629 }
1633
1635 {
1636 /*
1637 * Since there might be a partial WAL segment named RECOVERYXLOG, get
1638 * rid of it.
1639 */
1640 snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG");
1641 unlink(recoveryPath); /* ignore any error */
1642
1643 /* Get rid of any remaining recovered timeline-history file, too */
1644 snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY");
1645 unlink(recoveryPath); /* ignore any error */
1646 }
1647
1648 /*
1649 * We don't need the latch anymore. It's not strictly necessary to disown
1650 * it, but let's do it for the sake of tidiness.
1651 */
1654}
1655
1656/*
1657 * Perform WAL recovery.
1658 *
1659 * If the system was shut down cleanly, this is never called.
1660 */
1661void
1663{
1664 XLogRecord *record;
1665 bool reachedRecoveryTarget = false;
1666 TimeLineID replayTLI;
1667
1668 /*
1669 * Initialize shared variables for tracking progress of WAL replay, as if
1670 * we had just replayed the record before the REDO location (or the
1671 * checkpoint record itself, if it's a shutdown checkpoint).
1672 */
1675 {
1679 }
1680 else
1681 {
1685 }
1692
1693 /* Also ensure XLogReceiptTime has a sane value */
1695
1696 /*
1697 * Let postmaster know we've started redo now, so that it can launch the
1698 * archiver if necessary.
1699 */
1702
1703 /*
1704 * Allow read-only connections immediately if we're consistent already.
1705 */
1707
1708 /*
1709 * Find the first record that logically follows the checkpoint --- it
1710 * might physically precede it, though.
1711 */
1713 {
1714 /* back up to find the record */
1715 replayTLI = RedoStartTLI;
1717 record = ReadRecord(xlogprefetcher, PANIC, false, replayTLI);
1718
1719 /*
1720 * If a checkpoint record's redo pointer points back to an earlier
1721 * LSN, the record at that LSN should be an XLOG_CHECKPOINT_REDO
1722 * record.
1723 */
1724 if (record->xl_rmid != RM_XLOG_ID ||
1726 ereport(FATAL,
1727 errmsg("unexpected record type found at redo point %X/%08X",
1729 }
1730 else
1731 {
1732 /* just have to read next record after CheckPoint */
1734 replayTLI = CheckPointTLI;
1735 record = ReadRecord(xlogprefetcher, LOG, false, replayTLI);
1736 }
1737
1738 if (record != NULL)
1739 {
1740 TimestampTz xtime;
1741 PGRUsage ru0;
1742
1743 pg_rusage_init(&ru0);
1744
1745 InRedo = true;
1746
1747 RmgrStartup();
1748
1749 ereport(LOG,
1750 errmsg("redo starts at %X/%08X",
1752
1753 /* Prepare to report progress of the redo phase. */
1754 if (!StandbyMode)
1756
1757 /*
1758 * main redo apply loop
1759 */
1760 do
1761 {
1762 if (!StandbyMode)
1763 ereport_startup_progress("redo in progress, elapsed time: %ld.%02d s, current LSN: %X/%08X",
1765
1766#ifdef WAL_DEBUG
1767 if (XLOG_DEBUG)
1768 {
1770
1772 appendStringInfo(&buf, "REDO @ %X/%08X; LSN %X/%08X: ",
1775 xlog_outrec(&buf, xlogreader);
1776 appendStringInfoString(&buf, " - ");
1778 elog(LOG, "%s", buf.data);
1779 pfree(buf.data);
1780 }
1781#endif
1782
1783 /* Handle interrupt signals of startup process */
1785
1786 /*
1787 * Pause WAL replay, if requested by a hot-standby session via
1788 * SetRecoveryPause().
1789 *
1790 * Note that we intentionally don't take the info_lck spinlock
1791 * here. We might therefore read a slightly stale value of the
1792 * recoveryPause flag, but it can't be very stale (no worse than
1793 * the last spinlock we did acquire). Since a pause request is a
1794 * pretty asynchronous thing anyway, possibly responding to it one
1795 * WAL record later than we otherwise would is a minor issue, so
1796 * it doesn't seem worth adding another spinlock cycle to prevent
1797 * that.
1798 */
1799 if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
1801 recoveryPausesHere(false);
1802
1803 /*
1804 * Have we reached our recovery target?
1805 */
1807 {
1808 reachedRecoveryTarget = true;
1809 break;
1810 }
1811
1812 /*
1813 * If we've been asked to lag the primary, wait on latch until
1814 * enough time has passed.
1815 */
1817 {
1818 /*
1819 * We test for paused recovery again here. If user sets
1820 * delayed apply, it may be because they expect to pause
1821 * recovery in case of problems, so we must test again here
1822 * otherwise pausing during the delay-wait wouldn't work.
1823 */
1824 if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
1826 recoveryPausesHere(false);
1827 }
1828
1829 /*
1830 * Apply the record
1831 */
1832 ApplyWalRecord(xlogreader, record, &replayTLI);
1833
1834 /* Exit loop if we reached inclusive recovery target */
1836 {
1837 reachedRecoveryTarget = true;
1838 break;
1839 }
1840
1841 /* Else, try to fetch the next WAL record */
1842 record = ReadRecord(xlogprefetcher, LOG, false, replayTLI);
1843 } while (record != NULL);
1844
1845 /*
1846 * end of main redo apply loop
1847 */
1848
1849 if (reachedRecoveryTarget)
1850 {
1851 if (!reachedConsistency)
1852 ereport(FATAL,
1853 (errmsg("requested recovery stop point is before consistent recovery point")));
1854
1855 /*
1856 * This is the last point where we can restart recovery with a new
1857 * recovery target, if we shutdown and begin again. After this,
1858 * Resource Managers may choose to do permanent corrective actions
1859 * at end of recovery.
1860 */
1861 switch (recoveryTargetAction)
1862 {
1864
1865 /*
1866 * exit with special return code to request shutdown of
1867 * postmaster. Log messages issued from postmaster.
1868 */
1869 proc_exit(3);
1870
1872 SetRecoveryPause(true);
1873 recoveryPausesHere(true);
1874
1875 /* drop into promote */
1876
1878 break;
1879 }
1880 }
1881
1882 RmgrCleanup();
1883
1884 ereport(LOG,
1885 errmsg("redo done at %X/%08X system usage: %s",
1887 pg_rusage_show(&ru0)));
1888 xtime = GetLatestXTime();
1889 if (xtime)
1890 ereport(LOG,
1891 (errmsg("last completed transaction was at log time %s",
1892 timestamptz_to_str(xtime))));
1893
1894 InRedo = false;
1895 }
1896 else
1897 {
1898 /* there are no WAL records following the checkpoint */
1899 ereport(LOG,
1900 (errmsg("redo is not required")));
1901 }
1902
1903 /*
1904 * This check is intentionally after the above log messages that indicate
1905 * how far recovery went.
1906 */
1909 !reachedRecoveryTarget)
1910 ereport(FATAL,
1911 (errcode(ERRCODE_CONFIG_FILE_ERROR),
1912 errmsg("recovery ended before configured recovery target was reached")));
1913}
1914
1915/*
1916 * Subroutine of PerformWalRecovery, to apply one WAL record.
1917 */
1918static void
1920{
1921 ErrorContextCallback errcallback;
1922 bool switchedTLI = false;
1923
1924 /* Setup error traceback support for ereport() */
1925 errcallback.callback = rm_redo_error_callback;
1926 errcallback.arg = xlogreader;
1927 errcallback.previous = error_context_stack;
1928 error_context_stack = &errcallback;
1929
1930 /*
1931 * TransamVariables->nextXid must be beyond record's xid.
1932 */
1934
1935 /*
1936 * Before replaying this record, check if this record causes the current
1937 * timeline to change. The record is already considered to be part of the
1938 * new timeline, so we update replayTLI before replaying it. That's
1939 * important so that replayEndTLI, which is recorded as the minimum
1940 * recovery point's TLI if recovery stops after this record, is set
1941 * correctly.
1942 */
1943 if (record->xl_rmid == RM_XLOG_ID)
1944 {
1945 TimeLineID newReplayTLI = *replayTLI;
1946 TimeLineID prevReplayTLI = *replayTLI;
1947 uint8 info = record->xl_info & ~XLR_INFO_MASK;
1948
1949 if (info == XLOG_CHECKPOINT_SHUTDOWN)
1950 {
1951 CheckPoint checkPoint;
1952
1953 memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
1954 newReplayTLI = checkPoint.ThisTimeLineID;
1955 prevReplayTLI = checkPoint.PrevTimeLineID;
1956 }
1957 else if (info == XLOG_END_OF_RECOVERY)
1958 {
1959 xl_end_of_recovery xlrec;
1960
1961 memcpy(&xlrec, XLogRecGetData(xlogreader), sizeof(xl_end_of_recovery));
1962 newReplayTLI = xlrec.ThisTimeLineID;
1963 prevReplayTLI = xlrec.PrevTimeLineID;
1964 }
1965
1966 if (newReplayTLI != *replayTLI)
1967 {
1968 /* Check that it's OK to switch to this TLI */
1970 newReplayTLI, prevReplayTLI, *replayTLI);
1971
1972 /* Following WAL records should be run with new TLI */
1973 *replayTLI = newReplayTLI;
1974 switchedTLI = true;
1975 }
1976 }
1977
1978 /*
1979 * Update shared replayEndRecPtr before replaying this record, so that
1980 * XLogFlush will update minRecoveryPoint correctly.
1981 */
1984 XLogRecoveryCtl->replayEndTLI = *replayTLI;
1986
1987 /*
1988 * If we are attempting to enter Hot Standby mode, process XIDs we see
1989 */
1993
1994 /*
1995 * Some XLOG record types that are related to recovery are processed
1996 * directly here, rather than in xlog_redo()
1997 */
1998 if (record->xl_rmid == RM_XLOG_ID)
1999 xlogrecovery_redo(xlogreader, *replayTLI);
2000
2001 /* Now apply the WAL record itself */
2003
2004 /*
2005 * After redo, check whether the backup pages associated with the WAL
2006 * record are consistent with the existing pages. This check is done only
2007 * if consistency check is enabled for this record.
2008 */
2009 if ((record->xl_info & XLR_CHECK_CONSISTENCY) != 0)
2011
2012 /* Pop the error context stack */
2013 error_context_stack = errcallback.previous;
2014
2015 /*
2016 * Update lastReplayedEndRecPtr after this record has been successfully
2017 * replayed.
2018 */
2022 XLogRecoveryCtl->lastReplayedTLI = *replayTLI;
2024
2025 /* ------
2026 * Wakeup walsenders:
2027 *
2028 * On the standby, the WAL is flushed first (which will only wake up
2029 * physical walsenders) and then applied, which will only wake up logical
2030 * walsenders.
2031 *
2032 * Indeed, logical walsenders on standby can't decode and send data until
2033 * it's been applied.
2034 *
2035 * Physical walsenders don't need to be woken up during replay unless
2036 * cascading replication is allowed and time line change occurred (so that
2037 * they can notice that they are on a new time line).
2038 *
2039 * That's why the wake up conditions are for:
2040 *
2041 * - physical walsenders in case of new time line and cascade
2042 * replication is allowed
2043 * - logical walsenders in case cascade replication is allowed (could not
2044 * be created otherwise)
2045 * ------
2046 */
2048 WalSndWakeup(switchedTLI, true);
2049
2050 /*
2051 * If rm_redo called XLogRequestWalReceiverReply, then we wake up the
2052 * receiver so that it notices the updated lastReplayedEndRecPtr and sends
2053 * a reply to the primary.
2054 */
2056 {
2059 }
2060
2061 /* Allow read-only connections if we're consistent now */
2063
2064 /* Is this a timeline switch? */
2065 if (switchedTLI)
2066 {
2067 /*
2068 * Before we continue on the new timeline, clean up any (possibly
2069 * bogus) future WAL segments on the old timeline.
2070 */
2072
2073 /* Reset the prefetcher. */
2075 }
2076}
2077
2078/*
2079 * Some XLOG RM record types that are directly related to WAL recovery are
2080 * handled here rather than in the xlog_redo()
2081 */
2082static void
2084{
2085 uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
2086 XLogRecPtr lsn = record->EndRecPtr;
2087
2088 Assert(XLogRecGetRmid(record) == RM_XLOG_ID);
2089
2090 if (info == XLOG_OVERWRITE_CONTRECORD)
2091 {
2092 /* Verify the payload of a XLOG_OVERWRITE_CONTRECORD record. */
2094
2095 memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_overwrite_contrecord));
2096 if (xlrec.overwritten_lsn != record->overwrittenRecPtr)
2097 elog(FATAL, "mismatching overwritten LSN %X/%08X -> %X/%08X",
2100
2101 /* We have safely skipped the aborted record */
2104
2105 ereport(LOG,
2106 errmsg("successfully skipped missing contrecord at %X/%08X, overwritten at %s",
2109
2110 /* Verifying the record should only happen once */
2112 }
2113 else if (info == XLOG_BACKUP_END)
2114 {
2115 XLogRecPtr startpoint;
2116
2117 memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint));
2118
2119 if (backupStartPoint == startpoint)
2120 {
2121 /*
2122 * We have reached the end of base backup, the point where
2123 * pg_backup_stop() was done. The data on disk is now consistent
2124 * (assuming we have also reached minRecoveryPoint). Set
2125 * backupEndPoint to the current LSN, so that the next call to
2126 * CheckRecoveryConsistency() will notice it and do the
2127 * end-of-backup processing.
2128 */
2129 elog(DEBUG1, "end of backup record reached");
2130
2131 backupEndPoint = lsn;
2132 }
2133 else
2134 elog(DEBUG1, "saw end-of-backup record for backup starting at %X/%08X, waiting for %X/%08X",
2136 }
2137}
2138
2139/*
2140 * Verify that, in non-test mode, ./pg_tblspc doesn't contain any real
2141 * directories.
2142 *
2143 * Replay of database creation XLOG records for databases that were later
2144 * dropped can create fake directories in pg_tblspc. By the time consistency
2145 * is reached these directories should have been removed; here we verify
2146 * that this did indeed happen. This is to be called at the point where
2147 * consistent state is reached.
2148 *
2149 * allow_in_place_tablespaces turns the PANIC into a WARNING, which is
2150 * useful for testing purposes, and also allows for an escape hatch in case
2151 * things go south.
2152 */
2153static void
2155{
2156 DIR *dir;
2157 struct dirent *de;
2158
2160 while ((de = ReadDir(dir, PG_TBLSPC_DIR)) != NULL)
2161 {
2162 char path[MAXPGPATH + sizeof(PG_TBLSPC_DIR)];
2163
2164 /* Skip entries of non-oid names */
2165 if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
2166 continue;
2167
2168 snprintf(path, sizeof(path), "%s/%s", PG_TBLSPC_DIR, de->d_name);
2169
2170 if (get_dirent_type(path, de, false, ERROR) != PGFILETYPE_LNK)
2173 errmsg("unexpected directory entry \"%s\" found in %s",
2174 de->d_name, PG_TBLSPC_DIR),
2175 errdetail("All directory entries in %s/ should be symbolic links.",
2177 errhint("Remove those directories, or set \"allow_in_place_tablespaces\" to ON transiently to let recovery complete.")));
2178 }
2179}
2180
2181/*
2182 * Checks if recovery has reached a consistent state. When consistency is
2183 * reached and we have a valid starting standby snapshot, tell postmaster
2184 * that it can start accepting read-only connections.
2185 */
2186static void
2188{
2189 XLogRecPtr lastReplayedEndRecPtr;
2190 TimeLineID lastReplayedTLI;
2191
2192 /*
2193 * During crash recovery, we don't reach a consistent state until we've
2194 * replayed all the WAL.
2195 */
2197 return;
2198
2200
2201 /*
2202 * assume that we are called in the startup process, and hence don't need
2203 * a lock to read lastReplayedEndRecPtr
2204 */
2205 lastReplayedEndRecPtr = XLogRecoveryCtl->lastReplayedEndRecPtr;
2206 lastReplayedTLI = XLogRecoveryCtl->lastReplayedTLI;
2207
2208 /*
2209 * Have we reached the point where our base backup was completed?
2210 */
2212 backupEndPoint <= lastReplayedEndRecPtr)
2213 {
2214 XLogRecPtr saveBackupStartPoint = backupStartPoint;
2215 XLogRecPtr saveBackupEndPoint = backupEndPoint;
2216
2217 elog(DEBUG1, "end of backup reached");
2218
2219 /*
2220 * We have reached the end of base backup, as indicated by pg_control.
2221 * Update the control file accordingly.
2222 */
2223 ReachedEndOfBackup(lastReplayedEndRecPtr, lastReplayedTLI);
2226 backupEndRequired = false;
2227
2228 ereport(LOG,
2229 errmsg("completed backup recovery with redo LSN %X/%08X and end LSN %X/%08X",
2230 LSN_FORMAT_ARGS(saveBackupStartPoint),
2231 LSN_FORMAT_ARGS(saveBackupEndPoint)));
2232 }
2233
2234 /*
2235 * Have we passed our safe starting point? Note that minRecoveryPoint is
2236 * known to be incorrectly set if recovering from a backup, until the
2237 * XLOG_BACKUP_END arrives to advise us of the correct minRecoveryPoint.
2238 * All we know prior to that is that we're not consistent yet.
2239 */
2241 minRecoveryPoint <= lastReplayedEndRecPtr)
2242 {
2243 /*
2244 * Check to see if the XLOG sequence contained any unresolved
2245 * references to uninitialized pages.
2246 */
2248
2249 /*
2250 * Check that pg_tblspc doesn't contain any real directories. Replay
2251 * of Database/CREATE_* records may have created fictitious tablespace
2252 * directories that should have been removed by the time consistency
2253 * was reached.
2254 */
2256
2257 reachedConsistency = true;
2259 ereport(LOG,
2260 errmsg("consistent recovery state reached at %X/%08X",
2261 LSN_FORMAT_ARGS(lastReplayedEndRecPtr)));
2262 }
2263
2264 /*
2265 * Have we got a valid starting snapshot that will allow queries to be
2266 * run? If so, we can tell postmaster that the database is consistent now,
2267 * enabling connections.
2268 */
2273 {
2277
2278 LocalHotStandbyActive = true;
2279
2281 }
2282}
2283
2284/*
2285 * Error context callback for errors occurring during rm_redo().
2286 */
2287static void
2289{
2290 XLogReaderState *record = (XLogReaderState *) arg;
2292
2294 xlog_outdesc(&buf, record);
2295 xlog_block_info(&buf, record);
2296
2297 /* translator: %s is a WAL record description */
2298 errcontext("WAL redo at %X/%08X for %s",
2299 LSN_FORMAT_ARGS(record->ReadRecPtr),
2300 buf.data);
2301
2302 pfree(buf.data);
2303}
2304
2305/*
2306 * Returns a string describing an XLogRecord, consisting of its identity
2307 * optionally followed by a colon, a space, and a further description.
2308 */
2309void
2311{
2312 RmgrData rmgr = GetRmgr(XLogRecGetRmid(record));
2313 uint8 info = XLogRecGetInfo(record);
2314 const char *id;
2315
2318
2319 id = rmgr.rm_identify(info);
2320 if (id == NULL)
2321 appendStringInfo(buf, "UNKNOWN (%X): ", info & ~XLR_INFO_MASK);
2322 else
2323 appendStringInfo(buf, "%s: ", id);
2324
2325 rmgr.rm_desc(buf, record);
2326}
2327
2328#ifdef WAL_DEBUG
2329
2330static void
2331xlog_outrec(StringInfo buf, XLogReaderState *record)
2332{
2333 appendStringInfo(buf, "prev %X/%08X; xid %u",
2335 XLogRecGetXid(record));
2336
2337 appendStringInfo(buf, "; len %u",
2338 XLogRecGetDataLen(record));
2339
2340 xlog_block_info(buf, record);
2341}
2342#endif /* WAL_DEBUG */
2343
2344/*
2345 * Returns a string giving information about all the blocks in an
2346 * XLogRecord.
2347 */
2348static void
2350{
2351 int block_id;
2352
2353 /* decode block references */
2354 for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
2355 {
2356 RelFileLocator rlocator;
2357 ForkNumber forknum;
2358 BlockNumber blk;
2359
2360 if (!XLogRecGetBlockTagExtended(record, block_id,
2361 &rlocator, &forknum, &blk, NULL))
2362 continue;
2363
2364 if (forknum != MAIN_FORKNUM)
2365 appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, fork %u, blk %u",
2366 block_id,
2367 rlocator.spcOid, rlocator.dbOid,
2368 rlocator.relNumber,
2369 forknum,
2370 blk);
2371 else
2372 appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, blk %u",
2373 block_id,
2374 rlocator.spcOid, rlocator.dbOid,
2375 rlocator.relNumber,
2376 blk);
2377 if (XLogRecHasBlockImage(record, block_id))
2378 appendStringInfoString(buf, " FPW");
2379 }
2380}
2381
2382
2383/*
2384 * Check that it's OK to switch to new timeline during recovery.
2385 *
2386 * 'lsn' is the address of the shutdown checkpoint record we're about to
2387 * replay. (Currently, timeline can only change at a shutdown checkpoint).
2388 */
2389static void
2391 TimeLineID replayTLI)
2392{
2393 /* Check that the record agrees on what the current (old) timeline is */
2394 if (prevTLI != replayTLI)
2395 ereport(PANIC,
2396 (errmsg("unexpected previous timeline ID %u (current timeline ID %u) in checkpoint record",
2397 prevTLI, replayTLI)));
2398
2399 /*
2400 * The new timeline better be in the list of timelines we expect to see,
2401 * according to the timeline history. It should also not decrease.
2402 */
2403 if (newTLI < replayTLI || !tliInHistory(newTLI, expectedTLEs))
2404 ereport(PANIC,
2405 (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
2406 newTLI, replayTLI)));
2407
2408 /*
2409 * If we have not yet reached min recovery point, and we're about to
2410 * switch to a timeline greater than the timeline of the min recovery
2411 * point: trouble. After switching to the new timeline, we could not
2412 * possibly visit the min recovery point on the correct timeline anymore.
2413 * This can happen if there is a newer timeline in the archive that
2414 * branched before the timeline the min recovery point is on, and you
2415 * attempt to do PITR to the new timeline.
2416 */
2418 lsn < minRecoveryPoint &&
2419 newTLI > minRecoveryPointTLI)
2420 ereport(PANIC,
2421 errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%08X on timeline %u",
2422 newTLI,
2425
2426 /* Looks good */
2427}
2428
2429
2430/*
2431 * Extract timestamp from WAL record.
2432 *
2433 * If the record contains a timestamp, returns true, and saves the timestamp
2434 * in *recordXtime. If the record type has no timestamp, returns false.
2435 * Currently, only transaction commit/abort records and restore points contain
2436 * timestamps.
2437 */
2438static bool
2440{
2441 uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
2442 uint8 xact_info = info & XLOG_XACT_OPMASK;
2443 uint8 rmid = XLogRecGetRmid(record);
2444
2445 if (rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
2446 {
2447 *recordXtime = ((xl_restore_point *) XLogRecGetData(record))->rp_time;
2448 return true;
2449 }
2450 if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_COMMIT ||
2451 xact_info == XLOG_XACT_COMMIT_PREPARED))
2452 {
2453 *recordXtime = ((xl_xact_commit *) XLogRecGetData(record))->xact_time;
2454 return true;
2455 }
2456 if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_ABORT ||
2457 xact_info == XLOG_XACT_ABORT_PREPARED))
2458 {
2459 *recordXtime = ((xl_xact_abort *) XLogRecGetData(record))->xact_time;
2460 return true;
2461 }
2462 return false;
2463}
2464
2465/*
2466 * Checks whether the current buffer page and backup page stored in the
2467 * WAL record are consistent or not. Before comparing the two pages, a
2468 * masking can be applied to the pages to ignore certain areas like hint bits,
2469 * unused space between pd_lower and pd_upper among other things. This
2470 * function should be called once WAL replay has been completed for a
2471 * given record.
2472 */
2473static void
2475{
2476 RmgrData rmgr = GetRmgr(XLogRecGetRmid(record));
2477 RelFileLocator rlocator;
2478 ForkNumber forknum;
2479 BlockNumber blkno;
2480 int block_id;
2481
2482 /* Records with no backup blocks have no need for consistency checks. */
2483 if (!XLogRecHasAnyBlockRefs(record))
2484 return;
2485
2487
2488 for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
2489 {
2490 Buffer buf;
2491 Page page;
2492
2493 if (!XLogRecGetBlockTagExtended(record, block_id,
2494 &rlocator, &forknum, &blkno, NULL))
2495 {
2496 /*
2497 * WAL record doesn't contain a block reference with the given id.
2498 * Do nothing.
2499 */
2500 continue;
2501 }
2502
2503 Assert(XLogRecHasBlockImage(record, block_id));
2504
2505 if (XLogRecBlockImageApply(record, block_id))
2506 {
2507 /*
2508 * WAL record has already applied the page, so bypass the
2509 * consistency check as that would result in comparing the full
2510 * page stored in the record with itself.
2511 */
2512 continue;
2513 }
2514
2515 /*
2516 * Read the contents from the current buffer and store it in a
2517 * temporary page.
2518 */
2519 buf = XLogReadBufferExtended(rlocator, forknum, blkno,
2522 if (!BufferIsValid(buf))
2523 continue;
2524
2526 page = BufferGetPage(buf);
2527
2528 /*
2529 * Take a copy of the local page where WAL has been applied to have a
2530 * comparison base before masking it...
2531 */
2532 memcpy(replay_image_masked, page, BLCKSZ);
2533
2534 /* No need for this page anymore now that a copy is in. */
2536
2537 /*
2538 * If the block LSN is already ahead of this WAL record, we can't
2539 * expect contents to match. This can happen if recovery is
2540 * restarted.
2541 */
2543 continue;
2544
2545 /*
2546 * Read the contents from the backup copy, stored in WAL record and
2547 * store it in a temporary page. There is no need to allocate a new
2548 * page here, a local buffer is fine to hold its contents and a mask
2549 * can be directly applied on it.
2550 */
2551 if (!RestoreBlockImage(record, block_id, primary_image_masked))
2552 ereport(ERROR,
2553 (errcode(ERRCODE_INTERNAL_ERROR),
2554 errmsg_internal("%s", record->errormsg_buf)));
2555
2556 /*
2557 * If masking function is defined, mask both the primary and replay
2558 * images
2559 */
2560 if (rmgr.rm_mask != NULL)
2561 {
2562 rmgr.rm_mask(replay_image_masked, blkno);
2563 rmgr.rm_mask(primary_image_masked, blkno);
2564 }
2565
2566 /* Time to compare the primary and replay images. */
2567 if (memcmp(replay_image_masked, primary_image_masked, BLCKSZ) != 0)
2568 {
2569 elog(FATAL,
2570 "inconsistent page found, rel %u/%u/%u, forknum %u, blkno %u",
2571 rlocator.spcOid, rlocator.dbOid, rlocator.relNumber,
2572 forknum, blkno);
2573 }
2574 }
2575}
2576
2577/*
2578 * For point-in-time recovery, this function decides whether we want to
2579 * stop applying the XLOG before the current record.
2580 *
2581 * Returns true if we are stopping, false otherwise. If stopping, some
2582 * information is saved in recoveryStopXid et al for use in annotating the
2583 * new timeline's history file.
2584 */
2585static bool
2587{
2588 bool stopsHere = false;
2589 uint8 xact_info;
2590 bool isCommit;
2591 TimestampTz recordXtime = 0;
2592 TransactionId recordXid;
2593
2594 /*
2595 * Ignore recovery target settings when not in archive recovery (meaning
2596 * we are in crash recovery).
2597 */
2599 return false;
2600
2601 /* Check if we should stop as soon as reaching consistency */
2603 {
2604 ereport(LOG,
2605 (errmsg("recovery stopping after reaching consistency")));
2606
2607 recoveryStopAfter = false;
2610 recoveryStopTime = 0;
2611 recoveryStopName[0] = '\0';
2612 return true;
2613 }
2614
2615 /* Check if target LSN has been reached */
2618 record->ReadRecPtr >= recoveryTargetLSN)
2619 {
2620 recoveryStopAfter = false;
2622 recoveryStopLSN = record->ReadRecPtr;
2623 recoveryStopTime = 0;
2624 recoveryStopName[0] = '\0';
2625 ereport(LOG,
2626 errmsg("recovery stopping before WAL location (LSN) \"%X/%08X\"",
2628 return true;
2629 }
2630
2631 /* Otherwise we only consider stopping before COMMIT or ABORT records. */
2632 if (XLogRecGetRmid(record) != RM_XACT_ID)
2633 return false;
2634
2635 xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
2636
2637 if (xact_info == XLOG_XACT_COMMIT)
2638 {
2639 isCommit = true;
2640 recordXid = XLogRecGetXid(record);
2641 }
2642 else if (xact_info == XLOG_XACT_COMMIT_PREPARED)
2643 {
2644 xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
2645 xl_xact_parsed_commit parsed;
2646
2647 isCommit = true;
2649 xlrec,
2650 &parsed);
2651 recordXid = parsed.twophase_xid;
2652 }
2653 else if (xact_info == XLOG_XACT_ABORT)
2654 {
2655 isCommit = false;
2656 recordXid = XLogRecGetXid(record);
2657 }
2658 else if (xact_info == XLOG_XACT_ABORT_PREPARED)
2659 {
2660 xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
2661 xl_xact_parsed_abort parsed;
2662
2663 isCommit = false;
2665 xlrec,
2666 &parsed);
2667 recordXid = parsed.twophase_xid;
2668 }
2669 else
2670 return false;
2671
2673 {
2674 /*
2675 * There can be only one transaction end record with this exact
2676 * transactionid
2677 *
2678 * when testing for an xid, we MUST test for equality only, since
2679 * transactions are numbered in the order they start, not the order
2680 * they complete. A higher numbered xid will complete before you about
2681 * 50% of the time...
2682 */
2683 stopsHere = (recordXid == recoveryTargetXid);
2684 }
2685
2686 /*
2687 * Note: we must fetch recordXtime regardless of recoveryTarget setting.
2688 * We don't expect getRecordTimestamp ever to fail, since we already know
2689 * this is a commit or abort record; but test its result anyway.
2690 */
2691 if (getRecordTimestamp(record, &recordXtime) &&
2693 {
2694 /*
2695 * There can be many transactions that share the same commit time, so
2696 * we stop after the last one, if we are inclusive, or stop at the
2697 * first one if we are exclusive
2698 */
2700 stopsHere = (recordXtime > recoveryTargetTime);
2701 else
2702 stopsHere = (recordXtime >= recoveryTargetTime);
2703 }
2704
2705 if (stopsHere)
2706 {
2707 recoveryStopAfter = false;
2708 recoveryStopXid = recordXid;
2709 recoveryStopTime = recordXtime;
2711 recoveryStopName[0] = '\0';
2712
2713 if (isCommit)
2714 {
2715 ereport(LOG,
2716 (errmsg("recovery stopping before commit of transaction %u, time %s",
2719 }
2720 else
2721 {
2722 ereport(LOG,
2723 (errmsg("recovery stopping before abort of transaction %u, time %s",
2726 }
2727 }
2728
2729 return stopsHere;
2730}
2731
2732/*
2733 * Same as recoveryStopsBefore, but called after applying the record.
2734 *
2735 * We also track the timestamp of the latest applied COMMIT/ABORT
2736 * record in XLogRecoveryCtl->recoveryLastXTime.
2737 */
2738static bool
2740{
2741 uint8 info;
2742 uint8 xact_info;
2743 uint8 rmid;
2744 TimestampTz recordXtime = 0;
2745
2746 /*
2747 * Ignore recovery target settings when not in archive recovery (meaning
2748 * we are in crash recovery).
2749 */
2751 return false;
2752
2753 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
2754 rmid = XLogRecGetRmid(record);
2755
2756 /*
2757 * There can be many restore points that share the same name; we stop at
2758 * the first one.
2759 */
2761 rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
2762 {
2763 xl_restore_point *recordRestorePointData;
2764
2765 recordRestorePointData = (xl_restore_point *) XLogRecGetData(record);
2766
2767 if (strcmp(recordRestorePointData->rp_name, recoveryTargetName) == 0)
2768 {
2769 recoveryStopAfter = true;
2772 (void) getRecordTimestamp(record, &recoveryStopTime);
2773 strlcpy(recoveryStopName, recordRestorePointData->rp_name, MAXFNAMELEN);
2774
2775 ereport(LOG,
2776 (errmsg("recovery stopping at restore point \"%s\", time %s",
2779 return true;
2780 }
2781 }
2782
2783 /* Check if the target LSN has been reached */
2786 record->ReadRecPtr >= recoveryTargetLSN)
2787 {
2788 recoveryStopAfter = true;
2790 recoveryStopLSN = record->ReadRecPtr;
2791 recoveryStopTime = 0;
2792 recoveryStopName[0] = '\0';
2793 ereport(LOG,
2794 errmsg("recovery stopping after WAL location (LSN) \"%X/%08X\"",
2796 return true;
2797 }
2798
2799 if (rmid != RM_XACT_ID)
2800 return false;
2801
2802 xact_info = info & XLOG_XACT_OPMASK;
2803
2804 if (xact_info == XLOG_XACT_COMMIT ||
2805 xact_info == XLOG_XACT_COMMIT_PREPARED ||
2806 xact_info == XLOG_XACT_ABORT ||
2807 xact_info == XLOG_XACT_ABORT_PREPARED)
2808 {
2809 TransactionId recordXid;
2810
2811 /* Update the last applied transaction timestamp */
2812 if (getRecordTimestamp(record, &recordXtime))
2813 SetLatestXTime(recordXtime);
2814
2815 /* Extract the XID of the committed/aborted transaction */
2816 if (xact_info == XLOG_XACT_COMMIT_PREPARED)
2817 {
2818 xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
2819 xl_xact_parsed_commit parsed;
2820
2822 xlrec,
2823 &parsed);
2824 recordXid = parsed.twophase_xid;
2825 }
2826 else if (xact_info == XLOG_XACT_ABORT_PREPARED)
2827 {
2828 xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
2829 xl_xact_parsed_abort parsed;
2830
2832 xlrec,
2833 &parsed);
2834 recordXid = parsed.twophase_xid;
2835 }
2836 else
2837 recordXid = XLogRecGetXid(record);
2838
2839 /*
2840 * There can be only one transaction end record with this exact
2841 * transactionid
2842 *
2843 * when testing for an xid, we MUST test for equality only, since
2844 * transactions are numbered in the order they start, not the order
2845 * they complete. A higher numbered xid will complete before you about
2846 * 50% of the time...
2847 */
2849 recordXid == recoveryTargetXid)
2850 {
2851 recoveryStopAfter = true;
2852 recoveryStopXid = recordXid;
2853 recoveryStopTime = recordXtime;
2855 recoveryStopName[0] = '\0';
2856
2857 if (xact_info == XLOG_XACT_COMMIT ||
2858 xact_info == XLOG_XACT_COMMIT_PREPARED)
2859 {
2860 ereport(LOG,
2861 (errmsg("recovery stopping after commit of transaction %u, time %s",
2864 }
2865 else if (xact_info == XLOG_XACT_ABORT ||
2866 xact_info == XLOG_XACT_ABORT_PREPARED)
2867 {
2868 ereport(LOG,
2869 (errmsg("recovery stopping after abort of transaction %u, time %s",
2872 }
2873 return true;
2874 }
2875 }
2876
2877 /* Check if we should stop as soon as reaching consistency */
2879 {
2880 ereport(LOG,
2881 (errmsg("recovery stopping after reaching consistency")));
2882
2883 recoveryStopAfter = true;
2885 recoveryStopTime = 0;
2887 recoveryStopName[0] = '\0';
2888 return true;
2889 }
2890
2891 return false;
2892}
2893
2894/*
2895 * Create a comment for the history file to explain why and where
2896 * timeline changed.
2897 */
2898static char *
2900{
2901 char reason[200];
2902
2904 snprintf(reason, sizeof(reason),
2905 "%s transaction %u",
2906 recoveryStopAfter ? "after" : "before",
2909 snprintf(reason, sizeof(reason),
2910 "%s %s\n",
2911 recoveryStopAfter ? "after" : "before",
2914 snprintf(reason, sizeof(reason),
2915 "%s LSN %X/%08X\n",
2916 recoveryStopAfter ? "after" : "before",
2919 snprintf(reason, sizeof(reason),
2920 "at restore point \"%s\"",
2923 snprintf(reason, sizeof(reason), "reached consistency");
2924 else
2925 snprintf(reason, sizeof(reason), "no recovery target specified");
2926
2927 return pstrdup(reason);
2928}
2929
2930/*
2931 * Wait until shared recoveryPauseState is set to RECOVERY_NOT_PAUSED.
2932 *
2933 * endOfRecovery is true if the recovery target is reached and
2934 * the paused state starts at the end of recovery because of
2935 * recovery_target_action=pause, and false otherwise.
2936 */
2937static void
2938recoveryPausesHere(bool endOfRecovery)
2939{
2940 /* Don't pause unless users can connect! */
2942 return;
2943
2944 /* Don't pause after standby promotion has been triggered */
2946 return;
2947
2948 if (endOfRecovery)
2949 ereport(LOG,
2950 (errmsg("pausing at the end of recovery"),
2951 errhint("Execute pg_wal_replay_resume() to promote.")));
2952 else
2953 ereport(LOG,
2954 (errmsg("recovery has paused"),
2955 errhint("Execute pg_wal_replay_resume() to continue.")));
2956
2957 /* loop until recoveryPauseState is set to RECOVERY_NOT_PAUSED */
2959 {
2962 return;
2963
2964 /*
2965 * If recovery pause is requested then set it paused. While we are in
2966 * the loop, user might resume and pause again so set this every time.
2967 */
2969
2970 /*
2971 * We wait on a condition variable that will wake us as soon as the
2972 * pause ends, but we use a timeout so we can check the above exit
2973 * condition periodically too.
2974 */
2976 WAIT_EVENT_RECOVERY_PAUSE);
2977 }
2979}
2980
2981/*
2982 * When recovery_min_apply_delay is set, we wait long enough to make sure
2983 * certain record types are applied at least that interval behind the primary.
2984 *
2985 * Returns true if we waited.
2986 *
2987 * Note that the delay is calculated between the WAL record log time and
2988 * the current time on standby. We would prefer to keep track of when this
2989 * standby received each WAL record, which would allow a more consistent
2990 * approach and one not affected by time synchronisation issues, but that
2991 * is significantly more effort and complexity for little actual gain in
2992 * usability.
2993 */
2994static bool
2996{
2997 uint8 xact_info;
2998 TimestampTz xtime;
2999 TimestampTz delayUntil;
3000 long msecs;
3001
3002 /* nothing to do if no delay configured */
3003 if (recovery_min_apply_delay <= 0)
3004 return false;
3005
3006 /* no delay is applied on a database not yet consistent */
3007 if (!reachedConsistency)
3008 return false;
3009
3010 /* nothing to do if crash recovery is requested */
3012 return false;
3013
3014 /*
3015 * Is it a COMMIT record?
3016 *
3017 * We deliberately choose not to delay aborts since they have no effect on
3018 * MVCC. We already allow replay of records that don't have a timestamp,
3019 * so there is already opportunity for issues caused by early conflicts on
3020 * standbys.
3021 */
3022 if (XLogRecGetRmid(record) != RM_XACT_ID)
3023 return false;
3024
3025 xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
3026
3027 if (xact_info != XLOG_XACT_COMMIT &&
3028 xact_info != XLOG_XACT_COMMIT_PREPARED)
3029 return false;
3030
3031 if (!getRecordTimestamp(record, &xtime))
3032 return false;
3033
3035
3036 /*
3037 * Exit without arming the latch if it's already past time to apply this
3038 * record
3039 */
3041 if (msecs <= 0)
3042 return false;
3043
3044 while (true)
3045 {
3047
3048 /* This might change recovery_min_apply_delay. */
3050
3052 break;
3053
3054 /*
3055 * Recalculate delayUntil as recovery_min_apply_delay could have
3056 * changed while waiting in this loop.
3057 */
3059
3060 /*
3061 * Wait for difference between GetCurrentTimestamp() and delayUntil.
3062 */
3064 delayUntil);
3065
3066 if (msecs <= 0)
3067 break;
3068
3069 elog(DEBUG2, "recovery apply delay %ld milliseconds", msecs);
3070
3073 msecs,
3074 WAIT_EVENT_RECOVERY_APPLY_DELAY);
3075 }
3076 return true;
3077}
3078
3079/*
3080 * Get the current state of the recovery pause request.
3081 */
3084{
3086
3090
3091 return state;
3092}
3093
3094/*
3095 * Set the recovery pause state.
3096 *
3097 * If recovery pause is requested then sets the recovery pause state to
3098 * 'pause requested' if it is not already 'paused'. Otherwise, sets it
3099 * to 'not paused' to resume the recovery. The recovery pause will be
3100 * confirmed by the ConfirmRecoveryPaused.
3101 */
3102void
3103SetRecoveryPause(bool recoveryPause)
3104{
3106
3107 if (!recoveryPause)
3111
3113
3114 if (!recoveryPause)
3116}
3117
3118/*
3119 * Confirm the recovery pause by setting the recovery pause state to
3120 * RECOVERY_PAUSED.
3121 */
3122static void
3124{
3125 /* If recovery pause is requested then set it paused */
3130}
3131
3132
3133/*
3134 * Attempt to read the next XLOG record.
3135 *
3136 * Before first call, the reader needs to be positioned to the first record
3137 * by calling XLogPrefetcherBeginRead().
3138 *
3139 * If no valid record is available, returns NULL, or fails if emode is PANIC.
3140 * (emode must be either PANIC, LOG). In standby mode, retries until a valid
3141 * record is available.
3142 */
3143static XLogRecord *
3145 bool fetching_ckpt, TimeLineID replayTLI)
3146{
3147 XLogRecord *record;
3150
3151 /* Pass through parameters to XLogPageRead */
3152 private->fetching_ckpt = fetching_ckpt;
3153 private->emode = emode;
3154 private->randAccess = (xlogreader->ReadRecPtr == InvalidXLogRecPtr);
3155 private->replayTLI = replayTLI;
3156
3157 /* This is the first attempt to read this page. */
3158 lastSourceFailed = false;
3159
3160 for (;;)
3161 {
3162 char *errormsg;
3163
3164 record = XLogPrefetcherReadRecord(xlogprefetcher, &errormsg);
3165 if (record == NULL)
3166 {
3167 /*
3168 * When we find that WAL ends in an incomplete record, keep track
3169 * of that record. After recovery is done, we'll write a record
3170 * to indicate to downstream WAL readers that that portion is to
3171 * be ignored.
3172 *
3173 * However, when ArchiveRecoveryRequested = true, we're going to
3174 * switch to a new timeline at the end of recovery. We will only
3175 * copy WAL over to the new timeline up to the end of the last
3176 * complete record, so if we did this, we would later create an
3177 * overwrite contrecord in the wrong place, breaking everything.
3178 */
3181 {
3184 }
3185
3186 if (readFile >= 0)
3187 {
3188 close(readFile);
3189 readFile = -1;
3190 }
3191
3192 /*
3193 * We only end up here without a message when XLogPageRead()
3194 * failed - in that case we already logged something. In
3195 * StandbyMode that only happens if we have been triggered, so we
3196 * shouldn't loop anymore in that case.
3197 */
3198 if (errormsg)
3200 (errmsg_internal("%s", errormsg) /* already translated */ ));
3201 }
3202
3203 /*
3204 * Check page TLI is one of the expected values.
3205 */
3207 {
3208 char fname[MAXFNAMELEN];
3209 XLogSegNo segno;
3210 int32 offset;
3211
3215 XLogFileName(fname, xlogreader->seg.ws_tli, segno,
3218 errmsg("unexpected timeline ID %u in WAL segment %s, LSN %X/%08X, offset %u",
3220 fname,
3222 offset));
3223 record = NULL;
3224 }
3225
3226 if (record)
3227 {
3228 /* Great, got a record */
3229 return record;
3230 }
3231 else
3232 {
3233 /* No valid record available from this source */
3234 lastSourceFailed = true;
3235
3236 /*
3237 * If archive recovery was requested, but we were still doing
3238 * crash recovery, switch to archive recovery and retry using the
3239 * offline archive. We have now replayed all the valid WAL in
3240 * pg_wal, so we are presumably now consistent.
3241 *
3242 * We require that there's at least some valid WAL present in
3243 * pg_wal, however (!fetching_ckpt). We could recover using the
3244 * WAL from the archive, even if pg_wal is completely empty, but
3245 * we'd have no idea how far we'd have to replay to reach
3246 * consistency. So err on the safe side and give up.
3247 */
3249 !fetching_ckpt)
3250 {
3252 (errmsg_internal("reached end of WAL in pg_wal, entering archive recovery")));
3253 InArchiveRecovery = true;
3256
3259 minRecoveryPointTLI = replayTLI;
3260
3262
3263 /*
3264 * Before we retry, reset lastSourceFailed and currentSource
3265 * so that we will check the archive next.
3266 */
3267 lastSourceFailed = false;
3269
3270 continue;
3271 }
3272
3273 /* In standby mode, loop back to retry. Otherwise, give up. */
3275 continue;
3276 else
3277 return NULL;
3278 }
3279 }
3280}
3281
3282/*
3283 * Read the XLOG page containing targetPagePtr into readBuf (if not read
3284 * already). Returns number of bytes read, if the page is read successfully,
3285 * or XLREAD_FAIL in case of errors. When errors occur, they are ereport'ed,
3286 * but only if they have not been previously reported.
3287 *
3288 * See XLogReaderRoutine.page_read for more details.
3289 *
3290 * While prefetching, xlogreader->nonblocking may be set. In that case,
3291 * returns XLREAD_WOULDBLOCK if we'd otherwise have to wait for more WAL.
3292 *
3293 * This is responsible for restoring files from archive as needed, as well
3294 * as for waiting for the requested WAL record to arrive in standby mode.
3295 *
3296 * xlogreader->private_data->emode specifies the log level used for reporting
3297 * "file not found" or "end of WAL" situations in archive recovery, or in
3298 * standby mode when promotion is triggered. If set to WARNING or below,
3299 * XLogPageRead() returns XLREAD_FAIL in those situations, on higher log
3300 * levels the ereport() won't return.
3301 *
3302 * In standby mode, if after a successful return of XLogPageRead() the
3303 * caller finds the record it's interested in to be broken, it should
3304 * ereport the error with the level determined by
3305 * emode_for_corrupt_record(), and then set lastSourceFailed
3306 * and call XLogPageRead() again with the same arguments. This lets
3307 * XLogPageRead() to try fetching the record from another source, or to
3308 * sleep and retry.
3309 */
3310static int
3312 XLogRecPtr targetRecPtr, char *readBuf)
3313{
3314 XLogPageReadPrivate *private =
3316 int emode = private->emode;
3317 uint32 targetPageOff;
3319 int r;
3320 instr_time io_start;
3321
3322 XLByteToSeg(targetPagePtr, targetSegNo, wal_segment_size);
3323 targetPageOff = XLogSegmentOffset(targetPagePtr, wal_segment_size);
3324
3325 /*
3326 * See if we need to switch to a new segment because the requested record
3327 * is not in the currently open one.
3328 */
3329 if (readFile >= 0 &&
3330 !XLByteInSeg(targetPagePtr, readSegNo, wal_segment_size))
3331 {
3332 /*
3333 * Request a restartpoint if we've replayed too much xlog since the
3334 * last one.
3335 */
3337 {
3339 {
3340 (void) GetRedoRecPtr();
3343 }
3344 }
3345
3346 close(readFile);
3347 readFile = -1;
3349 }
3350
3351 XLByteToSeg(targetPagePtr, readSegNo, wal_segment_size);
3352
3353retry:
3354 /* See if we need to retrieve more data */
3355 if (readFile < 0 ||
3357 flushedUpto < targetPagePtr + reqLen))
3358 {
3359 if (readFile >= 0 &&
3362 flushedUpto < targetPagePtr + reqLen)
3363 return XLREAD_WOULDBLOCK;
3364
3365 switch (WaitForWALToBecomeAvailable(targetPagePtr + reqLen,
3366 private->randAccess,
3367 private->fetching_ckpt,
3368 targetRecPtr,
3369 private->replayTLI,
3372 {
3373 case XLREAD_WOULDBLOCK:
3374 return XLREAD_WOULDBLOCK;
3375 case XLREAD_FAIL:
3376 if (readFile >= 0)
3377 close(readFile);
3378 readFile = -1;
3379 readLen = 0;
3381 return XLREAD_FAIL;
3382 case XLREAD_SUCCESS:
3383 break;
3384 }
3385 }
3386
3387 /*
3388 * At this point, we have the right segment open and if we're streaming we
3389 * know the requested record is in it.
3390 */
3391 Assert(readFile != -1);
3392
3393 /*
3394 * If the current segment is being streamed from the primary, calculate
3395 * how much of the current page we have received already. We know the
3396 * requested record has been received, but this is for the benefit of
3397 * future calls, to allow quick exit at the top of this function.
3398 */
3400 {
3401 if (((targetPagePtr) / XLOG_BLCKSZ) != (flushedUpto / XLOG_BLCKSZ))
3402 readLen = XLOG_BLCKSZ;
3403 else
3405 targetPageOff;
3406 }
3407 else
3408 readLen = XLOG_BLCKSZ;
3409
3410 /* Read the requested page */
3411 readOff = targetPageOff;
3412
3413 /* Measure I/O timing when reading segment */
3415
3416 pgstat_report_wait_start(WAIT_EVENT_WAL_READ);
3417 r = pg_pread(readFile, readBuf, XLOG_BLCKSZ, (off_t) readOff);
3418 if (r != XLOG_BLCKSZ)
3419 {
3420 char fname[MAXFNAMELEN];
3421 int save_errno = errno;
3422
3424
3426 io_start, 1, r);
3427
3429 if (r < 0)
3430 {
3431 errno = save_errno;
3432 ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
3434 errmsg("could not read from WAL segment %s, LSN %X/%08X, offset %u: %m",
3435 fname, LSN_FORMAT_ARGS(targetPagePtr),
3436 readOff)));
3437 }
3438 else
3439 ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
3441 errmsg("could not read from WAL segment %s, LSN %X/%08X, offset %u: read %d of %zu",
3442 fname, LSN_FORMAT_ARGS(targetPagePtr),
3443 readOff, r, (Size) XLOG_BLCKSZ)));
3444 goto next_record_is_invalid;
3445 }
3447
3449 io_start, 1, r);
3450
3451 Assert(targetSegNo == readSegNo);
3452 Assert(targetPageOff == readOff);
3453 Assert(reqLen <= readLen);
3454
3456
3457 /*
3458 * Check the page header immediately, so that we can retry immediately if
3459 * it's not valid. This may seem unnecessary, because ReadPageInternal()
3460 * validates the page header anyway, and would propagate the failure up to
3461 * ReadRecord(), which would retry. However, there's a corner case with
3462 * continuation records, if a record is split across two pages such that
3463 * we would need to read the two pages from different sources across two
3464 * WAL segments.
3465 *
3466 * The first page is only available locally, in pg_wal, because it's
3467 * already been recycled on the primary. The second page, however, is not
3468 * present in pg_wal, and we should stream it from the primary. There is a
3469 * recycled WAL segment present in pg_wal, with garbage contents, however.
3470 * We would read the first page from the local WAL segment, but when
3471 * reading the second page, we would read the bogus, recycled, WAL
3472 * segment. If we didn't catch that case here, we would never recover,
3473 * because ReadRecord() would retry reading the whole record from the
3474 * beginning.
3475 *
3476 * Of course, this only catches errors in the page header, which is what
3477 * happens in the case of a recycled WAL segment. Other kinds of errors or
3478 * corruption still has the same problem. But this at least fixes the
3479 * common case, which can happen as part of normal operation.
3480 *
3481 * Validating the page header is cheap enough that doing it twice
3482 * shouldn't be a big deal from a performance point of view.
3483 *
3484 * When not in standby mode, an invalid page header should cause recovery
3485 * to end, not retry reading the page, so we don't need to validate the
3486 * page header here for the retry. Instead, ReadPageInternal() is
3487 * responsible for the validation.
3488 */
3489 if (StandbyMode &&
3490 (targetPagePtr % wal_segment_size) == 0 &&
3491 !XLogReaderValidatePageHeader(xlogreader, targetPagePtr, readBuf))
3492 {
3493 /*
3494 * Emit this error right now then retry this page immediately. Use
3495 * errmsg_internal() because the message was already translated.
3496 */
3497 if (xlogreader->errormsg_buf[0])
3500
3501 /* reset any error XLogReaderValidatePageHeader() might have set */
3503 goto next_record_is_invalid;
3504 }
3505
3506 return readLen;
3507
3508next_record_is_invalid:
3509
3510 /*
3511 * If we're reading ahead, give up fast. Retries and error reporting will
3512 * be handled by a later read when recovery catches up to this point.
3513 */
3515 return XLREAD_WOULDBLOCK;
3516
3517 lastSourceFailed = true;
3518
3519 if (readFile >= 0)
3520 close(readFile);
3521 readFile = -1;
3522 readLen = 0;
3524
3525 /* In standby-mode, keep trying */
3526 if (StandbyMode)
3527 goto retry;
3528 else
3529 return XLREAD_FAIL;
3530}
3531
3532/*
3533 * Open the WAL segment containing WAL location 'RecPtr'.
3534 *
3535 * The segment can be fetched via restore_command, or via walreceiver having
3536 * streamed the record, or it can already be present in pg_wal. Checking
3537 * pg_wal is mainly for crash recovery, but it will be polled in standby mode
3538 * too, in case someone copies a new segment directly to pg_wal. That is not
3539 * documented or recommended, though.
3540 *
3541 * If 'fetching_ckpt' is true, we're fetching a checkpoint record, and should
3542 * prepare to read WAL starting from RedoStartLSN after this.
3543 *
3544 * 'RecPtr' might not point to the beginning of the record we're interested
3545 * in, it might also point to the page or segment header. In that case,
3546 * 'tliRecPtr' is the position of the WAL record we're interested in. It is
3547 * used to decide which timeline to stream the requested WAL from.
3548 *
3549 * 'replayLSN' is the current replay LSN, so that if we scan for new
3550 * timelines, we can reject a switch to a timeline that branched off before
3551 * this point.
3552 *
3553 * If the record is not immediately available, the function returns false
3554 * if we're not in standby mode. In standby mode, waits for it to become
3555 * available.
3556 *
3557 * When the requested record becomes available, the function opens the file
3558 * containing it (if not open already), and returns XLREAD_SUCCESS. When end
3559 * of standby mode is triggered by the user, and there is no more WAL
3560 * available, returns XLREAD_FAIL.
3561 *
3562 * If nonblocking is true, then give up immediately if we can't satisfy the
3563 * request, returning XLREAD_WOULDBLOCK instead of waiting.
3564 */
3565static XLogPageReadResult
3567 bool fetching_ckpt, XLogRecPtr tliRecPtr,
3568 TimeLineID replayTLI, XLogRecPtr replayLSN,
3569 bool nonblocking)
3570{
3571 static TimestampTz last_fail_time = 0;
3573 bool streaming_reply_sent = false;
3574
3575 /*-------
3576 * Standby mode is implemented by a state machine:
3577 *
3578 * 1. Read from either archive or pg_wal (XLOG_FROM_ARCHIVE), or just
3579 * pg_wal (XLOG_FROM_PG_WAL)
3580 * 2. Check for promotion trigger request
3581 * 3. Read from primary server via walreceiver (XLOG_FROM_STREAM)
3582 * 4. Rescan timelines
3583 * 5. Sleep wal_retrieve_retry_interval milliseconds, and loop back to 1.
3584 *
3585 * Failure to read from the current source advances the state machine to
3586 * the next state.
3587 *
3588 * 'currentSource' indicates the current state. There are no currentSource
3589 * values for "check trigger", "rescan timelines", and "sleep" states,
3590 * those actions are taken when reading from the previous source fails, as
3591 * part of advancing to the next state.
3592 *
3593 * If standby mode is turned off while reading WAL from stream, we move
3594 * to XLOG_FROM_ARCHIVE and reset lastSourceFailed, to force fetching
3595 * the files (which would be required at end of recovery, e.g., timeline
3596 * history file) from archive or pg_wal. We don't need to kill WAL receiver
3597 * here because it's already stopped when standby mode is turned off at
3598 * the end of recovery.
3599 *-------
3600 */
3601 if (!InArchiveRecovery)
3603 else if (currentSource == XLOG_FROM_ANY ||
3605 {
3606 lastSourceFailed = false;
3608 }
3609
3610 for (;;)
3611 {
3612 XLogSource oldSource = currentSource;
3613 bool startWalReceiver = false;
3614
3615 /*
3616 * First check if we failed to read from the current source, and
3617 * advance the state machine if so. The failure to read might've
3618 * happened outside this function, e.g when a CRC check fails on a
3619 * record, or within this loop.
3620 */
3621 if (lastSourceFailed)
3622 {
3623 /*
3624 * Don't allow any retry loops to occur during nonblocking
3625 * readahead. Let the caller process everything that has been
3626 * decoded already first.
3627 */
3628 if (nonblocking)
3629 return XLREAD_WOULDBLOCK;
3630
3631 switch (currentSource)
3632 {
3633 case XLOG_FROM_ARCHIVE:
3634 case XLOG_FROM_PG_WAL:
3635
3636 /*
3637 * Check to see if promotion is requested. Note that we do
3638 * this only after failure, so when you promote, we still
3639 * finish replaying as much as we can from archive and
3640 * pg_wal before failover.
3641 */
3643 {
3645 return XLREAD_FAIL;
3646 }
3647
3648 /*
3649 * Not in standby mode, and we've now tried the archive
3650 * and pg_wal.
3651 */
3652 if (!StandbyMode)
3653 return XLREAD_FAIL;
3654
3655 /*
3656 * Move to XLOG_FROM_STREAM state, and set to start a
3657 * walreceiver if necessary.
3658 */
3660 startWalReceiver = true;
3661 break;
3662
3663 case XLOG_FROM_STREAM:
3664
3665 /*
3666 * Failure while streaming. Most likely, we got here
3667 * because streaming replication was terminated, or
3668 * promotion was triggered. But we also get here if we
3669 * find an invalid record in the WAL streamed from the
3670 * primary, in which case something is seriously wrong.
3671 * There's little chance that the problem will just go
3672 * away, but PANIC is not good for availability either,
3673 * especially in hot standby mode. So, we treat that the
3674 * same as disconnection, and retry from archive/pg_wal
3675 * again. The WAL in the archive should be identical to
3676 * what was streamed, so it's unlikely that it helps, but
3677 * one can hope...
3678 */
3679
3680 /*
3681 * We should be able to move to XLOG_FROM_STREAM only in
3682 * standby mode.
3683 */
3685
3686 /*
3687 * Before we leave XLOG_FROM_STREAM state, make sure that
3688 * walreceiver is not active, so that it won't overwrite
3689 * WAL that we restore from archive.
3690 */
3692
3693 /*
3694 * Before we sleep, re-scan for possible new timelines if
3695 * we were requested to recover to the latest timeline.
3696 */
3698 {
3699 if (rescanLatestTimeLine(replayTLI, replayLSN))
3700 {
3702 break;
3703 }
3704 }
3705
3706 /*
3707 * XLOG_FROM_STREAM is the last state in our state
3708 * machine, so we've exhausted all the options for
3709 * obtaining the requested WAL. We're going to loop back
3710 * and retry from the archive, but if it hasn't been long
3711 * since last attempt, sleep wal_retrieve_retry_interval
3712 * milliseconds to avoid busy-waiting.
3713 */
3715 if (!TimestampDifferenceExceeds(last_fail_time, now,
3717 {
3718 long wait_time;
3719
3720 wait_time = wal_retrieve_retry_interval -
3721 TimestampDifferenceMilliseconds(last_fail_time, now);
3722
3723 elog(LOG, "waiting for WAL to become available at %X/%08X",
3724 LSN_FORMAT_ARGS(RecPtr));
3725
3726 /* Do background tasks that might benefit us later. */
3728
3732 wait_time,
3733 WAIT_EVENT_RECOVERY_RETRIEVE_RETRY_INTERVAL);
3736
3737 /* Handle interrupt signals of startup process */
3739 }
3740 last_fail_time = now;
3742 break;
3743
3744 default:
3745 elog(ERROR, "unexpected WAL source %d", currentSource);
3746 }
3747 }
3748 else if (currentSource == XLOG_FROM_PG_WAL)
3749 {
3750 /*
3751 * We just successfully read a file in pg_wal. We prefer files in
3752 * the archive over ones in pg_wal, so try the next file again
3753 * from the archive first.
3754 */
3757 }
3758
3759 if (currentSource != oldSource)
3760 elog(DEBUG2, "switched WAL source from %s to %s after %s",
3762 lastSourceFailed ? "failure" : "success");
3763
3764 /*
3765 * We've now handled possible failure. Try to read from the chosen
3766 * source.
3767 */
3768 lastSourceFailed = false;
3769
3770 switch (currentSource)
3771 {
3772 case XLOG_FROM_ARCHIVE:
3773 case XLOG_FROM_PG_WAL:
3774
3775 /*
3776 * WAL receiver must not be running when reading WAL from
3777 * archive or pg_wal.
3778 */
3780
3781 /* Close any old file we might have open. */
3782 if (readFile >= 0)
3783 {
3784 close(readFile);
3785 readFile = -1;
3786 }
3787 /* Reset curFileTLI if random fetch. */
3788 if (randAccess)
3789 curFileTLI = 0;
3790
3791 /*
3792 * Try to restore the file from archive, or read an existing
3793 * file from pg_wal.
3794 */
3798 if (readFile >= 0)
3799 return XLREAD_SUCCESS; /* success! */
3800
3801 /*
3802 * Nope, not found in archive or pg_wal.
3803 */
3804 lastSourceFailed = true;
3805 break;
3806
3807 case XLOG_FROM_STREAM:
3808 {
3809 bool havedata;
3810
3811 /*
3812 * We should be able to move to XLOG_FROM_STREAM only in
3813 * standby mode.
3814 */
3816
3817 /*
3818 * First, shutdown walreceiver if its restart has been
3819 * requested -- but no point if we're already slated for
3820 * starting it.
3821 */
3822 if (pendingWalRcvRestart && !startWalReceiver)
3823 {
3825
3826 /*
3827 * Re-scan for possible new timelines if we were
3828 * requested to recover to the latest timeline.
3829 */
3832 rescanLatestTimeLine(replayTLI, replayLSN);
3833
3834 startWalReceiver = true;
3835 }
3836 pendingWalRcvRestart = false;
3837
3838 /*
3839 * Launch walreceiver if needed.
3840 *
3841 * If fetching_ckpt is true, RecPtr points to the initial
3842 * checkpoint location. In that case, we use RedoStartLSN
3843 * as the streaming start position instead of RecPtr, so
3844 * that when we later jump backwards to start redo at
3845 * RedoStartLSN, we will have the logs streamed already.
3846 */
3847 if (startWalReceiver &&
3848 PrimaryConnInfo && strcmp(PrimaryConnInfo, "") != 0)
3849 {
3850 XLogRecPtr ptr;
3851 TimeLineID tli;
3852
3853 if (fetching_ckpt)
3854 {
3855 ptr = RedoStartLSN;
3856 tli = RedoStartTLI;
3857 }
3858 else
3859 {
3860 ptr = RecPtr;
3861
3862 /*
3863 * Use the record begin position to determine the
3864 * TLI, rather than the position we're reading.
3865 */
3866 tli = tliOfPointInHistory(tliRecPtr, expectedTLEs);
3867
3868 if (curFileTLI > 0 && tli < curFileTLI)
3869 elog(ERROR, "according to history file, WAL location %X/%08X belongs to timeline %u, but previous recovered WAL file came from timeline %u",
3870 LSN_FORMAT_ARGS(tliRecPtr),
3871 tli, curFileTLI);
3872 }
3873 curFileTLI = tli;
3878 flushedUpto = 0;
3879 }
3880
3881 /*
3882 * Check if WAL receiver is active or wait to start up.
3883 */
3884 if (!WalRcvStreaming())
3885 {
3886 lastSourceFailed = true;
3887 break;
3888 }
3889
3890 /*
3891 * Walreceiver is active, so see if new data has arrived.
3892 *
3893 * We only advance XLogReceiptTime when we obtain fresh
3894 * WAL from walreceiver and observe that we had already
3895 * processed everything before the most recent "chunk"
3896 * that it flushed to disk. In steady state where we are
3897 * keeping up with the incoming data, XLogReceiptTime will
3898 * be updated on each cycle. When we are behind,
3899 * XLogReceiptTime will not advance, so the grace time
3900 * allotted to conflicting queries will decrease.
3901 */
3902 if (RecPtr < flushedUpto)
3903 havedata = true;
3904 else
3905 {
3906 XLogRecPtr latestChunkStart;
3907
3908 flushedUpto = GetWalRcvFlushRecPtr(&latestChunkStart, &receiveTLI);
3909 if (RecPtr < flushedUpto && receiveTLI == curFileTLI)
3910 {
3911 havedata = true;
3912 if (latestChunkStart <= RecPtr)
3913 {
3916 }
3917 }
3918 else
3919 havedata = false;
3920 }
3921 if (havedata)
3922 {
3923 /*
3924 * Great, streamed far enough. Open the file if it's
3925 * not open already. Also read the timeline history
3926 * file if we haven't initialized timeline history
3927 * yet; it should be streamed over and present in
3928 * pg_wal by now. Use XLOG_FROM_STREAM so that source
3929 * info is set correctly and XLogReceiptTime isn't
3930 * changed.
3931 *
3932 * NB: We must set readTimeLineHistory based on
3933 * recoveryTargetTLI, not receiveTLI. Normally they'll
3934 * be the same, but if recovery_target_timeline is
3935 * 'latest' and archiving is configured, then it's
3936 * possible that we managed to retrieve one or more
3937 * new timeline history files from the archive,
3938 * updating recoveryTargetTLI.
3939 */
3940 if (readFile < 0)
3941 {
3942 if (!expectedTLEs)
3945 XLOG_FROM_STREAM, false);
3946 Assert(readFile >= 0);
3947 }
3948 else
3949 {
3950 /* just make sure source info is correct... */
3953 return XLREAD_SUCCESS;
3954 }
3955 break;
3956 }
3957
3958 /* In nonblocking mode, return rather than sleeping. */
3959 if (nonblocking)
3960 return XLREAD_WOULDBLOCK;
3961
3962 /*
3963 * Data not here yet. Check for trigger, then wait for
3964 * walreceiver to wake us up when new WAL arrives.
3965 */
3967 {
3968 /*
3969 * Note that we don't return XLREAD_FAIL immediately
3970 * here. After being triggered, we still want to
3971 * replay all the WAL that was already streamed. It's
3972 * in pg_wal now, so we just treat this as a failure,
3973 * and the state machine will move on to replay the
3974 * streamed WAL from pg_wal, and then recheck the
3975 * trigger and exit replay.
3976 */
3977 lastSourceFailed = true;
3978 break;
3979 }
3980
3981 /*
3982 * Since we have replayed everything we have received so
3983 * far and are about to start waiting for more WAL, let's
3984 * tell the upstream server our replay location now so
3985 * that pg_stat_replication doesn't show stale
3986 * information.
3987 */
3988 if (!streaming_reply_sent)
3989 {
3991 streaming_reply_sent = true;
3992 }
3993
3994 /* Do any background tasks that might benefit us later. */
3996
3997 /* Update pg_stat_recovery_prefetch before sleeping. */
3999
4000 /*
4001 * Wait for more WAL to arrive, when we will be woken
4002 * immediately by the WAL receiver.
4003 */
4006 -1L,
4007 WAIT_EVENT_RECOVERY_WAL_STREAM);
4009 break;
4010 }
4011
4012 default:
4013 elog(ERROR, "unexpected WAL source %d", currentSource);
4014 }
4015
4016 /*
4017 * Check for recovery pause here so that we can confirm more quickly
4018 * that a requested pause has actually taken effect.
4019 */
4020 if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
4022 recoveryPausesHere(false);
4023
4024 /*
4025 * This possibly-long loop needs to handle interrupts of startup
4026 * process.
4027 */
4029 }
4030
4031 return XLREAD_FAIL; /* not reached */
4032}
4033
4034
4035/*
4036 * Determine what log level should be used to report a corrupt WAL record
4037 * in the current WAL page, previously read by XLogPageRead().
4038 *
4039 * 'emode' is the error mode that would be used to report a file-not-found
4040 * or legitimate end-of-WAL situation. Generally, we use it as-is, but if
4041 * we're retrying the exact same record that we've tried previously, only
4042 * complain the first time to keep the noise down. However, we only do when
4043 * reading from pg_wal, because we don't expect any invalid records in archive
4044 * or in records streamed from the primary. Files in the archive should be complete,
4045 * and we should never hit the end of WAL because we stop and wait for more WAL
4046 * to arrive before replaying it.
4047 *
4048 * NOTE: This function remembers the RecPtr value it was last called with,
4049 * to suppress repeated messages about the same record. Only call this when
4050 * you are about to ereport(), or you might cause a later message to be
4051 * erroneously suppressed.
4052 */
4053static int
4055{
4056 static XLogRecPtr lastComplaint = 0;
4057
4058 if (readSource == XLOG_FROM_PG_WAL && emode == LOG)
4059 {
4060 if (RecPtr == lastComplaint)
4061 emode = DEBUG1;
4062 else
4063 lastComplaint = RecPtr;
4064 }
4065 return emode;
4066}
4067
4068
4069/*
4070 * Subroutine to try to fetch and validate a prior checkpoint record.
4071 */
4072static XLogRecord *
4074 TimeLineID replayTLI)
4075{
4076 XLogRecord *record;
4077 uint8 info;
4078
4079 Assert(xlogreader != NULL);
4080
4081 if (!XRecOffIsValid(RecPtr))
4082 {
4083 ereport(LOG,
4084 (errmsg("invalid checkpoint location")));
4085 return NULL;
4086 }
4087
4089 record = ReadRecord(xlogprefetcher, LOG, true, replayTLI);
4090
4091 if (record == NULL)
4092 {
4093 ereport(LOG,
4094 (errmsg("invalid checkpoint record")));
4095 return NULL;
4096 }
4097 if (record->xl_rmid != RM_XLOG_ID)
4098 {
4099 ereport(LOG,
4100 (errmsg("invalid resource manager ID in checkpoint record")));
4101 return NULL;
4102 }
4103 info = record->xl_info & ~XLR_INFO_MASK;
4104 if (info != XLOG_CHECKPOINT_SHUTDOWN &&
4105 info != XLOG_CHECKPOINT_ONLINE)
4106 {
4107 ereport(LOG,
4108 (errmsg("invalid xl_info in checkpoint record")));
4109 return NULL;
4110 }
4112 {
4113 ereport(LOG,
4114 (errmsg("invalid length of checkpoint record")));
4115 return NULL;
4116 }
4117 return record;
4118}
4119
4120/*
4121 * Scan for new timelines that might have appeared in the archive since we
4122 * started recovery.
4123 *
4124 * If there are any, the function changes recovery target TLI to the latest
4125 * one and returns 'true'.
4126 */
4127static bool
4129{
4130 List *newExpectedTLEs;
4131 bool found;
4132 ListCell *cell;
4133 TimeLineID newtarget;
4134 TimeLineID oldtarget = recoveryTargetTLI;
4135 TimeLineHistoryEntry *currentTle = NULL;
4136
4138 if (newtarget == recoveryTargetTLI)
4139 {
4140 /* No new timelines found */
4141 return false;
4142 }
4143
4144 /*
4145 * Determine the list of expected TLIs for the new TLI
4146 */
4147
4148 newExpectedTLEs = readTimeLineHistory(newtarget);
4149
4150 /*
4151 * If the current timeline is not part of the history of the new timeline,
4152 * we cannot proceed to it.
4153 */
4154 found = false;
4155 foreach(cell, newExpectedTLEs)
4156 {
4157 currentTle = (TimeLineHistoryEntry *) lfirst(cell);
4158
4159 if (currentTle->tli == recoveryTargetTLI)
4160 {
4161 found = true;
4162 break;
4163 }
4164 }
4165 if (!found)
4166 {
4167 ereport(LOG,
4168 (errmsg("new timeline %u is not a child of database system timeline %u",
4169 newtarget,
4170 replayTLI)));
4171 return false;
4172 }
4173
4174 /*
4175 * The current timeline was found in the history file, but check that the
4176 * next timeline was forked off from it *after* the current recovery
4177 * location.
4178 */
4179 if (currentTle->end < replayLSN)
4180 {
4181 ereport(LOG,
4182 errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%08X",
4183 newtarget,
4184 replayTLI,
4185 LSN_FORMAT_ARGS(replayLSN)));
4186 return false;
4187 }
4188
4189 /* The new timeline history seems valid. Switch target */
4190 recoveryTargetTLI = newtarget;
4192 expectedTLEs = newExpectedTLEs;
4193
4194 /*
4195 * As in StartupXLOG(), try to ensure we have all the history files
4196 * between the old target and new target in pg_wal.
4197 */
4198 restoreTimeLineHistoryFiles(oldtarget + 1, newtarget);
4199
4200 ereport(LOG,
4201 (errmsg("new target timeline is %u",
4203
4204 return true;
4205}
4206
4207
4208/*
4209 * Open a logfile segment for reading (during recovery).
4210 *
4211 * If source == XLOG_FROM_ARCHIVE, the segment is retrieved from archive.
4212 * Otherwise, it's assumed to be already available in pg_wal.
4213 */
4214static int
4216 XLogSource source, bool notfoundOk)
4217{
4218 char xlogfname[MAXFNAMELEN];
4219 char activitymsg[MAXFNAMELEN + 16];
4220 char path[MAXPGPATH];
4221 int fd;
4222
4223 XLogFileName(xlogfname, tli, segno, wal_segment_size);
4224
4225 switch (source)
4226 {
4227 case XLOG_FROM_ARCHIVE:
4228 /* Report recovery progress in PS display */
4229 snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
4230 xlogfname);
4231 set_ps_display(activitymsg);
4232
4233 if (!RestoreArchivedFile(path, xlogfname,
4234 "RECOVERYXLOG",
4236 InRedo))
4237 return -1;
4238 break;
4239
4240 case XLOG_FROM_PG_WAL:
4241 case XLOG_FROM_STREAM:
4242 XLogFilePath(path, tli, segno, wal_segment_size);
4243 break;
4244
4245 default:
4246 elog(ERROR, "invalid XLogFileRead source %d", source);
4247 }
4248
4249 /*
4250 * If the segment was fetched from archival storage, replace the existing
4251 * xlog segment (if any) with the archival version.
4252 */
4254 {
4256 KeepFileRestoredFromArchive(path, xlogfname);
4257
4258 /*
4259 * Set path to point at the new file in pg_wal.
4260 */
4261 snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
4262 }
4263
4264 fd = BasicOpenFile(path, O_RDONLY | PG_BINARY);
4265 if (fd >= 0)
4266 {
4267 /* Success! */
4268 curFileTLI = tli;
4269
4270 /* Report recovery progress in PS display */
4271 snprintf(activitymsg, sizeof(activitymsg), "recovering %s",
4272 xlogfname);
4273 set_ps_display(activitymsg);
4274
4275 /* Track source of data in assorted state variables */
4278 /* In FROM_STREAM case, caller tracks receipt time, not me */
4279 if (source != XLOG_FROM_STREAM)
4281
4282 return fd;
4283 }
4284 if (errno != ENOENT || !notfoundOk) /* unexpected failure? */
4285 ereport(PANIC,
4287 errmsg("could not open file \"%s\": %m", path)));
4288 return -1;
4289}
4290
4291/*
4292 * Open a logfile segment for reading (during recovery).
4293 *
4294 * This version searches for the segment with any TLI listed in expectedTLEs.
4295 */
4296static int
4298{
4299 char path[MAXPGPATH];
4300 ListCell *cell;
4301 int fd;
4302 List *tles;
4303
4304 /*
4305 * Loop looking for a suitable timeline ID: we might need to read any of
4306 * the timelines listed in expectedTLEs.
4307 *
4308 * We expect curFileTLI on entry to be the TLI of the preceding file in
4309 * sequence, or 0 if there was no predecessor. We do not allow curFileTLI
4310 * to go backwards; this prevents us from picking up the wrong file when a
4311 * parent timeline extends to higher segment numbers than the child we
4312 * want to read.
4313 *
4314 * If we haven't read the timeline history file yet, read it now, so that
4315 * we know which TLIs to scan. We don't save the list in expectedTLEs,
4316 * however, unless we actually find a valid segment. That way if there is
4317 * neither a timeline history file nor a WAL segment in the archive, and
4318 * streaming replication is set up, we'll read the timeline history file
4319 * streamed from the primary when we start streaming, instead of
4320 * recovering with a dummy history generated here.
4321 */
4322 if (expectedTLEs)
4323 tles = expectedTLEs;
4324 else
4326
4327 foreach(cell, tles)
4328 {
4330 TimeLineID tli = hent->tli;
4331
4332 if (tli < curFileTLI)
4333 break; /* don't bother looking at too-old TLIs */
4334
4335 /*
4336 * Skip scanning the timeline ID that the logfile segment to read
4337 * doesn't belong to
4338 */
4339 if (hent->begin != InvalidXLogRecPtr)
4340 {
4341 XLogSegNo beginseg = 0;
4342
4343 XLByteToSeg(hent->begin, beginseg, wal_segment_size);
4344
4345 /*
4346 * The logfile segment that doesn't belong to the timeline is
4347 * older or newer than the segment that the timeline started or
4348 * ended at, respectively. It's sufficient to check only the
4349 * starting segment of the timeline here. Since the timelines are
4350 * scanned in descending order in this loop, any segments newer
4351 * than the ending segment should belong to newer timeline and
4352 * have already been read before. So it's not necessary to check
4353 * the ending segment of the timeline here.
4354 */
4355 if (segno < beginseg)
4356 continue;
4357 }
4358
4360 {
4361 fd = XLogFileRead(segno, tli, XLOG_FROM_ARCHIVE, true);
4362 if (fd != -1)
4363 {
4364 elog(DEBUG1, "got WAL segment from archive");
4365 if (!expectedTLEs)
4366 expectedTLEs = tles;
4367 return fd;
4368 }
4369 }
4370
4372 {
4373 fd = XLogFileRead(segno, tli, XLOG_FROM_PG_WAL, true);
4374 if (fd != -1)
4375 {
4376 if (!expectedTLEs)
4377 expectedTLEs = tles;
4378 return fd;
4379 }
4380 }
4381 }
4382
4383 /* Couldn't find it. For simplicity, complain about front timeline */
4385 errno = ENOENT;
4388 errmsg("could not open file \"%s\": %m", path)));
4389 return -1;
4390}
4391
4392/*
4393 * Set flag to signal the walreceiver to restart. (The startup process calls
4394 * this on noticing a relevant configuration change.)
4395 */
4396void
4398{
4400 {
4401 ereport(LOG,
4402 (errmsg("WAL receiver process shutdown requested")));
4403
4404 pendingWalRcvRestart = true;
4405 }
4406}
4407
4408
4409/*
4410 * Has a standby promotion already been triggered?
4411 *
4412 * Unlike CheckForStandbyTrigger(), this works in any process
4413 * that's connected to shared memory.
4414 */
4415bool
4417{
4418 /*
4419 * We check shared state each time only until a standby promotion is
4420 * triggered. We can't trigger a promotion again, so there's no need to
4421 * keep checking after the shared variable has once been seen true.
4422 */
4424 return true;
4425
4429
4431}
4432
4433static void
4435{
4439
4440 /*
4441 * Mark the recovery pause state as 'not paused' because the paused state
4442 * ends and promotion continues if a promotion is triggered while recovery
4443 * is paused. Otherwise pg_get_wal_replay_pause_state() can mistakenly
4444 * return 'paused' while a promotion is ongoing.
4445 */
4446 SetRecoveryPause(false);
4447
4449}
4450
4451/*
4452 * Check whether a promote request has arrived.
4453 */
4454static bool
4456{
4458 return true;
4459
4461 {
4462 ereport(LOG, (errmsg("received promote request")));
4466 return true;
4467 }
4468
4469 return false;
4470}
4471
4472/*
4473 * Remove the files signaling a standby promotion request.
4474 */
4475void
4477{
4478 unlink(PROMOTE_SIGNAL_FILE);
4479}
4480
4481/*
4482 * Check to see if a promote request has arrived.
4483 */
4484bool
4486{
4487 struct stat stat_buf;
4488
4489 if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
4490 return true;
4491
4492 return false;
4493}
4494
4495/*
4496 * Wake up startup process to replay newly arrived WAL, or to notice that
4497 * failover has been requested.
4498 */
4499void
4501{
4503}
4504
4505/*
4506 * Schedule a walreceiver wakeup in the main recovery loop.
4507 */
4508void
4510{
4512}
4513
4514/*
4515 * Is HotStandby active yet? This is only important in special backends
4516 * since normal backends won't ever be able to connect until this returns
4517 * true. Postmaster knows this by way of signal, not via shared memory.
4518 *
4519 * Unlike testing standbyState, this works in any process that's connected to
4520 * shared memory. (And note that standbyState alone doesn't tell the truth
4521 * anyway.)
4522 */
4523bool
4525{
4526 /*
4527 * We check shared state each time only until Hot Standby is active. We
4528 * can't de-activate Hot Standby, so there's no need to keep checking
4529 * after the shared variable has once been seen true.
4530 */
4532 return true;
4533 else
4534 {
4535 /* spinlock is essential on machines with weak memory ordering! */
4539
4540 return LocalHotStandbyActive;
4541 }
4542}
4543
4544/*
4545 * Like HotStandbyActive(), but to be used only in WAL replay code,
4546 * where we don't need to ask any other process what the state is.
4547 */
4548static bool
4550{
4552 return LocalHotStandbyActive;
4553}
4554
4555/*
4556 * Get latest redo apply position.
4557 *
4558 * Exported to allow WALReceiver to read the pointer directly.
4559 */
4562{
4563 XLogRecPtr recptr;
4564 TimeLineID tli;
4565
4570
4571 if (replayTLI)
4572 *replayTLI = tli;
4573 return recptr;
4574}
4575
4576
4577/*
4578 * Get position of last applied, or the record being applied.
4579 *
4580 * This is different from GetXLogReplayRecPtr() in that if a WAL
4581 * record is currently being applied, this includes that record.
4582 */
4585{
4586 XLogRecPtr recptr;
4587 TimeLineID tli;
4588
4593
4594 if (replayEndTLI)
4595 *replayEndTLI = tli;
4596 return recptr;
4597}
4598
4599/*
4600 * Save timestamp of latest processed commit/abort record.
4601 *
4602 * We keep this in XLogRecoveryCtl, not a simple static variable, so that it can be
4603 * seen by processes other than the startup process. Note in particular
4604 * that CreateRestartPoint is executed in the checkpointer.
4605 */
4606static void
4608{
4612}
4613
4614/*
4615 * Fetch timestamp of latest processed commit/abort record.
4616 */
4619{
4620 TimestampTz xtime;
4621
4625
4626 return xtime;
4627}
4628
4629/*
4630 * Save timestamp of the next chunk of WAL records to apply.
4631 *
4632 * We keep this in XLogRecoveryCtl, not a simple static variable, so that it can be
4633 * seen by all backends.
4634 */
4635static void
4637{
4641}
4642
4643/*
4644 * Fetch timestamp of latest processed commit/abort record.
4645 * Startup process maintains an accurate local copy in XLogReceiptTime
4646 */
4649{
4650 TimestampTz xtime;
4651
4655
4656 return xtime;
4657}
4658
4659/*
4660 * Returns time of receipt of current chunk of XLOG data, as well as
4661 * whether it was received from streaming replication or from archives.
4662 */
4663void
4664GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream)
4665{
4666 /*
4667 * This must be executed in the startup process, since we don't export the
4668 * relevant state to shared memory.
4669 */
4671
4672 *rtime = XLogReceiptTime;
4673 *fromStream = (XLogReceiptSource == XLOG_FROM_STREAM);
4674}
4675
4676/*
4677 * Note that text field supplied is a parameter name and does not require
4678 * translation
4679 */
4680void
4681RecoveryRequiresIntParameter(const char *param_name, int currValue, int minValue)
4682{
4683 if (currValue < minValue)
4684 {
4686 {
4687 bool warned_for_promote = false;
4688
4690 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4691 errmsg("hot standby is not possible because of insufficient parameter settings"),
4692 errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4693 param_name,
4694 currValue,
4695 minValue)));
4696
4697 SetRecoveryPause(true);
4698
4699 ereport(LOG,
4700 (errmsg("recovery has paused"),
4701 errdetail("If recovery is unpaused, the server will shut down."),
4702 errhint("You can then restart the server after making the necessary configuration changes.")));
4703
4705 {
4707
4709 {
4710 if (!warned_for_promote)
4712 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4713 errmsg("promotion is not possible because of insufficient parameter settings"),
4714
4715 /*
4716 * Repeat the detail from above so it's easy to find
4717 * in the log.
4718 */
4719 errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4720 param_name,
4721 currValue,
4722 minValue),
4723 errhint("Restart the server after making the necessary configuration changes.")));
4724 warned_for_promote = true;
4725 }
4726
4727 /*
4728 * If recovery pause is requested then set it paused. While
4729 * we are in the loop, user might resume and pause again so
4730 * set this every time.
4731 */
4733
4734 /*
4735 * We wait on a condition variable that will wake us as soon
4736 * as the pause ends, but we use a timeout so we can check the
4737 * above conditions periodically too.
4738 */
4740 WAIT_EVENT_RECOVERY_PAUSE);
4741 }
4743 }
4744
4745 ereport(FATAL,
4746 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4747 errmsg("recovery aborted because of insufficient parameter settings"),
4748 /* Repeat the detail from above so it's easy to find in the log. */
4749 errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4750 param_name,
4751 currValue,
4752 minValue),
4753 errhint("You can restart the server after making the necessary configuration changes.")));
4754 }
4755}
4756
4757
4758/*
4759 * GUC check_hook for primary_slot_name
4760 */
4761bool
4763{
4764 if (*newval && strcmp(*newval, "") != 0 &&
4766 return false;
4767
4768 return true;
4769}
4770
4771/*
4772 * Recovery target settings: Only one of the several recovery_target* settings
4773 * may be set. Setting a second one results in an error. The global variable
4774 * recoveryTarget tracks which kind of recovery target was chosen. Other
4775 * variables store the actual target value (for example a string or a xid).
4776 * The assign functions of the parameters check whether a competing parameter
4777 * was already set. But we want to allow setting the same parameter multiple
4778 * times. We also want to allow unsetting a parameter and setting a different
4779 * one, so we unset recoveryTarget when the parameter is set to an empty
4780 * string.
4781 *
4782 * XXX this code is broken by design. Throwing an error from a GUC assign
4783 * hook breaks fundamental assumptions of guc.c. So long as all the variables
4784 * for which this can happen are PGC_POSTMASTER, the consequences are limited,
4785 * since we'd just abort postmaster startup anyway. Nonetheless it's likely
4786 * that we have odd behaviors such as unexpected GUC ordering dependencies.
4787 */
4788
4789pg_noreturn static void
4791{
4792 ereport(ERROR,
4793 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4794 errmsg("multiple recovery targets specified"),
4795 errdetail("At most one of \"recovery_target\", \"recovery_target_lsn\", \"recovery_target_name\", \"recovery_target_time\", \"recovery_target_xid\" may be set.")));
4796}
4797
4798/*
4799 * GUC check_hook for recovery_target
4800 */
4801bool
4803{
4804 if (strcmp(*newval, "immediate") != 0 && strcmp(*newval, "") != 0)
4805 {
4806 GUC_check_errdetail("The only allowed value is \"immediate\".");
4807 return false;
4808 }
4809 return true;
4810}
4811
4812/*
4813 * GUC assign_hook for recovery_target
4814 */
4815void
4816assign_recovery_target(const char *newval, void *extra)
4817{
4821
4822 if (newval && strcmp(newval, "") != 0)
4824 else
4826}
4827
4828/*
4829 * GUC check_hook for recovery_target_lsn
4830 */
4831bool
4833{
4834 if (strcmp(*newval, "") != 0)
4835 {
4836 XLogRecPtr lsn;
4837 XLogRecPtr *myextra;
4838 ErrorSaveContext escontext = {T_ErrorSaveContext};
4839
4840 lsn = pg_lsn_in_safe(*newval, (Node *) &escontext);
4841 if (escontext.error_occurred)
4842 return false;
4843
4844 myextra = (XLogRecPtr *) guc_malloc(LOG, sizeof(XLogRecPtr));
4845 if (!myextra)
4846 return false;
4847 *myextra = lsn;
4848 *extra = myextra;
4849 }
4850 return true;
4851}
4852
4853/*
4854 * GUC assign_hook for recovery_target_lsn
4855 */
4856void
4857assign_recovery_target_lsn(const char *newval, void *extra)
4858{
4862
4863 if (newval && strcmp(newval, "") != 0)
4864 {
4866 recoveryTargetLSN = *((XLogRecPtr *) extra);
4867 }
4868 else
4870}
4871
4872/*
4873 * GUC check_hook for recovery_target_name
4874 */
4875bool
4877{
4878 /* Use the value of newval directly */
4879 if (strlen(*newval) >= MAXFNAMELEN)
4880 {
4881 GUC_check_errdetail("\"%s\" is too long (maximum %d characters).",
4882 "recovery_target_name", MAXFNAMELEN - 1);
4883 return false;
4884 }
4885 return true;
4886}
4887
4888/*
4889 * GUC assign_hook for recovery_target_name
4890 */
4891void
4892assign_recovery_target_name(const char *newval, void *extra)
4893{
4897
4898 if (newval && strcmp(newval, "") != 0)
4899 {
4902 }
4903 else
4905}
4906
4907/*
4908 * GUC check_hook for recovery_target_time
4909 *
4910 * The interpretation of the recovery_target_time string can depend on the
4911 * time zone setting, so we need to wait until after all GUC processing is
4912 * done before we can do the final parsing of the string. This check function
4913 * only does a parsing pass to catch syntax errors, but we store the string
4914 * and parse it again when we need to use it.
4915 */
4916bool
4918{
4919 if (strcmp(*newval, "") != 0)
4920 {
4921 /* reject some special values */
4922 if (strcmp(*newval, "now") == 0 ||
4923 strcmp(*newval, "today") == 0 ||
4924 strcmp(*newval, "tomorrow") == 0 ||
4925 strcmp(*newval, "yesterday") == 0)
4926 {
4927 return false;
4928 }
4929
4930 /*
4931 * parse timestamp value (see also timestamptz_in())
4932 */
4933 {
4934 char *str = *newval;
4935 fsec_t fsec;
4936 struct pg_tm tt,
4937 *tm = &tt;
4938 int tz;
4939 int dtype;
4940 int nf;
4941 int dterr;
4942 char *field[MAXDATEFIELDS];
4943 int ftype[MAXDATEFIELDS];
4944 char workbuf[MAXDATELEN + MAXDATEFIELDS];
4945 DateTimeErrorExtra dtextra;
4947
4948 dterr = ParseDateTime(str, workbuf, sizeof(workbuf),
4949 field, ftype, MAXDATEFIELDS, &nf);
4950 if (dterr == 0)
4951 dterr = DecodeDateTime(field, ftype, nf,
4952 &dtype, tm, &fsec, &tz, &dtextra);
4953 if (dterr != 0)
4954 return false;
4955 if (dtype != DTK_DATE)
4956 return false;
4957
4958 if (tm2timestamp(tm, fsec, &tz, &timestamp) != 0)
4959 {
4960 GUC_check_errdetail("Timestamp out of range: \"%s\".", str);
4961 return false;
4962 }
4963 }
4964 }
4965 return true;
4966}
4967
4968/*
4969 * GUC assign_hook for recovery_target_time
4970 */
4971void
4972assign_recovery_target_time(const char *newval, void *extra)
4973{
4977
4978 if (newval && strcmp(newval, "") != 0)
4980 else
4982}
4983
4984/*
4985 * GUC check_hook for recovery_target_timeline
4986 */
4987bool
4989{
4992
4993 if (strcmp(*newval, "current") == 0)
4995 else if (strcmp(*newval, "latest") == 0)
4997 else
4998 {
4999 char *endp;
5000 uint64 timeline;
5001
5003
5004 errno = 0;
5005 timeline = strtou64(*newval, &endp, 0);
5006
5007 if (*endp != '\0' || errno == EINVAL || errno == ERANGE)
5008 {
5009 GUC_check_errdetail("\"%s\" is not a valid number.",
5010 "recovery_target_timeline");
5011 return false;
5012 }
5013
5014 if (timeline < 1 || timeline > PG_UINT32_MAX)
5015 {
5016 GUC_check_errdetail("\"%s\" must be between %u and %u.",
5017 "recovery_target_timeline", 1, UINT_MAX);
5018 return false;
5019 }
5020 }
5021
5023 if (!myextra)
5024 return false;
5025 *myextra = rttg;
5026 *extra = myextra;
5027
5028 return true;
5029}
5030
5031/*
5032 * GUC assign_hook for recovery_target_timeline
5033 */
5034void
5036{
5039 recoveryTargetTLIRequested = (TimeLineID) strtoul(newval, NULL, 0);
5040 else
5042}
5043
5044/*
5045 * GUC check_hook for recovery_target_xid
5046 */
5047bool
5049{
5050 if (strcmp(*newval, "") != 0)
5051 {
5052 TransactionId xid;
5053 TransactionId *myextra;
5054
5055 errno = 0;
5056 xid = (TransactionId) strtou64(*newval, NULL, 0);
5057 if (errno == EINVAL || errno == ERANGE)
5058 return false;
5059
5060 myextra = (TransactionId *) guc_malloc(LOG, sizeof(TransactionId));
5061 if (!myextra)
5062 return false;
5063 *myextra = xid;
5064 *extra = myextra;
5065 }
5066 return true;
5067}
5068
5069/*
5070 * GUC assign_hook for recovery_target_xid
5071 */
5072void
5073assign_recovery_target_xid(const char *newval, void *extra)
5074{
5078
5079 if (newval && strcmp(newval, "") != 0)
5080 {
5082 recoveryTargetXid = *((TransactionId *) extra);
5083 }
5084 else
5086}
List * readTimeLineHistory(TimeLineID targetTLI)
Definition: timeline.c:76
TimeLineID findNewestTimeLine(TimeLineID startTLI)
Definition: timeline.c:264
TimeLineID tliOfPointInHistory(XLogRecPtr ptr, List *history)
Definition: timeline.c:544
XLogRecPtr tliSwitchPoint(TimeLineID tli, List *history, TimeLineID *nextTLI)
Definition: timeline.c:572
bool existsTimeLineHistory(TimeLineID probeTLI)
Definition: timeline.c:222
void restoreTimeLineHistoryFiles(TimeLineID begin, TimeLineID end)
Definition: timeline.c:50
bool tliInHistory(TimeLineID tli, List *expectedTLEs)
Definition: timeline.c:526
void remove_tablespace_symlink(const char *linkloc)
Definition: tablespace.c:883
bool allow_in_place_tablespaces
Definition: tablespace.c:85
void disable_startup_progress_timeout(void)
Definition: startup.c:309
bool IsPromoteSignaled(void)
Definition: startup.c:288
void begin_startup_progress_phase(void)
Definition: startup.c:343
void ProcessStartupProcInterrupts(void)
Definition: startup.c:154
void ResetPromoteSignaled(void)
Definition: startup.c:294
int ParseDateTime(const char *timestr, char *workbuf, size_t buflen, char **field, int *ftype, int maxfields, int *numfields)
Definition: datetime.c:773
int DecodeDateTime(char **field, int *ftype, int nf, int *dtype, struct pg_tm *tm, fsec_t *fsec, int *tzp, DateTimeErrorExtra *extra)
Definition: datetime.c:997
long TimestampDifferenceMilliseconds(TimestampTz start_time, TimestampTz stop_time)
Definition: timestamp.c:1757
int tm2timestamp(struct pg_tm *tm, fsec_t fsec, int *tzp, Timestamp *result)
Definition: timestamp.c:2006
bool TimestampDifferenceExceeds(TimestampTz start_time, TimestampTz stop_time, int msec)
Definition: timestamp.c:1781
Datum timestamptz_in(PG_FUNCTION_ARGS)
Definition: timestamp.c:418
TimestampTz GetCurrentTimestamp(void)
Definition: timestamp.c:1645
const char * timestamptz_to_str(TimestampTz t)
Definition: timestamp.c:1862
Datum now(PG_FUNCTION_ARGS)
Definition: timestamp.c:1609
uint32 BlockNumber
Definition: block.h:31
int Buffer
Definition: buf.h:23
#define InvalidBuffer
Definition: buf.h:25
void UnlockReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:5355
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:5572
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:417
#define BUFFER_LOCK_EXCLUSIVE
Definition: bufmgr.h:198
@ RBM_NORMAL_NO_LOG
Definition: bufmgr.h:52
static bool BufferIsValid(Buffer bufnum)
Definition: bufmgr.h:368
PageData * Page
Definition: bufpage.h:82
static XLogRecPtr PageGetLSN(const PageData *page)
Definition: bufpage.h:386
uint8_t uint8
Definition: c.h:537
#define PG_UINT32_MAX
Definition: c.h:596
#define pg_noreturn
Definition: c.h:164
#define PG_USED_FOR_ASSERTS_ONLY
Definition: c.h:223
#define PG_BINARY
Definition: c.h:1273
#define UINT64_FORMAT
Definition: c.h:558
int32_t int32
Definition: c.h:535
uint64_t uint64
Definition: c.h:540
uint32_t uint32
Definition: c.h:539
uint32 TransactionId
Definition: c.h:658
size_t Size
Definition: c.h:611
void RequestCheckpoint(int flags)
bool ConditionVariableCancelSleep(void)
bool ConditionVariableTimedSleep(ConditionVariable *cv, long timeout, uint32 wait_event_info)
void ConditionVariableBroadcast(ConditionVariable *cv)
void ConditionVariableInit(ConditionVariable *cv)
int64 TimestampTz
Definition: timestamp.h:39
int32 fsec_t
Definition: timestamp.h:41
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1161
int errcode_for_file_access(void)
Definition: elog.c:877
int errdetail(const char *fmt,...)
Definition: elog.c:1207
ErrorContextCallback * error_context_stack
Definition: elog.c:95
int errhint(const char *fmt,...)
Definition: elog.c:1321
int errcode(int sqlerrcode)
Definition: elog.c:854
int errmsg(const char *fmt,...)
Definition: elog.c:1071
#define LOG
Definition: elog.h:31
#define errcontext
Definition: elog.h:198
#define FATAL
Definition: elog.h:41
#define WARNING
Definition: elog.h:36
#define DEBUG2
Definition: elog.h:29
#define PANIC
Definition: elog.h:42
#define DEBUG1
Definition: elog.h:30
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:226
#define ereport(elevel,...)
Definition: elog.h:150
int BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition: fd.c:1108
int durable_rename(const char *oldfile, const char *newfile, int elevel)
Definition: fd.c:779
int BasicOpenFile(const char *fileName, int fileFlags)
Definition: fd.c:1086
int FreeFile(FILE *file)
Definition: fd.c:2840
DIR * AllocateDir(const char *dirname)
Definition: fd.c:2904
struct dirent * ReadDir(DIR *dir, const char *dirname)
Definition: fd.c:2970
int pg_fsync(int fd)
Definition: fd.c:386
FILE * AllocateFile(const char *name, const char *mode)
Definition: fd.c:2641
PGFileType get_dirent_type(const char *path, const struct dirent *de, bool look_through_symlinks, int elevel)
Definition: file_utils.c:547
@ PGFILETYPE_LNK
Definition: file_utils.h:24
#define DirectFunctionCall3(func, arg1, arg2, arg3)
Definition: fmgr.h:686
bool IsUnderPostmaster
Definition: globals.c:120
char * DataDir
Definition: globals.c:71
bool IsPostmasterEnvironment
Definition: globals.c:119
void * guc_malloc(int elevel, size_t size)
Definition: guc.c:639
#define newval
#define GUC_check_errdetail
Definition: guc.h:505
GucSource
Definition: guc.h:112
Assert(PointerIsAligned(start, uint64))
const char * str
#define MAXDATEFIELDS
Definition: datetime.h:202
#define DTK_DATE
Definition: datetime.h:144
#define MAXDATELEN
Definition: datetime.h:200
#define close(a)
Definition: win32.h:12
void proc_exit(int code)
Definition: ipc.c:104
int i
Definition: isn.c:77
void OwnLatch(Latch *latch)
Definition: latch.c:126
void DisownLatch(Latch *latch)
Definition: latch.c:144
void InitSharedLatch(Latch *latch)
Definition: latch.c:93
void SetLatch(Latch *latch)
Definition: latch.c:290
void ResetLatch(Latch *latch)
Definition: latch.c:374
int WaitLatch(Latch *latch, int wakeEvents, long timeout, uint32 wait_event_info)
Definition: latch.c:172
List * lappend(List *list, void *datum)
Definition: list.c:339
void list_free_deep(List *list)
Definition: list.c:1560
static struct pg_tm tm
Definition: localtime.c:104
char * pstrdup(const char *in)
Definition: mcxt.c:1759
void pfree(void *pointer)
Definition: mcxt.c:1594
void * palloc0(Size size)
Definition: mcxt.c:1395
void * palloc(Size size)
Definition: mcxt.c:1365
#define AmStartupProcess()
Definition: miscadmin.h:389
#define IsBootstrapProcessingMode()
Definition: miscadmin.h:476
void * arg
#define ERRCODE_DATA_CORRUPTED
Definition: pg_basebackup.c:42
#define MAXPGPATH
#define XLOG_RESTORE_POINT
Definition: pg_control.h:75
#define XLOG_CHECKPOINT_REDO
Definition: pg_control.h:82
#define XLOG_OVERWRITE_CONTRECORD
Definition: pg_control.h:81
DBState
Definition: pg_control.h:90
@ DB_IN_ARCHIVE_RECOVERY
Definition: pg_control.h:96
@ DB_SHUTDOWNED_IN_RECOVERY
Definition: pg_control.h:93
@ DB_SHUTDOWNED
Definition: pg_control.h:92
@ DB_IN_CRASH_RECOVERY
Definition: pg_control.h:95
#define XLOG_CHECKPOINT_SHUTDOWN
Definition: pg_control.h:68
#define XLOG_BACKUP_END
Definition: pg_control.h:73
#define XLOG_CHECKPOINT_ONLINE
Definition: pg_control.h:69
#define XLOG_END_OF_RECOVERY
Definition: pg_control.h:77
const void size_t len
#define lfirst(lc)
Definition: pg_list.h:172
#define NIL
Definition: pg_list.h:68
XLogRecPtr pg_lsn_in_safe(const char *str, Node *escontext)
Definition: pg_lsn.c:32
static rewind_source * source
Definition: pg_rewind.c:89
const char * pg_rusage_show(const PGRUsage *ru0)
Definition: pg_rusage.c:40
void pg_rusage_init(PGRUsage *ru0)
Definition: pg_rusage.c:27
static char * buf
Definition: pg_test_fsync.c:72
@ IOOBJECT_WAL
Definition: pgstat.h:277
@ IOCONTEXT_NORMAL
Definition: pgstat.h:287
@ IOOP_READ
Definition: pgstat.h:313
instr_time pgstat_prepare_io_time(bool track_io_guc)
Definition: pgstat_io.c:91
void pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op, instr_time start_time, uint32 cnt, uint64 bytes)
Definition: pgstat_io.c:122
int64 timestamp
void SendPostmasterSignal(PMSignalReason reason)
Definition: pmsignal.c:165
@ PMSIGNAL_RECOVERY_STARTED
Definition: pmsignal.h:35
@ PMSIGNAL_BEGIN_HOT_STANDBY
Definition: pmsignal.h:37
@ PMSIGNAL_RECOVERY_CONSISTENT
Definition: pmsignal.h:36
#define pg_pread
Definition: port.h:226
#define snprintf
Definition: port.h:239
size_t strlcpy(char *dst, const char *src, size_t siz)
Definition: strlcpy.c:45
static Datum ObjectIdGetDatum(Oid X)
Definition: postgres.h:262
static Datum CStringGetDatum(const char *X)
Definition: postgres.h:360
static Datum Int32GetDatum(int32 X)
Definition: postgres.h:222
#define InvalidOid
Definition: postgres_ext.h:37
static int fd(const char *x, int i)
Definition: preproc-init.c:105
void RecordKnownAssignedTransactionIds(TransactionId xid)
Definition: procarray.c:4365
void KnownAssignedTransactionIdsIdleMaintenance(void)
Definition: procarray.c:4526
static void set_ps_display(const char *activity)
Definition: ps_status.h:40
char * psprintf(const char *fmt,...)
Definition: psprintf.c:43
ForkNumber
Definition: relpath.h:56
@ MAIN_FORKNUM
Definition: relpath.h:58
#define PG_TBLSPC_DIR
Definition: relpath.h:41
void RmgrStartup(void)
Definition: rmgr.c:58
void RmgrCleanup(void)
Definition: rmgr.c:74
void * ShmemInitStruct(const char *name, Size size, bool *foundPtr)
Definition: shmem.c:387
bool ReplicationSlotValidateName(const char *name, bool allow_reserved_name, int elevel)
Definition: slot.c:272
void ShutDownSlotSync(void)
Definition: slotsync.c:1580
#define SpinLockInit(lock)
Definition: spin.h:57
#define SpinLockRelease(lock)
Definition: spin.h:61
#define SpinLockAcquire(lock)
Definition: spin.h:59
#define ereport_startup_progress(msg,...)
Definition: startup.h:18
void appendStringInfo(StringInfo str, const char *fmt,...)
Definition: stringinfo.c:145
void appendStringInfoString(StringInfo str, const char *s)
Definition: stringinfo.c:230
void appendStringInfoChar(StringInfo str, char ch)
Definition: stringinfo.c:242
void initStringInfo(StringInfo str)
Definition: stringinfo.c:97
Oid oldestMultiDB
Definition: pg_control.h:51
MultiXactId oldestMulti
Definition: pg_control.h:50
MultiXactOffset nextMultiOffset
Definition: pg_control.h:47
TransactionId newestCommitTsXid
Definition: pg_control.h:55
TransactionId oldestXid
Definition: pg_control.h:48
TimeLineID PrevTimeLineID
Definition: pg_control.h:40
TimeLineID ThisTimeLineID
Definition: pg_control.h:39
Oid nextOid
Definition: pg_control.h:45
MultiXactId nextMulti
Definition: pg_control.h:46
FullTransactionId nextXid
Definition: pg_control.h:44
TransactionId oldestCommitTsXid
Definition: pg_control.h:53
XLogRecPtr redo
Definition: pg_control.h:37
Oid oldestXidDB
Definition: pg_control.h:49
XLogRecPtr backupStartPoint
Definition: pg_control.h:170
bool backupEndRequired
Definition: pg_control.h:172
CheckPoint checkPointCopy
Definition: pg_control.h:135
XLogRecPtr backupEndPoint
Definition: pg_control.h:171
XLogRecPtr minRecoveryPoint
Definition: pg_control.h:168
XLogRecPtr checkPoint
Definition: pg_control.h:133
uint64 system_identifier
Definition: pg_control.h:110
TimeLineID minRecoveryPointTLI
Definition: pg_control.h:169
Definition: dirent.c:26
XLogRecPtr lastPageBeginPtr
Definition: xlogrecovery.h:121
XLogRecPtr abortedRecPtr
Definition: xlogrecovery.h:130
XLogRecPtr missingContrecPtr
Definition: xlogrecovery.h:131
TimeLineID endOfLogTLI
Definition: xlogrecovery.h:119
struct ErrorContextCallback * previous
Definition: elog.h:297
void(* callback)(void *arg)
Definition: elog.h:298
bool error_occurred
Definition: miscnodes.h:47
Definition: latch.h:114
Definition: pg_list.h:54
Definition: nodes.h:135
RelFileNumber relNumber
const char *(* rm_identify)(uint8 info)
void(* rm_mask)(char *pagedata, BlockNumber blkno)
void(* rm_redo)(XLogReaderState *record)
const char * rm_name
void(* rm_desc)(StringInfo buf, XLogReaderState *record)
XLogRecPtr begin
Definition: timeline.h:28
TimeLineID tli
Definition: timeline.h:27
XLogRecPtr end
Definition: timeline.h:29
TimeLineID ws_tli
Definition: xlogreader.h:49
TimeLineID replayTLI
Definition: xlogrecovery.c:201
XLogRecPtr missingContrecPtr
Definition: xlogreader.h:215
char * errormsg_buf
Definition: xlogreader.h:311
XLogRecPtr EndRecPtr
Definition: xlogreader.h:207
uint64 system_identifier
Definition: xlogreader.h:191
XLogRecPtr ReadRecPtr
Definition: xlogreader.h:206
XLogRecPtr abortedRecPtr
Definition: xlogreader.h:214
TimeLineID latestPageTLI
Definition: xlogreader.h:280
XLogRecPtr overwrittenRecPtr
Definition: xlogreader.h:217
XLogRecPtr latestPagePtr
Definition: xlogreader.h:279
WALOpenSegment seg
Definition: xlogreader.h:272
void * private_data
Definition: xlogreader.h:196
uint8 xl_info
Definition: xlogrecord.h:46
uint32 xl_tot_len
Definition: xlogrecord.h:43
TransactionId xl_xid
Definition: xlogrecord.h:44
RmgrId xl_rmid
Definition: xlogrecord.h:47
ConditionVariable recoveryNotPausedCV
Definition: xlogrecovery.c:366
XLogRecPtr lastReplayedEndRecPtr
Definition: xlogrecovery.c:346
TimeLineID replayEndTLI
Definition: xlogrecovery.c:355
TimeLineID lastReplayedTLI
Definition: xlogrecovery.c:347
TimestampTz currentChunkStartTime
Definition: xlogrecovery.c:363
XLogRecPtr replayEndRecPtr
Definition: xlogrecovery.c:354
TimestampTz recoveryLastXTime
Definition: xlogrecovery.c:357
RecoveryPauseState recoveryPauseState
Definition: xlogrecovery.c:365
XLogRecPtr lastReplayedReadRecPtr
Definition: xlogrecovery.c:345
Definition: guc.h:174
Definition: dirent.h:10
char d_name[MAX_PATH]
Definition: dirent.h:15
Definition: pgtime.h:35
Definition: regguts.h:323
TimeLineID PrevTimeLineID
TimeLineID ThisTimeLineID
char rp_name[MAXFNAMELEN]
TransactionId twophase_xid
Definition: xact.h:428
TransactionId twophase_xid
Definition: xact.h:398
#define InvalidTransactionId
Definition: transam.h:31
#define U64FromFullTransactionId(x)
Definition: transam.h:49
#define XidFromFullTransactionId(x)
Definition: transam.h:48
#define TransactionIdIsValid(xid)
Definition: transam.h:41
#define TransactionIdIsNormal(xid)
Definition: transam.h:42
#define TimestampTzPlusMilliseconds(tz, ms)
Definition: timestamp.h:85
static TimestampTz DatumGetTimestampTz(Datum X)
Definition: timestamp.h:34
void AdvanceNextFullTransactionIdPastXid(TransactionId xid)
Definition: varsup.c:304
static void pgstat_report_wait_start(uint32 wait_event_info)
Definition: wait_event.h:69
static void pgstat_report_wait_end(void)
Definition: wait_event.h:85
#define WL_TIMEOUT
Definition: waiteventset.h:37
#define WL_EXIT_ON_PM_DEATH
Definition: waiteventset.h:39
#define WL_LATCH_SET
Definition: waiteventset.h:34
void WalRcvForceReply(void)
Definition: walreceiver.c:1350
#define AllowCascadeReplication()
Definition: walreceiver.h:40
XLogRecPtr GetWalRcvFlushRecPtr(XLogRecPtr *latestChunkStart, TimeLineID *receiveTLI)
bool WalRcvStreaming(void)
void RequestXLogStreaming(TimeLineID tli, XLogRecPtr recptr, const char *conninfo, const char *slotname, bool create_temp_slot)
bool WalRcvRunning(void)
void WalSndWakeup(bool physical, bool logical)
Definition: walsender.c:3779
#define stat
Definition: win32_port.h:274
#define S_IRUSR
Definition: win32_port.h:279
#define symlink(oldpath, newpath)
Definition: win32_port.h:225
#define S_IWUSR
Definition: win32_port.h:282
#define XLOG_XACT_COMMIT_PREPARED
Definition: xact.h:173
#define XLOG_XACT_COMMIT
Definition: xact.h:170
#define XLOG_XACT_OPMASK
Definition: xact.h:180
#define XLOG_XACT_ABORT
Definition: xact.h:172
#define XLOG_XACT_ABORT_PREPARED
Definition: xact.h:174
void ParseCommitRecord(uint8 info, xl_xact_commit *xlrec, xl_xact_parsed_commit *parsed)
Definition: xactdesc.c:35
void ParseAbortRecord(uint8 info, xl_xact_abort *xlrec, xl_xact_parsed_abort *parsed)
Definition: xactdesc.c:141
int wal_decode_buffer_size
Definition: xlog.c:137
bool EnableHotStandby
Definition: xlog.c:122
XLogRecPtr GetRedoRecPtr(void)
Definition: xlog.c:6486
void SetInstallXLogFileSegmentActive(void)
Definition: xlog.c:9525
bool IsInstallXLogFileSegmentActive(void)
Definition: xlog.c:9533
int wal_segment_size
Definition: xlog.c:144
void SwitchIntoArchiveRecovery(XLogRecPtr EndRecPtr, TimeLineID replayTLI)
Definition: xlog.c:6258
void RemoveNonParentXlogFiles(XLogRecPtr switchpoint, TimeLineID newTLI)
Definition: xlog.c:3953
void ReachedEndOfBackup(XLogRecPtr EndRecPtr, TimeLineID tli)
Definition: xlog.c:6296
int wal_retrieve_retry_interval
Definition: xlog.c:135
bool track_wal_io_timing
Definition: xlog.c:138
static ControlFileData * ControlFile
Definition: xlog.c:574
void XLogShutdownWalRcv(void)
Definition: xlog.c:9514
bool XLogCheckpointNeeded(XLogSegNo new_segno)
Definition: xlog.c:2280
#define TABLESPACE_MAP_OLD
Definition: xlog.h:307
#define TABLESPACE_MAP
Definition: xlog.h:306
#define STANDBY_SIGNAL_FILE
Definition: xlog.h:302
#define CHECKPOINT_CAUSE_XLOG
Definition: xlog.h:148
#define PROMOTE_SIGNAL_FILE
Definition: xlog.h:310
#define BACKUP_LABEL_FILE
Definition: xlog.h:303
#define RECOVERY_SIGNAL_FILE
Definition: xlog.h:301
static RmgrData GetRmgr(RmgrId rmid)
#define XLogSegmentOffset(xlogptr, wal_segsz_bytes)
#define MAXFNAMELEN
#define XLOGDIR
#define XLByteToSeg(xlrp, logSegNo, wal_segsz_bytes)
static void XLogFilePath(char *path, TimeLineID tli, XLogSegNo logSegNo, int wal_segsz_bytes)
#define XRecOffIsValid(xlrp)
static void XLogFileName(char *fname, TimeLineID tli, XLogSegNo logSegNo, int wal_segsz_bytes)
#define XLByteInSeg(xlrp, logSegNo, wal_segsz_bytes)
bool RestoreArchivedFile(char *path, const char *xlogfname, const char *recovername, off_t expectedSize, bool cleanupEnabled)
Definition: xlogarchive.c:54
void KeepFileRestoredFromArchive(const char *path, const char *xlogfname)
Definition: xlogarchive.c:358
#define LSN_FORMAT_ARGS(lsn)
Definition: xlogdefs.h:46
#define XLogRecPtrIsInvalid(r)
Definition: xlogdefs.h:29
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define InvalidXLogRecPtr
Definition: xlogdefs.h:28
uint32 TimeLineID
Definition: xlogdefs.h:62
uint64 XLogSegNo
Definition: xlogdefs.h:51
void XLogPrefetcherComputeStats(XLogPrefetcher *prefetcher)
XLogPrefetcher * XLogPrefetcherAllocate(XLogReaderState *reader)
void XLogPrefetchReconfigure(void)
XLogRecord * XLogPrefetcherReadRecord(XLogPrefetcher *prefetcher, char **errmsg)
XLogReaderState * XLogPrefetcherGetReader(XLogPrefetcher *prefetcher)
void XLogPrefetcherBeginRead(XLogPrefetcher *prefetcher, XLogRecPtr recPtr)
void XLogPrefetcherFree(XLogPrefetcher *prefetcher)
bool XLogRecGetBlockTagExtended(XLogReaderState *record, uint8 block_id, RelFileLocator *rlocator, ForkNumber *forknum, BlockNumber *blknum, Buffer *prefetch_buffer)
Definition: xlogreader.c:2017
XLogReaderState * XLogReaderAllocate(int wal_segment_size, const char *waldir, XLogReaderRoutine *routine, void *private_data)
Definition: xlogreader.c:107
void XLogReaderSetDecodeBuffer(XLogReaderState *state, void *buffer, size_t size)
Definition: xlogreader.c:91
void XLogReaderResetError(XLogReaderState *state)
Definition: xlogreader.c:1376
bool XLogReaderValidatePageHeader(XLogReaderState *state, XLogRecPtr recptr, char *phdr)
Definition: xlogreader.c:1235
void XLogReaderFree(XLogReaderState *state)
Definition: xlogreader.c:162
bool RestoreBlockImage(XLogReaderState *record, uint8 block_id, char *page)
Definition: xlogreader.c:2076
#define XLogRecGetDataLen(decoder)
Definition: xlogreader.h:416
#define XLogRecGetInfo(decoder)
Definition: xlogreader.h:410
#define XLogRecBlockImageApply(decoder, block_id)
Definition: xlogreader.h:425
#define XLogRecGetRmid(decoder)
Definition: xlogreader.h:411
#define XLogRecGetData(decoder)
Definition: xlogreader.h:415
#define XLogRecGetXid(decoder)
Definition: xlogreader.h:412
#define XL_ROUTINE(...)
Definition: xlogreader.h:117
#define XLogRecMaxBlockId(decoder)
Definition: xlogreader.h:418
XLogPageReadResult
Definition: xlogreader.h:350
@ XLREAD_WOULDBLOCK
Definition: xlogreader.h:353
@ XLREAD_SUCCESS
Definition: xlogreader.h:351
@ XLREAD_FAIL
Definition: xlogreader.h:352
#define XLogRecHasBlockImage(decoder, block_id)
Definition: xlogreader.h:423
#define XLogRecGetPrev(decoder)
Definition: xlogreader.h:409
#define XLogRecHasAnyBlockRefs(decoder)
Definition: xlogreader.h:417
#define SizeOfXLogRecordDataHeaderShort
Definition: xlogrecord.h:217
#define XLR_INFO_MASK
Definition: xlogrecord.h:62
#define SizeOfXLogRecord
Definition: xlogrecord.h:55
#define XLR_CHECK_CONSISTENCY
Definition: xlogrecord.h:91
bool reachedConsistency
Definition: xlogrecovery.c:301
bool check_primary_slot_name(char **newval, void **extra, GucSource source)
static bool getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime)
static XLogRecPtr recoveryStopLSN
Definition: xlogrecovery.c:388
static bool recoveryStopsBefore(XLogReaderState *record)
static TimestampTz recoveryStopTime
Definition: xlogrecovery.c:387
void assign_recovery_target_xid(const char *newval, void *extra)
static bool CheckForStandbyTrigger(void)
int recovery_min_apply_delay
Definition: xlogrecovery.c:95
bool check_recovery_target(char **newval, void **extra, GucSource source)
static bool backupEndRequired
Definition: xlogrecovery.c:285
bool HotStandbyActive(void)
static char * getRecoveryStopReason(void)
void ShutdownWalRecovery(void)
RecoveryTargetTimeLineGoal recoveryTargetTimeLineGoal
Definition: xlogrecovery.c:122
int recoveryTargetAction
Definition: xlogrecovery.c:89
static void rm_redo_error_callback(void *arg)
static bool recoveryApplyDelay(XLogReaderState *record)
bool ArchiveRecoveryRequested
Definition: xlogrecovery.c:139
const char * recoveryTargetName
Definition: xlogrecovery.c:93
static void xlogrecovery_redo(XLogReaderState *record, TimeLineID replayTLI)
bool check_recovery_target_timeline(char **newval, void **extra, GucSource source)
static XLogRecPtr minRecoveryPoint
Definition: xlogrecovery.c:280
static int XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen, XLogRecPtr targetRecPtr, char *readBuf)
static XLogRecPtr backupEndPoint
Definition: xlogrecovery.c:284
const struct config_enum_entry recovery_target_action_options[]
Definition: xlogrecovery.c:76
static void validateRecoveryParameters(void)
static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI, TimeLineID replayTLI)
static XLogRecord * ReadCheckpointRecord(XLogPrefetcher *xlogprefetcher, XLogRecPtr RecPtr, TimeLineID replayTLI)
void StartupRequestWalReceiverRestart(void)
bool InArchiveRecovery
Definition: xlogrecovery.c:140
static bool recoveryStopsAfter(XLogReaderState *record)
void RecoveryRequiresIntParameter(const char *param_name, int currValue, int minValue)
char * PrimarySlotName
Definition: xlogrecovery.c:99
static TimeLineID curFileTLI
Definition: xlogrecovery.c:126
static char recoveryStopName[MAXFNAMELEN]
Definition: xlogrecovery.c:389
static void CheckRecoveryConsistency(void)
static bool pendingWalRcvRestart
Definition: xlogrecovery.c:250
void PerformWalRecovery(void)
static XLogSource XLogReceiptSource
Definition: xlogrecovery.c:261
bool CheckPromoteSignal(void)
struct XLogPageReadPrivate XLogPageReadPrivate
static bool recoveryStopAfter
Definition: xlogrecovery.c:390
static const char *const xlogSourceNames[]
Definition: xlogrecovery.c:220
static TimeLineID RedoStartTLI
Definition: xlogrecovery.c:172
char * recoveryRestoreCommand
Definition: xlogrecovery.c:84
static void verifyBackupPageConsistency(XLogReaderState *record)
static int XLogFileReadAnyTLI(XLogSegNo segno, XLogSource source)
void assign_recovery_target(const char *newval, void *extra)
void SetRecoveryPause(bool recoveryPause)
static bool lastSourceFailed
Definition: xlogrecovery.c:249
char * archiveCleanupCommand
Definition: xlogrecovery.c:86
XLogRecPtr GetCurrentReplayRecPtr(TimeLineID *replayEndTLI)
static TimeLineID receiveTLI
Definition: xlogrecovery.c:265
void WakeupRecovery(void)
void xlog_outdesc(StringInfo buf, XLogReaderState *record)
static bool LocalPromoteIsTriggered
Definition: xlogrecovery.c:184
bool PromoteIsTriggered(void)
TimestampTz GetCurrentChunkReplayStartTime(void)
static void ConfirmRecoveryPaused(void)
static void readRecoverySignalFile(void)
static XLogRecPtr missingContrecPtr
Definition: xlogrecovery.c:380
static XLogRecoveryCtlData * XLogRecoveryCtl
Definition: xlogrecovery.c:371
static uint32 readOff
Definition: xlogrecovery.c:234
static bool standby_signal_file_found
Definition: xlogrecovery.c:152
char * recovery_target_time_string
Definition: xlogrecovery.c:91
bool StandbyMode
Definition: xlogrecovery.c:149
static int readFile
Definition: xlogrecovery.c:232
static XLogPageReadResult WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, bool fetching_ckpt, XLogRecPtr tliRecPtr, TimeLineID replayTLI, XLogRecPtr replayLSN, bool nonblocking)
XLogRecPtr recoveryTargetLSN
Definition: xlogrecovery.c:94
RecoveryTargetType recoveryTarget
Definition: xlogrecovery.c:87
static bool read_tablespace_map(List **tablespaces)
static bool doRequestWalReceiverReply
Definition: xlogrecovery.c:187
static bool read_backup_label(XLogRecPtr *checkPointLoc, TimeLineID *backupLabelTLI, bool *backupEndRequired, bool *backupFromStandby)
static int XLogFileRead(XLogSegNo segno, TimeLineID tli, XLogSource source, bool notfoundOk)
static XLogSource currentSource
Definition: xlogrecovery.c:248
XLogRecPtr GetXLogReplayRecPtr(TimeLineID *replayTLI)
void GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream)
static List * expectedTLEs
Definition: xlogrecovery.c:125
static XLogSegNo readSegNo
Definition: xlogrecovery.c:233
void assign_recovery_target_name(const char *newval, void *extra)
static XLogRecPtr abortedRecPtr
Definition: xlogrecovery.c:379
static char * primary_image_masked
Definition: xlogrecovery.c:305
static TimeLineID minRecoveryPointTLI
Definition: xlogrecovery.c:281
static XLogRecord * ReadRecord(XLogPrefetcher *xlogprefetcher, int emode, bool fetching_ckpt, TimeLineID replayTLI)
EndOfWalRecoveryInfo * FinishWalRecovery(void)
void assign_recovery_target_time(const char *newval, void *extra)
static void SetCurrentChunkStartTime(TimestampTz xtime)
static XLogRecPtr CheckPointLoc
Definition: xlogrecovery.c:169
bool check_recovery_target_xid(char **newval, void **extra, GucSource source)
static bool LocalHotStandbyActive
Definition: xlogrecovery.c:178
struct XLogRecoveryCtlData XLogRecoveryCtlData
static bool HotStandbyActiveInReplay(void)
static bool InRedo
Definition: xlogrecovery.c:205
static TransactionId recoveryStopXid
Definition: xlogrecovery.c:386
bool check_recovery_target_time(char **newval, void **extra, GucSource source)
static XLogSource readSource
Definition: xlogrecovery.c:236
static void SetPromoteIsTriggered(void)
#define RECOVERY_COMMAND_FILE
Definition: xlogrecovery.c:70
TransactionId recoveryTargetXid
Definition: xlogrecovery.c:90
XLogSource
Definition: xlogrecovery.c:212
@ XLOG_FROM_PG_WAL
Definition: xlogrecovery.c:215
@ XLOG_FROM_STREAM
Definition: xlogrecovery.c:216
@ XLOG_FROM_ARCHIVE
Definition: xlogrecovery.c:214
@ XLOG_FROM_ANY
Definition: xlogrecovery.c:213
TimeLineID recoveryTargetTLIRequested
Definition: xlogrecovery.c:123
static pg_noreturn void error_multiple_recovery_targets(void)
void InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, bool *haveBackupLabel_ptr, bool *haveTblspcMap_ptr)
Definition: xlogrecovery.c:519
static void xlog_block_info(StringInfo buf, XLogReaderState *record)
static TimestampTz XLogReceiptTime
Definition: xlogrecovery.c:260
static void ApplyWalRecord(XLogReaderState *xlogreader, XLogRecord *record, TimeLineID *replayTLI)
Size XLogRecoveryShmemSize(void)
Definition: xlogrecovery.c:454
static char * replay_image_masked
Definition: xlogrecovery.c:304
bool wal_receiver_create_temp_slot
Definition: xlogrecovery.c:100
static void CheckTablespaceDirectory(void)
char * recoveryEndCommand
Definition: xlogrecovery.c:85
RecoveryPauseState GetRecoveryPauseState(void)
TimeLineID recoveryTargetTLI
Definition: xlogrecovery.c:124
static int emode_for_corrupt_record(int emode, XLogRecPtr RecPtr)
void assign_recovery_target_lsn(const char *newval, void *extra)
bool check_recovery_target_lsn(char **newval, void **extra, GucSource source)
static XLogRecPtr RedoStartLSN
Definition: xlogrecovery.c:171
static XLogRecPtr flushedUpto
Definition: xlogrecovery.c:264
void XLogRecoveryShmemInit(void)
Definition: xlogrecovery.c:465
static void recoveryPausesHere(bool endOfRecovery)
static uint32 readLen
Definition: xlogrecovery.c:235
static void EnableStandbyMode(void)
Definition: xlogrecovery.c:485
#define RECOVERY_COMMAND_DONE
Definition: xlogrecovery.c:71
static bool recovery_signal_file_found
Definition: xlogrecovery.c:153
TimestampTz recoveryTargetTime
Definition: xlogrecovery.c:92
TimestampTz GetLatestXTime(void)
char * PrimaryConnInfo
Definition: xlogrecovery.c:98
void XLogRequestWalReceiverReply(void)
static bool rescanLatestTimeLine(TimeLineID replayTLI, XLogRecPtr replayLSN)
static XLogPrefetcher * xlogprefetcher
Definition: xlogrecovery.c:193
static bool StandbyModeRequested
Definition: xlogrecovery.c:148
bool check_recovery_target_name(char **newval, void **extra, GucSource source)
bool recoveryTargetInclusive
Definition: xlogrecovery.c:88
static XLogReaderState * xlogreader
Definition: xlogrecovery.c:190
void RemovePromoteSignalFiles(void)
void assign_recovery_target_timeline(const char *newval, void *extra)
static XLogRecPtr backupStartPoint
Definition: xlogrecovery.c:283
static void SetLatestXTime(TimestampTz xtime)
static TimeLineID CheckPointTLI
Definition: xlogrecovery.c:170
@ RECOVERY_TARGET_ACTION_PAUSE
Definition: xlogrecovery.h:48
@ RECOVERY_TARGET_ACTION_PROMOTE
Definition: xlogrecovery.h:49
@ RECOVERY_TARGET_ACTION_SHUTDOWN
Definition: xlogrecovery.h:50
RecoveryTargetType
Definition: xlogrecovery.h:24
@ RECOVERY_TARGET_IMMEDIATE
Definition: xlogrecovery.h:30
@ RECOVERY_TARGET_TIME
Definition: xlogrecovery.h:27
@ RECOVERY_TARGET_UNSET
Definition: xlogrecovery.h:25
@ RECOVERY_TARGET_XID
Definition: xlogrecovery.h:26
@ RECOVERY_TARGET_LSN
Definition: xlogrecovery.h:29
@ RECOVERY_TARGET_NAME
Definition: xlogrecovery.h:28
RecoveryTargetTimeLineGoal
Definition: xlogrecovery.h:37
@ RECOVERY_TARGET_TIMELINE_NUMERIC
Definition: xlogrecovery.h:40
@ RECOVERY_TARGET_TIMELINE_CONTROLFILE
Definition: xlogrecovery.h:38
@ RECOVERY_TARGET_TIMELINE_LATEST
Definition: xlogrecovery.h:39
RecoveryPauseState
Definition: xlogrecovery.h:55
@ RECOVERY_PAUSED
Definition: xlogrecovery.h:58
@ RECOVERY_NOT_PAUSED
Definition: xlogrecovery.h:56
@ RECOVERY_PAUSE_REQUESTED
Definition: xlogrecovery.h:57
void wal_segment_close(XLogReaderState *state)
Definition: xlogutils.c:831
Buffer XLogReadBufferExtended(RelFileLocator rlocator, ForkNumber forknum, BlockNumber blkno, ReadBufferMode mode, Buffer recent_buffer)
Definition: xlogutils.c:460
HotStandbyState standbyState
Definition: xlogutils.c:53
bool InRecovery
Definition: xlogutils.c:50
void XLogCheckInvalidPages(void)
Definition: xlogutils.c:234
@ STANDBY_SNAPSHOT_READY
Definition: xlogutils.h:55
@ STANDBY_INITIALIZED
Definition: xlogutils.h:53