Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit f64b11f

Browse files
committed
Fix an assertion failure related to an exclusive backup.
Previously multiple sessions could execute pg_start_backup() and pg_stop_backup() to start and stop an exclusive backup at the same time. This could trigger the assertion failure of "FailedAssertion("!(XLogCtl->Insert.exclusiveBackup)". This happend because, even while pg_start_backup() was starting an exclusive backup, other session could run pg_stop_backup() concurrently and mark the backup as not-in-progress unconditionally. This patch introduces ExclusiveBackupState indicating the state of an exclusive backup. This state is used to ensure that there is only one session running pg_start_backup() or pg_stop_backup() at the same time, to avoid the assertion failure. Back-patch to all supported versions. Author: Michael Paquier Reviewed-By: Kyotaro Horiguchi and me Reported-By: Andreas Seltenreich Discussion: <[email protected]>
1 parent 47d32a4 commit f64b11f

File tree

1 file changed

+147
-62
lines changed
  • src/backend/access/transam

1 file changed

+147
-62
lines changed

src/backend/access/transam/xlog.c

Lines changed: 147 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -348,6 +348,29 @@ typedef struct XLogwrtResult
348348
XLogRecPtr Flush; /* last byte + 1 flushed */
349349
} XLogwrtResult;
350350

351+
/*
352+
* State of an exclusive backup, necessary to control concurrent activities
353+
* across sessions when working on exclusive backups.
354+
*
355+
* EXCLUSIVE_BACKUP_NONE means that there is no exclusive backup actually
356+
* running, to be more precise pg_start_backup() is not being executed for
357+
* an exclusive backup and there is no exclusive backup in progress.
358+
* EXCLUSIVE_BACKUP_STARTING means that pg_start_backup() is starting an
359+
* exclusive backup.
360+
* EXCLUSIVE_BACKUP_IN_PROGRESS means that pg_start_backup() has finished
361+
* running and an exclusive backup is in progress. pg_stop_backup() is
362+
* needed to finish it.
363+
* EXCLUSIVE_BACKUP_STOPPING means that pg_stop_backup() is stopping an
364+
* exclusive backup.
365+
*/
366+
typedef enum ExclusiveBackupState
367+
{
368+
EXCLUSIVE_BACKUP_NONE = 0,
369+
EXCLUSIVE_BACKUP_STARTING,
370+
EXCLUSIVE_BACKUP_IN_PROGRESS,
371+
EXCLUSIVE_BACKUP_STOPPING
372+
} ExclusiveBackupState;
373+
351374
/*
352375
* Shared state data for XLogInsert.
353376
*/
@@ -370,13 +393,15 @@ typedef struct XLogCtlInsert
370393
bool fullPageWrites;
371394

372395
/*
373-
* exclusiveBackup is true if a backup started with pg_start_backup() is
374-
* in progress, and nonExclusiveBackups is a counter indicating the number
375-
* of streaming base backups currently in progress. forcePageWrites is set
376-
* to true when either of these is non-zero. lastBackupStart is the latest
377-
* checkpoint redo location used as a starting point for an online backup.
396+
* exclusiveBackupState indicates the state of an exclusive backup
397+
* (see comments of ExclusiveBackupState for more details).
398+
* nonExclusiveBackups is a counter indicating the number of streaming
399+
* base backups currently in progress. forcePageWrites is set to true
400+
* when either of these is non-zero. lastBackupStart is the latest
401+
* checkpoint redo location used as a starting point for an online
402+
* backup.
378403
*/
379-
bool exclusiveBackup;
404+
ExclusiveBackupState exclusiveBackupState;
380405
int nonExclusiveBackups;
381406
XLogRecPtr lastBackupStart;
382407
} XLogCtlInsert;
@@ -693,6 +718,7 @@ static bool CheckForStandbyTrigger(void);
693718
static void xlog_outrec(StringInfo buf, XLogRecord *record);
694719
#endif
695720
static void pg_start_backup_callback(int code, Datum arg);
721+
static void pg_stop_backup_callback(int code, Datum arg);
696722
static bool read_backup_label(XLogRecPtr *checkPointLoc,
697723
bool *backupEndRequired, bool *backupFromStandby);
698724
static void rm_redo_error_callback(void *arg);
@@ -8699,15 +8725,20 @@ do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p,
86998725
LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
87008726
if (exclusive)
87018727
{
8702-
if (XLogCtl->Insert.exclusiveBackup)
8728+
/*
8729+
* At first, mark that we're now starting an exclusive backup,
8730+
* to ensure that there are no other sessions currently running
8731+
* pg_start_backup() or pg_stop_backup().
8732+
*/
8733+
if (XLogCtl->Insert.exclusiveBackupState != EXCLUSIVE_BACKUP_NONE)
87038734
{
87048735
LWLockRelease(WALInsertLock);
87058736
ereport(ERROR,
87068737
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
87078738
errmsg("a backup is already in progress"),
87088739
errhint("Run pg_stop_backup() and try again.")));
87098740
}
8710-
XLogCtl->Insert.exclusiveBackup = true;
8741+
XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_STARTING;
87118742
}
87128743
else
87138744
XLogCtl->Insert.nonExclusiveBackups++;
@@ -8867,7 +8898,7 @@ do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p,
88678898
{
88688899
/*
88698900
* Check for existing backup label --- implies a backup is already
8870-
* running. (XXX given that we checked exclusiveBackup above,
8901+
* running. (XXX given that we checked exclusiveBackupState above,
88718902
* maybe it would be OK to just unlink any such label file?)
88728903
*/
88738904
if (stat(BACKUP_LABEL_FILE, &stat_buf) != 0)
@@ -8908,6 +8939,16 @@ do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p,
89088939
}
89098940
PG_END_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));
89108941

8942+
/*
8943+
* Mark that start phase has correctly finished for an exclusive backup.
8944+
*/
8945+
if (exclusive)
8946+
{
8947+
LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
8948+
XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_IN_PROGRESS;
8949+
LWLockRelease(WALInsertLock);
8950+
}
8951+
89118952
/*
89128953
* We're done. As a convenience, return the starting WAL location.
89138954
*/
@@ -8926,23 +8967,41 @@ pg_start_backup_callback(int code, Datum arg)
89268967
LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
89278968
if (exclusive)
89288969
{
8929-
Assert(XLogCtl->Insert.exclusiveBackup);
8930-
XLogCtl->Insert.exclusiveBackup = false;
8970+
Assert(XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_STARTING);
8971+
XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_NONE;
89318972
}
89328973
else
89338974
{
89348975
Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
89358976
XLogCtl->Insert.nonExclusiveBackups--;
89368977
}
89378978

8938-
if (!XLogCtl->Insert.exclusiveBackup &&
8979+
if (XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_NONE &&
89398980
XLogCtl->Insert.nonExclusiveBackups == 0)
89408981
{
89418982
XLogCtl->Insert.forcePageWrites = false;
89428983
}
89438984
LWLockRelease(WALInsertLock);
89448985
}
89458986

8987+
/*
8988+
* Error cleanup callback for pg_stop_backup
8989+
*/
8990+
static void
8991+
pg_stop_backup_callback(int code, Datum arg)
8992+
{
8993+
bool exclusive = DatumGetBool(arg);
8994+
8995+
/* Update backup status on failure */
8996+
LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
8997+
if (exclusive)
8998+
{
8999+
Assert(XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_STOPPING);
9000+
XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_IN_PROGRESS;
9001+
}
9002+
LWLockRelease(WALInsertLock);
9003+
}
9004+
89469005
/*
89479006
* do_pg_stop_backup is the workhorse of the user-visible pg_stop_backup()
89489007
* function.
@@ -9006,12 +9065,85 @@ do_pg_stop_backup(char *labelfile, bool waitforarchive, TimeLineID *stoptli_p)
90069065
errmsg("WAL level not sufficient for making an online backup"),
90079066
errhint("wal_level must be set to \"archive\" or \"hot_standby\" at server start.")));
90089067

9068+
if (exclusive)
9069+
{
9070+
/*
9071+
* At first, mark that we're now stopping an exclusive backup,
9072+
* to ensure that there are no other sessions currently running
9073+
* pg_start_backup() or pg_stop_backup().
9074+
*/
9075+
LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
9076+
if (XLogCtl->Insert.exclusiveBackupState != EXCLUSIVE_BACKUP_IN_PROGRESS)
9077+
{
9078+
LWLockRelease(WALInsertLock);
9079+
ereport(ERROR,
9080+
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9081+
errmsg("exclusive backup not in progress")));
9082+
}
9083+
XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_STOPPING;
9084+
LWLockRelease(WALInsertLock);
9085+
9086+
/*
9087+
* Remove backup_label. In case of failure, the state for an exclusive
9088+
* backup is switched back to in-progress.
9089+
*/
9090+
PG_ENSURE_ERROR_CLEANUP(pg_stop_backup_callback, (Datum) BoolGetDatum(exclusive));
9091+
{
9092+
/*
9093+
* Read the existing label file into memory.
9094+
*/
9095+
struct stat statbuf;
9096+
int r;
9097+
9098+
if (stat(BACKUP_LABEL_FILE, &statbuf))
9099+
{
9100+
if (errno != ENOENT)
9101+
ereport(ERROR,
9102+
(errcode_for_file_access(),
9103+
errmsg("could not stat file \"%s\": %m",
9104+
BACKUP_LABEL_FILE)));
9105+
ereport(ERROR,
9106+
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9107+
errmsg("a backup is not in progress")));
9108+
}
9109+
9110+
lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
9111+
if (!lfp)
9112+
{
9113+
ereport(ERROR,
9114+
(errcode_for_file_access(),
9115+
errmsg("could not read file \"%s\": %m",
9116+
BACKUP_LABEL_FILE)));
9117+
}
9118+
labelfile = palloc(statbuf.st_size + 1);
9119+
r = fread(labelfile, statbuf.st_size, 1, lfp);
9120+
labelfile[statbuf.st_size] = '\0';
9121+
9122+
/*
9123+
* Close and remove the backup label file
9124+
*/
9125+
if (r != 1 || ferror(lfp) || FreeFile(lfp))
9126+
ereport(ERROR,
9127+
(errcode_for_file_access(),
9128+
errmsg("could not read file \"%s\": %m",
9129+
BACKUP_LABEL_FILE)));
9130+
if (unlink(BACKUP_LABEL_FILE) != 0)
9131+
ereport(ERROR,
9132+
(errcode_for_file_access(),
9133+
errmsg("could not remove file \"%s\": %m",
9134+
BACKUP_LABEL_FILE)));
9135+
}
9136+
PG_END_ENSURE_ERROR_CLEANUP(pg_stop_backup_callback, (Datum) BoolGetDatum(exclusive));
9137+
}
9138+
90099139
/*
90109140
* OK to update backup counters and forcePageWrites
90119141
*/
90129142
LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
90139143
if (exclusive)
9014-
XLogCtl->Insert.exclusiveBackup = false;
9144+
{
9145+
XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_NONE;
9146+
}
90159147
else
90169148
{
90179149
/*
@@ -9024,60 +9156,13 @@ do_pg_stop_backup(char *labelfile, bool waitforarchive, TimeLineID *stoptli_p)
90249156
XLogCtl->Insert.nonExclusiveBackups--;
90259157
}
90269158

9027-
if (!XLogCtl->Insert.exclusiveBackup &&
9159+
if (XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_NONE &&
90289160
XLogCtl->Insert.nonExclusiveBackups == 0)
90299161
{
90309162
XLogCtl->Insert.forcePageWrites = false;
90319163
}
90329164
LWLockRelease(WALInsertLock);
90339165

9034-
if (exclusive)
9035-
{
9036-
/*
9037-
* Read the existing label file into memory.
9038-
*/
9039-
struct stat statbuf;
9040-
int r;
9041-
9042-
if (stat(BACKUP_LABEL_FILE, &statbuf))
9043-
{
9044-
if (errno != ENOENT)
9045-
ereport(ERROR,
9046-
(errcode_for_file_access(),
9047-
errmsg("could not stat file \"%s\": %m",
9048-
BACKUP_LABEL_FILE)));
9049-
ereport(ERROR,
9050-
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9051-
errmsg("a backup is not in progress")));
9052-
}
9053-
9054-
lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
9055-
if (!lfp)
9056-
{
9057-
ereport(ERROR,
9058-
(errcode_for_file_access(),
9059-
errmsg("could not read file \"%s\": %m",
9060-
BACKUP_LABEL_FILE)));
9061-
}
9062-
labelfile = palloc(statbuf.st_size + 1);
9063-
r = fread(labelfile, statbuf.st_size, 1, lfp);
9064-
labelfile[statbuf.st_size] = '\0';
9065-
9066-
/*
9067-
* Close and remove the backup label file
9068-
*/
9069-
if (r != 1 || ferror(lfp) || FreeFile(lfp))
9070-
ereport(ERROR,
9071-
(errcode_for_file_access(),
9072-
errmsg("could not read file \"%s\": %m",
9073-
BACKUP_LABEL_FILE)));
9074-
if (unlink(BACKUP_LABEL_FILE) != 0)
9075-
ereport(ERROR,
9076-
(errcode_for_file_access(),
9077-
errmsg("could not remove file \"%s\": %m",
9078-
BACKUP_LABEL_FILE)));
9079-
}
9080-
90819166
/*
90829167
* Read and parse the START WAL LOCATION line (this code is pretty crude,
90839168
* but we are not expecting any variability in the file format).
@@ -9318,7 +9403,7 @@ do_pg_abort_backup(void)
93189403
Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
93199404
XLogCtl->Insert.nonExclusiveBackups--;
93209405

9321-
if (!XLogCtl->Insert.exclusiveBackup &&
9406+
if (XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_NONE &&
93229407
XLogCtl->Insert.nonExclusiveBackups == 0)
93239408
{
93249409
XLogCtl->Insert.forcePageWrites = false;

0 commit comments

Comments
 (0)