Thanks to visit codestin.com
Credit goes to doxygen.postgresql.org

PostgreSQL Source Code git master
twophase.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * twophase.c
4 * Two-phase commit support functions.
5 *
6 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 * IDENTIFICATION
10 * src/backend/access/transam/twophase.c
11 *
12 * NOTES
13 * Each global transaction is associated with a global transaction
14 * identifier (GID). The client assigns a GID to a postgres
15 * transaction with the PREPARE TRANSACTION command.
16 *
17 * We keep all active global transactions in a shared memory array.
18 * When the PREPARE TRANSACTION command is issued, the GID is
19 * reserved for the transaction in the array. This is done before
20 * a WAL entry is made, because the reservation checks for duplicate
21 * GIDs and aborts the transaction if there already is a global
22 * transaction in prepared state with the same GID.
23 *
24 * A global transaction (gxact) also has dummy PGPROC; this is what keeps
25 * the XID considered running by TransactionIdIsInProgress. It is also
26 * convenient as a PGPROC to hook the gxact's locks to.
27 *
28 * Information to recover prepared transactions in case of crash is
29 * now stored in WAL for the common case. In some cases there will be
30 * an extended period between preparing a GXACT and commit/abort, in
31 * which case we need to separately record prepared transaction data
32 * in permanent storage. This includes locking information, pending
33 * notifications etc. All that state information is written to the
34 * per-transaction state file in the pg_twophase directory.
35 * All prepared transactions will be written prior to shutdown.
36 *
37 * Life track of state data is following:
38 *
39 * * On PREPARE TRANSACTION backend writes state data only to the WAL and
40 * stores pointer to the start of the WAL record in
41 * gxact->prepare_start_lsn.
42 * * If COMMIT occurs before checkpoint then backend reads data from WAL
43 * using prepare_start_lsn.
44 * * On checkpoint state data copied to files in pg_twophase directory and
45 * fsynced
46 * * If COMMIT happens after checkpoint then backend reads state data from
47 * files
48 *
49 * During replay and replication, TwoPhaseState also holds information
50 * about active prepared transactions that haven't been moved to disk yet.
51 *
52 * Replay of twophase records happens by the following rules:
53 *
54 * * At the beginning of recovery, pg_twophase is scanned once, filling
55 * TwoPhaseState with entries marked with gxact->inredo and
56 * gxact->ondisk. Two-phase file data older than the XID horizon of
57 * the redo position are discarded.
58 * * On PREPARE redo, the transaction is added to TwoPhaseState->prepXacts.
59 * gxact->inredo is set to true for such entries.
60 * * On Checkpoint we iterate through TwoPhaseState->prepXacts entries
61 * that have gxact->inredo set and are behind the redo_horizon. We
62 * save them to disk and then switch gxact->ondisk to true.
63 * * On COMMIT/ABORT we delete the entry from TwoPhaseState->prepXacts.
64 * If gxact->ondisk is true, the corresponding entry from the disk
65 * is additionally deleted.
66 * * RecoverPreparedTransactions(), StandbyRecoverPreparedTransactions()
67 * and PrescanPreparedTransactions() have been modified to go through
68 * gxact->inredo entries that have not made it to disk.
69 *
70 *-------------------------------------------------------------------------
71 */
72#include "postgres.h"
73
74#include <fcntl.h>
75#include <sys/stat.h>
76#include <time.h>
77#include <unistd.h>
78
79#include "access/commit_ts.h"
80#include "access/htup_details.h"
81#include "access/subtrans.h"
82#include "access/transam.h"
83#include "access/twophase.h"
85#include "access/xact.h"
86#include "access/xlog.h"
87#include "access/xloginsert.h"
88#include "access/xlogreader.h"
89#include "access/xlogrecovery.h"
90#include "access/xlogutils.h"
91#include "catalog/pg_type.h"
92#include "catalog/storage.h"
93#include "funcapi.h"
94#include "miscadmin.h"
95#include "pg_trace.h"
96#include "pgstat.h"
97#include "replication/origin.h"
98#include "replication/syncrep.h"
99#include "storage/fd.h"
100#include "storage/ipc.h"
101#include "storage/md.h"
102#include "storage/predicate.h"
103#include "storage/proc.h"
104#include "storage/procarray.h"
105#include "utils/builtins.h"
107#include "utils/memutils.h"
108#include "utils/timestamp.h"
109
110/*
111 * Directory where Two-phase commit files reside within PGDATA
112 */
113#define TWOPHASE_DIR "pg_twophase"
114
115/* GUC variable, can't be changed after startup */
117
118/*
119 * This struct describes one global transaction that is in prepared state
120 * or attempting to become prepared.
121 *
122 * The lifecycle of a global transaction is:
123 *
124 * 1. After checking that the requested GID is not in use, set up an entry in
125 * the TwoPhaseState->prepXacts array with the correct GID and valid = false,
126 * and mark it as locked by my backend.
127 *
128 * 2. After successfully completing prepare, set valid = true and enter the
129 * referenced PGPROC into the global ProcArray.
130 *
131 * 3. To begin COMMIT PREPARED or ROLLBACK PREPARED, check that the entry is
132 * valid and not locked, then mark the entry as locked by storing my current
133 * proc number into locking_backend. This prevents concurrent attempts to
134 * commit or rollback the same prepared xact.
135 *
136 * 4. On completion of COMMIT PREPARED or ROLLBACK PREPARED, remove the entry
137 * from the ProcArray and the TwoPhaseState->prepXacts array and return it to
138 * the freelist.
139 *
140 * Note that if the preparing transaction fails between steps 1 and 2, the
141 * entry must be removed so that the GID and the GlobalTransaction struct
142 * can be reused. See AtAbort_Twophase().
143 *
144 * typedef struct GlobalTransactionData *GlobalTransaction appears in
145 * twophase.h
146 */
147
149{
150 GlobalTransaction next; /* list link for free list */
151 int pgprocno; /* ID of associated dummy PGPROC */
152 TimestampTz prepared_at; /* time of preparation */
153
154 /*
155 * Note that we need to keep track of two LSNs for each GXACT. We keep
156 * track of the start LSN because this is the address we must use to read
157 * state data back from WAL when committing a prepared GXACT. We keep
158 * track of the end LSN because that is the LSN we need to wait for prior
159 * to commit.
160 */
161 XLogRecPtr prepare_start_lsn; /* XLOG offset of prepare record start */
162 XLogRecPtr prepare_end_lsn; /* XLOG offset of prepare record end */
163 FullTransactionId fxid; /* The GXACT full xid */
164
165 Oid owner; /* ID of user that executed the xact */
166 ProcNumber locking_backend; /* backend currently working on the xact */
167 bool valid; /* true if PGPROC entry is in proc array */
168 bool ondisk; /* true if prepare state file is on disk */
169 bool inredo; /* true if entry was added via xlog_redo */
170 char gid[GIDSIZE]; /* The GID assigned to the prepared xact */
172
173/*
174 * Two Phase Commit shared state. Access to this struct is protected
175 * by TwoPhaseStateLock.
176 */
177typedef struct TwoPhaseStateData
178{
179 /* Head of linked list of free GlobalTransactionData structs */
181
182 /* Number of valid prepXacts entries. */
184
185 /* There are max_prepared_xacts items in this array */
188
190
191/*
192 * Global transaction entry currently locked by us, if any. Note that any
193 * access to the entry pointed to by this variable must be protected by
194 * TwoPhaseStateLock, though obviously the pointer itself doesn't need to be
195 * (since it's just local memory).
196 */
198
199static bool twophaseExitRegistered = false;
200
201static void PrepareRedoRemoveFull(FullTransactionId fxid, bool giveWarning);
203 int nchildren,
204 TransactionId *children,
205 int nrels,
206 RelFileLocator *rels,
207 int nstats,
208 xl_xact_stats_item *stats,
209 int ninvalmsgs,
210 SharedInvalidationMessage *invalmsgs,
211 bool initfileinval,
212 const char *gid);
214 int nchildren,
215 TransactionId *children,
216 int nrels,
217 RelFileLocator *rels,
218 int nstats,
219 xl_xact_stats_item *stats,
220 const char *gid);
221static void ProcessRecords(char *bufptr, FullTransactionId fxid,
222 const TwoPhaseCallback callbacks[]);
223static void RemoveGXact(GlobalTransaction gxact);
224
225static void XlogReadTwoPhaseData(XLogRecPtr lsn, char **buf, int *len);
227 XLogRecPtr prepare_start_lsn,
228 bool fromdisk, bool setParent, bool setNextXid);
230 const char *gid, TimestampTz prepared_at, Oid owner,
231 Oid databaseid);
232static void RemoveTwoPhaseFile(FullTransactionId fxid, bool giveWarning);
233static void RecreateTwoPhaseFile(FullTransactionId fxid, void *content, int len);
234
235/*
236 * Initialization of shared memory
237 */
238Size
240{
241 Size size;
242
243 /* Need the fixed struct, the array of pointers, and the GTD structs */
244 size = offsetof(TwoPhaseStateData, prepXacts);
246 sizeof(GlobalTransaction)));
247 size = MAXALIGN(size);
249 sizeof(GlobalTransactionData)));
250
251 return size;
252}
253
254void
256{
257 bool found;
258
259 TwoPhaseState = ShmemInitStruct("Prepared Transaction Table",
261 &found);
263 {
264 GlobalTransaction gxacts;
265 int i;
266
267 Assert(!found);
270
271 /*
272 * Initialize the linked list of free GlobalTransactionData structs
273 */
274 gxacts = (GlobalTransaction)
275 ((char *) TwoPhaseState +
276 MAXALIGN(offsetof(TwoPhaseStateData, prepXacts) +
278 for (i = 0; i < max_prepared_xacts; i++)
279 {
280 /* insert into linked list */
281 gxacts[i].next = TwoPhaseState->freeGXacts;
282 TwoPhaseState->freeGXacts = &gxacts[i];
283
284 /* associate it with a PGPROC assigned by InitProcGlobal */
286 }
287 }
288 else
289 Assert(found);
290}
291
292/*
293 * Exit hook to unlock the global transaction entry we're working on.
294 */
295static void
297{
298 /* same logic as abort */
300}
301
302/*
303 * Abort hook to unlock the global transaction entry we're working on.
304 */
305void
307{
308 if (MyLockedGxact == NULL)
309 return;
310
311 /*
312 * What to do with the locked global transaction entry? If we were in the
313 * process of preparing the transaction, but haven't written the WAL
314 * record and state file yet, the transaction must not be considered as
315 * prepared. Likewise, if we are in the process of finishing an
316 * already-prepared transaction, and fail after having already written the
317 * 2nd phase commit or rollback record to the WAL, the transaction should
318 * not be considered as prepared anymore. In those cases, just remove the
319 * entry from shared memory.
320 *
321 * Otherwise, the entry must be left in place so that the transaction can
322 * be finished later, so just unlock it.
323 *
324 * If we abort during prepare, after having written the WAL record, we
325 * might not have transferred all locks and other state to the prepared
326 * transaction yet. Likewise, if we abort during commit or rollback,
327 * after having written the WAL record, we might not have released all the
328 * resources held by the transaction yet. In those cases, the in-memory
329 * state can be wrong, but it's too late to back out.
330 */
331 LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
332 if (!MyLockedGxact->valid)
334 else
336 LWLockRelease(TwoPhaseStateLock);
337
338 MyLockedGxact = NULL;
339}
340
341/*
342 * This is called after we have finished transferring state to the prepared
343 * PGPROC entry.
344 */
345void
347{
348 LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
350 LWLockRelease(TwoPhaseStateLock);
351
352 MyLockedGxact = NULL;
353}
354
355
356/*
357 * MarkAsPreparing
358 * Reserve the GID for the given transaction.
359 */
361MarkAsPreparing(FullTransactionId fxid, const char *gid,
362 TimestampTz prepared_at, Oid owner, Oid databaseid)
363{
364 GlobalTransaction gxact;
365 int i;
366
367 if (strlen(gid) >= GIDSIZE)
369 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
370 errmsg("transaction identifier \"%s\" is too long",
371 gid)));
372
373 /* fail immediately if feature is disabled */
374 if (max_prepared_xacts == 0)
376 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
377 errmsg("prepared transactions are disabled"),
378 errhint("Set \"max_prepared_transactions\" to a nonzero value.")));
379
380 /* on first call, register the exit hook */
382 {
385 }
386
387 LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
388
389 /* Check for conflicting GID */
390 for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
391 {
392 gxact = TwoPhaseState->prepXacts[i];
393 if (strcmp(gxact->gid, gid) == 0)
394 {
397 errmsg("transaction identifier \"%s\" is already in use",
398 gid)));
399 }
400 }
401
402 /* Get a free gxact from the freelist */
403 if (TwoPhaseState->freeGXacts == NULL)
405 (errcode(ERRCODE_OUT_OF_MEMORY),
406 errmsg("maximum number of prepared transactions reached"),
407 errhint("Increase \"max_prepared_transactions\" (currently %d).",
409 gxact = TwoPhaseState->freeGXacts;
410 TwoPhaseState->freeGXacts = gxact->next;
411
412 MarkAsPreparingGuts(gxact, fxid, gid, prepared_at, owner, databaseid);
413
414 gxact->ondisk = false;
415
416 /* And insert it into the active array */
419
420 LWLockRelease(TwoPhaseStateLock);
421
422 return gxact;
423}
424
425/*
426 * MarkAsPreparingGuts
427 *
428 * This uses a gxact struct and puts it into the active array.
429 * NOTE: this is also used when reloading a gxact after a crash; so avoid
430 * assuming that we can use very much backend context.
431 *
432 * Note: This function should be called with appropriate locks held.
433 */
434static void
436 const char *gid, TimestampTz prepared_at, Oid owner,
437 Oid databaseid)
438{
439 PGPROC *proc;
440 int i;
442
443 Assert(LWLockHeldByMeInMode(TwoPhaseStateLock, LW_EXCLUSIVE));
444
445 Assert(gxact != NULL);
446 proc = GetPGProcByNumber(gxact->pgprocno);
447
448 /* Initialize the PGPROC entry */
449 MemSet(proc, 0, sizeof(PGPROC));
450 dlist_node_init(&proc->links);
453 {
454 /* clone VXID, for TwoPhaseGetXidByVirtualXID() to find */
455 proc->vxid.lxid = MyProc->vxid.lxid;
457 }
458 else
459 {
461 /* GetLockConflicts() uses this to specify a wait on the XID */
462 proc->vxid.lxid = xid;
464 }
465 proc->xid = xid;
467 proc->delayChkptFlags = 0;
468 proc->statusFlags = 0;
469 proc->pid = 0;
470 proc->databaseId = databaseid;
471 proc->roleId = owner;
473 proc->isRegularBackend = false;
475 proc->lwWaitMode = 0;
476 proc->waitLock = NULL;
477 proc->waitProcLock = NULL;
478 pg_atomic_init_u64(&proc->waitStart, 0);
479 for (i = 0; i < NUM_LOCK_PARTITIONS; i++)
480 dlist_init(&proc->myProcLocks[i]);
481 /* subxid data must be filled later by GXactLoadSubxactData */
482 proc->subxidStatus.overflowed = false;
483 proc->subxidStatus.count = 0;
484
485 gxact->prepared_at = prepared_at;
486 gxact->fxid = fxid;
487 gxact->owner = owner;
489 gxact->valid = false;
490 gxact->inredo = false;
491 strcpy(gxact->gid, gid);
492
493 /*
494 * Remember that we have this GlobalTransaction entry locked for us. If we
495 * abort after this, we must release it.
496 */
497 MyLockedGxact = gxact;
498}
499
500/*
501 * GXactLoadSubxactData
502 *
503 * If the transaction being persisted had any subtransactions, this must
504 * be called before MarkAsPrepared() to load information into the dummy
505 * PGPROC.
506 */
507static void
509 TransactionId *children)
510{
511 PGPROC *proc = GetPGProcByNumber(gxact->pgprocno);
512
513 /* We need no extra lock since the GXACT isn't valid yet */
514 if (nsubxacts > PGPROC_MAX_CACHED_SUBXIDS)
515 {
516 proc->subxidStatus.overflowed = true;
517 nsubxacts = PGPROC_MAX_CACHED_SUBXIDS;
518 }
519 if (nsubxacts > 0)
520 {
521 memcpy(proc->subxids.xids, children,
522 nsubxacts * sizeof(TransactionId));
523 proc->subxidStatus.count = nsubxacts;
524 }
525}
526
527/*
528 * MarkAsPrepared
529 * Mark the GXACT as fully valid, and enter it into the global ProcArray.
530 *
531 * lock_held indicates whether caller already holds TwoPhaseStateLock.
532 */
533static void
534MarkAsPrepared(GlobalTransaction gxact, bool lock_held)
535{
536 /* Lock here may be overkill, but I'm not convinced of that ... */
537 if (!lock_held)
538 LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
539 Assert(!gxact->valid);
540 gxact->valid = true;
541 if (!lock_held)
542 LWLockRelease(TwoPhaseStateLock);
543
544 /*
545 * Put it into the global ProcArray so TransactionIdIsInProgress considers
546 * the XID as still running.
547 */
549}
550
551/*
552 * LockGXact
553 * Locate the prepared transaction and mark it busy for COMMIT or PREPARE.
554 */
556LockGXact(const char *gid, Oid user)
557{
558 int i;
559
560 /* on first call, register the exit hook */
562 {
565 }
566
567 LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
568
569 for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
570 {
572 PGPROC *proc = GetPGProcByNumber(gxact->pgprocno);
573
574 /* Ignore not-yet-valid GIDs */
575 if (!gxact->valid)
576 continue;
577 if (strcmp(gxact->gid, gid) != 0)
578 continue;
579
580 /* Found it, but has someone else got it locked? */
583 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
584 errmsg("prepared transaction with identifier \"%s\" is busy",
585 gid)));
586
587 if (user != gxact->owner && !superuser_arg(user))
589 (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
590 errmsg("permission denied to finish prepared transaction"),
591 errhint("Must be superuser or the user that prepared the transaction.")));
592
593 /*
594 * Note: it probably would be possible to allow committing from
595 * another database; but at the moment NOTIFY is known not to work and
596 * there may be some other issues as well. Hence disallow until
597 * someone gets motivated to make it work.
598 */
599 if (MyDatabaseId != proc->databaseId)
601 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
602 errmsg("prepared transaction belongs to another database"),
603 errhint("Connect to the database where the transaction was prepared to finish it.")));
604
605 /* OK for me to lock it */
607 MyLockedGxact = gxact;
608
609 LWLockRelease(TwoPhaseStateLock);
610
611 return gxact;
612 }
613
614 LWLockRelease(TwoPhaseStateLock);
615
617 (errcode(ERRCODE_UNDEFINED_OBJECT),
618 errmsg("prepared transaction with identifier \"%s\" does not exist",
619 gid)));
620
621 /* NOTREACHED */
622 return NULL;
623}
624
625/*
626 * RemoveGXact
627 * Remove the prepared transaction from the shared memory array.
628 *
629 * NB: caller should have already removed it from ProcArray
630 */
631static void
633{
634 int i;
635
636 Assert(LWLockHeldByMeInMode(TwoPhaseStateLock, LW_EXCLUSIVE));
637
638 for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
639 {
640 if (gxact == TwoPhaseState->prepXacts[i])
641 {
642 /* remove from the active array */
645
646 /* and put it back in the freelist */
647 gxact->next = TwoPhaseState->freeGXacts;
648 TwoPhaseState->freeGXacts = gxact;
649
650 return;
651 }
652 }
653
654 elog(ERROR, "failed to find %p in GlobalTransaction array", gxact);
655}
656
657/*
658 * Returns an array of all prepared transactions for the user-level
659 * function pg_prepared_xact.
660 *
661 * The returned array and all its elements are copies of internal data
662 * structures, to minimize the time we need to hold the TwoPhaseStateLock.
663 *
664 * WARNING -- we return even those transactions that are not fully prepared
665 * yet. The caller should filter them out if he doesn't want them.
666 *
667 * The returned array is palloc'd.
668 */
669static int
671{
672 GlobalTransaction array;
673 int num;
674 int i;
675
676 LWLockAcquire(TwoPhaseStateLock, LW_SHARED);
677
678 if (TwoPhaseState->numPrepXacts == 0)
679 {
680 LWLockRelease(TwoPhaseStateLock);
681
682 *gxacts = NULL;
683 return 0;
684 }
685
687 array = (GlobalTransaction) palloc(sizeof(GlobalTransactionData) * num);
688 *gxacts = array;
689 for (i = 0; i < num; i++)
690 memcpy(array + i, TwoPhaseState->prepXacts[i],
691 sizeof(GlobalTransactionData));
692
693 LWLockRelease(TwoPhaseStateLock);
694
695 return num;
696}
697
698
699/* Working status for pg_prepared_xact */
700typedef struct
701{
706
707/*
708 * pg_prepared_xact
709 * Produce a view with one row per prepared transaction.
710 *
711 * This function is here so we don't have to export the
712 * GlobalTransactionData struct definition.
713 */
714Datum
716{
717 FuncCallContext *funcctx;
718 Working_State *status;
719
720 if (SRF_IS_FIRSTCALL())
721 {
722 TupleDesc tupdesc;
723 MemoryContext oldcontext;
724
725 /* create a function context for cross-call persistence */
726 funcctx = SRF_FIRSTCALL_INIT();
727
728 /*
729 * Switch to memory context appropriate for multiple function calls
730 */
731 oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
732
733 /* build tupdesc for result tuples */
734 /* this had better match pg_prepared_xacts view in system_views.sql */
735 tupdesc = CreateTemplateTupleDesc(5);
736 TupleDescInitEntry(tupdesc, (AttrNumber) 1, "transaction",
737 XIDOID, -1, 0);
738 TupleDescInitEntry(tupdesc, (AttrNumber) 2, "gid",
739 TEXTOID, -1, 0);
740 TupleDescInitEntry(tupdesc, (AttrNumber) 3, "prepared",
741 TIMESTAMPTZOID, -1, 0);
742 TupleDescInitEntry(tupdesc, (AttrNumber) 4, "ownerid",
743 OIDOID, -1, 0);
744 TupleDescInitEntry(tupdesc, (AttrNumber) 5, "dbid",
745 OIDOID, -1, 0);
746
747 funcctx->tuple_desc = BlessTupleDesc(tupdesc);
748
749 /*
750 * Collect all the 2PC status information that we will format and send
751 * out as a result set.
752 */
753 status = (Working_State *) palloc(sizeof(Working_State));
754 funcctx->user_fctx = status;
755
756 status->ngxacts = GetPreparedTransactionList(&status->array);
757 status->currIdx = 0;
758
759 MemoryContextSwitchTo(oldcontext);
760 }
761
762 funcctx = SRF_PERCALL_SETUP();
763 status = (Working_State *) funcctx->user_fctx;
764
765 while (status->array != NULL && status->currIdx < status->ngxacts)
766 {
767 GlobalTransaction gxact = &status->array[status->currIdx++];
768 PGPROC *proc = GetPGProcByNumber(gxact->pgprocno);
769 Datum values[5] = {0};
770 bool nulls[5] = {0};
771 HeapTuple tuple;
772 Datum result;
773
774 if (!gxact->valid)
775 continue;
776
777 /*
778 * Form tuple with appropriate data.
779 */
780
781 values[0] = TransactionIdGetDatum(proc->xid);
782 values[1] = CStringGetTextDatum(gxact->gid);
784 values[3] = ObjectIdGetDatum(gxact->owner);
786
787 tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls);
788 result = HeapTupleGetDatum(tuple);
789 SRF_RETURN_NEXT(funcctx, result);
790 }
791
792 SRF_RETURN_DONE(funcctx);
793}
794
795/*
796 * TwoPhaseGetGXact
797 * Get the GlobalTransaction struct for a prepared transaction
798 * specified by XID
799 *
800 * If lock_held is set to true, TwoPhaseStateLock will not be taken, so the
801 * caller had better hold it.
802 */
805{
806 GlobalTransaction result = NULL;
807 int i;
808
809 static FullTransactionId cached_fxid = {InvalidTransactionId};
810 static GlobalTransaction cached_gxact = NULL;
811
812 Assert(!lock_held || LWLockHeldByMe(TwoPhaseStateLock));
813
814 /*
815 * During a recovery, COMMIT PREPARED, or ABORT PREPARED, we'll be called
816 * repeatedly for the same XID. We can save work with a simple cache.
817 */
818 if (FullTransactionIdEquals(fxid, cached_fxid))
819 return cached_gxact;
820
821 if (!lock_held)
822 LWLockAcquire(TwoPhaseStateLock, LW_SHARED);
823
824 for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
825 {
827
828 if (FullTransactionIdEquals(gxact->fxid, fxid))
829 {
830 result = gxact;
831 break;
832 }
833 }
834
835 if (!lock_held)
836 LWLockRelease(TwoPhaseStateLock);
837
838 if (result == NULL) /* should not happen */
839 elog(ERROR, "failed to find GlobalTransaction for xid %u",
841
842 cached_fxid = fxid;
843 cached_gxact = result;
844
845 return result;
846}
847
848/*
849 * TwoPhaseGetXidByVirtualXID
850 * Lookup VXID among xacts prepared since last startup.
851 *
852 * (This won't find recovered xacts.) If more than one matches, return any
853 * and set "have_more" to true. To witness multiple matches, a single
854 * proc number must consume 2^32 LXIDs, with no intervening database restart.
855 */
858 bool *have_more)
859{
860 int i;
862
864 LWLockAcquire(TwoPhaseStateLock, LW_SHARED);
865
866 for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
867 {
869 PGPROC *proc;
870 VirtualTransactionId proc_vxid;
871
872 if (!gxact->valid)
873 continue;
874 proc = GetPGProcByNumber(gxact->pgprocno);
875 GET_VXID_FROM_PGPROC(proc_vxid, *proc);
876 if (VirtualTransactionIdEquals(vxid, proc_vxid))
877 {
878 /*
879 * Startup process sets proc->vxid.procNumber to
880 * INVALID_PROC_NUMBER.
881 */
882 Assert(!gxact->inredo);
883
884 if (result != InvalidTransactionId)
885 {
886 *have_more = true;
887 break;
888 }
889 result = XidFromFullTransactionId(gxact->fxid);
890 }
891 }
892
893 LWLockRelease(TwoPhaseStateLock);
894
895 return result;
896}
897
898/*
899 * TwoPhaseGetDummyProcNumber
900 * Get the dummy proc number for prepared transaction
901 *
902 * Dummy proc numbers are similar to proc numbers of real backends. They
903 * start at MaxBackends, and are unique across all currently active real
904 * backends and prepared transactions. If lock_held is set to true,
905 * TwoPhaseStateLock will not be taken, so the caller had better hold it.
906 */
909{
910 GlobalTransaction gxact = TwoPhaseGetGXact(fxid, lock_held);
911
912 return gxact->pgprocno;
913}
914
915/*
916 * TwoPhaseGetDummyProc
917 * Get the PGPROC that represents a prepared transaction
918 *
919 * If lock_held is set to true, TwoPhaseStateLock will not be taken, so the
920 * caller had better hold it.
921 */
922PGPROC *
924{
925 GlobalTransaction gxact = TwoPhaseGetGXact(fxid, lock_held);
926
927 return GetPGProcByNumber(gxact->pgprocno);
928}
929
930/************************************************************************/
931/* State file support */
932/************************************************************************/
933
934/*
935 * Compute the FullTransactionId for the given TransactionId.
936 *
937 * This is safe if the xid has not yet reached COMMIT PREPARED or ROLLBACK
938 * PREPARED. After those commands, concurrent vac_truncate_clog() may make
939 * the xid cease to qualify as allowable. XXX Not all callers limit their
940 * calls accordingly.
941 */
942static inline FullTransactionId
944{
947}
948
949static inline int
951{
952 return snprintf(path, MAXPGPATH, TWOPHASE_DIR "/%08X%08X",
955}
956
957/*
958 * 2PC state file format:
959 *
960 * 1. TwoPhaseFileHeader
961 * 2. TransactionId[] (subtransactions)
962 * 3. RelFileLocator[] (files to be deleted at commit)
963 * 4. RelFileLocator[] (files to be deleted at abort)
964 * 5. SharedInvalidationMessage[] (inval messages to be sent at commit)
965 * 6. TwoPhaseRecordOnDisk
966 * 7. ...
967 * 8. TwoPhaseRecordOnDisk (end sentinel, rmid == TWOPHASE_RM_END_ID)
968 * 9. checksum (CRC-32C)
969 *
970 * Each segment except the final checksum is MAXALIGN'd.
971 */
972
973/*
974 * Header for a 2PC state file
975 */
976#define TWOPHASE_MAGIC 0x57F94534 /* format identifier */
977
979
980/*
981 * Header for each record in a state file
982 *
983 * NOTE: len counts only the rmgr data, not the TwoPhaseRecordOnDisk header.
984 * The rmgr data will be stored starting on a MAXALIGN boundary.
985 */
987{
988 uint32 len; /* length of rmgr data */
989 TwoPhaseRmgrId rmid; /* resource manager for this record */
990 uint16 info; /* flag bits for use by rmgr */
992
993/*
994 * During prepare, the state file is assembled in memory before writing it
995 * to WAL and the actual state file. We use a chain of StateFileChunk blocks
996 * for that.
997 */
998typedef struct StateFileChunk
999{
1000 char *data;
1004
1005static struct xllist
1006{
1007 StateFileChunk *head; /* first data block in the chain */
1008 StateFileChunk *tail; /* last block in chain */
1010 uint32 bytes_free; /* free bytes left in tail block */
1011 uint32 total_len; /* total data bytes in chain */
1013
1014
1015/*
1016 * Append a block of data to records data structure.
1017 *
1018 * NB: each block is padded to a MAXALIGN multiple. This must be
1019 * accounted for when the file is later read!
1020 *
1021 * The data is copied, so the caller is free to modify it afterwards.
1022 */
1023static void
1025{
1026 uint32 padlen = MAXALIGN(len);
1027
1028 if (padlen > records.bytes_free)
1029 {
1032 records.tail->len = 0;
1033 records.tail->next = NULL;
1035
1036 records.bytes_free = Max(padlen, 512);
1038 }
1039
1040 memcpy(((char *) records.tail->data) + records.tail->len, data, len);
1041 records.tail->len += padlen;
1042 records.bytes_free -= padlen;
1043 records.total_len += padlen;
1044}
1045
1046/*
1047 * Start preparing a state file.
1048 *
1049 * Initializes data structure and inserts the 2PC file header record.
1050 */
1051void
1053{
1054 PGPROC *proc = GetPGProcByNumber(gxact->pgprocno);
1057 TransactionId *children;
1058 RelFileLocator *commitrels;
1059 RelFileLocator *abortrels;
1060 xl_xact_stats_item *abortstats = NULL;
1061 xl_xact_stats_item *commitstats = NULL;
1062 SharedInvalidationMessage *invalmsgs;
1063
1064 /* Initialize linked list */
1066 records.head->len = 0;
1067 records.head->next = NULL;
1068
1069 records.bytes_free = Max(sizeof(TwoPhaseFileHeader), 512);
1071
1073 records.num_chunks = 1;
1074
1075 records.total_len = 0;
1076
1077 /* Create header */
1078 hdr.magic = TWOPHASE_MAGIC;
1079 hdr.total_len = 0; /* EndPrepare will fill this in */
1080 hdr.xid = xid;
1081 hdr.database = proc->databaseId;
1082 hdr.prepared_at = gxact->prepared_at;
1083 hdr.owner = gxact->owner;
1084 hdr.nsubxacts = xactGetCommittedChildren(&children);
1085 hdr.ncommitrels = smgrGetPendingDeletes(true, &commitrels);
1086 hdr.nabortrels = smgrGetPendingDeletes(false, &abortrels);
1087 hdr.ncommitstats =
1088 pgstat_get_transactional_drops(true, &commitstats);
1089 hdr.nabortstats =
1090 pgstat_get_transactional_drops(false, &abortstats);
1092 &hdr.initfileinval);
1093 hdr.gidlen = strlen(gxact->gid) + 1; /* Include '\0' */
1094 /* EndPrepare will fill the origin data, if necessary */
1096 hdr.origin_timestamp = 0;
1097
1098 save_state_data(&hdr, sizeof(TwoPhaseFileHeader));
1099 save_state_data(gxact->gid, hdr.gidlen);
1100
1101 /*
1102 * Add the additional info about subxacts, deletable files and cache
1103 * invalidation messages.
1104 */
1105 if (hdr.nsubxacts > 0)
1106 {
1107 save_state_data(children, hdr.nsubxacts * sizeof(TransactionId));
1108 /* While we have the child-xact data, stuff it in the gxact too */
1109 GXactLoadSubxactData(gxact, hdr.nsubxacts, children);
1110 }
1111 if (hdr.ncommitrels > 0)
1112 {
1113 save_state_data(commitrels, hdr.ncommitrels * sizeof(RelFileLocator));
1114 pfree(commitrels);
1115 }
1116 if (hdr.nabortrels > 0)
1117 {
1118 save_state_data(abortrels, hdr.nabortrels * sizeof(RelFileLocator));
1119 pfree(abortrels);
1120 }
1121 if (hdr.ncommitstats > 0)
1122 {
1123 save_state_data(commitstats,
1124 hdr.ncommitstats * sizeof(xl_xact_stats_item));
1125 pfree(commitstats);
1126 }
1127 if (hdr.nabortstats > 0)
1128 {
1129 save_state_data(abortstats,
1130 hdr.nabortstats * sizeof(xl_xact_stats_item));
1131 pfree(abortstats);
1132 }
1133 if (hdr.ninvalmsgs > 0)
1134 {
1135 save_state_data(invalmsgs,
1137 pfree(invalmsgs);
1138 }
1139}
1140
1141/*
1142 * Finish preparing state data and writing it to WAL.
1143 */
1144void
1146{
1147 TwoPhaseFileHeader *hdr;
1148 StateFileChunk *record;
1149 bool replorigin;
1150
1151 /* Add the end sentinel to the list of 2PC records */
1153 NULL, 0);
1154
1155 /* Go back and fill in total_len in the file header record */
1157 Assert(hdr->magic == TWOPHASE_MAGIC);
1158 hdr->total_len = records.total_len + sizeof(pg_crc32c);
1159
1162
1163 if (replorigin)
1164 {
1167 }
1168
1169 /*
1170 * If the data size exceeds MaxAllocSize, we won't be able to read it in
1171 * ReadTwoPhaseFile. Check for that now, rather than fail in the case
1172 * where we write data to file and then re-read at commit time.
1173 */
1174 if (hdr->total_len > MaxAllocSize)
1175 ereport(ERROR,
1176 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1177 errmsg("two-phase state file maximum length exceeded")));
1178
1179 /*
1180 * Now writing 2PC state data to WAL. We let the WAL's CRC protection
1181 * cover us, so no need to calculate a separate CRC.
1182 *
1183 * We have to set DELAY_CHKPT_START here, too; otherwise a checkpoint
1184 * starting immediately after the WAL record is inserted could complete
1185 * without fsync'ing our state file. (This is essentially the same kind
1186 * of race condition as the COMMIT-to-clog-write case that
1187 * RecordTransactionCommit uses DELAY_CHKPT_IN_COMMIT for; see notes
1188 * there.) Note that DELAY_CHKPT_IN_COMMIT is used to find transactions in
1189 * the critical commit section. We need to know about such transactions
1190 * for conflict detection in logical replication. See
1191 * GetOldestActiveTransactionId(true, false) and its use.
1192 *
1193 * We save the PREPARE record's location in the gxact for later use by
1194 * CheckPointTwoPhase.
1195 */
1197
1199
1202
1204 for (record = records.head; record != NULL; record = record->next)
1205 XLogRegisterData(record->data, record->len);
1206
1208
1209 gxact->prepare_end_lsn = XLogInsert(RM_XACT_ID, XLOG_XACT_PREPARE);
1210
1211 if (replorigin)
1212 {
1213 /* Move LSNs forward for this replication origin */
1215 gxact->prepare_end_lsn);
1216 }
1217
1218 XLogFlush(gxact->prepare_end_lsn);
1219
1220 /* If we crash now, we have prepared: WAL replay will fix things */
1221
1222 /* Store record's start location to read that later on Commit */
1224
1225 /*
1226 * Mark the prepared transaction as valid. As soon as xact.c marks MyProc
1227 * as not running our XID (which it will do immediately after this
1228 * function returns), others can commit/rollback the xact.
1229 *
1230 * NB: a side effect of this is to make a dummy ProcArray entry for the
1231 * prepared XID. This must happen before we clear the XID from MyProc /
1232 * ProcGlobal->xids[], else there is a window where the XID is not running
1233 * according to TransactionIdIsInProgress, and onlookers would be entitled
1234 * to assume the xact crashed. Instead we have a window where the same
1235 * XID appears twice in ProcArray, which is OK.
1236 */
1237 MarkAsPrepared(gxact, false);
1238
1239 /*
1240 * Now we can mark ourselves as out of the commit critical section: a
1241 * checkpoint starting after this will certainly see the gxact as a
1242 * candidate for fsyncing.
1243 */
1244 MyProc->delayChkptFlags &= ~DELAY_CHKPT_START;
1245
1246 /*
1247 * Remember that we have this GlobalTransaction entry locked for us. If
1248 * we crash after this point, it's too late to abort, but we must unlock
1249 * it so that the prepared transaction can be committed or rolled back.
1250 */
1251 MyLockedGxact = gxact;
1252
1254
1255 /*
1256 * Wait for synchronous replication, if required.
1257 *
1258 * Note that at this stage we have marked the prepare, but still show as
1259 * running in the procarray (twice!) and continue to hold locks.
1260 */
1261 SyncRepWaitForLSN(gxact->prepare_end_lsn, false);
1262
1263 records.tail = records.head = NULL;
1264 records.num_chunks = 0;
1265}
1266
1267/*
1268 * Register a 2PC record to be written to state file.
1269 */
1270void
1272 const void *data, uint32 len)
1273{
1274 TwoPhaseRecordOnDisk record;
1275
1276 record.rmid = rmid;
1277 record.info = info;
1278 record.len = len;
1279 save_state_data(&record, sizeof(TwoPhaseRecordOnDisk));
1280 if (len > 0)
1282}
1283
1284
1285/*
1286 * Read and validate the state file for xid.
1287 *
1288 * If it looks OK (has a valid magic number and CRC), return the palloc'd
1289 * contents of the file, issuing an error when finding corrupted data. If
1290 * missing_ok is true, which indicates that missing files can be safely
1291 * ignored, then return NULL. This state can be reached when doing recovery
1292 * after discarding two-phase files from frozen epochs.
1293 */
1294static char *
1296{
1297 char path[MAXPGPATH];
1298 char *buf;
1299 TwoPhaseFileHeader *hdr;
1300 int fd;
1301 struct stat stat;
1302 uint32 crc_offset;
1303 pg_crc32c calc_crc,
1304 file_crc;
1305 int r;
1306
1307 TwoPhaseFilePath(path, fxid);
1308
1309 fd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
1310 if (fd < 0)
1311 {
1312 if (missing_ok && errno == ENOENT)
1313 return NULL;
1314
1315 ereport(ERROR,
1317 errmsg("could not open file \"%s\": %m", path)));
1318 }
1319
1320 /*
1321 * Check file length. We can determine a lower bound pretty easily. We
1322 * set an upper bound to avoid palloc() failure on a corrupt file, though
1323 * we can't guarantee that we won't get an out of memory error anyway,
1324 * even on a valid file.
1325 */
1326 if (fstat(fd, &stat))
1327 ereport(ERROR,
1329 errmsg("could not stat file \"%s\": %m", path)));
1330
1331 if (stat.st_size < (MAXALIGN(sizeof(TwoPhaseFileHeader)) +
1333 sizeof(pg_crc32c)) ||
1335 ereport(ERROR,
1337 errmsg_plural("incorrect size of file \"%s\": %lld byte",
1338 "incorrect size of file \"%s\": %lld bytes",
1339 (long long int) stat.st_size, path,
1340 (long long int) stat.st_size)));
1341
1342 crc_offset = stat.st_size - sizeof(pg_crc32c);
1343 if (crc_offset != MAXALIGN(crc_offset))
1344 ereport(ERROR,
1346 errmsg("incorrect alignment of CRC offset for file \"%s\"",
1347 path)));
1348
1349 /*
1350 * OK, slurp in the file.
1351 */
1352 buf = (char *) palloc(stat.st_size);
1353
1354 pgstat_report_wait_start(WAIT_EVENT_TWOPHASE_FILE_READ);
1355 r = read(fd, buf, stat.st_size);
1356 if (r != stat.st_size)
1357 {
1358 if (r < 0)
1359 ereport(ERROR,
1361 errmsg("could not read file \"%s\": %m", path)));
1362 else
1363 ereport(ERROR,
1364 (errmsg("could not read file \"%s\": read %d of %lld",
1365 path, r, (long long int) stat.st_size)));
1366 }
1367
1369
1370 if (CloseTransientFile(fd) != 0)
1371 ereport(ERROR,
1373 errmsg("could not close file \"%s\": %m", path)));
1374
1375 hdr = (TwoPhaseFileHeader *) buf;
1376 if (hdr->magic != TWOPHASE_MAGIC)
1377 ereport(ERROR,
1379 errmsg("invalid magic number stored in file \"%s\"",
1380 path)));
1381
1382 if (hdr->total_len != stat.st_size)
1383 ereport(ERROR,
1385 errmsg("invalid size stored in file \"%s\"",
1386 path)));
1387
1388 INIT_CRC32C(calc_crc);
1389 COMP_CRC32C(calc_crc, buf, crc_offset);
1390 FIN_CRC32C(calc_crc);
1391
1392 file_crc = *((pg_crc32c *) (buf + crc_offset));
1393
1394 if (!EQ_CRC32C(calc_crc, file_crc))
1395 ereport(ERROR,
1397 errmsg("calculated CRC checksum does not match value stored in file \"%s\"",
1398 path)));
1399
1400 return buf;
1401}
1402
1403
1404/*
1405 * Reads 2PC data from xlog. During checkpoint this data will be moved to
1406 * twophase files and ReadTwoPhaseFile should be used instead.
1407 *
1408 * Note clearly that this function can access WAL during normal operation,
1409 * similarly to the way WALSender or Logical Decoding would do.
1410 */
1411static void
1413{
1414 XLogRecord *record;
1416 char *errormsg;
1417
1419 XL_ROUTINE(.page_read = &read_local_xlog_page,
1420 .segment_open = &wal_segment_open,
1421 .segment_close = &wal_segment_close),
1422 NULL);
1423 if (!xlogreader)
1424 ereport(ERROR,
1425 (errcode(ERRCODE_OUT_OF_MEMORY),
1426 errmsg("out of memory"),
1427 errdetail("Failed while allocating a WAL reading processor.")));
1428
1430 record = XLogReadRecord(xlogreader, &errormsg);
1431
1432 if (record == NULL)
1433 {
1434 if (errormsg)
1435 ereport(ERROR,
1437 errmsg("could not read two-phase state from WAL at %X/%08X: %s",
1438 LSN_FORMAT_ARGS(lsn), errormsg)));
1439 else
1440 ereport(ERROR,
1442 errmsg("could not read two-phase state from WAL at %X/%08X",
1443 LSN_FORMAT_ARGS(lsn))));
1444 }
1445
1446 if (XLogRecGetRmid(xlogreader) != RM_XACT_ID ||
1448 ereport(ERROR,
1450 errmsg("expected two-phase state data is not present in WAL at %X/%08X",
1451 LSN_FORMAT_ARGS(lsn))));
1452
1453 if (len != NULL)
1455
1456 *buf = palloc(sizeof(char) * XLogRecGetDataLen(xlogreader));
1457 memcpy(*buf, XLogRecGetData(xlogreader), sizeof(char) * XLogRecGetDataLen(xlogreader));
1458
1460}
1461
1462
1463/*
1464 * Confirms an xid is prepared, during recovery
1465 */
1466bool
1468{
1469 char *buf;
1470 TwoPhaseFileHeader *hdr;
1471 bool result;
1472 FullTransactionId fxid;
1473
1475
1476 if (max_prepared_xacts <= 0)
1477 return false; /* nothing to do */
1478
1479 /* Read and validate file */
1480 fxid = AdjustToFullTransactionId(xid);
1481 buf = ReadTwoPhaseFile(fxid, true);
1482 if (buf == NULL)
1483 return false;
1484
1485 /* Check header also */
1486 hdr = (TwoPhaseFileHeader *) buf;
1487 result = TransactionIdEquals(hdr->xid, xid);
1488 pfree(buf);
1489
1490 return result;
1491}
1492
1493/*
1494 * FinishPreparedTransaction: execute COMMIT PREPARED or ROLLBACK PREPARED
1495 */
1496void
1497FinishPreparedTransaction(const char *gid, bool isCommit)
1498{
1499 GlobalTransaction gxact;
1500 PGPROC *proc;
1501 FullTransactionId fxid;
1502 TransactionId xid;
1503 bool ondisk;
1504 char *buf;
1505 char *bufptr;
1506 TwoPhaseFileHeader *hdr;
1507 TransactionId latestXid;
1508 TransactionId *children;
1509 RelFileLocator *commitrels;
1510 RelFileLocator *abortrels;
1511 RelFileLocator *delrels;
1512 int ndelrels;
1513 xl_xact_stats_item *commitstats;
1514 xl_xact_stats_item *abortstats;
1515 SharedInvalidationMessage *invalmsgs;
1516
1517 /*
1518 * Validate the GID, and lock the GXACT to ensure that two backends do not
1519 * try to commit the same GID at once.
1520 */
1521 gxact = LockGXact(gid, GetUserId());
1522 proc = GetPGProcByNumber(gxact->pgprocno);
1523 fxid = gxact->fxid;
1524 xid = XidFromFullTransactionId(fxid);
1525
1526 /*
1527 * Read and validate 2PC state data. State data will typically be stored
1528 * in WAL files if the LSN is after the last checkpoint record, or moved
1529 * to disk if for some reason they have lived for a long time.
1530 */
1531 if (gxact->ondisk)
1532 buf = ReadTwoPhaseFile(fxid, false);
1533 else
1535
1536
1537 /*
1538 * Disassemble the header area
1539 */
1540 hdr = (TwoPhaseFileHeader *) buf;
1541 Assert(TransactionIdEquals(hdr->xid, xid));
1542 bufptr = buf + MAXALIGN(sizeof(TwoPhaseFileHeader));
1543 bufptr += MAXALIGN(hdr->gidlen);
1544 children = (TransactionId *) bufptr;
1545 bufptr += MAXALIGN(hdr->nsubxacts * sizeof(TransactionId));
1546 commitrels = (RelFileLocator *) bufptr;
1547 bufptr += MAXALIGN(hdr->ncommitrels * sizeof(RelFileLocator));
1548 abortrels = (RelFileLocator *) bufptr;
1549 bufptr += MAXALIGN(hdr->nabortrels * sizeof(RelFileLocator));
1550 commitstats = (xl_xact_stats_item *) bufptr;
1551 bufptr += MAXALIGN(hdr->ncommitstats * sizeof(xl_xact_stats_item));
1552 abortstats = (xl_xact_stats_item *) bufptr;
1553 bufptr += MAXALIGN(hdr->nabortstats * sizeof(xl_xact_stats_item));
1554 invalmsgs = (SharedInvalidationMessage *) bufptr;
1555 bufptr += MAXALIGN(hdr->ninvalmsgs * sizeof(SharedInvalidationMessage));
1556
1557 /* compute latestXid among all children */
1558 latestXid = TransactionIdLatest(xid, hdr->nsubxacts, children);
1559
1560 /* Prevent cancel/die interrupt while cleaning up */
1562
1563 /*
1564 * The order of operations here is critical: make the XLOG entry for
1565 * commit or abort, then mark the transaction committed or aborted in
1566 * pg_xact, then remove its PGPROC from the global ProcArray (which means
1567 * TransactionIdIsInProgress will stop saying the prepared xact is in
1568 * progress), then run the post-commit or post-abort callbacks. The
1569 * callbacks will release the locks the transaction held.
1570 */
1571 if (isCommit)
1573 hdr->nsubxacts, children,
1574 hdr->ncommitrels, commitrels,
1575 hdr->ncommitstats,
1576 commitstats,
1577 hdr->ninvalmsgs, invalmsgs,
1578 hdr->initfileinval, gid);
1579 else
1581 hdr->nsubxacts, children,
1582 hdr->nabortrels, abortrels,
1583 hdr->nabortstats,
1584 abortstats,
1585 gid);
1586
1587 ProcArrayRemove(proc, latestXid);
1588
1589 /*
1590 * In case we fail while running the callbacks, mark the gxact invalid so
1591 * no one else will try to commit/rollback, and so it will be recycled if
1592 * we fail after this point. It is still locked by our backend so it
1593 * won't go away yet.
1594 *
1595 * (We assume it's safe to do this without taking TwoPhaseStateLock.)
1596 */
1597 gxact->valid = false;
1598
1599 /*
1600 * We have to remove any files that were supposed to be dropped. For
1601 * consistency with the regular xact.c code paths, must do this before
1602 * releasing locks, so do it before running the callbacks.
1603 *
1604 * NB: this code knows that we couldn't be dropping any temp rels ...
1605 */
1606 if (isCommit)
1607 {
1608 delrels = commitrels;
1609 ndelrels = hdr->ncommitrels;
1610 }
1611 else
1612 {
1613 delrels = abortrels;
1614 ndelrels = hdr->nabortrels;
1615 }
1616
1617 /* Make sure files supposed to be dropped are dropped */
1618 DropRelationFiles(delrels, ndelrels, false);
1619
1620 if (isCommit)
1621 pgstat_execute_transactional_drops(hdr->ncommitstats, commitstats, false);
1622 else
1623 pgstat_execute_transactional_drops(hdr->nabortstats, abortstats, false);
1624
1625 /*
1626 * Handle cache invalidation messages.
1627 *
1628 * Relcache init file invalidation requires processing both before and
1629 * after we send the SI messages, only when committing. See
1630 * AtEOXact_Inval().
1631 */
1632 if (isCommit)
1633 {
1634 if (hdr->initfileinval)
1636 SendSharedInvalidMessages(invalmsgs, hdr->ninvalmsgs);
1637 if (hdr->initfileinval)
1639 }
1640
1641 /*
1642 * Acquire the two-phase lock. We want to work on the two-phase callbacks
1643 * while holding it to avoid potential conflicts with other transactions
1644 * attempting to use the same GID, so the lock is released once the shared
1645 * memory state is cleared.
1646 */
1647 LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
1648
1649 /* And now do the callbacks */
1650 if (isCommit)
1652 else
1654
1655 PredicateLockTwoPhaseFinish(fxid, isCommit);
1656
1657 /*
1658 * Read this value while holding the two-phase lock, as the on-disk 2PC
1659 * file is physically removed after the lock is released.
1660 */
1661 ondisk = gxact->ondisk;
1662
1663 /* Clear shared memory state */
1664 RemoveGXact(gxact);
1665
1666 /*
1667 * Release the lock as all callbacks are called and shared memory cleanup
1668 * is done.
1669 */
1670 LWLockRelease(TwoPhaseStateLock);
1671
1672 /* Count the prepared xact as committed or aborted */
1673 AtEOXact_PgStat(isCommit, false);
1674
1675 /*
1676 * And now we can clean up any files we may have left.
1677 */
1678 if (ondisk)
1679 RemoveTwoPhaseFile(fxid, true);
1680
1681 MyLockedGxact = NULL;
1682
1684
1685 pfree(buf);
1686}
1687
1688/*
1689 * Scan 2PC state data in memory and call the indicated callbacks for each 2PC record.
1690 */
1691static void
1693 const TwoPhaseCallback callbacks[])
1694{
1695 for (;;)
1696 {
1697 TwoPhaseRecordOnDisk *record = (TwoPhaseRecordOnDisk *) bufptr;
1698
1699 Assert(record->rmid <= TWOPHASE_RM_MAX_ID);
1700 if (record->rmid == TWOPHASE_RM_END_ID)
1701 break;
1702
1703 bufptr += MAXALIGN(sizeof(TwoPhaseRecordOnDisk));
1704
1705 if (callbacks[record->rmid] != NULL)
1706 callbacks[record->rmid] (fxid, record->info, bufptr, record->len);
1707
1708 bufptr += MAXALIGN(record->len);
1709 }
1710}
1711
1712/*
1713 * Remove the 2PC file.
1714 *
1715 * If giveWarning is false, do not complain about file-not-present;
1716 * this is an expected case during WAL replay.
1717 *
1718 * This routine is used at early stages at recovery where future and
1719 * past orphaned files are checked, hence the FullTransactionId to build
1720 * a complete file name fit for the removal.
1721 */
1722static void
1724{
1725 char path[MAXPGPATH];
1726
1727 TwoPhaseFilePath(path, fxid);
1728 if (unlink(path))
1729 if (errno != ENOENT || giveWarning)
1732 errmsg("could not remove file \"%s\": %m", path)));
1733}
1734
1735/*
1736 * Recreates a state file. This is used in WAL replay and during
1737 * checkpoint creation.
1738 *
1739 * Note: content and len don't include CRC.
1740 */
1741static void
1743{
1744 char path[MAXPGPATH];
1745 pg_crc32c statefile_crc;
1746 int fd;
1747
1748 /* Recompute CRC */
1749 INIT_CRC32C(statefile_crc);
1750 COMP_CRC32C(statefile_crc, content, len);
1751 FIN_CRC32C(statefile_crc);
1752
1753 TwoPhaseFilePath(path, fxid);
1754
1755 fd = OpenTransientFile(path,
1756 O_CREAT | O_TRUNC | O_WRONLY | PG_BINARY);
1757 if (fd < 0)
1758 ereport(ERROR,
1760 errmsg("could not recreate file \"%s\": %m", path)));
1761
1762 /* Write content and CRC */
1763 errno = 0;
1764 pgstat_report_wait_start(WAIT_EVENT_TWOPHASE_FILE_WRITE);
1765 if (write(fd, content, len) != len)
1766 {
1767 /* if write didn't set errno, assume problem is no disk space */
1768 if (errno == 0)
1769 errno = ENOSPC;
1770 ereport(ERROR,
1772 errmsg("could not write file \"%s\": %m", path)));
1773 }
1774 if (write(fd, &statefile_crc, sizeof(pg_crc32c)) != sizeof(pg_crc32c))
1775 {
1776 /* if write didn't set errno, assume problem is no disk space */
1777 if (errno == 0)
1778 errno = ENOSPC;
1779 ereport(ERROR,
1781 errmsg("could not write file \"%s\": %m", path)));
1782 }
1784
1785 /*
1786 * We must fsync the file because the end-of-replay checkpoint will not do
1787 * so, there being no GXACT in shared memory yet to tell it to.
1788 */
1789 pgstat_report_wait_start(WAIT_EVENT_TWOPHASE_FILE_SYNC);
1790 if (pg_fsync(fd) != 0)
1791 ereport(ERROR,
1793 errmsg("could not fsync file \"%s\": %m", path)));
1795
1796 if (CloseTransientFile(fd) != 0)
1797 ereport(ERROR,
1799 errmsg("could not close file \"%s\": %m", path)));
1800}
1801
1802/*
1803 * CheckPointTwoPhase -- handle 2PC component of checkpointing.
1804 *
1805 * We must fsync the state file of any GXACT that is valid or has been
1806 * generated during redo and has a PREPARE LSN <= the checkpoint's redo
1807 * horizon. (If the gxact isn't valid yet, has not been generated in
1808 * redo, or has a later LSN, this checkpoint is not responsible for
1809 * fsyncing it.)
1810 *
1811 * This is deliberately run as late as possible in the checkpoint sequence,
1812 * because GXACTs ordinarily have short lifespans, and so it is quite
1813 * possible that GXACTs that were valid at checkpoint start will no longer
1814 * exist if we wait a little bit. With typical checkpoint settings this
1815 * will be about 3 minutes for an online checkpoint, so as a result we
1816 * expect that there will be no GXACTs that need to be copied to disk.
1817 *
1818 * If a GXACT remains valid across multiple checkpoints, it will already
1819 * be on disk so we don't bother to repeat that write.
1820 */
1821void
1823{
1824 int i;
1825 int serialized_xacts = 0;
1826
1827 if (max_prepared_xacts <= 0)
1828 return; /* nothing to do */
1829
1830 TRACE_POSTGRESQL_TWOPHASE_CHECKPOINT_START();
1831
1832 /*
1833 * We are expecting there to be zero GXACTs that need to be copied to
1834 * disk, so we perform all I/O while holding TwoPhaseStateLock for
1835 * simplicity. This prevents any new xacts from preparing while this
1836 * occurs, which shouldn't be a problem since the presence of long-lived
1837 * prepared xacts indicates the transaction manager isn't active.
1838 *
1839 * It's also possible to move I/O out of the lock, but on every error we
1840 * should check whether somebody committed our transaction in different
1841 * backend. Let's leave this optimization for future, if somebody will
1842 * spot that this place cause bottleneck.
1843 *
1844 * Note that it isn't possible for there to be a GXACT with a
1845 * prepare_end_lsn set prior to the last checkpoint yet is marked invalid,
1846 * because of the efforts with delayChkptFlags.
1847 */
1848 LWLockAcquire(TwoPhaseStateLock, LW_SHARED);
1849 for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
1850 {
1851 /*
1852 * Note that we are using gxact not PGPROC so this works in recovery
1853 * also
1854 */
1856
1857 if ((gxact->valid || gxact->inredo) &&
1858 !gxact->ondisk &&
1859 gxact->prepare_end_lsn <= redo_horizon)
1860 {
1861 char *buf;
1862 int len;
1863
1865 RecreateTwoPhaseFile(gxact->fxid, buf, len);
1866 gxact->ondisk = true;
1869 pfree(buf);
1870 serialized_xacts++;
1871 }
1872 }
1873 LWLockRelease(TwoPhaseStateLock);
1874
1875 /*
1876 * Flush unconditionally the parent directory to make any information
1877 * durable on disk. Two-phase files could have been removed and those
1878 * removals need to be made persistent as well as any files newly created
1879 * previously since the last checkpoint.
1880 */
1882
1883 TRACE_POSTGRESQL_TWOPHASE_CHECKPOINT_DONE();
1884
1885 if (log_checkpoints && serialized_xacts > 0)
1886 ereport(LOG,
1887 (errmsg_plural("%u two-phase state file was written "
1888 "for a long-running prepared transaction",
1889 "%u two-phase state files were written "
1890 "for long-running prepared transactions",
1891 serialized_xacts,
1892 serialized_xacts)));
1893}
1894
1895/*
1896 * restoreTwoPhaseData
1897 *
1898 * Scan pg_twophase and fill TwoPhaseState depending on the on-disk data.
1899 * This is called once at the beginning of recovery, saving any extra
1900 * lookups in the future. Two-phase files that are newer than the
1901 * minimum XID horizon are discarded on the way.
1902 */
1903void
1905{
1906 DIR *cldir;
1907 struct dirent *clde;
1908
1909 LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
1910 cldir = AllocateDir(TWOPHASE_DIR);
1911 while ((clde = ReadDir(cldir, TWOPHASE_DIR)) != NULL)
1912 {
1913 if (strlen(clde->d_name) == 16 &&
1914 strspn(clde->d_name, "0123456789ABCDEF") == 16)
1915 {
1916 FullTransactionId fxid;
1917 char *buf;
1918
1919 fxid = FullTransactionIdFromU64(strtou64(clde->d_name, NULL, 16));
1920
1922 true, false, false);
1923 if (buf == NULL)
1924 continue;
1925
1928 }
1929 }
1930 LWLockRelease(TwoPhaseStateLock);
1931 FreeDir(cldir);
1932}
1933
1934/*
1935 * PrescanPreparedTransactions
1936 *
1937 * Scan the shared memory entries of TwoPhaseState and determine the range
1938 * of valid XIDs present. This is run during database startup, after we
1939 * have completed reading WAL. TransamVariables->nextXid has been set to
1940 * one more than the highest XID for which evidence exists in WAL.
1941 *
1942 * We throw away any prepared xacts with main XID beyond nextXid --- if any
1943 * are present, it suggests that the DBA has done a PITR recovery to an
1944 * earlier point in time without cleaning out pg_twophase. We dare not
1945 * try to recover such prepared xacts since they likely depend on database
1946 * state that doesn't exist now.
1947 *
1948 * However, we will advance nextXid beyond any subxact XIDs belonging to
1949 * valid prepared xacts. We need to do this since subxact commit doesn't
1950 * write a WAL entry, and so there might be no evidence in WAL of those
1951 * subxact XIDs.
1952 *
1953 * On corrupted two-phase files, fail immediately. Keeping around broken
1954 * entries and let replay continue causes harm on the system, and a new
1955 * backup should be rolled in.
1956 *
1957 * Our other responsibility is to determine and return the oldest valid XID
1958 * among the prepared xacts (if none, return TransamVariables->nextXid).
1959 * This is needed to synchronize pg_subtrans startup properly.
1960 *
1961 * If xids_p and nxids_p are not NULL, pointer to a palloc'd array of all
1962 * top-level xids is stored in *xids_p. The number of entries in the array
1963 * is returned in *nxids_p.
1964 */
1967{
1969 TransactionId origNextXid = XidFromFullTransactionId(nextXid);
1970 TransactionId result = origNextXid;
1971 TransactionId *xids = NULL;
1972 int nxids = 0;
1973 int allocsize = 0;
1974 int i;
1975
1976 LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
1977 for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
1978 {
1979 TransactionId xid;
1980 char *buf;
1982
1983 Assert(gxact->inredo);
1984
1986 gxact->prepare_start_lsn,
1987 gxact->ondisk, false, true);
1988
1989 if (buf == NULL)
1990 continue;
1991
1992 /*
1993 * OK, we think this file is valid. Incorporate xid into the
1994 * running-minimum result.
1995 */
1996 xid = XidFromFullTransactionId(gxact->fxid);
1997 if (TransactionIdPrecedes(xid, result))
1998 result = xid;
1999
2000 if (xids_p)
2001 {
2002 if (nxids == allocsize)
2003 {
2004 if (nxids == 0)
2005 {
2006 allocsize = 10;
2007 xids = palloc(allocsize * sizeof(TransactionId));
2008 }
2009 else
2010 {
2011 allocsize = allocsize * 2;
2012 xids = repalloc(xids, allocsize * sizeof(TransactionId));
2013 }
2014 }
2015 xids[nxids++] = xid;
2016 }
2017
2018 pfree(buf);
2019 }
2020 LWLockRelease(TwoPhaseStateLock);
2021
2022 if (xids_p)
2023 {
2024 *xids_p = xids;
2025 *nxids_p = nxids;
2026 }
2027
2028 return result;
2029}
2030
2031/*
2032 * StandbyRecoverPreparedTransactions
2033 *
2034 * Scan the shared memory entries of TwoPhaseState and setup all the required
2035 * information to allow standby queries to treat prepared transactions as still
2036 * active.
2037 *
2038 * This is never called at the end of recovery - we use
2039 * RecoverPreparedTransactions() at that point.
2040 *
2041 * This updates pg_subtrans, so that any subtransactions will be correctly
2042 * seen as in-progress in snapshots taken during recovery.
2043 */
2044void
2046{
2047 int i;
2048
2049 LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
2050 for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
2051 {
2052 char *buf;
2054
2055 Assert(gxact->inredo);
2056
2058 gxact->prepare_start_lsn,
2059 gxact->ondisk, true, false);
2060 if (buf != NULL)
2061 pfree(buf);
2062 }
2063 LWLockRelease(TwoPhaseStateLock);
2064}
2065
2066/*
2067 * RecoverPreparedTransactions
2068 *
2069 * Scan the shared memory entries of TwoPhaseState and reload the state for
2070 * each prepared transaction (reacquire locks, etc).
2071 *
2072 * This is run at the end of recovery, but before we allow backends to write
2073 * WAL.
2074 *
2075 * At the end of recovery the way we take snapshots will change. We now need
2076 * to mark all running transactions with their full SubTransSetParent() info
2077 * to allow normal snapshots to work correctly if snapshots overflow.
2078 * We do this here because by definition prepared transactions are the only
2079 * type of write transaction still running, so this is necessary and
2080 * complete.
2081 */
2082void
2084{
2085 int i;
2086
2087 LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
2088 for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
2089 {
2090 char *buf;
2092 FullTransactionId fxid = gxact->fxid;
2093 char *bufptr;
2094 TwoPhaseFileHeader *hdr;
2095 TransactionId *subxids;
2096 const char *gid;
2097
2098 /*
2099 * Reconstruct subtrans state for the transaction --- needed because
2100 * pg_subtrans is not preserved over a restart. Note that we are
2101 * linking all the subtransactions directly to the top-level XID;
2102 * there may originally have been a more complex hierarchy, but
2103 * there's no need to restore that exactly. It's possible that
2104 * SubTransSetParent has been set before, if the prepared transaction
2105 * generated xid assignment records.
2106 */
2108 gxact->prepare_start_lsn,
2109 gxact->ondisk, true, false);
2110 if (buf == NULL)
2111 continue;
2112
2113 ereport(LOG,
2114 (errmsg("recovering prepared transaction %u of epoch %u from shared memory",
2117
2118 hdr = (TwoPhaseFileHeader *) buf;
2121 bufptr = buf + MAXALIGN(sizeof(TwoPhaseFileHeader));
2122 gid = (const char *) bufptr;
2123 bufptr += MAXALIGN(hdr->gidlen);
2124 subxids = (TransactionId *) bufptr;
2125 bufptr += MAXALIGN(hdr->nsubxacts * sizeof(TransactionId));
2126 bufptr += MAXALIGN(hdr->ncommitrels * sizeof(RelFileLocator));
2127 bufptr += MAXALIGN(hdr->nabortrels * sizeof(RelFileLocator));
2128 bufptr += MAXALIGN(hdr->ncommitstats * sizeof(xl_xact_stats_item));
2129 bufptr += MAXALIGN(hdr->nabortstats * sizeof(xl_xact_stats_item));
2130 bufptr += MAXALIGN(hdr->ninvalmsgs * sizeof(SharedInvalidationMessage));
2131
2132 /*
2133 * Recreate its GXACT and dummy PGPROC. But, check whether it was
2134 * added in redo and already has a shmem entry for it.
2135 */
2136 MarkAsPreparingGuts(gxact, gxact->fxid, gid,
2137 hdr->prepared_at,
2138 hdr->owner, hdr->database);
2139
2140 /* recovered, so reset the flag for entries generated by redo */
2141 gxact->inredo = false;
2142
2143 GXactLoadSubxactData(gxact, hdr->nsubxacts, subxids);
2144 MarkAsPrepared(gxact, true);
2145
2146 LWLockRelease(TwoPhaseStateLock);
2147
2148 /*
2149 * Recover other state (notably locks) using resource managers.
2150 */
2152
2153 /*
2154 * Release locks held by the standby process after we process each
2155 * prepared transaction. As a result, we don't need too many
2156 * additional locks at any one time.
2157 */
2158 if (InHotStandby)
2159 StandbyReleaseLockTree(hdr->xid, hdr->nsubxacts, subxids);
2160
2161 /*
2162 * We're done with recovering this transaction. Clear MyLockedGxact,
2163 * like we do in PrepareTransaction() during normal operation.
2164 */
2166
2167 pfree(buf);
2168
2169 LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
2170 }
2171
2172 LWLockRelease(TwoPhaseStateLock);
2173}
2174
2175/*
2176 * ProcessTwoPhaseBuffer
2177 *
2178 * Given a FullTransactionId, read it either from disk or read it directly
2179 * via shmem xlog record pointer using the provided "prepare_start_lsn".
2180 *
2181 * If setParent is true, set up subtransaction parent linkages.
2182 *
2183 * If setNextXid is true, set TransamVariables->nextXid to the newest
2184 * value scanned.
2185 */
2186static char *
2188 XLogRecPtr prepare_start_lsn,
2189 bool fromdisk,
2190 bool setParent, bool setNextXid)
2191{
2193 TransactionId *subxids;
2194 char *buf;
2195 TwoPhaseFileHeader *hdr;
2196 int i;
2197
2198 Assert(LWLockHeldByMeInMode(TwoPhaseStateLock, LW_EXCLUSIVE));
2199
2200 if (!fromdisk)
2201 Assert(prepare_start_lsn != InvalidXLogRecPtr);
2202
2203 /* Already processed? */
2206 {
2207 if (fromdisk)
2208 {
2210 (errmsg("removing stale two-phase state file for transaction %u of epoch %u",
2213 RemoveTwoPhaseFile(fxid, true);
2214 }
2215 else
2216 {
2218 (errmsg("removing stale two-phase state from memory for transaction %u of epoch %u",
2221 PrepareRedoRemoveFull(fxid, true);
2222 }
2223 return NULL;
2224 }
2225
2226 /* Reject XID if too new */
2227 if (FullTransactionIdFollowsOrEquals(fxid, nextXid))
2228 {
2229 if (fromdisk)
2230 {
2232 (errmsg("removing future two-phase state file for transaction %u of epoch %u",
2235 RemoveTwoPhaseFile(fxid, true);
2236 }
2237 else
2238 {
2240 (errmsg("removing future two-phase state from memory for transaction %u of epoch %u",
2243 PrepareRedoRemoveFull(fxid, true);
2244 }
2245 return NULL;
2246 }
2247
2248 if (fromdisk)
2249 {
2250 /* Read and validate file */
2251 buf = ReadTwoPhaseFile(fxid, false);
2252 }
2253 else
2254 {
2255 /* Read xlog data */
2256 XlogReadTwoPhaseData(prepare_start_lsn, &buf, NULL);
2257 }
2258
2259 /* Deconstruct header */
2260 hdr = (TwoPhaseFileHeader *) buf;
2262 {
2263 if (fromdisk)
2264 ereport(ERROR,
2266 errmsg("corrupted two-phase state file for transaction %u of epoch %u",
2269 else
2270 ereport(ERROR,
2272 errmsg("corrupted two-phase state in memory for transaction %u of epoch %u",
2275 }
2276
2277 /*
2278 * Examine subtransaction XIDs ... they should all follow main XID, and
2279 * they may force us to advance nextXid.
2280 */
2281 subxids = (TransactionId *) (buf +
2282 MAXALIGN(sizeof(TwoPhaseFileHeader)) +
2283 MAXALIGN(hdr->gidlen));
2284 for (i = 0; i < hdr->nsubxacts; i++)
2285 {
2286 TransactionId subxid = subxids[i];
2287
2289
2290 /* update nextXid if needed */
2291 if (setNextXid)
2293
2294 if (setParent)
2296 }
2297
2298 return buf;
2299}
2300
2301
2302/*
2303 * RecordTransactionCommitPrepared
2304 *
2305 * This is basically the same as RecordTransactionCommit (q.v. if you change
2306 * this function): in particular, we must set DELAY_CHKPT_IN_COMMIT to avoid a
2307 * race condition.
2308 *
2309 * We know the transaction made at least one XLOG entry (its PREPARE),
2310 * so it is never possible to optimize out the commit record.
2311 */
2312static void
2314 int nchildren,
2315 TransactionId *children,
2316 int nrels,
2317 RelFileLocator *rels,
2318 int nstats,
2319 xl_xact_stats_item *stats,
2320 int ninvalmsgs,
2321 SharedInvalidationMessage *invalmsgs,
2322 bool initfileinval,
2323 const char *gid)
2324{
2325 XLogRecPtr recptr;
2326 TimestampTz committs;
2327 bool replorigin;
2328
2329 /*
2330 * Are we using the replication origins feature? Or, in other words, are
2331 * we replaying remote actions?
2332 */
2335
2336 /* Load the injection point before entering the critical section */
2337 INJECTION_POINT_LOAD("commit-after-delay-checkpoint");
2338
2340
2341 /* See notes in RecordTransactionCommit */
2344
2345 INJECTION_POINT_CACHED("commit-after-delay-checkpoint", NULL);
2346
2347 /*
2348 * Ensures the DELAY_CHKPT_IN_COMMIT flag write is globally visible before
2349 * commit time is written.
2350 */
2352
2353 /*
2354 * Note it is important to set committs value after marking ourselves as
2355 * in the commit critical section (DELAY_CHKPT_IN_COMMIT). This is because
2356 * we want to ensure all transactions that have acquired commit timestamp
2357 * are finished before we allow the logical replication client to advance
2358 * its xid which is used to hold back dead rows for conflict detection.
2359 * See comments atop worker.c.
2360 */
2361 committs = GetCurrentTimestamp();
2362
2363 /*
2364 * Emit the XLOG commit record. Note that we mark 2PC commits as
2365 * potentially having AccessExclusiveLocks since we don't know whether or
2366 * not they do.
2367 */
2368 recptr = XactLogCommitRecord(committs,
2369 nchildren, children, nrels, rels,
2370 nstats, stats,
2371 ninvalmsgs, invalmsgs,
2372 initfileinval,
2374 xid, gid);
2375
2376
2377 if (replorigin)
2378 /* Move LSNs forward for this replication origin */
2381
2382 /*
2383 * Record commit timestamp. The value comes from plain commit timestamp
2384 * if replorigin is not enabled, or replorigin already set a value for us
2385 * in replorigin_session_origin_timestamp otherwise.
2386 *
2387 * We don't need to WAL-log anything here, as the commit record written
2388 * above already contains the data.
2389 */
2390 if (!replorigin || replorigin_session_origin_timestamp == 0)
2392
2393 TransactionTreeSetCommitTsData(xid, nchildren, children,
2396
2397 /*
2398 * We don't currently try to sleep before flush here ... nor is there any
2399 * support for async commit of a prepared xact (the very idea is probably
2400 * a contradiction)
2401 */
2402
2403 /* Flush XLOG to disk */
2404 XLogFlush(recptr);
2405
2406 /* Mark the transaction committed in pg_xact */
2407 TransactionIdCommitTree(xid, nchildren, children);
2408
2409 /* Checkpoint can proceed now */
2410 MyProc->delayChkptFlags &= ~DELAY_CHKPT_IN_COMMIT;
2411
2413
2414 /*
2415 * Wait for synchronous replication, if required.
2416 *
2417 * Note that at this stage we have marked clog, but still show as running
2418 * in the procarray and continue to hold locks.
2419 */
2420 SyncRepWaitForLSN(recptr, true);
2421}
2422
2423/*
2424 * RecordTransactionAbortPrepared
2425 *
2426 * This is basically the same as RecordTransactionAbort.
2427 *
2428 * We know the transaction made at least one XLOG entry (its PREPARE),
2429 * so it is never possible to optimize out the abort record.
2430 */
2431static void
2433 int nchildren,
2434 TransactionId *children,
2435 int nrels,
2436 RelFileLocator *rels,
2437 int nstats,
2438 xl_xact_stats_item *stats,
2439 const char *gid)
2440{
2441 XLogRecPtr recptr;
2442 bool replorigin;
2443
2444 /*
2445 * Are we using the replication origins feature? Or, in other words, are
2446 * we replaying remote actions?
2447 */
2450
2451 /*
2452 * Catch the scenario where we aborted partway through
2453 * RecordTransactionCommitPrepared ...
2454 */
2455 if (TransactionIdDidCommit(xid))
2456 elog(PANIC, "cannot abort transaction %u, it was already committed",
2457 xid);
2458
2460
2461 /*
2462 * Emit the XLOG commit record. Note that we mark 2PC aborts as
2463 * potentially having AccessExclusiveLocks since we don't know whether or
2464 * not they do.
2465 */
2467 nchildren, children,
2468 nrels, rels,
2469 nstats, stats,
2471 xid, gid);
2472
2473 if (replorigin)
2474 /* Move LSNs forward for this replication origin */
2477
2478 /* Always flush, since we're about to remove the 2PC state file */
2479 XLogFlush(recptr);
2480
2481 /*
2482 * Mark the transaction aborted in clog. This is not absolutely necessary
2483 * but we may as well do it while we are here.
2484 */
2485 TransactionIdAbortTree(xid, nchildren, children);
2486
2488
2489 /*
2490 * Wait for synchronous replication, if required.
2491 *
2492 * Note that at this stage we have marked clog, but still show as running
2493 * in the procarray and continue to hold locks.
2494 */
2495 SyncRepWaitForLSN(recptr, false);
2496}
2497
2498/*
2499 * PrepareRedoAdd
2500 *
2501 * Store pointers to the start/end of the WAL record along with the xid in
2502 * a gxact entry in shared memory TwoPhaseState structure. If caller
2503 * specifies InvalidXLogRecPtr as WAL location to fetch the two-phase
2504 * data, the entry is marked as located on disk.
2505 */
2506void
2508 XLogRecPtr start_lsn, XLogRecPtr end_lsn,
2509 RepOriginId origin_id)
2510{
2512 char *bufptr;
2513 const char *gid;
2514 GlobalTransaction gxact;
2515
2516 Assert(LWLockHeldByMeInMode(TwoPhaseStateLock, LW_EXCLUSIVE));
2518
2519 if (!FullTransactionIdIsValid(fxid))
2520 {
2523 hdr->xid);
2524 }
2525
2526 bufptr = buf + MAXALIGN(sizeof(TwoPhaseFileHeader));
2527 gid = (const char *) bufptr;
2528
2529 /*
2530 * Reserve the GID for the given transaction in the redo code path.
2531 *
2532 * This creates a gxact struct and puts it into the active array.
2533 *
2534 * In redo, this struct is mainly used to track PREPARE/COMMIT entries in
2535 * shared memory. Hence, we only fill up the bare minimum contents here.
2536 * The gxact also gets marked with gxact->inredo set to true to indicate
2537 * that it got added in the redo phase
2538 */
2539
2540 /*
2541 * In the event of a crash while a checkpoint was running, it may be
2542 * possible that some two-phase data found its way to disk while its
2543 * corresponding record needs to be replayed in the follow-up recovery. As
2544 * the 2PC data was on disk, it has already been restored at the beginning
2545 * of recovery with restoreTwoPhaseData(), so skip this record to avoid
2546 * duplicates in TwoPhaseState. If a consistent state has been reached,
2547 * the record is added to TwoPhaseState and it should have no
2548 * corresponding file in pg_twophase.
2549 */
2550 if (!XLogRecPtrIsInvalid(start_lsn))
2551 {
2552 char path[MAXPGPATH];
2553
2555 TwoPhaseFilePath(path, fxid);
2556
2557 if (access(path, F_OK) == 0)
2558 {
2560 (errmsg("could not recover two-phase state file for transaction %u",
2561 hdr->xid),
2562 errdetail("Two-phase state file has been found in WAL record %X/%08X, but this transaction has already been restored from disk.",
2563 LSN_FORMAT_ARGS(start_lsn))));
2564 return;
2565 }
2566
2567 if (errno != ENOENT)
2568 ereport(ERROR,
2570 errmsg("could not access file \"%s\": %m", path)));
2571 }
2572
2573 /* Get a free gxact from the freelist */
2574 if (TwoPhaseState->freeGXacts == NULL)
2575 ereport(ERROR,
2576 (errcode(ERRCODE_OUT_OF_MEMORY),
2577 errmsg("maximum number of prepared transactions reached"),
2578 errhint("Increase \"max_prepared_transactions\" (currently %d).",
2580 gxact = TwoPhaseState->freeGXacts;
2581 TwoPhaseState->freeGXacts = gxact->next;
2582
2583 gxact->prepared_at = hdr->prepared_at;
2584 gxact->prepare_start_lsn = start_lsn;
2585 gxact->prepare_end_lsn = end_lsn;
2586 gxact->fxid = fxid;
2587 gxact->owner = hdr->owner;
2589 gxact->valid = false;
2590 gxact->ondisk = XLogRecPtrIsInvalid(start_lsn);
2591 gxact->inredo = true; /* yes, added in redo */
2592 strcpy(gxact->gid, gid);
2593
2594 /* And insert it into the active array */
2597
2598 if (origin_id != InvalidRepOriginId)
2599 {
2600 /* recover apply progress */
2601 replorigin_advance(origin_id, hdr->origin_lsn, end_lsn,
2602 false /* backward */ , false /* WAL */ );
2603 }
2604
2605 elog(DEBUG2, "added 2PC data in shared memory for transaction %u of epoch %u",
2608}
2609
2610/*
2611 * PrepareRedoRemoveFull
2612 *
2613 * Remove the corresponding gxact entry from TwoPhaseState. Also remove
2614 * the 2PC file if a prepared transaction was saved via an earlier checkpoint.
2615 *
2616 * Caller must hold TwoPhaseStateLock in exclusive mode, because TwoPhaseState
2617 * is updated.
2618 */
2619static void
2621{
2622 GlobalTransaction gxact = NULL;
2623 int i;
2624 bool found = false;
2625
2626 Assert(LWLockHeldByMeInMode(TwoPhaseStateLock, LW_EXCLUSIVE));
2628
2629 for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
2630 {
2631 gxact = TwoPhaseState->prepXacts[i];
2632
2633 if (FullTransactionIdEquals(gxact->fxid, fxid))
2634 {
2635 Assert(gxact->inredo);
2636 found = true;
2637 break;
2638 }
2639 }
2640
2641 /*
2642 * Just leave if there is nothing, this is expected during WAL replay.
2643 */
2644 if (!found)
2645 return;
2646
2647 /*
2648 * And now we can clean up any files we may have left.
2649 */
2650 elog(DEBUG2, "removing 2PC data for transaction %u of epoch %u ",
2653
2654 if (gxact->ondisk)
2655 RemoveTwoPhaseFile(fxid, giveWarning);
2656
2657 RemoveGXact(gxact);
2658}
2659
2660/*
2661 * Wrapper of PrepareRedoRemoveFull(), for TransactionIds.
2662 */
2663void
2664PrepareRedoRemove(TransactionId xid, bool giveWarning)
2665{
2666 FullTransactionId fxid =
2668
2669 PrepareRedoRemoveFull(fxid, giveWarning);
2670}
2671
2672/*
2673 * LookupGXact
2674 * Check if the prepared transaction with the given GID, lsn and timestamp
2675 * exists.
2676 *
2677 * Note that we always compare with the LSN where prepare ends because that is
2678 * what is stored as origin_lsn in the 2PC file.
2679 *
2680 * This function is primarily used to check if the prepared transaction
2681 * received from the upstream (remote node) already exists. Checking only GID
2682 * is not sufficient because a different prepared xact with the same GID can
2683 * exist on the same node. So, we are ensuring to match origin_lsn and
2684 * origin_timestamp of prepared xact to avoid the possibility of a match of
2685 * prepared xact from two different nodes.
2686 */
2687bool
2688LookupGXact(const char *gid, XLogRecPtr prepare_end_lsn,
2689 TimestampTz origin_prepare_timestamp)
2690{
2691 int i;
2692 bool found = false;
2693
2694 LWLockAcquire(TwoPhaseStateLock, LW_SHARED);
2695 for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
2696 {
2698
2699 /* Ignore not-yet-valid GIDs. */
2700 if (gxact->valid && strcmp(gxact->gid, gid) == 0)
2701 {
2702 char *buf;
2703 TwoPhaseFileHeader *hdr;
2704
2705 /*
2706 * We are not expecting collisions of GXACTs (same gid) between
2707 * publisher and subscribers, so we perform all I/O while holding
2708 * TwoPhaseStateLock for simplicity.
2709 *
2710 * To move the I/O out of the lock, we need to ensure that no
2711 * other backend commits the prepared xact in the meantime. We can
2712 * do this optimization if we encounter many collisions in GID
2713 * between publisher and subscriber.
2714 */
2715 if (gxact->ondisk)
2716 buf = ReadTwoPhaseFile(gxact->fxid, false);
2717 else
2718 {
2719 Assert(gxact->prepare_start_lsn);
2721 }
2722
2723 hdr = (TwoPhaseFileHeader *) buf;
2724
2725 if (hdr->origin_lsn == prepare_end_lsn &&
2726 hdr->origin_timestamp == origin_prepare_timestamp)
2727 {
2728 found = true;
2729 pfree(buf);
2730 break;
2731 }
2732
2733 pfree(buf);
2734 }
2735 }
2736 LWLockRelease(TwoPhaseStateLock);
2737 return found;
2738}
2739
2740/*
2741 * TwoPhaseTransactionGid
2742 * Form the prepared transaction GID for two_phase transactions.
2743 *
2744 * Return the GID in the supplied buffer.
2745 */
2746void
2747TwoPhaseTransactionGid(Oid subid, TransactionId xid, char *gid_res, int szgid)
2748{
2749 Assert(OidIsValid(subid));
2750
2751 if (!TransactionIdIsValid(xid))
2752 ereport(ERROR,
2753 (errcode(ERRCODE_PROTOCOL_VIOLATION),
2754 errmsg_internal("invalid two-phase transaction ID")));
2755
2756 snprintf(gid_res, szgid, "pg_gid_%u_%u", subid, xid);
2757}
2758
2759/*
2760 * IsTwoPhaseTransactionGidForSubid
2761 * Check whether the given GID (as formed by TwoPhaseTransactionGid) is
2762 * for the specified 'subid'.
2763 */
2764static bool
2766{
2767 int ret;
2768 Oid subid_from_gid;
2769 TransactionId xid_from_gid;
2770 char gid_tmp[GIDSIZE];
2771
2772 /* Extract the subid and xid from the given GID */
2773 ret = sscanf(gid, "pg_gid_%u_%u", &subid_from_gid, &xid_from_gid);
2774
2775 /*
2776 * Check that the given GID has expected format, and at least the subid
2777 * matches.
2778 */
2779 if (ret != 2 || subid != subid_from_gid)
2780 return false;
2781
2782 /*
2783 * Reconstruct a temporary GID based on the subid and xid extracted from
2784 * the given GID and check whether the temporary GID and the given GID
2785 * match.
2786 */
2787 TwoPhaseTransactionGid(subid, xid_from_gid, gid_tmp, sizeof(gid_tmp));
2788
2789 return strcmp(gid, gid_tmp) == 0;
2790}
2791
2792/*
2793 * LookupGXactBySubid
2794 * Check if the prepared transaction done by apply worker exists.
2795 */
2796bool
2798{
2799 bool found = false;
2800
2801 LWLockAcquire(TwoPhaseStateLock, LW_SHARED);
2802 for (int i = 0; i < TwoPhaseState->numPrepXacts; i++)
2803 {
2805
2806 /* Ignore not-yet-valid GIDs. */
2807 if (gxact->valid &&
2809 {
2810 found = true;
2811 break;
2812 }
2813 }
2814 LWLockRelease(TwoPhaseStateLock);
2815
2816 return found;
2817}
2818
2819/*
2820 * TwoPhaseGetXidByLockingProc
2821 * Return the oldest transaction ID from prepared transactions that are
2822 * currently in the commit critical section.
2823 *
2824 * This function only considers transactions in the currently connected
2825 * database. If no matching transactions are found, it returns
2826 * InvalidTransactionId.
2827 */
2830{
2831 TransactionId oldestRunningXid = InvalidTransactionId;
2832
2833 LWLockAcquire(TwoPhaseStateLock, LW_SHARED);
2834
2835 for (int i = 0; i < TwoPhaseState->numPrepXacts; i++)
2836 {
2838 PGPROC *commitproc;
2839 TransactionId xid;
2840
2841 if (!gxact->valid)
2842 continue;
2843
2845 continue;
2846
2847 /*
2848 * Get the backend that is handling the transaction. It's safe to
2849 * access this backend while holding TwoPhaseStateLock, as the backend
2850 * can only be destroyed after either removing or unlocking the
2851 * current global transaction, both of which require an exclusive
2852 * TwoPhaseStateLock.
2853 */
2854 commitproc = GetPGProcByNumber(gxact->locking_backend);
2855
2856 if (MyDatabaseId != commitproc->databaseId)
2857 continue;
2858
2859 if ((commitproc->delayChkptFlags & DELAY_CHKPT_IN_COMMIT) == 0)
2860 continue;
2861
2862 xid = XidFromFullTransactionId(gxact->fxid);
2863
2864 if (!TransactionIdIsValid(oldestRunningXid) ||
2865 TransactionIdPrecedes(xid, oldestRunningXid))
2866 oldestRunningXid = xid;
2867 }
2868
2869 LWLockRelease(TwoPhaseStateLock);
2870
2871 return oldestRunningXid;
2872}
#define pg_write_barrier()
Definition: atomics.h:155
static void pg_atomic_init_u64(volatile pg_atomic_uint64 *ptr, uint64 val)
Definition: atomics.h:451
int16 AttrNumber
Definition: attnum.h:21
TimestampTz GetCurrentTimestamp(void)
Definition: timestamp.c:1645
static Datum values[MAXATTR]
Definition: bootstrap.c:153
#define CStringGetTextDatum(s)
Definition: builtins.h:97
#define MAXALIGN(LEN)
Definition: c.h:811
#define Max(x, y)
Definition: c.h:998
#define PG_BINARY
Definition: c.h:1273
#define FLEXIBLE_ARRAY_MEMBER
Definition: c.h:471
uint16_t uint16
Definition: c.h:538
uint32_t uint32
Definition: c.h:539
#define MemSet(start, val, len)
Definition: c.h:1020
uint32 TransactionId
Definition: c.h:658
#define OidIsValid(objectId)
Definition: c.h:775
size_t Size
Definition: c.h:611
void TransactionTreeSetCommitTsData(TransactionId xid, int nsubxids, TransactionId *subxids, TimestampTz timestamp, RepOriginId nodeid)
Definition: commit_ts.c:139
int64 TimestampTz
Definition: timestamp.h:39
int errmsg_plural(const char *fmt_singular, const char *fmt_plural, unsigned long n,...)
Definition: elog.c:1184
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1161
int errcode_for_file_access(void)
Definition: elog.c:877
int errdetail(const char *fmt,...)
Definition: elog.c:1207
int errhint(const char *fmt,...)
Definition: elog.c:1321
int errcode(int sqlerrcode)
Definition: elog.c:854
int errmsg(const char *fmt,...)
Definition: elog.c:1071
#define LOG
Definition: elog.h:31
#define WARNING
Definition: elog.h:36
#define DEBUG2
Definition: elog.h:29
#define PANIC
Definition: elog.h:42
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:226
#define ereport(elevel,...)
Definition: elog.h:150
TupleDesc BlessTupleDesc(TupleDesc tupdesc)
Definition: execTuples.c:2260
int FreeDir(DIR *dir)
Definition: fd.c:3022
int CloseTransientFile(int fd)
Definition: fd.c:2868
void fsync_fname(const char *fname, bool isdir)
Definition: fd.c:753
DIR * AllocateDir(const char *dirname)
Definition: fd.c:2904
struct dirent * ReadDir(DIR *dir, const char *dirname)
Definition: fd.c:2970
int pg_fsync(int fd)
Definition: fd.c:386
int OpenTransientFile(const char *fileName, int fileFlags)
Definition: fd.c:2691
#define MaxAllocSize
Definition: fe_memutils.h:22
#define PG_FUNCTION_ARGS
Definition: fmgr.h:193
#define SRF_IS_FIRSTCALL()
Definition: funcapi.h:304
#define SRF_PERCALL_SETUP()
Definition: funcapi.h:308
#define SRF_RETURN_NEXT(_funcctx, _result)
Definition: funcapi.h:310
#define SRF_FIRSTCALL_INIT()
Definition: funcapi.h:306
static Datum HeapTupleGetDatum(const HeapTupleData *tuple)
Definition: funcapi.h:230
#define SRF_RETURN_DONE(_funcctx)
Definition: funcapi.h:328
ProcNumber MyProcNumber
Definition: globals.c:90
bool IsUnderPostmaster
Definition: globals.c:120
bool IsPostmasterEnvironment
Definition: globals.c:119
Oid MyDatabaseId
Definition: globals.c:94
Assert(PointerIsAligned(start, uint64))
HeapTuple heap_form_tuple(TupleDesc tupleDescriptor, const Datum *values, const bool *isnull)
Definition: heaptuple.c:1117
static void dlist_init(dlist_head *head)
Definition: ilist.h:314
static void dlist_node_init(dlist_node *node)
Definition: ilist.h:325
#define INJECTION_POINT_CACHED(name, arg)
#define INJECTION_POINT_LOAD(name)
#define write(a, b, c)
Definition: win32.h:14
#define read(a, b, c)
Definition: win32.h:13
int xactGetCommittedInvalidationMessages(SharedInvalidationMessage **msgs, bool *RelcacheInitFileInval)
Definition: inval.c:1012
void before_shmem_exit(pg_on_exit_callback function, Datum arg)
Definition: ipc.c:337
int i
Definition: isn.c:77
#define VirtualTransactionIdIsValid(vxid)
Definition: lock.h:69
#define GET_VXID_FROM_PGPROC(vxid_dst, proc)
Definition: lock.h:79
#define LocalTransactionIdIsValid(lxid)
Definition: lock.h:68
#define VirtualTransactionIdEquals(vxid1, vxid2)
Definition: lock.h:73
bool LWLockHeldByMe(LWLock *lock)
Definition: lwlock.c:1977
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1174
bool LWLockHeldByMeInMode(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:2021
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1894
@ LW_WS_NOT_WAITING
Definition: lwlock.h:30
#define NUM_LOCK_PARTITIONS
Definition: lwlock.h:95
@ LW_SHARED
Definition: lwlock.h:113
@ LW_EXCLUSIVE
Definition: lwlock.h:112
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1610
void pfree(void *pointer)
Definition: mcxt.c:1594
void * palloc0(Size size)
Definition: mcxt.c:1395
void * palloc(Size size)
Definition: mcxt.c:1365
void DropRelationFiles(RelFileLocator *delrels, int ndelrels, bool isRedo)
Definition: md.c:1587
#define RESUME_INTERRUPTS()
Definition: miscadmin.h:135
#define AmStartupProcess()
Definition: miscadmin.h:389
#define START_CRIT_SECTION()
Definition: miscadmin.h:149
#define HOLD_INTERRUPTS()
Definition: miscadmin.h:133
#define END_CRIT_SECTION()
Definition: miscadmin.h:151
Oid GetUserId(void)
Definition: miscinit.c:469
TimestampTz replorigin_session_origin_timestamp
Definition: origin.c:165
void replorigin_session_advance(XLogRecPtr remote_commit, XLogRecPtr local_commit)
Definition: origin.c:1255
RepOriginId replorigin_session_origin
Definition: origin.c:163
void replorigin_advance(RepOriginId node, XLogRecPtr remote_commit, XLogRecPtr local_commit, bool go_backward, bool wal_log)
Definition: origin.c:911
XLogRecPtr replorigin_session_origin_lsn
Definition: origin.c:164
#define DoNotReplicateId
Definition: origin.h:34
#define InvalidRepOriginId
Definition: origin.h:33
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition: palloc.h:124
void * arg
#define ERRCODE_DATA_CORRUPTED
Definition: pg_basebackup.c:42
#define MAXPGPATH
uint32 pg_crc32c
Definition: pg_crc32c.h:38
#define COMP_CRC32C(crc, data, len)
Definition: pg_crc32c.h:153
#define EQ_CRC32C(c1, c2)
Definition: pg_crc32c.h:42
#define INIT_CRC32C(crc)
Definition: pg_crc32c.h:41
#define FIN_CRC32C(crc)
Definition: pg_crc32c.h:158
const void size_t len
const void * data
while(p+4<=pend)
static char * user
Definition: pg_regress.c:119
static char * buf
Definition: pg_test_fsync.c:72
void pgstat_execute_transactional_drops(int ndrops, struct xl_xact_stats_item *items, bool is_redo)
Definition: pgstat_xact.c:314
void AtEOXact_PgStat(bool isCommit, bool parallel)
Definition: pgstat_xact.c:40
int pgstat_get_transactional_drops(bool isCommit, xl_xact_stats_item **items)
Definition: pgstat_xact.c:272
#define snprintf
Definition: port.h:239
static Datum TransactionIdGetDatum(TransactionId X)
Definition: postgres.h:282
static Datum ObjectIdGetDatum(Oid X)
Definition: postgres.h:262
uint64_t Datum
Definition: postgres.h:70
#define InvalidOid
Definition: postgres_ext.h:37
unsigned int Oid
Definition: postgres_ext.h:32
void PredicateLockTwoPhaseFinish(FullTransactionId fxid, bool isCommit)
Definition: predicate.c:4882
static int fd(const char *x, int i)
Definition: preproc-init.c:105
short access
Definition: preproc-type.c:36
#define DELAY_CHKPT_IN_COMMIT
Definition: proc.h:137
#define GetPGProcByNumber(n)
Definition: proc.h:440
#define PGPROC_MAX_CACHED_SUBXIDS
Definition: proc.h:39
#define GetNumberFromPGProc(proc)
Definition: proc.h:441
#define DELAY_CHKPT_START
Definition: proc.h:135
@ PROC_WAIT_STATUS_OK
Definition: proc.h:141
void ProcArrayAdd(PGPROC *proc)
Definition: procarray.c:468
void ProcArrayRemove(PGPROC *proc, TransactionId latestXid)
Definition: procarray.c:565
#define INVALID_PROC_NUMBER
Definition: procnumber.h:26
int ProcNumber
Definition: procnumber.h:24
void RelationCacheInitFilePostInvalidate(void)
Definition: relcache.c:6885
void RelationCacheInitFilePreInvalidate(void)
Definition: relcache.c:6860
Size add_size(Size s1, Size s2)
Definition: shmem.c:493
Size mul_size(Size s1, Size s2)
Definition: shmem.c:510
void * ShmemInitStruct(const char *name, Size size, bool *foundPtr)
Definition: shmem.c:387
void SendSharedInvalidMessages(const SharedInvalidationMessage *msgs, int n)
Definition: sinval.c:47
PGPROC * MyProc
Definition: proc.c:66
PGPROC * PreparedXactProcs
Definition: proc.c:80
void StandbyReleaseLockTree(TransactionId xid, int nsubxids, TransactionId *subxids)
Definition: standby.c:1092
int smgrGetPendingDeletes(bool forCommit, RelFileLocator **ptr)
Definition: storage.c:893
#define ERRCODE_DUPLICATE_OBJECT
Definition: streamutil.c:30
Definition: dirent.c:26
void * user_fctx
Definition: funcapi.h:82
MemoryContext multi_call_memory_ctx
Definition: funcapi.h:101
TupleDesc tuple_desc
Definition: funcapi.h:112
TimestampTz prepared_at
Definition: twophase.c:152
XLogRecPtr prepare_start_lsn
Definition: twophase.c:161
XLogRecPtr prepare_end_lsn
Definition: twophase.c:162
GlobalTransaction next
Definition: twophase.c:150
FullTransactionId fxid
Definition: twophase.c:163
ProcNumber locking_backend
Definition: twophase.c:166
char gid[GIDSIZE]
Definition: twophase.c:170
Definition: proc.h:179
bool isRegularBackend
Definition: proc.h:230
TransactionId xmin
Definition: proc.h:194
LocalTransactionId lxid
Definition: proc.h:217
PROCLOCK * waitProcLock
Definition: proc.h:250
uint8 lwWaitMode
Definition: proc.h:241
uint8 statusFlags
Definition: proc.h:259
Oid databaseId
Definition: proc.h:224
struct PGPROC::@128 vxid
pg_atomic_uint64 waitStart
Definition: proc.h:254
ProcNumber procNumber
Definition: proc.h:212
int pid
Definition: proc.h:199
XidCacheStatus subxidStatus
Definition: proc.h:280
LOCK * waitLock
Definition: proc.h:249
TransactionId xid
Definition: proc.h:189
struct XidCache subxids
Definition: proc.h:282
int delayChkptFlags
Definition: proc.h:257
dlist_head myProcLocks[NUM_LOCK_PARTITIONS]
Definition: proc.h:278
Oid roleId
Definition: proc.h:225
ProcWaitStatus waitStatus
Definition: proc.h:184
Oid tempNamespaceId
Definition: proc.h:227
dlist_node links
Definition: proc.h:180
uint8 lwWaiting
Definition: proc.h:240
struct StateFileChunk * next
Definition: twophase.c:1002
FullTransactionId nextXid
Definition: transam.h:220
TwoPhaseRmgrId rmid
Definition: twophase.c:989
GlobalTransaction freeGXacts
Definition: twophase.c:180
GlobalTransaction prepXacts[FLEXIBLE_ARRAY_MEMBER]
Definition: twophase.c:186
GlobalTransaction array
Definition: twophase.c:702
bool overflowed
Definition: proc.h:46
uint8 count
Definition: proc.h:44
TransactionId xids[PGPROC_MAX_CACHED_SUBXIDS]
Definition: proc.h:51
Definition: dirent.h:10
char d_name[MAX_PATH]
Definition: dirent.h:15
__int64 st_size
Definition: win32_port.h:263
TimestampTz prepared_at
Definition: xact.h:359
int32 nabortrels
Definition: xact.h:363
int32 ninvalmsgs
Definition: xact.h:366
bool initfileinval
Definition: xact.h:367
int32 ncommitstats
Definition: xact.h:364
TimestampTz origin_timestamp
Definition: xact.h:370
uint16 gidlen
Definition: xact.h:368
uint32 total_len
Definition: xact.h:356
int32 nabortstats
Definition: xact.h:365
Oid database
Definition: xact.h:358
XLogRecPtr origin_lsn
Definition: xact.h:369
uint32 magic
Definition: xact.h:355
int32 ncommitrels
Definition: xact.h:362
TransactionId xid
Definition: xact.h:357
int32 nsubxacts
Definition: xact.h:361
uint32 total_len
Definition: twophase.c:1011
uint32 num_chunks
Definition: twophase.c:1009
StateFileChunk * head
Definition: twophase.c:1007
StateFileChunk * tail
Definition: twophase.c:1008
uint32 bytes_free
Definition: twophase.c:1010
void SubTransSetParent(TransactionId xid, TransactionId parent)
Definition: subtrans.c:84
bool superuser_arg(Oid roleid)
Definition: superuser.c:56
void SyncRepWaitForLSN(XLogRecPtr lsn, bool commit)
Definition: syncrep.c:148
TransactionId TransactionIdLatest(TransactionId mainxid, int nxids, const TransactionId *xids)
Definition: transam.c:345
bool TransactionIdDidCommit(TransactionId transactionId)
Definition: transam.c:126
void TransactionIdCommitTree(TransactionId xid, int nxids, TransactionId *xids)
Definition: transam.c:240
void TransactionIdAbortTree(TransactionId xid, int nxids, TransactionId *xids)
Definition: transam.c:270
bool TransactionIdPrecedes(TransactionId id1, TransactionId id2)
Definition: transam.c:280
bool TransactionIdDidAbort(TransactionId transactionId)
Definition: transam.c:188
bool TransactionIdFollows(TransactionId id1, TransactionId id2)
Definition: transam.c:314
#define FullTransactionIdEquals(a, b)
Definition: transam.h:50
#define InvalidTransactionId
Definition: transam.h:31
static FullTransactionId FullTransactionIdFromAllowableAt(FullTransactionId nextFullXid, TransactionId xid)
Definition: transam.h:381
#define EpochFromFullTransactionId(x)
Definition: transam.h:47
static FullTransactionId FullTransactionIdFromU64(uint64 value)
Definition: transam.h:81
#define FullTransactionIdFollowsOrEquals(a, b)
Definition: transam.h:54
#define TransactionIdEquals(id1, id2)
Definition: transam.h:43
#define XidFromFullTransactionId(x)
Definition: transam.h:48
#define TransactionIdIsValid(xid)
Definition: transam.h:41
#define FullTransactionIdIsValid(x)
Definition: transam.h:55
TupleDesc CreateTemplateTupleDesc(int natts)
Definition: tupdesc.c:182
void TupleDescInitEntry(TupleDesc desc, AttrNumber attributeNumber, const char *attributeName, Oid oidtypeid, int32 typmod, int attdim)
Definition: tupdesc.c:842
static char * ReadTwoPhaseFile(FullTransactionId fxid, bool missing_ok)
Definition: twophase.c:1295
static void XlogReadTwoPhaseData(XLogRecPtr lsn, char **buf, int *len)
Definition: twophase.c:1412
ProcNumber TwoPhaseGetDummyProcNumber(FullTransactionId fxid, bool lock_held)
Definition: twophase.c:908
TransactionId TwoPhaseGetOldestXidInCommit(void)
Definition: twophase.c:2829
static void ProcessRecords(char *bufptr, FullTransactionId fxid, const TwoPhaseCallback callbacks[])
Definition: twophase.c:1692
void TwoPhaseTransactionGid(Oid subid, TransactionId xid, char *gid_res, int szgid)
Definition: twophase.c:2747
void RecoverPreparedTransactions(void)
Definition: twophase.c:2083
static bool twophaseExitRegistered
Definition: twophase.c:199
void restoreTwoPhaseData(void)
Definition: twophase.c:1904
static GlobalTransaction TwoPhaseGetGXact(FullTransactionId fxid, bool lock_held)
Definition: twophase.c:804
bool LookupGXact(const char *gid, XLogRecPtr prepare_end_lsn, TimestampTz origin_prepare_timestamp)
Definition: twophase.c:2688
Size TwoPhaseShmemSize(void)
Definition: twophase.c:239
#define TWOPHASE_DIR
Definition: twophase.c:113
GlobalTransaction MarkAsPreparing(FullTransactionId fxid, const char *gid, TimestampTz prepared_at, Oid owner, Oid databaseid)
Definition: twophase.c:361
static void RecordTransactionAbortPrepared(TransactionId xid, int nchildren, TransactionId *children, int nrels, RelFileLocator *rels, int nstats, xl_xact_stats_item *stats, const char *gid)
Definition: twophase.c:2432
void RegisterTwoPhaseRecord(TwoPhaseRmgrId rmid, uint16 info, const void *data, uint32 len)
Definition: twophase.c:1271
int max_prepared_xacts
Definition: twophase.c:116
static FullTransactionId AdjustToFullTransactionId(TransactionId xid)
Definition: twophase.c:943
static void RecordTransactionCommitPrepared(TransactionId xid, int nchildren, TransactionId *children, int nrels, RelFileLocator *rels, int nstats, xl_xact_stats_item *stats, int ninvalmsgs, SharedInvalidationMessage *invalmsgs, bool initfileinval, const char *gid)
Definition: twophase.c:2313
static void RemoveGXact(GlobalTransaction gxact)
Definition: twophase.c:632
struct TwoPhaseStateData TwoPhaseStateData
static void RemoveTwoPhaseFile(FullTransactionId fxid, bool giveWarning)
Definition: twophase.c:1723
static GlobalTransaction MyLockedGxact
Definition: twophase.c:197
static TwoPhaseStateData * TwoPhaseState
Definition: twophase.c:189
void AtAbort_Twophase(void)
Definition: twophase.c:306
struct GlobalTransactionData GlobalTransactionData
static void save_state_data(const void *data, uint32 len)
Definition: twophase.c:1024
#define TWOPHASE_MAGIC
Definition: twophase.c:976
void FinishPreparedTransaction(const char *gid, bool isCommit)
Definition: twophase.c:1497
struct TwoPhaseRecordOnDisk TwoPhaseRecordOnDisk
TransactionId TwoPhaseGetXidByVirtualXID(VirtualTransactionId vxid, bool *have_more)
Definition: twophase.c:857
static char * ProcessTwoPhaseBuffer(FullTransactionId fxid, XLogRecPtr prepare_start_lsn, bool fromdisk, bool setParent, bool setNextXid)
Definition: twophase.c:2187
static void GXactLoadSubxactData(GlobalTransaction gxact, int nsubxacts, TransactionId *children)
Definition: twophase.c:508
void PrepareRedoRemove(TransactionId xid, bool giveWarning)
Definition: twophase.c:2664
Datum pg_prepared_xact(PG_FUNCTION_ARGS)
Definition: twophase.c:715
void EndPrepare(GlobalTransaction gxact)
Definition: twophase.c:1145
TransactionId PrescanPreparedTransactions(TransactionId **xids_p, int *nxids_p)
Definition: twophase.c:1966
void StartPrepare(GlobalTransaction gxact)
Definition: twophase.c:1052
static int GetPreparedTransactionList(GlobalTransaction *gxacts)
Definition: twophase.c:670
void TwoPhaseShmemInit(void)
Definition: twophase.c:255
void StandbyRecoverPreparedTransactions(void)
Definition: twophase.c:2045
static void RecreateTwoPhaseFile(FullTransactionId fxid, void *content, int len)
Definition: twophase.c:1742
static void AtProcExit_Twophase(int code, Datum arg)
Definition: twophase.c:296
static void PrepareRedoRemoveFull(FullTransactionId fxid, bool giveWarning)
Definition: twophase.c:2620
static int TwoPhaseFilePath(char *path, FullTransactionId fxid)
Definition: twophase.c:950
static void MarkAsPrepared(GlobalTransaction gxact, bool lock_held)
Definition: twophase.c:534
void PostPrepare_Twophase(void)
Definition: twophase.c:346
bool LookupGXactBySubid(Oid subid)
Definition: twophase.c:2797
PGPROC * TwoPhaseGetDummyProc(FullTransactionId fxid, bool lock_held)
Definition: twophase.c:923
xl_xact_prepare TwoPhaseFileHeader
Definition: twophase.c:978
void CheckPointTwoPhase(XLogRecPtr redo_horizon)
Definition: twophase.c:1822
struct StateFileChunk StateFileChunk
bool StandbyTransactionIdIsPrepared(TransactionId xid)
Definition: twophase.c:1467
static GlobalTransaction LockGXact(const char *gid, Oid user)
Definition: twophase.c:556
static void MarkAsPreparingGuts(GlobalTransaction gxact, FullTransactionId fxid, const char *gid, TimestampTz prepared_at, Oid owner, Oid databaseid)
Definition: twophase.c:435
static bool IsTwoPhaseTransactionGidForSubid(Oid subid, char *gid)
Definition: twophase.c:2765
void PrepareRedoAdd(FullTransactionId fxid, char *buf, XLogRecPtr start_lsn, XLogRecPtr end_lsn, RepOriginId origin_id)
Definition: twophase.c:2507
static struct xllist records
struct GlobalTransactionData * GlobalTransaction
Definition: twophase.h:26
const TwoPhaseCallback twophase_postcommit_callbacks[TWOPHASE_RM_MAX_ID+1]
Definition: twophase_rmgr.c:33
const TwoPhaseCallback twophase_recover_callbacks[TWOPHASE_RM_MAX_ID+1]
Definition: twophase_rmgr.c:24
const TwoPhaseCallback twophase_postabort_callbacks[TWOPHASE_RM_MAX_ID+1]
Definition: twophase_rmgr.c:42
void(* TwoPhaseCallback)(FullTransactionId fxid, uint16 info, void *recdata, uint32 len)
Definition: twophase_rmgr.h:19
#define TWOPHASE_RM_MAX_ID
Definition: twophase_rmgr.h:31
uint8 TwoPhaseRmgrId
Definition: twophase_rmgr.h:21
#define TWOPHASE_RM_END_ID
Definition: twophase_rmgr.h:26
static Datum TimestampTzGetDatum(TimestampTz X)
Definition: timestamp.h:52
FullTransactionId ReadNextFullTransactionId(void)
Definition: varsup.c:288
void AdvanceNextFullTransactionIdPastXid(TransactionId xid)
Definition: varsup.c:304
TransamVariablesData * TransamVariables
Definition: varsup.c:34
static void pgstat_report_wait_start(uint32 wait_event_info)
Definition: wait_event.h:69
static void pgstat_report_wait_end(void)
Definition: wait_event.h:85
#define fstat
Definition: win32_port.h:273
XLogRecPtr XactLogCommitRecord(TimestampTz commit_time, int nsubxacts, TransactionId *subxacts, int nrels, RelFileLocator *rels, int ndroppedstats, xl_xact_stats_item *droppedstats, int nmsgs, SharedInvalidationMessage *msgs, bool relcacheInval, int xactflags, TransactionId twophase_xid, const char *twophase_gid)
Definition: xact.c:5826
int xactGetCommittedChildren(TransactionId **ptr)
Definition: xact.c:5802
int MyXactFlags
Definition: xact.c:136
XLogRecPtr XactLogAbortRecord(TimestampTz abort_time, int nsubxacts, TransactionId *subxacts, int nrels, RelFileLocator *rels, int ndroppedstats, xl_xact_stats_item *droppedstats, int xactflags, TransactionId twophase_xid, const char *twophase_gid)
Definition: xact.c:5998
#define XLOG_XACT_PREPARE
Definition: xact.h:171
#define XACT_FLAGS_ACQUIREDACCESSEXCLUSIVELOCK
Definition: xact.h:109
#define XLOG_XACT_OPMASK
Definition: xact.h:180
#define GIDSIZE
Definition: xact.h:31
XLogRecPtr ProcLastRecPtr
Definition: xlog.c:254
bool RecoveryInProgress(void)
Definition: xlog.c:6383
XLogRecPtr XactLastRecEnd
Definition: xlog.c:255
int wal_segment_size
Definition: xlog.c:144
bool log_checkpoints
Definition: xlog.c:130
void XLogFlush(XLogRecPtr record)
Definition: xlog.c:2780
#define XLOG_INCLUDE_ORIGIN
Definition: xlog.h:154
#define LSN_FORMAT_ARGS(lsn)
Definition: xlogdefs.h:46
#define XLogRecPtrIsInvalid(r)
Definition: xlogdefs.h:29
uint16 RepOriginId
Definition: xlogdefs.h:68
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define InvalidXLogRecPtr
Definition: xlogdefs.h:28
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:474
void XLogRegisterData(const void *data, uint32 len)
Definition: xloginsert.c:364
void XLogSetRecordFlags(uint8 flags)
Definition: xloginsert.c:456
void XLogBeginInsert(void)
Definition: xloginsert.c:149
void XLogEnsureRecordSpace(int max_block_id, int ndatas)
Definition: xloginsert.c:175
XLogReaderState * XLogReaderAllocate(int wal_segment_size, const char *waldir, XLogReaderRoutine *routine, void *private_data)
Definition: xlogreader.c:107
XLogRecord * XLogReadRecord(XLogReaderState *state, char **errormsg)
Definition: xlogreader.c:390
void XLogReaderFree(XLogReaderState *state)
Definition: xlogreader.c:162
void XLogBeginRead(XLogReaderState *state, XLogRecPtr RecPtr)
Definition: xlogreader.c:232
#define XLogRecGetDataLen(decoder)
Definition: xlogreader.h:416
#define XLogRecGetInfo(decoder)
Definition: xlogreader.h:410
#define XLogRecGetRmid(decoder)
Definition: xlogreader.h:411
#define XLogRecGetData(decoder)
Definition: xlogreader.h:415
#define XL_ROUTINE(...)
Definition: xlogreader.h:117
bool reachedConsistency
Definition: xlogrecovery.c:301
static XLogReaderState * xlogreader
Definition: xlogrecovery.c:190
void wal_segment_close(XLogReaderState *state)
Definition: xlogutils.c:831
void wal_segment_open(XLogReaderState *state, XLogSegNo nextSegNo, TimeLineID *tli_p)
Definition: xlogutils.c:806
bool InRecovery
Definition: xlogutils.c:50
int read_local_xlog_page(XLogReaderState *state, XLogRecPtr targetPagePtr, int reqLen, XLogRecPtr targetRecPtr, char *cur_page)
Definition: xlogutils.c:845
#define InHotStandby
Definition: xlogutils.h:60