Thanks to visit codestin.com
Credit goes to doxygen.postgresql.org

PostgreSQL Source Code git master
multixact.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * multixact.c
4 * PostgreSQL multi-transaction-log manager
5 *
6 * The pg_multixact manager is a pg_xact-like manager that stores an array of
7 * MultiXactMember for each MultiXactId. It is a fundamental part of the
8 * shared-row-lock implementation. Each MultiXactMember is comprised of a
9 * TransactionId and a set of flag bits. The name is a bit historical:
10 * originally, a MultiXactId consisted of more than one TransactionId (except
11 * in rare corner cases), hence "multi". Nowadays, however, it's perfectly
12 * legitimate to have MultiXactIds that only include a single Xid.
13 *
14 * The meaning of the flag bits is opaque to this module, but they are mostly
15 * used in heapam.c to identify lock modes that each of the member transactions
16 * is holding on any given tuple. This module just contains support to store
17 * and retrieve the arrays.
18 *
19 * We use two SLRU areas, one for storing the offsets at which the data
20 * starts for each MultiXactId in the other one. This trick allows us to
21 * store variable length arrays of TransactionIds. (We could alternatively
22 * use one area containing counts and TransactionIds, with valid MultiXactId
23 * values pointing at slots containing counts; but that way seems less robust
24 * since it would get completely confused if someone inquired about a bogus
25 * MultiXactId that pointed to an intermediate slot containing an XID.)
26 *
27 * XLOG interactions: this module generates a record whenever a new OFFSETs or
28 * MEMBERs page is initialized to zeroes, as well as an
29 * XLOG_MULTIXACT_CREATE_ID record whenever a new MultiXactId is defined.
30 * This module ignores the WAL rule "write xlog before data," because it
31 * suffices that actions recording a MultiXactId in a heap xmax do follow that
32 * rule. The only way for the MXID to be referenced from any data page is for
33 * heap_lock_tuple() or heap_update() to have put it there, and each generates
34 * an XLOG record that must follow ours. The normal LSN interlock between the
35 * data page and that XLOG record will ensure that our XLOG record reaches
36 * disk first. If the SLRU members/offsets data reaches disk sooner than the
37 * XLOG records, we do not care; after recovery, no xmax will refer to it. On
38 * the flip side, to ensure that all referenced entries _do_ reach disk, this
39 * module's XLOG records completely rebuild the data entered since the last
40 * checkpoint. We flush and sync all dirty OFFSETs and MEMBERs pages to disk
41 * before each checkpoint is considered complete.
42 *
43 * Like clog.c, and unlike subtrans.c, we have to preserve state across
44 * crashes and ensure that MXID and offset numbering increases monotonically
45 * across a crash. We do this in the same way as it's done for transaction
46 * IDs: the WAL record is guaranteed to contain evidence of every MXID we
47 * could need to worry about, and we just make sure that at the end of
48 * replay, the next-MXID and next-offset counters are at least as large as
49 * anything we saw during replay.
50 *
51 * We are able to remove segments no longer necessary by carefully tracking
52 * each table's used values: during vacuum, any multixact older than a certain
53 * value is removed; the cutoff value is stored in pg_class. The minimum value
54 * across all tables in each database is stored in pg_database, and the global
55 * minimum across all databases is part of pg_control and is kept in shared
56 * memory. Whenever that minimum is advanced, the SLRUs are truncated.
57 *
58 * When new multixactid values are to be created, care is taken that the
59 * counter does not fall within the wraparound horizon considering the global
60 * minimum value.
61 *
62 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
63 * Portions Copyright (c) 1994, Regents of the University of California
64 *
65 * src/backend/access/transam/multixact.c
66 *
67 *-------------------------------------------------------------------------
68 */
69#include "postgres.h"
70
71#include "access/multixact.h"
72#include "access/slru.h"
73#include "access/twophase.h"
75#include "access/xlog.h"
76#include "access/xloginsert.h"
77#include "access/xlogutils.h"
78#include "miscadmin.h"
79#include "pg_trace.h"
80#include "pgstat.h"
83#include "storage/pmsignal.h"
84#include "storage/proc.h"
85#include "storage/procarray.h"
86#include "utils/guc_hooks.h"
88#include "utils/lsyscache.h"
89#include "utils/memutils.h"
90
91
92/*
93 * Defines for MultiXactOffset page sizes. A page is the same BLCKSZ as is
94 * used everywhere else in Postgres.
95 *
96 * Note: because MultiXactOffsets are 32 bits and wrap around at 0xFFFFFFFF,
97 * MultiXact page numbering also wraps around at
98 * 0xFFFFFFFF/MULTIXACT_OFFSETS_PER_PAGE, and segment numbering at
99 * 0xFFFFFFFF/MULTIXACT_OFFSETS_PER_PAGE/SLRU_PAGES_PER_SEGMENT. We need
100 * take no explicit notice of that fact in this module, except when comparing
101 * segment and page numbers in TruncateMultiXact (see
102 * MultiXactOffsetPagePrecedes).
103 */
104
105/* We need four bytes per offset */
106#define MULTIXACT_OFFSETS_PER_PAGE (BLCKSZ / sizeof(MultiXactOffset))
107
108static inline int64
110{
111 return multi / MULTIXACT_OFFSETS_PER_PAGE;
112}
113
114static inline int
116{
117 return multi % MULTIXACT_OFFSETS_PER_PAGE;
118}
119
120static inline int64
122{
124}
125
126/*
127 * The situation for members is a bit more complex: we store one byte of
128 * additional flag bits for each TransactionId. To do this without getting
129 * into alignment issues, we store four bytes of flags, and then the
130 * corresponding 4 Xids. Each such 5-word (20-byte) set we call a "group", and
131 * are stored as a whole in pages. Thus, with 8kB BLCKSZ, we keep 409 groups
132 * per page. This wastes 12 bytes per page, but that's OK -- simplicity (and
133 * performance) trumps space efficiency here.
134 *
135 * Note that the "offset" macros work with byte offset, not array indexes, so
136 * arithmetic must be done using "char *" pointers.
137 */
138/* We need eight bits per xact, so one xact fits in a byte */
139#define MXACT_MEMBER_BITS_PER_XACT 8
140#define MXACT_MEMBER_FLAGS_PER_BYTE 1
141#define MXACT_MEMBER_XACT_BITMASK ((1 << MXACT_MEMBER_BITS_PER_XACT) - 1)
142
143/* how many full bytes of flags are there in a group? */
144#define MULTIXACT_FLAGBYTES_PER_GROUP 4
145#define MULTIXACT_MEMBERS_PER_MEMBERGROUP \
146 (MULTIXACT_FLAGBYTES_PER_GROUP * MXACT_MEMBER_FLAGS_PER_BYTE)
147/* size in bytes of a complete group */
148#define MULTIXACT_MEMBERGROUP_SIZE \
149 (sizeof(TransactionId) * MULTIXACT_MEMBERS_PER_MEMBERGROUP + MULTIXACT_FLAGBYTES_PER_GROUP)
150#define MULTIXACT_MEMBERGROUPS_PER_PAGE (BLCKSZ / MULTIXACT_MEMBERGROUP_SIZE)
151#define MULTIXACT_MEMBERS_PER_PAGE \
152 (MULTIXACT_MEMBERGROUPS_PER_PAGE * MULTIXACT_MEMBERS_PER_MEMBERGROUP)
153
154/*
155 * Because the number of items per page is not a divisor of the last item
156 * number (member 0xFFFFFFFF), the last segment does not use the maximum number
157 * of pages, and moreover the last used page therein does not use the same
158 * number of items as previous pages. (Another way to say it is that the
159 * 0xFFFFFFFF member is somewhere in the middle of the last page, so the page
160 * has some empty space after that item.)
161 *
162 * This constant is the number of members in the last page of the last segment.
163 */
164#define MAX_MEMBERS_IN_LAST_MEMBERS_PAGE \
165 ((uint32) ((0xFFFFFFFF % MULTIXACT_MEMBERS_PER_PAGE) + 1))
166
167/* page in which a member is to be found */
168static inline int64
170{
171 return offset / MULTIXACT_MEMBERS_PER_PAGE;
172}
173
174static inline int64
176{
178}
179
180/* Location (byte offset within page) of flag word for a given member */
181static inline int
183{
185 int grouponpg = group % MULTIXACT_MEMBERGROUPS_PER_PAGE;
186 int byteoff = grouponpg * MULTIXACT_MEMBERGROUP_SIZE;
187
188 return byteoff;
189}
190
191static inline int
193{
194 int member_in_group = offset % MULTIXACT_MEMBERS_PER_MEMBERGROUP;
195 int bshift = member_in_group * MXACT_MEMBER_BITS_PER_XACT;
196
197 return bshift;
198}
199
200/* Location (byte offset within page) of TransactionId of given member */
201static inline int
203{
204 int member_in_group = offset % MULTIXACT_MEMBERS_PER_MEMBERGROUP;
205
206 return MXOffsetToFlagsOffset(offset) +
208 member_in_group * sizeof(TransactionId);
209}
210
211/* Multixact members wraparound thresholds. */
212#define MULTIXACT_MEMBER_SAFE_THRESHOLD (MaxMultiXactOffset / 2)
213#define MULTIXACT_MEMBER_DANGER_THRESHOLD \
214 (MaxMultiXactOffset - MaxMultiXactOffset / 4)
215
216static inline MultiXactId
218{
219 return multi == FirstMultiXactId ? MaxMultiXactId : multi - 1;
220}
221
222/*
223 * Links to shared-memory data structures for MultiXact control
224 */
227
228#define MultiXactOffsetCtl (&MultiXactOffsetCtlData)
229#define MultiXactMemberCtl (&MultiXactMemberCtlData)
230
231/*
232 * MultiXact state shared across all backends. All this state is protected
233 * by MultiXactGenLock. (We also use SLRU bank's lock of MultiXactOffset and
234 * MultiXactMember to guard accesses to the two sets of SLRU buffers. For
235 * concurrency's sake, we avoid holding more than one of these locks at a
236 * time.)
237 */
238typedef struct MultiXactStateData
239{
240 /* next-to-be-assigned MultiXactId */
242
243 /* next-to-be-assigned offset */
245
246 /* Have we completed multixact startup? */
248
249 /*
250 * Oldest multixact that is still potentially referenced by a relation.
251 * Anything older than this should not be consulted. These values are
252 * updated by vacuum.
253 */
256
257 /*
258 * Oldest multixact offset that is potentially referenced by a multixact
259 * referenced by a relation. We don't always know this value, so there's
260 * a flag here to indicate whether or not we currently do.
261 */
264
265 /* support for anti-wraparound measures */
270
271 /* support for members anti-wraparound measures */
272 MultiXactOffset offsetStopLimit; /* known if oldestOffsetKnown */
273
274 /*
275 * This is used to sleep until a multixact offset is written when we want
276 * to create the next one.
277 */
279
280 /*
281 * Per-backend data starts here. We have two arrays stored in the area
282 * immediately following the MultiXactStateData struct. Each is indexed by
283 * ProcNumber.
284 *
285 * In both arrays, there's a slot for all normal backends
286 * (0..MaxBackends-1) followed by a slot for max_prepared_xacts prepared
287 * transactions.
288 *
289 * OldestMemberMXactId[k] is the oldest MultiXactId each backend's current
290 * transaction(s) could possibly be a member of, or InvalidMultiXactId
291 * when the backend has no live transaction that could possibly be a
292 * member of a MultiXact. Each backend sets its entry to the current
293 * nextMXact counter just before first acquiring a shared lock in a given
294 * transaction, and clears it at transaction end. (This works because only
295 * during or after acquiring a shared lock could an XID possibly become a
296 * member of a MultiXact, and that MultiXact would have to be created
297 * during or after the lock acquisition.)
298 *
299 * OldestVisibleMXactId[k] is the oldest MultiXactId each backend's
300 * current transaction(s) think is potentially live, or InvalidMultiXactId
301 * when not in a transaction or not in a transaction that's paid any
302 * attention to MultiXacts yet. This is computed when first needed in a
303 * given transaction, and cleared at transaction end. We can compute it
304 * as the minimum of the valid OldestMemberMXactId[] entries at the time
305 * we compute it (using nextMXact if none are valid). Each backend is
306 * required not to attempt to access any SLRU data for MultiXactIds older
307 * than its own OldestVisibleMXactId[] setting; this is necessary because
308 * the relevant SLRU data can be concurrently truncated away.
309 *
310 * The oldest valid value among all of the OldestMemberMXactId[] and
311 * OldestVisibleMXactId[] entries is considered by vacuum as the earliest
312 * possible value still having any live member transaction -- OldestMxact.
313 * Any value older than that is typically removed from tuple headers, or
314 * "frozen" via being replaced with a new xmax. VACUUM can sometimes even
315 * remove an individual MultiXact xmax whose value is >= its OldestMxact
316 * cutoff, though typically only when no individual member XID is still
317 * running. See FreezeMultiXactId for full details.
318 *
319 * Whenever VACUUM advances relminmxid, then either its OldestMxact cutoff
320 * or the oldest extant Multi remaining in the table is used as the new
321 * pg_class.relminmxid value (whichever is earlier). The minimum of all
322 * relminmxid values in each database is stored in pg_database.datminmxid.
323 * In turn, the minimum of all of those values is stored in pg_control.
324 * This is used as the truncation point for pg_multixact when unneeded
325 * segments get removed by vac_truncate_clog() during vacuuming.
326 */
329
330/*
331 * Size of OldestMemberMXactId and OldestVisibleMXactId arrays.
332 */
333#define MaxOldestSlot (MaxBackends + max_prepared_xacts)
334
335/* Pointers to the state data in shared memory */
339
340
341/*
342 * Definitions for the backend-local MultiXactId cache.
343 *
344 * We use this cache to store known MultiXacts, so we don't need to go to
345 * SLRU areas every time.
346 *
347 * The cache lasts for the duration of a single transaction, the rationale
348 * for this being that most entries will contain our own TransactionId and
349 * so they will be uninteresting by the time our next transaction starts.
350 * (XXX not clear that this is correct --- other members of the MultiXact
351 * could hang around longer than we did. However, it's not clear what a
352 * better policy for flushing old cache entries would be.) FIXME actually
353 * this is plain wrong now that multixact's may contain update Xids.
354 *
355 * We allocate the cache entries in a memory context that is deleted at
356 * transaction end, so we don't need to do retail freeing of entries.
357 */
358typedef struct mXactCacheEnt
359{
365
366#define MAX_CACHE_ENTRIES 256
369
370#ifdef MULTIXACT_DEBUG
371#define debug_elog2(a,b) elog(a,b)
372#define debug_elog3(a,b,c) elog(a,b,c)
373#define debug_elog4(a,b,c,d) elog(a,b,c,d)
374#define debug_elog5(a,b,c,d,e) elog(a,b,c,d,e)
375#define debug_elog6(a,b,c,d,e,f) elog(a,b,c,d,e,f)
376#else
377#define debug_elog2(a,b)
378#define debug_elog3(a,b,c)
379#define debug_elog4(a,b,c,d)
380#define debug_elog5(a,b,c,d,e)
381#define debug_elog6(a,b,c,d,e,f)
382#endif
383
384/* internal MultiXactId management */
385static void MultiXactIdSetOldestVisible(void);
386static void RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
387 int nmembers, MultiXactMember *members);
388static MultiXactId GetNewMultiXactId(int nmembers, MultiXactOffset *offset);
389
390/* MultiXact cache management */
391static int mxactMemberComparator(const void *arg1, const void *arg2);
392static MultiXactId mXactCacheGetBySet(int nmembers, MultiXactMember *members);
393static int mXactCacheGetById(MultiXactId multi, MultiXactMember **members);
394static void mXactCachePut(MultiXactId multi, int nmembers,
395 MultiXactMember *members);
396
397/* management of SLRU infrastructure */
398static bool MultiXactOffsetPagePrecedes(int64 page1, int64 page2);
399static bool MultiXactMemberPagePrecedes(int64 page1, int64 page2);
400static bool MultiXactOffsetPrecedes(MultiXactOffset offset1,
401 MultiXactOffset offset2);
402static void ExtendMultiXactOffset(MultiXactId multi);
403static void ExtendMultiXactMember(MultiXactOffset offset, int nmembers);
404static bool MultiXactOffsetWouldWrap(MultiXactOffset boundary,
405 MultiXactOffset start, uint32 distance);
406static bool SetOffsetVacuumLimit(bool is_startup);
407static bool find_multixact_start(MultiXactId multi, MultiXactOffset *result);
408static void WriteMTruncateXlogRec(Oid oldestMultiDB,
409 MultiXactId startTruncOff,
410 MultiXactId endTruncOff,
411 MultiXactOffset startTruncMemb,
412 MultiXactOffset endTruncMemb);
413
414
415/*
416 * MultiXactIdCreate
417 * Construct a MultiXactId representing two TransactionIds.
418 *
419 * The two XIDs must be different, or be requesting different statuses.
420 *
421 * NB - we don't worry about our local MultiXactId cache here, because that
422 * is handled by the lower-level routines.
423 */
426 TransactionId xid2, MultiXactStatus status2)
427{
428 MultiXactId newMulti;
429 MultiXactMember members[2];
430
433
434 Assert(!TransactionIdEquals(xid1, xid2) || (status1 != status2));
435
436 /* MultiXactIdSetOldestMember() must have been called already. */
438
439 /*
440 * Note: unlike MultiXactIdExpand, we don't bother to check that both XIDs
441 * are still running. In typical usage, xid2 will be our own XID and the
442 * caller just did a check on xid1, so it'd be wasted effort.
443 */
444
445 members[0].xid = xid1;
446 members[0].status = status1;
447 members[1].xid = xid2;
448 members[1].status = status2;
449
450 newMulti = MultiXactIdCreateFromMembers(2, members);
451
452 debug_elog3(DEBUG2, "Create: %s",
453 mxid_to_string(newMulti, 2, members));
454
455 return newMulti;
456}
457
458/*
459 * MultiXactIdExpand
460 * Add a TransactionId to a pre-existing MultiXactId.
461 *
462 * If the TransactionId is already a member of the passed MultiXactId with the
463 * same status, just return it as-is.
464 *
465 * Note that we do NOT actually modify the membership of a pre-existing
466 * MultiXactId; instead we create a new one. This is necessary to avoid
467 * a race condition against code trying to wait for one MultiXactId to finish;
468 * see notes in heapam.c.
469 *
470 * NB - we don't worry about our local MultiXactId cache here, because that
471 * is handled by the lower-level routines.
472 *
473 * Note: It is critical that MultiXactIds that come from an old cluster (i.e.
474 * one upgraded by pg_upgrade from a cluster older than this feature) are not
475 * passed in.
476 */
479{
480 MultiXactId newMulti;
481 MultiXactMember *members;
482 MultiXactMember *newMembers;
483 int nmembers;
484 int i;
485 int j;
486
489
490 /* MultiXactIdSetOldestMember() must have been called already. */
492
493 debug_elog5(DEBUG2, "Expand: received multi %u, xid %u status %s",
494 multi, xid, mxstatus_to_string(status));
495
496 /*
497 * Note: we don't allow for old multis here. The reason is that the only
498 * caller of this function does a check that the multixact is no longer
499 * running.
500 */
501 nmembers = GetMultiXactIdMembers(multi, &members, false, false);
502
503 if (nmembers < 0)
504 {
505 MultiXactMember member;
506
507 /*
508 * The MultiXactId is obsolete. This can only happen if all the
509 * MultiXactId members stop running between the caller checking and
510 * passing it to us. It would be better to return that fact to the
511 * caller, but it would complicate the API and it's unlikely to happen
512 * too often, so just deal with it by creating a singleton MultiXact.
513 */
514 member.xid = xid;
515 member.status = status;
516 newMulti = MultiXactIdCreateFromMembers(1, &member);
517
518 debug_elog4(DEBUG2, "Expand: %u has no members, create singleton %u",
519 multi, newMulti);
520 return newMulti;
521 }
522
523 /*
524 * If the TransactionId is already a member of the MultiXactId with the
525 * same status, just return the existing MultiXactId.
526 */
527 for (i = 0; i < nmembers; i++)
528 {
529 if (TransactionIdEquals(members[i].xid, xid) &&
530 (members[i].status == status))
531 {
532 debug_elog4(DEBUG2, "Expand: %u is already a member of %u",
533 xid, multi);
534 pfree(members);
535 return multi;
536 }
537 }
538
539 /*
540 * Determine which of the members of the MultiXactId are still of
541 * interest. This is any running transaction, and also any transaction
542 * that grabbed something stronger than just a lock and was committed. (An
543 * update that aborted is of no interest here; and having more than one
544 * update Xid in a multixact would cause errors elsewhere.)
545 *
546 * Removing dead members is not just an optimization: freezing of tuples
547 * whose Xmax are multis depends on this behavior.
548 *
549 * Note we have the same race condition here as above: j could be 0 at the
550 * end of the loop.
551 */
552 newMembers = (MultiXactMember *)
553 palloc(sizeof(MultiXactMember) * (nmembers + 1));
554
555 for (i = 0, j = 0; i < nmembers; i++)
556 {
557 if (TransactionIdIsInProgress(members[i].xid) ||
558 (ISUPDATE_from_mxstatus(members[i].status) &&
559 TransactionIdDidCommit(members[i].xid)))
560 {
561 newMembers[j].xid = members[i].xid;
562 newMembers[j++].status = members[i].status;
563 }
564 }
565
566 newMembers[j].xid = xid;
567 newMembers[j++].status = status;
568 newMulti = MultiXactIdCreateFromMembers(j, newMembers);
569
570 pfree(members);
571 pfree(newMembers);
572
573 debug_elog3(DEBUG2, "Expand: returning new multi %u", newMulti);
574
575 return newMulti;
576}
577
578/*
579 * MultiXactIdIsRunning
580 * Returns whether a MultiXactId is "running".
581 *
582 * We return true if at least one member of the given MultiXactId is still
583 * running. Note that a "false" result is certain not to change,
584 * because it is not legal to add members to an existing MultiXactId.
585 *
586 * Caller is expected to have verified that the multixact does not come from
587 * a pg_upgraded share-locked tuple.
588 */
589bool
590MultiXactIdIsRunning(MultiXactId multi, bool isLockOnly)
591{
592 MultiXactMember *members;
593 int nmembers;
594 int i;
595
596 debug_elog3(DEBUG2, "IsRunning %u?", multi);
597
598 /*
599 * "false" here means we assume our callers have checked that the given
600 * multi cannot possibly come from a pg_upgraded database.
601 */
602 nmembers = GetMultiXactIdMembers(multi, &members, false, isLockOnly);
603
604 if (nmembers <= 0)
605 {
606 debug_elog2(DEBUG2, "IsRunning: no members");
607 return false;
608 }
609
610 /*
611 * Checking for myself is cheap compared to looking in shared memory;
612 * return true if any live subtransaction of the current top-level
613 * transaction is a member.
614 *
615 * This is not needed for correctness, it's just a fast path.
616 */
617 for (i = 0; i < nmembers; i++)
618 {
619 if (TransactionIdIsCurrentTransactionId(members[i].xid))
620 {
621 debug_elog3(DEBUG2, "IsRunning: I (%d) am running!", i);
622 pfree(members);
623 return true;
624 }
625 }
626
627 /*
628 * This could be made faster by having another entry point in procarray.c,
629 * walking the PGPROC array only once for all the members. But in most
630 * cases nmembers should be small enough that it doesn't much matter.
631 */
632 for (i = 0; i < nmembers; i++)
633 {
634 if (TransactionIdIsInProgress(members[i].xid))
635 {
636 debug_elog4(DEBUG2, "IsRunning: member %d (%u) is running",
637 i, members[i].xid);
638 pfree(members);
639 return true;
640 }
641 }
642
643 pfree(members);
644
645 debug_elog3(DEBUG2, "IsRunning: %u is not running", multi);
646
647 return false;
648}
649
650/*
651 * MultiXactIdSetOldestMember
652 * Save the oldest MultiXactId this transaction could be a member of.
653 *
654 * We set the OldestMemberMXactId for a given transaction the first time it's
655 * going to do some operation that might require a MultiXactId (tuple lock,
656 * update or delete). We need to do this even if we end up using a
657 * TransactionId instead of a MultiXactId, because there is a chance that
658 * another transaction would add our XID to a MultiXactId.
659 *
660 * The value to set is the next-to-be-assigned MultiXactId, so this is meant to
661 * be called just before doing any such possibly-MultiXactId-able operation.
662 */
663void
665{
667 {
668 MultiXactId nextMXact;
669
670 /*
671 * You might think we don't need to acquire a lock here, since
672 * fetching and storing of TransactionIds is probably atomic, but in
673 * fact we do: suppose we pick up nextMXact and then lose the CPU for
674 * a long time. Someone else could advance nextMXact, and then
675 * another someone else could compute an OldestVisibleMXactId that
676 * would be after the value we are going to store when we get control
677 * back. Which would be wrong.
678 *
679 * Note that a shared lock is sufficient, because it's enough to stop
680 * someone from advancing nextMXact; and nobody else could be trying
681 * to write to our OldestMember entry, only reading (and we assume
682 * storing it is atomic.)
683 */
684 LWLockAcquire(MultiXactGenLock, LW_SHARED);
685
686 /*
687 * We have to beware of the possibility that nextMXact is in the
688 * wrapped-around state. We don't fix the counter itself here, but we
689 * must be sure to store a valid value in our array entry.
690 */
691 nextMXact = MultiXactState->nextMXact;
692 if (nextMXact < FirstMultiXactId)
693 nextMXact = FirstMultiXactId;
694
696
697 LWLockRelease(MultiXactGenLock);
698
699 debug_elog4(DEBUG2, "MultiXact: setting OldestMember[%d] = %u",
700 MyProcNumber, nextMXact);
701 }
702}
703
704/*
705 * MultiXactIdSetOldestVisible
706 * Save the oldest MultiXactId this transaction considers possibly live.
707 *
708 * We set the OldestVisibleMXactId for a given transaction the first time
709 * it's going to inspect any MultiXactId. Once we have set this, we are
710 * guaranteed that SLRU data for MultiXactIds >= our own OldestVisibleMXactId
711 * won't be truncated away.
712 *
713 * The value to set is the oldest of nextMXact and all the valid per-backend
714 * OldestMemberMXactId[] entries. Because of the locking we do, we can be
715 * certain that no subsequent call to MultiXactIdSetOldestMember can set
716 * an OldestMemberMXactId[] entry older than what we compute here. Therefore
717 * there is no live transaction, now or later, that can be a member of any
718 * MultiXactId older than the OldestVisibleMXactId we compute here.
719 */
720static void
722{
724 {
725 MultiXactId oldestMXact;
726 int i;
727
728 LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
729
730 /*
731 * We have to beware of the possibility that nextMXact is in the
732 * wrapped-around state. We don't fix the counter itself here, but we
733 * must be sure to store a valid value in our array entry.
734 */
735 oldestMXact = MultiXactState->nextMXact;
736 if (oldestMXact < FirstMultiXactId)
737 oldestMXact = FirstMultiXactId;
738
739 for (i = 0; i < MaxOldestSlot; i++)
740 {
741 MultiXactId thisoldest = OldestMemberMXactId[i];
742
743 if (MultiXactIdIsValid(thisoldest) &&
744 MultiXactIdPrecedes(thisoldest, oldestMXact))
745 oldestMXact = thisoldest;
746 }
747
748 OldestVisibleMXactId[MyProcNumber] = oldestMXact;
749
750 LWLockRelease(MultiXactGenLock);
751
752 debug_elog4(DEBUG2, "MultiXact: setting OldestVisible[%d] = %u",
753 MyProcNumber, oldestMXact);
754 }
755}
756
757/*
758 * ReadNextMultiXactId
759 * Return the next MultiXactId to be assigned, but don't allocate it
760 */
763{
764 MultiXactId mxid;
765
766 /* XXX we could presumably do this without a lock. */
767 LWLockAcquire(MultiXactGenLock, LW_SHARED);
769 LWLockRelease(MultiXactGenLock);
770
771 if (mxid < FirstMultiXactId)
772 mxid = FirstMultiXactId;
773
774 return mxid;
775}
776
777/*
778 * ReadMultiXactIdRange
779 * Get the range of IDs that may still be referenced by a relation.
780 */
781void
783{
784 LWLockAcquire(MultiXactGenLock, LW_SHARED);
787 LWLockRelease(MultiXactGenLock);
788
789 if (*oldest < FirstMultiXactId)
790 *oldest = FirstMultiXactId;
791 if (*next < FirstMultiXactId)
793}
794
795
796/*
797 * MultiXactIdCreateFromMembers
798 * Make a new MultiXactId from the specified set of members
799 *
800 * Make XLOG, SLRU and cache entries for a new MultiXactId, recording the
801 * given TransactionIds as members. Returns the newly created MultiXactId.
802 *
803 * NB: the passed members[] array will be sorted in-place.
804 */
807{
808 MultiXactId multi;
809 MultiXactOffset offset;
811
812 debug_elog3(DEBUG2, "Create: %s",
813 mxid_to_string(InvalidMultiXactId, nmembers, members));
814
815 /*
816 * See if the same set of members already exists in our cache; if so, just
817 * re-use that MultiXactId. (Note: it might seem that looking in our
818 * cache is insufficient, and we ought to search disk to see if a
819 * duplicate definition already exists. But since we only ever create
820 * MultiXacts containing our own XID, in most cases any such MultiXacts
821 * were in fact created by us, and so will be in our cache. There are
822 * corner cases where someone else added us to a MultiXact without our
823 * knowledge, but it's not worth checking for.)
824 */
825 multi = mXactCacheGetBySet(nmembers, members);
826 if (MultiXactIdIsValid(multi))
827 {
828 debug_elog2(DEBUG2, "Create: in cache!");
829 return multi;
830 }
831
832 /* Verify that there is a single update Xid among the given members. */
833 {
834 int i;
835 bool has_update = false;
836
837 for (i = 0; i < nmembers; i++)
838 {
839 if (ISUPDATE_from_mxstatus(members[i].status))
840 {
841 if (has_update)
842 elog(ERROR, "new multixact has more than one updating member: %s",
843 mxid_to_string(InvalidMultiXactId, nmembers, members));
844 has_update = true;
845 }
846 }
847 }
848
849 /* Load the injection point before entering the critical section */
850 INJECTION_POINT_LOAD("multixact-create-from-members");
851
852 /*
853 * Assign the MXID and offsets range to use, and make sure there is space
854 * in the OFFSETs and MEMBERs files. NB: this routine does
855 * START_CRIT_SECTION().
856 *
857 * Note: unlike MultiXactIdCreate and MultiXactIdExpand, we do not check
858 * that we've called MultiXactIdSetOldestMember here. This is because
859 * this routine is used in some places to create new MultiXactIds of which
860 * the current backend is not a member, notably during freezing of multis
861 * in vacuum. During vacuum, in particular, it would be unacceptable to
862 * keep OldestMulti set, in case it runs for long.
863 */
864 multi = GetNewMultiXactId(nmembers, &offset);
865
866 INJECTION_POINT_CACHED("multixact-create-from-members", NULL);
867
868 /* Make an XLOG entry describing the new MXID. */
869 xlrec.mid = multi;
870 xlrec.moff = offset;
871 xlrec.nmembers = nmembers;
872
873 /*
874 * XXX Note: there's a lot of padding space in MultiXactMember. We could
875 * find a more compact representation of this Xlog record -- perhaps all
876 * the status flags in one XLogRecData, then all the xids in another one?
877 * Not clear that it's worth the trouble though.
878 */
881 XLogRegisterData(members, nmembers * sizeof(MultiXactMember));
882
883 (void) XLogInsert(RM_MULTIXACT_ID, XLOG_MULTIXACT_CREATE_ID);
884
885 /* Now enter the information into the OFFSETs and MEMBERs logs */
886 RecordNewMultiXact(multi, offset, nmembers, members);
887
888 /* Done with critical section */
890
891 /* Store the new MultiXactId in the local cache, too */
892 mXactCachePut(multi, nmembers, members);
893
894 debug_elog2(DEBUG2, "Create: all done");
895
896 return multi;
897}
898
899/*
900 * RecordNewMultiXact
901 * Write info about a new multixact into the offsets and members files
902 *
903 * This is broken out of MultiXactIdCreateFromMembers so that xlog replay can
904 * use it.
905 */
906static void
908 int nmembers, MultiXactMember *members)
909{
910 int64 pageno;
911 int64 prev_pageno;
912 int entryno;
913 int slotno;
914 MultiXactOffset *offptr;
915 int i;
916 LWLock *lock;
917 LWLock *prevlock = NULL;
918
919 pageno = MultiXactIdToOffsetPage(multi);
920 entryno = MultiXactIdToOffsetEntry(multi);
921
924
925 /*
926 * Note: we pass the MultiXactId to SimpleLruReadPage as the "transaction"
927 * to complain about if there's any I/O error. This is kinda bogus, but
928 * since the errors will always give the full pathname, it should be clear
929 * enough that a MultiXactId is really involved. Perhaps someday we'll
930 * take the trouble to generalize the slru.c error reporting code.
931 */
932 slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, multi);
933 offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
934 offptr += entryno;
935
936 *offptr = offset;
937
938 MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
939
940 /* Release MultiXactOffset SLRU lock. */
941 LWLockRelease(lock);
942
943 /*
944 * If anybody was waiting to know the offset of this multixact ID we just
945 * wrote, they can read it now, so wake them up.
946 */
948
949 prev_pageno = -1;
950
951 for (i = 0; i < nmembers; i++, offset++)
952 {
953 TransactionId *memberptr;
954 uint32 *flagsptr;
955 uint32 flagsval;
956 int bshift;
957 int flagsoff;
958 int memberoff;
959
960 Assert(members[i].status <= MultiXactStatusUpdate);
961
962 pageno = MXOffsetToMemberPage(offset);
963 memberoff = MXOffsetToMemberOffset(offset);
964 flagsoff = MXOffsetToFlagsOffset(offset);
965 bshift = MXOffsetToFlagsBitShift(offset);
966
967 if (pageno != prev_pageno)
968 {
969 /*
970 * MultiXactMember SLRU page is changed so check if this new page
971 * fall into the different SLRU bank then release the old bank's
972 * lock and acquire lock on the new bank.
973 */
975 if (lock != prevlock)
976 {
977 if (prevlock != NULL)
978 LWLockRelease(prevlock);
979
981 prevlock = lock;
982 }
983 slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, multi);
984 prev_pageno = pageno;
985 }
986
987 memberptr = (TransactionId *)
988 (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
989
990 *memberptr = members[i].xid;
991
992 flagsptr = (uint32 *)
993 (MultiXactMemberCtl->shared->page_buffer[slotno] + flagsoff);
994
995 flagsval = *flagsptr;
996 flagsval &= ~(((1 << MXACT_MEMBER_BITS_PER_XACT) - 1) << bshift);
997 flagsval |= (members[i].status << bshift);
998 *flagsptr = flagsval;
999
1000 MultiXactMemberCtl->shared->page_dirty[slotno] = true;
1001 }
1002
1003 if (prevlock != NULL)
1004 LWLockRelease(prevlock);
1005}
1006
1007/*
1008 * GetNewMultiXactId
1009 * Get the next MultiXactId.
1010 *
1011 * Also, reserve the needed amount of space in the "members" area. The
1012 * starting offset of the reserved space is returned in *offset.
1013 *
1014 * This may generate XLOG records for expansion of the offsets and/or members
1015 * files. Unfortunately, we have to do that while holding MultiXactGenLock
1016 * to avoid race conditions --- the XLOG record for zeroing a page must appear
1017 * before any backend can possibly try to store data in that page!
1018 *
1019 * We start a critical section before advancing the shared counters. The
1020 * caller must end the critical section after writing SLRU data.
1021 */
1022static MultiXactId
1024{
1025 MultiXactId result;
1026 MultiXactOffset nextOffset;
1027
1028 debug_elog3(DEBUG2, "GetNew: for %d xids", nmembers);
1029
1030 /* safety check, we should never get this far in a HS standby */
1031 if (RecoveryInProgress())
1032 elog(ERROR, "cannot assign MultiXactIds during recovery");
1033
1034 LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
1035
1036 /* Handle wraparound of the nextMXact counter */
1039
1040 /* Assign the MXID */
1041 result = MultiXactState->nextMXact;
1042
1043 /*----------
1044 * Check to see if it's safe to assign another MultiXactId. This protects
1045 * against catastrophic data loss due to multixact wraparound. The basic
1046 * rules are:
1047 *
1048 * If we're past multiVacLimit or the safe threshold for member storage
1049 * space, or we don't know what the safe threshold for member storage is,
1050 * start trying to force autovacuum cycles.
1051 * If we're past multiWarnLimit, start issuing warnings.
1052 * If we're past multiStopLimit, refuse to create new MultiXactIds.
1053 *
1054 * Note these are pretty much the same protections in GetNewTransactionId.
1055 *----------
1056 */
1058 {
1059 /*
1060 * For safety's sake, we release MultiXactGenLock while sending
1061 * signals, warnings, etc. This is not so much because we care about
1062 * preserving concurrency in this situation, as to avoid any
1063 * possibility of deadlock while doing get_database_name(). First,
1064 * copy all the shared values we'll need in this path.
1065 */
1066 MultiXactId multiWarnLimit = MultiXactState->multiWarnLimit;
1067 MultiXactId multiStopLimit = MultiXactState->multiStopLimit;
1068 MultiXactId multiWrapLimit = MultiXactState->multiWrapLimit;
1069 Oid oldest_datoid = MultiXactState->oldestMultiXactDB;
1070
1071 LWLockRelease(MultiXactGenLock);
1072
1073 if (IsUnderPostmaster &&
1074 !MultiXactIdPrecedes(result, multiStopLimit))
1075 {
1076 char *oldest_datname = get_database_name(oldest_datoid);
1077
1078 /*
1079 * Immediately kick autovacuum into action as we're already in
1080 * ERROR territory.
1081 */
1083
1084 /* complain even if that DB has disappeared */
1085 if (oldest_datname)
1086 ereport(ERROR,
1087 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1088 errmsg("database is not accepting commands that assign new MultiXactIds to avoid wraparound data loss in database \"%s\"",
1089 oldest_datname),
1090 errhint("Execute a database-wide VACUUM in that database.\n"
1091 "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
1092 else
1093 ereport(ERROR,
1094 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1095 errmsg("database is not accepting commands that assign new MultiXactIds to avoid wraparound data loss in database with OID %u",
1096 oldest_datoid),
1097 errhint("Execute a database-wide VACUUM in that database.\n"
1098 "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
1099 }
1100
1101 /*
1102 * To avoid swamping the postmaster with signals, we issue the autovac
1103 * request only once per 64K multis generated. This still gives
1104 * plenty of chances before we get into real trouble.
1105 */
1106 if (IsUnderPostmaster && (result % 65536) == 0)
1108
1109 if (!MultiXactIdPrecedes(result, multiWarnLimit))
1110 {
1111 char *oldest_datname = get_database_name(oldest_datoid);
1112
1113 /* complain even if that DB has disappeared */
1114 if (oldest_datname)
1116 (errmsg_plural("database \"%s\" must be vacuumed before %u more MultiXactId is used",
1117 "database \"%s\" must be vacuumed before %u more MultiXactIds are used",
1118 multiWrapLimit - result,
1119 oldest_datname,
1120 multiWrapLimit - result),
1121 errhint("Execute a database-wide VACUUM in that database.\n"
1122 "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
1123 else
1125 (errmsg_plural("database with OID %u must be vacuumed before %u more MultiXactId is used",
1126 "database with OID %u must be vacuumed before %u more MultiXactIds are used",
1127 multiWrapLimit - result,
1128 oldest_datoid,
1129 multiWrapLimit - result),
1130 errhint("Execute a database-wide VACUUM in that database.\n"
1131 "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
1132 }
1133
1134 /* Re-acquire lock and start over */
1135 LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
1136 result = MultiXactState->nextMXact;
1137 if (result < FirstMultiXactId)
1138 result = FirstMultiXactId;
1139 }
1140
1141 /* Make sure there is room for the MXID in the file. */
1142 ExtendMultiXactOffset(result);
1143
1144 /*
1145 * Reserve the members space, similarly to above. Also, be careful not to
1146 * return zero as the starting offset for any multixact. See
1147 * GetMultiXactIdMembers() for motivation.
1148 */
1149 nextOffset = MultiXactState->nextOffset;
1150 if (nextOffset == 0)
1151 {
1152 *offset = 1;
1153 nmembers++; /* allocate member slot 0 too */
1154 }
1155 else
1156 *offset = nextOffset;
1157
1158 /*----------
1159 * Protect against overrun of the members space as well, with the
1160 * following rules:
1161 *
1162 * If we're past offsetStopLimit, refuse to generate more multis.
1163 * If we're close to offsetStopLimit, emit a warning.
1164 *
1165 * Arbitrarily, we start emitting warnings when we're 20 segments or less
1166 * from offsetStopLimit.
1167 *
1168 * Note we haven't updated the shared state yet, so if we fail at this
1169 * point, the multixact ID we grabbed can still be used by the next guy.
1170 *
1171 * Note that there is no point in forcing autovacuum runs here: the
1172 * multixact freeze settings would have to be reduced for that to have any
1173 * effect.
1174 *----------
1175 */
1176#define OFFSET_WARN_SEGMENTS 20
1179 nmembers))
1180 {
1181 /* see comment in the corresponding offsets wraparound case */
1183
1184 ereport(ERROR,
1185 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1186 errmsg("multixact \"members\" limit exceeded"),
1187 errdetail_plural("This command would create a multixact with %u members, but the remaining space is only enough for %u member.",
1188 "This command would create a multixact with %u members, but the remaining space is only enough for %u members.",
1189 MultiXactState->offsetStopLimit - nextOffset - 1,
1190 nmembers,
1191 MultiXactState->offsetStopLimit - nextOffset - 1),
1192 errhint("Execute a database-wide VACUUM in database with OID %u with reduced \"vacuum_multixact_freeze_min_age\" and \"vacuum_multixact_freeze_table_age\" settings.",
1194 }
1195
1196 /*
1197 * Check whether we should kick autovacuum into action, to prevent members
1198 * wraparound. NB we use a much larger window to trigger autovacuum than
1199 * just the warning limit. The warning is just a measure of last resort -
1200 * this is in line with GetNewTransactionId's behaviour.
1201 */
1205 {
1206 /*
1207 * To avoid swamping the postmaster with signals, we issue the autovac
1208 * request only when crossing a segment boundary. With default
1209 * compilation settings that's roughly after 50k members. This still
1210 * gives plenty of chances before we get into real trouble.
1211 */
1212 if ((MXOffsetToMemberPage(nextOffset) / SLRU_PAGES_PER_SEGMENT) !=
1213 (MXOffsetToMemberPage(nextOffset + nmembers) / SLRU_PAGES_PER_SEGMENT))
1215 }
1216
1219 nextOffset,
1222 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1223 errmsg_plural("database with OID %u must be vacuumed before %d more multixact member is used",
1224 "database with OID %u must be vacuumed before %d more multixact members are used",
1225 MultiXactState->offsetStopLimit - nextOffset + nmembers,
1227 MultiXactState->offsetStopLimit - nextOffset + nmembers),
1228 errhint("Execute a database-wide VACUUM in that database with reduced \"vacuum_multixact_freeze_min_age\" and \"vacuum_multixact_freeze_table_age\" settings.")));
1229
1230 ExtendMultiXactMember(nextOffset, nmembers);
1231
1232 /*
1233 * Critical section from here until caller has written the data into the
1234 * just-reserved SLRU space; we don't want to error out with a partly
1235 * written MultiXact structure. (In particular, failing to write our
1236 * start offset after advancing nextMXact would effectively corrupt the
1237 * previous MultiXact.)
1238 */
1240
1241 /*
1242 * Advance counters. As in GetNewTransactionId(), this must not happen
1243 * until after file extension has succeeded!
1244 *
1245 * We don't care about MultiXactId wraparound here; it will be handled by
1246 * the next iteration. But note that nextMXact may be InvalidMultiXactId
1247 * or the first value on a segment-beginning page after this routine
1248 * exits, so anyone else looking at the variable must be prepared to deal
1249 * with either case. Similarly, nextOffset may be zero, but we won't use
1250 * that as the actual start offset of the next multixact.
1251 */
1253
1254 MultiXactState->nextOffset += nmembers;
1255
1256 LWLockRelease(MultiXactGenLock);
1257
1258 debug_elog4(DEBUG2, "GetNew: returning %u offset %u", result, *offset);
1259 return result;
1260}
1261
1262/*
1263 * GetMultiXactIdMembers
1264 * Return the set of MultiXactMembers that make up a MultiXactId
1265 *
1266 * Return value is the number of members found, or -1 if there are none,
1267 * and *members is set to a newly palloc'ed array of members. It's the
1268 * caller's responsibility to free it when done with it.
1269 *
1270 * from_pgupgrade must be passed as true if and only if only the multixact
1271 * corresponds to a value from a tuple that was locked in a 9.2-or-older
1272 * installation and later pg_upgrade'd (that is, the infomask is
1273 * HEAP_LOCKED_UPGRADED). In this case, we know for certain that no members
1274 * can still be running, so we return -1 just like for an empty multixact
1275 * without any further checking. It would be wrong to try to resolve such a
1276 * multixact: either the multixact is within the current valid multixact
1277 * range, in which case the returned result would be bogus, or outside that
1278 * range, in which case an error would be raised.
1279 *
1280 * In all other cases, the passed multixact must be within the known valid
1281 * range, that is, greater than or equal to oldestMultiXactId, and less than
1282 * nextMXact. Otherwise, an error is raised.
1283 *
1284 * isLockOnly must be set to true if caller is certain that the given multi
1285 * is used only to lock tuples; can be false without loss of correctness,
1286 * but passing a true means we can return quickly without checking for
1287 * old updates.
1288 */
1289int
1291 bool from_pgupgrade, bool isLockOnly)
1292{
1293 int64 pageno;
1294 int64 prev_pageno;
1295 int entryno;
1296 int slotno;
1297 MultiXactOffset *offptr;
1298 MultiXactOffset offset;
1299 int length;
1300 int truelength;
1301 MultiXactId oldestMXact;
1302 MultiXactId nextMXact;
1303 MultiXactId tmpMXact;
1304 MultiXactOffset nextOffset;
1305 MultiXactMember *ptr;
1306 LWLock *lock;
1307 bool slept = false;
1308
1309 debug_elog3(DEBUG2, "GetMembers: asked for %u", multi);
1310
1311 if (!MultiXactIdIsValid(multi) || from_pgupgrade)
1312 {
1313 *members = NULL;
1314 return -1;
1315 }
1316
1317 /* See if the MultiXactId is in the local cache */
1318 length = mXactCacheGetById(multi, members);
1319 if (length >= 0)
1320 {
1321 debug_elog3(DEBUG2, "GetMembers: found %s in the cache",
1322 mxid_to_string(multi, length, *members));
1323 return length;
1324 }
1325
1326 /* Set our OldestVisibleMXactId[] entry if we didn't already */
1328
1329 /*
1330 * If we know the multi is used only for locking and not for updates, then
1331 * we can skip checking if the value is older than our oldest visible
1332 * multi. It cannot possibly still be running.
1333 */
1334 if (isLockOnly &&
1336 {
1337 debug_elog2(DEBUG2, "GetMembers: a locker-only multi is too old");
1338 *members = NULL;
1339 return -1;
1340 }
1341
1342 /*
1343 * We check known limits on MultiXact before resorting to the SLRU area.
1344 *
1345 * An ID older than MultiXactState->oldestMultiXactId cannot possibly be
1346 * useful; it has already been removed, or will be removed shortly, by
1347 * truncation. If one is passed, an error is raised.
1348 *
1349 * Also, an ID >= nextMXact shouldn't ever be seen here; if it is seen, it
1350 * implies undetected ID wraparound has occurred. This raises a hard
1351 * error.
1352 *
1353 * Shared lock is enough here since we aren't modifying any global state.
1354 * Acquire it just long enough to grab the current counter values. We may
1355 * need both nextMXact and nextOffset; see below.
1356 */
1357 LWLockAcquire(MultiXactGenLock, LW_SHARED);
1358
1359 oldestMXact = MultiXactState->oldestMultiXactId;
1360 nextMXact = MultiXactState->nextMXact;
1361 nextOffset = MultiXactState->nextOffset;
1362
1363 LWLockRelease(MultiXactGenLock);
1364
1365 if (MultiXactIdPrecedes(multi, oldestMXact))
1366 ereport(ERROR,
1367 (errcode(ERRCODE_INTERNAL_ERROR),
1368 errmsg("MultiXactId %u does no longer exist -- apparent wraparound",
1369 multi)));
1370
1371 if (!MultiXactIdPrecedes(multi, nextMXact))
1372 ereport(ERROR,
1373 (errcode(ERRCODE_INTERNAL_ERROR),
1374 errmsg("MultiXactId %u has not been created yet -- apparent wraparound",
1375 multi)));
1376
1377 /*
1378 * Find out the offset at which we need to start reading MultiXactMembers
1379 * and the number of members in the multixact. We determine the latter as
1380 * the difference between this multixact's starting offset and the next
1381 * one's. However, there are some corner cases to worry about:
1382 *
1383 * 1. This multixact may be the latest one created, in which case there is
1384 * no next one to look at. In this case the nextOffset value we just
1385 * saved is the correct endpoint.
1386 *
1387 * 2. The next multixact may still be in process of being filled in: that
1388 * is, another process may have done GetNewMultiXactId but not yet written
1389 * the offset entry for that ID. In that scenario, it is guaranteed that
1390 * the offset entry for that multixact exists (because GetNewMultiXactId
1391 * won't release MultiXactGenLock until it does) but contains zero
1392 * (because we are careful to pre-zero offset pages). Because
1393 * GetNewMultiXactId will never return zero as the starting offset for a
1394 * multixact, when we read zero as the next multixact's offset, we know we
1395 * have this case. We handle this by sleeping on the condition variable
1396 * we have just for this; the process in charge will signal the CV as soon
1397 * as it has finished writing the multixact offset.
1398 *
1399 * 3. Because GetNewMultiXactId increments offset zero to offset one to
1400 * handle case #2, there is an ambiguity near the point of offset
1401 * wraparound. If we see next multixact's offset is one, is that our
1402 * multixact's actual endpoint, or did it end at zero with a subsequent
1403 * increment? We handle this using the knowledge that if the zero'th
1404 * member slot wasn't filled, it'll contain zero, and zero isn't a valid
1405 * transaction ID so it can't be a multixact member. Therefore, if we
1406 * read a zero from the members array, just ignore it.
1407 *
1408 * This is all pretty messy, but the mess occurs only in infrequent corner
1409 * cases, so it seems better than holding the MultiXactGenLock for a long
1410 * time on every multixact creation.
1411 */
1412retry:
1413 pageno = MultiXactIdToOffsetPage(multi);
1414 entryno = MultiXactIdToOffsetEntry(multi);
1415
1416 /* Acquire the bank lock for the page we need. */
1419
1420 slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, multi);
1421 offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
1422 offptr += entryno;
1423 offset = *offptr;
1424
1425 Assert(offset != 0);
1426
1427 /*
1428 * Use the same increment rule as GetNewMultiXactId(), that is, don't
1429 * handle wraparound explicitly until needed.
1430 */
1431 tmpMXact = multi + 1;
1432
1433 if (nextMXact == tmpMXact)
1434 {
1435 /* Corner case 1: there is no next multixact */
1436 length = nextOffset - offset;
1437 }
1438 else
1439 {
1440 MultiXactOffset nextMXOffset;
1441
1442 /* handle wraparound if needed */
1443 if (tmpMXact < FirstMultiXactId)
1444 tmpMXact = FirstMultiXactId;
1445
1446 prev_pageno = pageno;
1447
1448 pageno = MultiXactIdToOffsetPage(tmpMXact);
1449 entryno = MultiXactIdToOffsetEntry(tmpMXact);
1450
1451 if (pageno != prev_pageno)
1452 {
1453 LWLock *newlock;
1454
1455 /*
1456 * Since we're going to access a different SLRU page, if this page
1457 * falls under a different bank, release the old bank's lock and
1458 * acquire the lock of the new bank.
1459 */
1460 newlock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno);
1461 if (newlock != lock)
1462 {
1463 LWLockRelease(lock);
1464 LWLockAcquire(newlock, LW_EXCLUSIVE);
1465 lock = newlock;
1466 }
1467 slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, tmpMXact);
1468 }
1469
1470 offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
1471 offptr += entryno;
1472 nextMXOffset = *offptr;
1473
1474 if (nextMXOffset == 0)
1475 {
1476 /* Corner case 2: next multixact is still being filled in */
1477 LWLockRelease(lock);
1479
1480 INJECTION_POINT("multixact-get-members-cv-sleep", NULL);
1481
1483 WAIT_EVENT_MULTIXACT_CREATION);
1484 slept = true;
1485 goto retry;
1486 }
1487
1488 length = nextMXOffset - offset;
1489 }
1490
1491 LWLockRelease(lock);
1492 lock = NULL;
1493
1494 /*
1495 * If we slept above, clean up state; it's no longer needed.
1496 */
1497 if (slept)
1499
1500 ptr = (MultiXactMember *) palloc(length * sizeof(MultiXactMember));
1501
1502 truelength = 0;
1503 prev_pageno = -1;
1504 for (int i = 0; i < length; i++, offset++)
1505 {
1506 TransactionId *xactptr;
1507 uint32 *flagsptr;
1508 int flagsoff;
1509 int bshift;
1510 int memberoff;
1511
1512 pageno = MXOffsetToMemberPage(offset);
1513 memberoff = MXOffsetToMemberOffset(offset);
1514
1515 if (pageno != prev_pageno)
1516 {
1517 LWLock *newlock;
1518
1519 /*
1520 * Since we're going to access a different SLRU page, if this page
1521 * falls under a different bank, release the old bank's lock and
1522 * acquire the lock of the new bank.
1523 */
1524 newlock = SimpleLruGetBankLock(MultiXactMemberCtl, pageno);
1525 if (newlock != lock)
1526 {
1527 if (lock)
1528 LWLockRelease(lock);
1529 LWLockAcquire(newlock, LW_EXCLUSIVE);
1530 lock = newlock;
1531 }
1532
1533 slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, multi);
1534 prev_pageno = pageno;
1535 }
1536
1537 xactptr = (TransactionId *)
1538 (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
1539
1540 if (!TransactionIdIsValid(*xactptr))
1541 {
1542 /* Corner case 3: we must be looking at unused slot zero */
1543 Assert(offset == 0);
1544 continue;
1545 }
1546
1547 flagsoff = MXOffsetToFlagsOffset(offset);
1548 bshift = MXOffsetToFlagsBitShift(offset);
1549 flagsptr = (uint32 *) (MultiXactMemberCtl->shared->page_buffer[slotno] + flagsoff);
1550
1551 ptr[truelength].xid = *xactptr;
1552 ptr[truelength].status = (*flagsptr >> bshift) & MXACT_MEMBER_XACT_BITMASK;
1553 truelength++;
1554 }
1555
1556 LWLockRelease(lock);
1557
1558 /* A multixid with zero members should not happen */
1559 Assert(truelength > 0);
1560
1561 /*
1562 * Copy the result into the local cache.
1563 */
1564 mXactCachePut(multi, truelength, ptr);
1565
1566 debug_elog3(DEBUG2, "GetMembers: no cache for %s",
1567 mxid_to_string(multi, truelength, ptr));
1568 *members = ptr;
1569 return truelength;
1570}
1571
1572/*
1573 * mxactMemberComparator
1574 * qsort comparison function for MultiXactMember
1575 *
1576 * We can't use wraparound comparison for XIDs because that does not respect
1577 * the triangle inequality! Any old sort order will do.
1578 */
1579static int
1580mxactMemberComparator(const void *arg1, const void *arg2)
1581{
1582 MultiXactMember member1 = *(const MultiXactMember *) arg1;
1583 MultiXactMember member2 = *(const MultiXactMember *) arg2;
1584
1585 if (member1.xid > member2.xid)
1586 return 1;
1587 if (member1.xid < member2.xid)
1588 return -1;
1589 if (member1.status > member2.status)
1590 return 1;
1591 if (member1.status < member2.status)
1592 return -1;
1593 return 0;
1594}
1595
1596/*
1597 * mXactCacheGetBySet
1598 * returns a MultiXactId from the cache based on the set of
1599 * TransactionIds that compose it, or InvalidMultiXactId if
1600 * none matches.
1601 *
1602 * This is helpful, for example, if two transactions want to lock a huge
1603 * table. By using the cache, the second will use the same MultiXactId
1604 * for the majority of tuples, thus keeping MultiXactId usage low (saving
1605 * both I/O and wraparound issues).
1606 *
1607 * NB: the passed members array will be sorted in-place.
1608 */
1609static MultiXactId
1611{
1612 dlist_iter iter;
1613
1614 debug_elog3(DEBUG2, "CacheGet: looking for %s",
1615 mxid_to_string(InvalidMultiXactId, nmembers, members));
1616
1617 /* sort the array so comparison is easy */
1618 qsort(members, nmembers, sizeof(MultiXactMember), mxactMemberComparator);
1619
1621 {
1623 iter.cur);
1624
1625 if (entry->nmembers != nmembers)
1626 continue;
1627
1628 /*
1629 * We assume the cache entries are sorted, and that the unused bits in
1630 * "status" are zeroed.
1631 */
1632 if (memcmp(members, entry->members, nmembers * sizeof(MultiXactMember)) == 0)
1633 {
1634 debug_elog3(DEBUG2, "CacheGet: found %u", entry->multi);
1636 return entry->multi;
1637 }
1638 }
1639
1640 debug_elog2(DEBUG2, "CacheGet: not found :-(");
1641 return InvalidMultiXactId;
1642}
1643
1644/*
1645 * mXactCacheGetById
1646 * returns the composing MultiXactMember set from the cache for a
1647 * given MultiXactId, if present.
1648 *
1649 * If successful, *xids is set to the address of a palloc'd copy of the
1650 * MultiXactMember set. Return value is number of members, or -1 on failure.
1651 */
1652static int
1654{
1655 dlist_iter iter;
1656
1657 debug_elog3(DEBUG2, "CacheGet: looking for %u", multi);
1658
1660 {
1662 iter.cur);
1663
1664 if (entry->multi == multi)
1665 {
1666 MultiXactMember *ptr;
1667 Size size;
1668
1669 size = sizeof(MultiXactMember) * entry->nmembers;
1670 ptr = (MultiXactMember *) palloc(size);
1671
1672 memcpy(ptr, entry->members, size);
1673
1674 debug_elog3(DEBUG2, "CacheGet: found %s",
1675 mxid_to_string(multi,
1676 entry->nmembers,
1677 entry->members));
1678
1679 /*
1680 * Note we modify the list while not using a modifiable iterator.
1681 * This is acceptable only because we exit the iteration
1682 * immediately afterwards.
1683 */
1685
1686 *members = ptr;
1687 return entry->nmembers;
1688 }
1689 }
1690
1691 debug_elog2(DEBUG2, "CacheGet: not found");
1692 return -1;
1693}
1694
1695/*
1696 * mXactCachePut
1697 * Add a new MultiXactId and its composing set into the local cache.
1698 */
1699static void
1700mXactCachePut(MultiXactId multi, int nmembers, MultiXactMember *members)
1701{
1702 mXactCacheEnt *entry;
1703
1704 debug_elog3(DEBUG2, "CachePut: storing %s",
1705 mxid_to_string(multi, nmembers, members));
1706
1707 if (MXactContext == NULL)
1708 {
1709 /* The cache only lives as long as the current transaction */
1710 debug_elog2(DEBUG2, "CachePut: initializing memory context");
1712 "MultiXact cache context",
1714 }
1715
1716 entry = (mXactCacheEnt *)
1718 offsetof(mXactCacheEnt, members) +
1719 nmembers * sizeof(MultiXactMember));
1720
1721 entry->multi = multi;
1722 entry->nmembers = nmembers;
1723 memcpy(entry->members, members, nmembers * sizeof(MultiXactMember));
1724
1725 /* mXactCacheGetBySet assumes the entries are sorted, so sort them */
1726 qsort(entry->members, nmembers, sizeof(MultiXactMember), mxactMemberComparator);
1727
1728 dclist_push_head(&MXactCache, &entry->node);
1730 {
1731 dlist_node *node;
1732
1735
1736 entry = dclist_container(mXactCacheEnt, node, node);
1737 debug_elog3(DEBUG2, "CachePut: pruning cached multi %u",
1738 entry->multi);
1739
1740 pfree(entry);
1741 }
1742}
1743
1744char *
1746{
1747 switch (status)
1748 {
1750 return "keysh";
1752 return "sh";
1754 return "fornokeyupd";
1756 return "forupd";
1758 return "nokeyupd";
1760 return "upd";
1761 default:
1762 elog(ERROR, "unrecognized multixact status %d", status);
1763 return "";
1764 }
1765}
1766
1767char *
1768mxid_to_string(MultiXactId multi, int nmembers, MultiXactMember *members)
1769{
1770 static char *str = NULL;
1772 int i;
1773
1774 if (str != NULL)
1775 pfree(str);
1776
1778
1779 appendStringInfo(&buf, "%u %d[%u (%s)", multi, nmembers, members[0].xid,
1780 mxstatus_to_string(members[0].status));
1781
1782 for (i = 1; i < nmembers; i++)
1783 appendStringInfo(&buf, ", %u (%s)", members[i].xid,
1784 mxstatus_to_string(members[i].status));
1785
1788 pfree(buf.data);
1789 return str;
1790}
1791
1792/*
1793 * AtEOXact_MultiXact
1794 * Handle transaction end for MultiXact
1795 *
1796 * This is called at top transaction commit or abort (we don't care which).
1797 */
1798void
1800{
1801 /*
1802 * Reset our OldestMemberMXactId and OldestVisibleMXactId values, both of
1803 * which should only be valid while within a transaction.
1804 *
1805 * We assume that storing a MultiXactId is atomic and so we need not take
1806 * MultiXactGenLock to do this.
1807 */
1810
1811 /*
1812 * Discard the local MultiXactId cache. Since MXactContext was created as
1813 * a child of TopTransactionContext, we needn't delete it explicitly.
1814 */
1815 MXactContext = NULL;
1817}
1818
1819/*
1820 * AtPrepare_MultiXact
1821 * Save multixact state at 2PC transaction prepare
1822 *
1823 * In this phase, we only store our OldestMemberMXactId value in the two-phase
1824 * state file.
1825 */
1826void
1828{
1830
1831 if (MultiXactIdIsValid(myOldestMember))
1833 &myOldestMember, sizeof(MultiXactId));
1834}
1835
1836/*
1837 * PostPrepare_MultiXact
1838 * Clean up after successful PREPARE TRANSACTION
1839 */
1840void
1842{
1843 MultiXactId myOldestMember;
1844
1845 /*
1846 * Transfer our OldestMemberMXactId value to the slot reserved for the
1847 * prepared transaction.
1848 */
1849 myOldestMember = OldestMemberMXactId[MyProcNumber];
1850 if (MultiXactIdIsValid(myOldestMember))
1851 {
1852 ProcNumber dummyProcNumber = TwoPhaseGetDummyProcNumber(fxid, false);
1853
1854 /*
1855 * Even though storing MultiXactId is atomic, acquire lock to make
1856 * sure others see both changes, not just the reset of the slot of the
1857 * current backend. Using a volatile pointer might suffice, but this
1858 * isn't a hot spot.
1859 */
1860 LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
1861
1862 OldestMemberMXactId[dummyProcNumber] = myOldestMember;
1864
1865 LWLockRelease(MultiXactGenLock);
1866 }
1867
1868 /*
1869 * We don't need to transfer OldestVisibleMXactId value, because the
1870 * transaction is not going to be looking at any more multixacts once it's
1871 * prepared.
1872 *
1873 * We assume that storing a MultiXactId is atomic and so we need not take
1874 * MultiXactGenLock to do this.
1875 */
1877
1878 /*
1879 * Discard the local MultiXactId cache like in AtEOXact_MultiXact.
1880 */
1881 MXactContext = NULL;
1883}
1884
1885/*
1886 * multixact_twophase_recover
1887 * Recover the state of a prepared transaction at startup
1888 */
1889void
1891 void *recdata, uint32 len)
1892{
1893 ProcNumber dummyProcNumber = TwoPhaseGetDummyProcNumber(fxid, false);
1894 MultiXactId oldestMember;
1895
1896 /*
1897 * Get the oldest member XID from the state file record, and set it in the
1898 * OldestMemberMXactId slot reserved for this prepared transaction.
1899 */
1900 Assert(len == sizeof(MultiXactId));
1901 oldestMember = *((MultiXactId *) recdata);
1902
1903 OldestMemberMXactId[dummyProcNumber] = oldestMember;
1904}
1905
1906/*
1907 * multixact_twophase_postcommit
1908 * Similar to AtEOXact_MultiXact but for COMMIT PREPARED
1909 */
1910void
1912 void *recdata, uint32 len)
1913{
1914 ProcNumber dummyProcNumber = TwoPhaseGetDummyProcNumber(fxid, true);
1915
1916 Assert(len == sizeof(MultiXactId));
1917
1918 OldestMemberMXactId[dummyProcNumber] = InvalidMultiXactId;
1919}
1920
1921/*
1922 * multixact_twophase_postabort
1923 * This is actually just the same as the COMMIT case.
1924 */
1925void
1927 void *recdata, uint32 len)
1928{
1929 multixact_twophase_postcommit(fxid, info, recdata, len);
1930}
1931
1932/*
1933 * Initialization of shared memory for MultiXact. We use two SLRU areas,
1934 * thus double memory. Also, reserve space for the shared MultiXactState
1935 * struct and the per-backend MultiXactId arrays (two of those, too).
1936 */
1937Size
1939{
1940 Size size;
1941
1942 /* We need 2*MaxOldestSlot perBackendXactIds[] entries */
1943#define SHARED_MULTIXACT_STATE_SIZE \
1944 add_size(offsetof(MultiXactStateData, perBackendXactIds), \
1945 mul_size(sizeof(MultiXactId) * 2, MaxOldestSlot))
1946
1950
1951 return size;
1952}
1953
1954void
1956{
1957 bool found;
1958
1959 debug_elog2(DEBUG2, "Shared Memory Init for MultiXact");
1960
1963
1965 "multixact_offset", multixact_offset_buffers, 0,
1966 "pg_multixact/offsets", LWTRANCHE_MULTIXACTOFFSET_BUFFER,
1967 LWTRANCHE_MULTIXACTOFFSET_SLRU,
1969 false);
1972 "multixact_member", multixact_member_buffers, 0,
1973 "pg_multixact/members", LWTRANCHE_MULTIXACTMEMBER_BUFFER,
1974 LWTRANCHE_MULTIXACTMEMBER_SLRU,
1976 false);
1977 /* doesn't call SimpleLruTruncate() or meet criteria for unit tests */
1978
1979 /* Initialize our shared state struct */
1980 MultiXactState = ShmemInitStruct("Shared MultiXact State",
1982 &found);
1983 if (!IsUnderPostmaster)
1984 {
1985 Assert(!found);
1986
1987 /* Make sure we zero out the per-backend state */
1990 }
1991 else
1992 Assert(found);
1993
1994 /*
1995 * Set up array pointers.
1996 */
1999}
2000
2001/*
2002 * GUC check_hook for multixact_offset_buffers
2003 */
2004bool
2006{
2007 return check_slru_buffers("multixact_offset_buffers", newval);
2008}
2009
2010/*
2011 * GUC check_hook for multixact_member_buffers
2012 */
2013bool
2015{
2016 return check_slru_buffers("multixact_member_buffers", newval);
2017}
2018
2019/*
2020 * This func must be called ONCE on system install. It creates the initial
2021 * MultiXact segments. (The MultiXacts directories are assumed to have been
2022 * created by initdb, and MultiXactShmemInit must have been called already.)
2023 */
2024void
2026{
2027 /* Zero the initial pages and flush them to disk */
2030}
2031
2032/*
2033 * MaybeExtendOffsetSlru
2034 * Extend the offsets SLRU area, if necessary
2035 *
2036 * After a binary upgrade from <= 9.2, the pg_multixact/offsets SLRU area might
2037 * contain files that are shorter than necessary; this would occur if the old
2038 * installation had used multixacts beyond the first page (files cannot be
2039 * copied, because the on-disk representation is different). pg_upgrade would
2040 * update pg_control to set the next offset value to be at that position, so
2041 * that tuples marked as locked by such MultiXacts would be seen as visible
2042 * without having to consult multixact. However, trying to create and use a
2043 * new MultiXactId would result in an error because the page on which the new
2044 * value would reside does not exist. This routine is in charge of creating
2045 * such pages.
2046 */
2047static void
2049{
2050 int64 pageno;
2051 LWLock *lock;
2052
2055
2057
2059 {
2060 int slotno;
2061
2062 /*
2063 * Fortunately for us, SimpleLruWritePage is already prepared to deal
2064 * with creating a new segment file even if the page we're writing is
2065 * not the first in it, so this is enough.
2066 */
2067 slotno = SimpleLruZeroPage(MultiXactOffsetCtl, pageno);
2069 }
2070
2071 LWLockRelease(lock);
2072}
2073
2074/*
2075 * This must be called ONCE during postmaster or standalone-backend startup.
2076 *
2077 * StartupXLOG has already established nextMXact/nextOffset by calling
2078 * MultiXactSetNextMXact and/or MultiXactAdvanceNextMXact, and the oldestMulti
2079 * info from pg_control and/or MultiXactAdvanceOldest, but we haven't yet
2080 * replayed WAL.
2081 */
2082void
2084{
2087 int64 pageno;
2088
2089 /*
2090 * Initialize offset's idea of the latest page number.
2091 */
2092 pageno = MultiXactIdToOffsetPage(multi);
2093 pg_atomic_write_u64(&MultiXactOffsetCtl->shared->latest_page_number,
2094 pageno);
2095
2096 /*
2097 * Initialize member's idea of the latest page number.
2098 */
2099 pageno = MXOffsetToMemberPage(offset);
2100 pg_atomic_write_u64(&MultiXactMemberCtl->shared->latest_page_number,
2101 pageno);
2102}
2103
2104/*
2105 * This must be called ONCE at the end of startup/recovery.
2106 */
2107void
2109{
2110 MultiXactId nextMXact;
2111 MultiXactOffset offset;
2112 MultiXactId oldestMXact;
2113 Oid oldestMXactDB;
2114 int64 pageno;
2115 int entryno;
2116 int flagsoff;
2117
2118 LWLockAcquire(MultiXactGenLock, LW_SHARED);
2119 nextMXact = MultiXactState->nextMXact;
2120 offset = MultiXactState->nextOffset;
2121 oldestMXact = MultiXactState->oldestMultiXactId;
2122 oldestMXactDB = MultiXactState->oldestMultiXactDB;
2123 LWLockRelease(MultiXactGenLock);
2124
2125 /* Clean up offsets state */
2126
2127 /*
2128 * (Re-)Initialize our idea of the latest page number for offsets.
2129 */
2130 pageno = MultiXactIdToOffsetPage(nextMXact);
2131 pg_atomic_write_u64(&MultiXactOffsetCtl->shared->latest_page_number,
2132 pageno);
2133
2134 /*
2135 * Zero out the remainder of the current offsets page. See notes in
2136 * TrimCLOG() for background. Unlike CLOG, some WAL record covers every
2137 * pg_multixact SLRU mutation. Since, also unlike CLOG, we ignore the WAL
2138 * rule "write xlog before data," nextMXact successors may carry obsolete,
2139 * nonzero offset values. Zero those so case 2 of GetMultiXactIdMembers()
2140 * operates normally.
2141 */
2142 entryno = MultiXactIdToOffsetEntry(nextMXact);
2143 if (entryno != 0)
2144 {
2145 int slotno;
2146 MultiXactOffset *offptr;
2148
2150 slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, nextMXact);
2151 offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
2152 offptr += entryno;
2153
2154 MemSet(offptr, 0, BLCKSZ - (entryno * sizeof(MultiXactOffset)));
2155
2156 MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
2157 LWLockRelease(lock);
2158 }
2159
2160 /*
2161 * And the same for members.
2162 *
2163 * (Re-)Initialize our idea of the latest page number for members.
2164 */
2165 pageno = MXOffsetToMemberPage(offset);
2166 pg_atomic_write_u64(&MultiXactMemberCtl->shared->latest_page_number,
2167 pageno);
2168
2169 /*
2170 * Zero out the remainder of the current members page. See notes in
2171 * TrimCLOG() for motivation.
2172 */
2173 flagsoff = MXOffsetToFlagsOffset(offset);
2174 if (flagsoff != 0)
2175 {
2176 int slotno;
2177 TransactionId *xidptr;
2178 int memberoff;
2180
2182 memberoff = MXOffsetToMemberOffset(offset);
2183 slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, offset);
2184 xidptr = (TransactionId *)
2185 (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
2186
2187 MemSet(xidptr, 0, BLCKSZ - memberoff);
2188
2189 /*
2190 * Note: we don't need to zero out the flag bits in the remaining
2191 * members of the current group, because they are always reset before
2192 * writing.
2193 */
2194
2195 MultiXactMemberCtl->shared->page_dirty[slotno] = true;
2196 LWLockRelease(lock);
2197 }
2198
2199 /* signal that we're officially up */
2200 LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
2202 LWLockRelease(MultiXactGenLock);
2203
2204 /* Now compute how far away the next members wraparound is. */
2205 SetMultiXactIdLimit(oldestMXact, oldestMXactDB, true);
2206}
2207
2208/*
2209 * Get the MultiXact data to save in a checkpoint record
2210 */
2211void
2213 MultiXactId *nextMulti,
2214 MultiXactOffset *nextMultiOffset,
2215 MultiXactId *oldestMulti,
2216 Oid *oldestMultiDB)
2217{
2218 LWLockAcquire(MultiXactGenLock, LW_SHARED);
2219 *nextMulti = MultiXactState->nextMXact;
2220 *nextMultiOffset = MultiXactState->nextOffset;
2221 *oldestMulti = MultiXactState->oldestMultiXactId;
2222 *oldestMultiDB = MultiXactState->oldestMultiXactDB;
2223 LWLockRelease(MultiXactGenLock);
2224
2226 "MultiXact: checkpoint is nextMulti %u, nextOffset %u, oldestMulti %u in DB %u",
2227 *nextMulti, *nextMultiOffset, *oldestMulti, *oldestMultiDB);
2228}
2229
2230/*
2231 * Perform a checkpoint --- either during shutdown, or on-the-fly
2232 */
2233void
2235{
2236 TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_START(true);
2237
2238 /*
2239 * Write dirty MultiXact pages to disk. This may result in sync requests
2240 * queued for later handling by ProcessSyncRequests(), as part of the
2241 * checkpoint.
2242 */
2245
2246 TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_DONE(true);
2247}
2248
2249/*
2250 * Set the next-to-be-assigned MultiXactId and offset
2251 *
2252 * This is used when we can determine the correct next ID/offset exactly
2253 * from a checkpoint record. Although this is only called during bootstrap
2254 * and XLog replay, we take the lock in case any hot-standby backends are
2255 * examining the values.
2256 */
2257void
2259 MultiXactOffset nextMultiOffset)
2260{
2261 debug_elog4(DEBUG2, "MultiXact: setting next multi to %u offset %u",
2262 nextMulti, nextMultiOffset);
2263 LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
2264 MultiXactState->nextMXact = nextMulti;
2265 MultiXactState->nextOffset = nextMultiOffset;
2266 LWLockRelease(MultiXactGenLock);
2267
2268 /*
2269 * During a binary upgrade, make sure that the offsets SLRU is large
2270 * enough to contain the next value that would be created.
2271 *
2272 * We need to do this pretty early during the first startup in binary
2273 * upgrade mode: before StartupMultiXact() in fact, because this routine
2274 * is called even before that by StartupXLOG(). And we can't do it
2275 * earlier than at this point, because during that first call of this
2276 * routine we determine the MultiXactState->nextMXact value that
2277 * MaybeExtendOffsetSlru needs.
2278 */
2279 if (IsBinaryUpgrade)
2281}
2282
2283/*
2284 * Determine the last safe MultiXactId to allocate given the currently oldest
2285 * datminmxid (ie, the oldest MultiXactId that might exist in any database
2286 * of our cluster), and the OID of the (or a) database with that value.
2287 *
2288 * is_startup is true when we are just starting the cluster, false when we
2289 * are updating state in a running cluster. This only affects log messages.
2290 */
2291void
2292SetMultiXactIdLimit(MultiXactId oldest_datminmxid, Oid oldest_datoid,
2293 bool is_startup)
2294{
2295 MultiXactId multiVacLimit;
2296 MultiXactId multiWarnLimit;
2297 MultiXactId multiStopLimit;
2298 MultiXactId multiWrapLimit;
2299 MultiXactId curMulti;
2300 bool needs_offset_vacuum;
2301
2302 Assert(MultiXactIdIsValid(oldest_datminmxid));
2303
2304 /*
2305 * We pretend that a wrap will happen halfway through the multixact ID
2306 * space, but that's not really true, because multixacts wrap differently
2307 * from transaction IDs. Note that, separately from any concern about
2308 * multixact IDs wrapping, we must ensure that multixact members do not
2309 * wrap. Limits for that are set in SetOffsetVacuumLimit, not here.
2310 */
2311 multiWrapLimit = oldest_datminmxid + (MaxMultiXactId >> 1);
2312 if (multiWrapLimit < FirstMultiXactId)
2313 multiWrapLimit += FirstMultiXactId;
2314
2315 /*
2316 * We'll refuse to continue assigning MultiXactIds once we get within 3M
2317 * multi of data loss. See SetTransactionIdLimit.
2318 */
2319 multiStopLimit = multiWrapLimit - 3000000;
2320 if (multiStopLimit < FirstMultiXactId)
2321 multiStopLimit -= FirstMultiXactId;
2322
2323 /*
2324 * We'll start complaining loudly when we get within 40M multis of data
2325 * loss. This is kind of arbitrary, but if you let your gas gauge get
2326 * down to 2% of full, would you be looking for the next gas station? We
2327 * need to be fairly liberal about this number because there are lots of
2328 * scenarios where most transactions are done by automatic clients that
2329 * won't pay attention to warnings. (No, we're not gonna make this
2330 * configurable. If you know enough to configure it, you know enough to
2331 * not get in this kind of trouble in the first place.)
2332 */
2333 multiWarnLimit = multiWrapLimit - 40000000;
2334 if (multiWarnLimit < FirstMultiXactId)
2335 multiWarnLimit -= FirstMultiXactId;
2336
2337 /*
2338 * We'll start trying to force autovacuums when oldest_datminmxid gets to
2339 * be more than autovacuum_multixact_freeze_max_age mxids old.
2340 *
2341 * Note: autovacuum_multixact_freeze_max_age is a PGC_POSTMASTER parameter
2342 * so that we don't have to worry about dealing with on-the-fly changes in
2343 * its value. See SetTransactionIdLimit.
2344 */
2345 multiVacLimit = oldest_datminmxid + autovacuum_multixact_freeze_max_age;
2346 if (multiVacLimit < FirstMultiXactId)
2347 multiVacLimit += FirstMultiXactId;
2348
2349 /* Grab lock for just long enough to set the new limit values */
2350 LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
2351 MultiXactState->oldestMultiXactId = oldest_datminmxid;
2352 MultiXactState->oldestMultiXactDB = oldest_datoid;
2353 MultiXactState->multiVacLimit = multiVacLimit;
2354 MultiXactState->multiWarnLimit = multiWarnLimit;
2355 MultiXactState->multiStopLimit = multiStopLimit;
2356 MultiXactState->multiWrapLimit = multiWrapLimit;
2357 curMulti = MultiXactState->nextMXact;
2358 LWLockRelease(MultiXactGenLock);
2359
2360 /* Log the info */
2362 (errmsg_internal("MultiXactId wrap limit is %u, limited by database with OID %u",
2363 multiWrapLimit, oldest_datoid)));
2364
2365 /*
2366 * Computing the actual limits is only possible once the data directory is
2367 * in a consistent state. There's no need to compute the limits while
2368 * still replaying WAL - no decisions about new multis are made even
2369 * though multixact creations might be replayed. So we'll only do further
2370 * checks after TrimMultiXact() has been called.
2371 */
2373 return;
2374
2376
2377 /* Set limits for offset vacuum. */
2378 needs_offset_vacuum = SetOffsetVacuumLimit(is_startup);
2379
2380 /*
2381 * If past the autovacuum force point, immediately signal an autovac
2382 * request. The reason for this is that autovac only processes one
2383 * database per invocation. Once it's finished cleaning up the oldest
2384 * database, it'll call here, and we'll signal the postmaster to start
2385 * another iteration immediately if there are still any old databases.
2386 */
2387 if ((MultiXactIdPrecedes(multiVacLimit, curMulti) ||
2388 needs_offset_vacuum) && IsUnderPostmaster)
2390
2391 /* Give an immediate warning if past the wrap warn point */
2392 if (MultiXactIdPrecedes(multiWarnLimit, curMulti))
2393 {
2394 char *oldest_datname;
2395
2396 /*
2397 * We can be called when not inside a transaction, for example during
2398 * StartupXLOG(). In such a case we cannot do database access, so we
2399 * must just report the oldest DB's OID.
2400 *
2401 * Note: it's also possible that get_database_name fails and returns
2402 * NULL, for example because the database just got dropped. We'll
2403 * still warn, even though the warning might now be unnecessary.
2404 */
2405 if (IsTransactionState())
2406 oldest_datname = get_database_name(oldest_datoid);
2407 else
2408 oldest_datname = NULL;
2409
2410 if (oldest_datname)
2412 (errmsg_plural("database \"%s\" must be vacuumed before %u more MultiXactId is used",
2413 "database \"%s\" must be vacuumed before %u more MultiXactIds are used",
2414 multiWrapLimit - curMulti,
2415 oldest_datname,
2416 multiWrapLimit - curMulti),
2417 errhint("To avoid MultiXactId assignment failures, execute a database-wide VACUUM in that database.\n"
2418 "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
2419 else
2421 (errmsg_plural("database with OID %u must be vacuumed before %u more MultiXactId is used",
2422 "database with OID %u must be vacuumed before %u more MultiXactIds are used",
2423 multiWrapLimit - curMulti,
2424 oldest_datoid,
2425 multiWrapLimit - curMulti),
2426 errhint("To avoid MultiXactId assignment failures, execute a database-wide VACUUM in that database.\n"
2427 "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
2428 }
2429}
2430
2431/*
2432 * Ensure the next-to-be-assigned MultiXactId is at least minMulti,
2433 * and similarly nextOffset is at least minMultiOffset.
2434 *
2435 * This is used when we can determine minimum safe values from an XLog
2436 * record (either an on-line checkpoint or an mxact creation log entry).
2437 * Although this is only called during XLog replay, we take the lock in case
2438 * any hot-standby backends are examining the values.
2439 */
2440void
2442 MultiXactOffset minMultiOffset)
2443{
2444 LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
2446 {
2447 debug_elog3(DEBUG2, "MultiXact: setting next multi to %u", minMulti);
2448 MultiXactState->nextMXact = minMulti;
2449 }
2451 {
2452 debug_elog3(DEBUG2, "MultiXact: setting next offset to %u",
2453 minMultiOffset);
2454 MultiXactState->nextOffset = minMultiOffset;
2455 }
2456 LWLockRelease(MultiXactGenLock);
2457}
2458
2459/*
2460 * Update our oldestMultiXactId value, but only if it's more recent than what
2461 * we had.
2462 *
2463 * This may only be called during WAL replay.
2464 */
2465void
2466MultiXactAdvanceOldest(MultiXactId oldestMulti, Oid oldestMultiDB)
2467{
2469
2471 SetMultiXactIdLimit(oldestMulti, oldestMultiDB, false);
2472}
2473
2474/*
2475 * Make sure that MultiXactOffset has room for a newly-allocated MultiXactId.
2476 *
2477 * NB: this is called while holding MultiXactGenLock. We want it to be very
2478 * fast most of the time; even when it's not so fast, no actual I/O need
2479 * happen unless we're forced to write out a dirty log or xlog page to make
2480 * room in shared memory.
2481 */
2482static void
2484{
2485 int64 pageno;
2486 LWLock *lock;
2487
2488 /*
2489 * No work except at first MultiXactId of a page. But beware: just after
2490 * wraparound, the first MultiXactId of page zero is FirstMultiXactId.
2491 */
2492 if (MultiXactIdToOffsetEntry(multi) != 0 &&
2493 multi != FirstMultiXactId)
2494 return;
2495
2496 pageno = MultiXactIdToOffsetPage(multi);
2498
2500
2501 /* Zero the page and make a WAL entry about it */
2504 pageno);
2505
2506 LWLockRelease(lock);
2507}
2508
2509/*
2510 * Make sure that MultiXactMember has room for the members of a newly-
2511 * allocated MultiXactId.
2512 *
2513 * Like the above routine, this is called while holding MultiXactGenLock;
2514 * same comments apply.
2515 */
2516static void
2518{
2519 /*
2520 * It's possible that the members span more than one page of the members
2521 * file, so we loop to ensure we consider each page. The coding is not
2522 * optimal if the members span several pages, but that seems unusual
2523 * enough to not worry much about.
2524 */
2525 while (nmembers > 0)
2526 {
2527 int flagsoff;
2528 int flagsbit;
2530
2531 /*
2532 * Only zero when at first entry of a page.
2533 */
2534 flagsoff = MXOffsetToFlagsOffset(offset);
2535 flagsbit = MXOffsetToFlagsBitShift(offset);
2536 if (flagsoff == 0 && flagsbit == 0)
2537 {
2538 int64 pageno;
2539 LWLock *lock;
2540
2541 pageno = MXOffsetToMemberPage(offset);
2543
2545
2546 /* Zero the page and make a WAL entry about it */
2548 XLogSimpleInsertInt64(RM_MULTIXACT_ID,
2550
2551 LWLockRelease(lock);
2552 }
2553
2554 /*
2555 * Compute the number of items till end of current page. Careful: if
2556 * addition of unsigned ints wraps around, we're at the last page of
2557 * the last segment; since that page holds a different number of items
2558 * than other pages, we need to do it differently.
2559 */
2560 if (offset + MAX_MEMBERS_IN_LAST_MEMBERS_PAGE < offset)
2561 {
2562 /*
2563 * This is the last page of the last segment; we can compute the
2564 * number of items left to allocate in it without modulo
2565 * arithmetic.
2566 */
2567 difference = MaxMultiXactOffset - offset + 1;
2568 }
2569 else
2571
2572 /*
2573 * Advance to next page, taking care to properly handle the wraparound
2574 * case. OK if nmembers goes negative.
2575 */
2576 nmembers -= difference;
2577 offset += difference;
2578 }
2579}
2580
2581/*
2582 * GetOldestMultiXactId
2583 *
2584 * Return the oldest MultiXactId that's still possibly still seen as live by
2585 * any running transaction. Older ones might still exist on disk, but they no
2586 * longer have any running member transaction.
2587 *
2588 * It's not safe to truncate MultiXact SLRU segments on the value returned by
2589 * this function; however, it can be set as the new relminmxid for any table
2590 * that VACUUM knows has no remaining MXIDs < the same value. It is only safe
2591 * to truncate SLRUs when no table can possibly still have a referencing MXID.
2592 */
2595{
2596 MultiXactId oldestMXact;
2597 MultiXactId nextMXact;
2598 int i;
2599
2600 /*
2601 * This is the oldest valid value among all the OldestMemberMXactId[] and
2602 * OldestVisibleMXactId[] entries, or nextMXact if none are valid.
2603 */
2604 LWLockAcquire(MultiXactGenLock, LW_SHARED);
2605
2606 /*
2607 * We have to beware of the possibility that nextMXact is in the
2608 * wrapped-around state. We don't fix the counter itself here, but we
2609 * must be sure to use a valid value in our calculation.
2610 */
2611 nextMXact = MultiXactState->nextMXact;
2612 if (nextMXact < FirstMultiXactId)
2613 nextMXact = FirstMultiXactId;
2614
2615 oldestMXact = nextMXact;
2616 for (i = 0; i < MaxOldestSlot; i++)
2617 {
2618 MultiXactId thisoldest;
2619
2620 thisoldest = OldestMemberMXactId[i];
2621 if (MultiXactIdIsValid(thisoldest) &&
2622 MultiXactIdPrecedes(thisoldest, oldestMXact))
2623 oldestMXact = thisoldest;
2624 thisoldest = OldestVisibleMXactId[i];
2625 if (MultiXactIdIsValid(thisoldest) &&
2626 MultiXactIdPrecedes(thisoldest, oldestMXact))
2627 oldestMXact = thisoldest;
2628 }
2629
2630 LWLockRelease(MultiXactGenLock);
2631
2632 return oldestMXact;
2633}
2634
2635/*
2636 * Determine how aggressively we need to vacuum in order to prevent member
2637 * wraparound.
2638 *
2639 * To do so determine what's the oldest member offset and install the limit
2640 * info in MultiXactState, where it can be used to prevent overrun of old data
2641 * in the members SLRU area.
2642 *
2643 * The return value is true if emergency autovacuum is required and false
2644 * otherwise.
2645 */
2646static bool
2647SetOffsetVacuumLimit(bool is_startup)
2648{
2649 MultiXactId oldestMultiXactId;
2650 MultiXactId nextMXact;
2651 MultiXactOffset oldestOffset = 0; /* placate compiler */
2652 MultiXactOffset prevOldestOffset;
2653 MultiXactOffset nextOffset;
2654 bool oldestOffsetKnown = false;
2655 bool prevOldestOffsetKnown;
2656 MultiXactOffset offsetStopLimit = 0;
2657 MultiXactOffset prevOffsetStopLimit;
2658
2659 /*
2660 * NB: Have to prevent concurrent truncation, we might otherwise try to
2661 * lookup an oldestMulti that's concurrently getting truncated away.
2662 */
2663 LWLockAcquire(MultiXactTruncationLock, LW_SHARED);
2664
2665 /* Read relevant fields from shared memory. */
2666 LWLockAcquire(MultiXactGenLock, LW_SHARED);
2667 oldestMultiXactId = MultiXactState->oldestMultiXactId;
2668 nextMXact = MultiXactState->nextMXact;
2669 nextOffset = MultiXactState->nextOffset;
2670 prevOldestOffsetKnown = MultiXactState->oldestOffsetKnown;
2671 prevOldestOffset = MultiXactState->oldestOffset;
2672 prevOffsetStopLimit = MultiXactState->offsetStopLimit;
2674 LWLockRelease(MultiXactGenLock);
2675
2676 /*
2677 * Determine the offset of the oldest multixact. Normally, we can read
2678 * the offset from the multixact itself, but there's an important special
2679 * case: if there are no multixacts in existence at all, oldestMXact
2680 * obviously can't point to one. It will instead point to the multixact
2681 * ID that will be assigned the next time one is needed.
2682 */
2683 if (oldestMultiXactId == nextMXact)
2684 {
2685 /*
2686 * When the next multixact gets created, it will be stored at the next
2687 * offset.
2688 */
2689 oldestOffset = nextOffset;
2690 oldestOffsetKnown = true;
2691 }
2692 else
2693 {
2694 /*
2695 * Figure out where the oldest existing multixact's offsets are
2696 * stored. Due to bugs in early release of PostgreSQL 9.3.X and 9.4.X,
2697 * the supposedly-earliest multixact might not really exist. We are
2698 * careful not to fail in that case.
2699 */
2700 oldestOffsetKnown =
2701 find_multixact_start(oldestMultiXactId, &oldestOffset);
2702
2703 if (oldestOffsetKnown)
2705 (errmsg_internal("oldest MultiXactId member is at offset %u",
2706 oldestOffset)));
2707 else
2708 ereport(LOG,
2709 (errmsg("MultiXact member wraparound protections are disabled because oldest checkpointed MultiXact %u does not exist on disk",
2710 oldestMultiXactId)));
2711 }
2712
2713 LWLockRelease(MultiXactTruncationLock);
2714
2715 /*
2716 * If we can, compute limits (and install them MultiXactState) to prevent
2717 * overrun of old data in the members SLRU area. We can only do so if the
2718 * oldest offset is known though.
2719 */
2720 if (oldestOffsetKnown)
2721 {
2722 /* move back to start of the corresponding segment */
2723 offsetStopLimit = oldestOffset - (oldestOffset %
2725
2726 /* always leave one segment before the wraparound point */
2728
2729 if (!prevOldestOffsetKnown && !is_startup)
2730 ereport(LOG,
2731 (errmsg("MultiXact member wraparound protections are now enabled")));
2732
2734 (errmsg_internal("MultiXact member stop limit is now %u based on MultiXact %u",
2735 offsetStopLimit, oldestMultiXactId)));
2736 }
2737 else if (prevOldestOffsetKnown)
2738 {
2739 /*
2740 * If we failed to get the oldest offset this time, but we have a
2741 * value from a previous pass through this function, use the old
2742 * values rather than automatically forcing an emergency autovacuum
2743 * cycle again.
2744 */
2745 oldestOffset = prevOldestOffset;
2746 oldestOffsetKnown = true;
2747 offsetStopLimit = prevOffsetStopLimit;
2748 }
2749
2750 /* Install the computed values */
2751 LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
2752 MultiXactState->oldestOffset = oldestOffset;
2753 MultiXactState->oldestOffsetKnown = oldestOffsetKnown;
2754 MultiXactState->offsetStopLimit = offsetStopLimit;
2755 LWLockRelease(MultiXactGenLock);
2756
2757 /*
2758 * Do we need an emergency autovacuum? If we're not sure, assume yes.
2759 */
2760 return !oldestOffsetKnown ||
2761 (nextOffset - oldestOffset > MULTIXACT_MEMBER_SAFE_THRESHOLD);
2762}
2763
2764/*
2765 * Return whether adding "distance" to "start" would move past "boundary".
2766 *
2767 * We use this to determine whether the addition is "wrapping around" the
2768 * boundary point, hence the name. The reason we don't want to use the regular
2769 * 2^31-modulo arithmetic here is that we want to be able to use the whole of
2770 * the 2^32-1 space here, allowing for more multixacts than would fit
2771 * otherwise.
2772 */
2773static bool
2775 uint32 distance)
2776{
2777 MultiXactOffset finish;
2778
2779 /*
2780 * Note that offset number 0 is not used (see GetMultiXactIdMembers), so
2781 * if the addition wraps around the UINT_MAX boundary, skip that value.
2782 */
2783 finish = start + distance;
2784 if (finish < start)
2785 finish++;
2786
2787 /*-----------------------------------------------------------------------
2788 * When the boundary is numerically greater than the starting point, any
2789 * value numerically between the two is not wrapped:
2790 *
2791 * <----S----B---->
2792 * [---) = F wrapped past B (and UINT_MAX)
2793 * [---) = F not wrapped
2794 * [----] = F wrapped past B
2795 *
2796 * When the boundary is numerically less than the starting point (i.e. the
2797 * UINT_MAX wraparound occurs somewhere in between) then all values in
2798 * between are wrapped:
2799 *
2800 * <----B----S---->
2801 * [---) = F not wrapped past B (but wrapped past UINT_MAX)
2802 * [---) = F wrapped past B (and UINT_MAX)
2803 * [----] = F not wrapped
2804 *-----------------------------------------------------------------------
2805 */
2806 if (start < boundary)
2807 return finish >= boundary || finish < start;
2808 else
2809 return finish >= boundary && finish < start;
2810}
2811
2812/*
2813 * Find the starting offset of the given MultiXactId.
2814 *
2815 * Returns false if the file containing the multi does not exist on disk.
2816 * Otherwise, returns true and sets *result to the starting member offset.
2817 *
2818 * This function does not prevent concurrent truncation, so if that's
2819 * required, the caller has to protect against that.
2820 */
2821static bool
2823{
2824 MultiXactOffset offset;
2825 int64 pageno;
2826 int entryno;
2827 int slotno;
2828 MultiXactOffset *offptr;
2829
2831
2832 pageno = MultiXactIdToOffsetPage(multi);
2833 entryno = MultiXactIdToOffsetEntry(multi);
2834
2835 /*
2836 * Write out dirty data, so PhysicalPageExists can work correctly.
2837 */
2840
2842 return false;
2843
2844 /* lock is acquired by SimpleLruReadPage_ReadOnly */
2845 slotno = SimpleLruReadPage_ReadOnly(MultiXactOffsetCtl, pageno, multi);
2846 offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
2847 offptr += entryno;
2848 offset = *offptr;
2850
2851 *result = offset;
2852 return true;
2853}
2854
2855/*
2856 * GetMultiXactInfo
2857 *
2858 * Returns information about the current MultiXact state, as of:
2859 * multixacts: Number of MultiXacts (nextMultiXactId - oldestMultiXactId)
2860 * members: Number of member entries (nextOffset - oldestOffset)
2861 * oldestMultiXactId: Oldest MultiXact ID still in use
2862 * oldestOffset: Oldest offset still in use
2863 *
2864 * Returns false if unable to determine, the oldest offset being unknown.
2865 */
2866bool
2868 MultiXactId *oldestMultiXactId, MultiXactOffset *oldestOffset)
2869{
2870 MultiXactOffset nextOffset;
2871 MultiXactId nextMultiXactId;
2872 bool oldestOffsetKnown;
2873
2874 LWLockAcquire(MultiXactGenLock, LW_SHARED);
2875 nextOffset = MultiXactState->nextOffset;
2876 *oldestMultiXactId = MultiXactState->oldestMultiXactId;
2877 nextMultiXactId = MultiXactState->nextMXact;
2878 *oldestOffset = MultiXactState->oldestOffset;
2879 oldestOffsetKnown = MultiXactState->oldestOffsetKnown;
2880 LWLockRelease(MultiXactGenLock);
2881
2882 if (!oldestOffsetKnown)
2883 {
2884 *members = 0;
2885 *multixacts = 0;
2886 *oldestMultiXactId = InvalidMultiXactId;
2887 *oldestOffset = 0;
2888 return false;
2889 }
2890
2891 *members = nextOffset - *oldestOffset;
2892 *multixacts = nextMultiXactId - *oldestMultiXactId;
2893 return true;
2894}
2895
2896/*
2897 * Multixact members can be removed once the multixacts that refer to them
2898 * are older than every datminmxid. autovacuum_multixact_freeze_max_age and
2899 * vacuum_multixact_freeze_table_age work together to make sure we never have
2900 * too many multixacts; we hope that, at least under normal circumstances,
2901 * this will also be sufficient to keep us from using too many offsets.
2902 * However, if the average multixact has many members, we might exhaust the
2903 * members space while still using few enough members that these limits fail
2904 * to trigger relminmxid advancement by VACUUM. At that point, we'd have no
2905 * choice but to start failing multixact-creating operations with an error.
2906 *
2907 * To prevent that, if more than a threshold portion of the members space is
2908 * used, we effectively reduce autovacuum_multixact_freeze_max_age and
2909 * to a value just less than the number of multixacts in use. We hope that
2910 * this will quickly trigger autovacuuming on the table or tables with the
2911 * oldest relminmxid, thus allowing datminmxid values to advance and removing
2912 * some members.
2913 *
2914 * As the fraction of the member space currently in use grows, we become
2915 * more aggressive in clamping this value. That not only causes autovacuum
2916 * to ramp up, but also makes any manual vacuums the user issues more
2917 * aggressive. This happens because vacuum_get_cutoffs() will clamp the
2918 * freeze table and the minimum freeze age cutoffs based on the effective
2919 * autovacuum_multixact_freeze_max_age this function returns. In the worst
2920 * case, we'll claim the freeze_max_age to zero, and every vacuum of any
2921 * table will freeze every multixact.
2922 */
2923int
2925{
2926 MultiXactOffset members;
2927 uint32 multixacts;
2928 uint32 victim_multixacts;
2929 double fraction;
2930 int result;
2931 MultiXactId oldestMultiXactId;
2932 MultiXactOffset oldestOffset;
2933
2934 /* If we can't determine member space utilization, assume the worst. */
2935 if (!GetMultiXactInfo(&multixacts, &members, &oldestMultiXactId, &oldestOffset))
2936 return 0;
2937
2938 /* If member space utilization is low, no special action is required. */
2939 if (members <= MULTIXACT_MEMBER_SAFE_THRESHOLD)
2941
2942 /*
2943 * Compute a target for relminmxid advancement. The number of multixacts
2944 * we try to eliminate from the system is based on how far we are past
2945 * MULTIXACT_MEMBER_SAFE_THRESHOLD.
2946 */
2947 fraction = (double) (members - MULTIXACT_MEMBER_SAFE_THRESHOLD) /
2949 victim_multixacts = multixacts * fraction;
2950
2951 /* fraction could be > 1.0, but lowest possible freeze age is zero */
2952 if (victim_multixacts > multixacts)
2953 return 0;
2954 result = multixacts - victim_multixacts;
2955
2956 /*
2957 * Clamp to autovacuum_multixact_freeze_max_age, so that we never make
2958 * autovacuum less aggressive than it would otherwise be.
2959 */
2961}
2962
2963typedef struct mxtruncinfo
2964{
2967
2968/*
2969 * SlruScanDirectory callback
2970 * This callback determines the earliest existing page number.
2971 */
2972static bool
2974{
2975 mxtruncinfo *trunc = (mxtruncinfo *) data;
2976
2977 if (trunc->earliestExistingPage == -1 ||
2978 ctl->PagePrecedes(segpage, trunc->earliestExistingPage))
2979 {
2980 trunc->earliestExistingPage = segpage;
2981 }
2982
2983 return false; /* keep going */
2984}
2985
2986
2987/*
2988 * Delete members segments [oldest, newOldest)
2989 *
2990 * The members SLRU can, in contrast to the offsets one, be filled to almost
2991 * the full range at once. This means SimpleLruTruncate() can't trivially be
2992 * used - instead the to-be-deleted range is computed using the offsets
2993 * SLRU. C.f. TruncateMultiXact().
2994 */
2995static void
2997{
2999 int64 startsegment = MXOffsetToMemberSegment(oldestOffset);
3000 int64 endsegment = MXOffsetToMemberSegment(newOldestOffset);
3001 int64 segment = startsegment;
3002
3003 /*
3004 * Delete all the segments but the last one. The last segment can still
3005 * contain, possibly partially, valid data.
3006 */
3007 while (segment != endsegment)
3008 {
3009 elog(DEBUG2, "truncating multixact members segment %" PRIx64,
3010 segment);
3012
3013 /* move to next segment, handling wraparound correctly */
3014 if (segment == maxsegment)
3015 segment = 0;
3016 else
3017 segment += 1;
3018 }
3019}
3020
3021/*
3022 * Delete offsets segments [oldest, newOldest)
3023 */
3024static void
3026{
3027 /*
3028 * We step back one multixact to avoid passing a cutoff page that hasn't
3029 * been created yet in the rare case that oldestMulti would be the first
3030 * item on a page and oldestMulti == nextMulti. In that case, if we
3031 * didn't subtract one, we'd trigger SimpleLruTruncate's wraparound
3032 * detection.
3033 */
3036}
3037
3038/*
3039 * Remove all MultiXactOffset and MultiXactMember segments before the oldest
3040 * ones still of interest.
3041 *
3042 * This is only called on a primary as part of vacuum (via
3043 * vac_truncate_clog()). During recovery truncation is done by replaying
3044 * truncation WAL records logged here.
3045 *
3046 * newOldestMulti is the oldest currently required multixact, newOldestMultiDB
3047 * is one of the databases preventing newOldestMulti from increasing.
3048 */
3049void
3050TruncateMultiXact(MultiXactId newOldestMulti, Oid newOldestMultiDB)
3051{
3052 MultiXactId oldestMulti;
3053 MultiXactId nextMulti;
3054 MultiXactOffset newOldestOffset;
3055 MultiXactOffset oldestOffset;
3056 MultiXactOffset nextOffset;
3057 mxtruncinfo trunc;
3058 MultiXactId earliest;
3059
3062
3063 /*
3064 * We can only allow one truncation to happen at once. Otherwise parts of
3065 * members might vanish while we're doing lookups or similar. There's no
3066 * need to have an interlock with creating new multis or such, since those
3067 * are constrained by the limits (which only grow, never shrink).
3068 */
3069 LWLockAcquire(MultiXactTruncationLock, LW_EXCLUSIVE);
3070
3071 LWLockAcquire(MultiXactGenLock, LW_SHARED);
3072 nextMulti = MultiXactState->nextMXact;
3073 nextOffset = MultiXactState->nextOffset;
3074 oldestMulti = MultiXactState->oldestMultiXactId;
3075 LWLockRelease(MultiXactGenLock);
3076 Assert(MultiXactIdIsValid(oldestMulti));
3077
3078 /*
3079 * Make sure to only attempt truncation if there's values to truncate
3080 * away. In normal processing values shouldn't go backwards, but there's
3081 * some corner cases (due to bugs) where that's possible.
3082 */
3083 if (MultiXactIdPrecedesOrEquals(newOldestMulti, oldestMulti))
3084 {
3085 LWLockRelease(MultiXactTruncationLock);
3086 return;
3087 }
3088
3089 /*
3090 * Note we can't just plow ahead with the truncation; it's possible that
3091 * there are no segments to truncate, which is a problem because we are
3092 * going to attempt to read the offsets page to determine where to
3093 * truncate the members SLRU. So we first scan the directory to determine
3094 * the earliest offsets page number that we can read without error.
3095 *
3096 * When nextMXact is less than one segment away from multiWrapLimit,
3097 * SlruScanDirCbFindEarliest can find some early segment other than the
3098 * actual earliest. (MultiXactOffsetPagePrecedes(EARLIEST, LATEST)
3099 * returns false, because not all pairs of entries have the same answer.)
3100 * That can also arise when an earlier truncation attempt failed unlink()
3101 * or returned early from this function. The only consequence is
3102 * returning early, which wastes space that we could have liberated.
3103 *
3104 * NB: It's also possible that the page that oldestMulti is on has already
3105 * been truncated away, and we crashed before updating oldestMulti.
3106 */
3107 trunc.earliestExistingPage = -1;
3110 if (earliest < FirstMultiXactId)
3111 earliest = FirstMultiXactId;
3112
3113 /* If there's nothing to remove, we can bail out early. */
3114 if (MultiXactIdPrecedes(oldestMulti, earliest))
3115 {
3116 LWLockRelease(MultiXactTruncationLock);
3117 return;
3118 }
3119
3120 /*
3121 * First, compute the safe truncation point for MultiXactMember. This is
3122 * the starting offset of the oldest multixact.
3123 *
3124 * Hopefully, find_multixact_start will always work here, because we've
3125 * already checked that it doesn't precede the earliest MultiXact on disk.
3126 * But if it fails, don't truncate anything, and log a message.
3127 */
3128 if (oldestMulti == nextMulti)
3129 {
3130 /* there are NO MultiXacts */
3131 oldestOffset = nextOffset;
3132 }
3133 else if (!find_multixact_start(oldestMulti, &oldestOffset))
3134 {
3135 ereport(LOG,
3136 (errmsg("oldest MultiXact %u not found, earliest MultiXact %u, skipping truncation",
3137 oldestMulti, earliest)));
3138 LWLockRelease(MultiXactTruncationLock);
3139 return;
3140 }
3141
3142 /*
3143 * Secondly compute up to where to truncate. Lookup the corresponding
3144 * member offset for newOldestMulti for that.
3145 */
3146 if (newOldestMulti == nextMulti)
3147 {
3148 /* there are NO MultiXacts */
3149 newOldestOffset = nextOffset;
3150 }
3151 else if (!find_multixact_start(newOldestMulti, &newOldestOffset))
3152 {
3153 ereport(LOG,
3154 (errmsg("cannot truncate up to MultiXact %u because it does not exist on disk, skipping truncation",
3155 newOldestMulti)));
3156 LWLockRelease(MultiXactTruncationLock);
3157 return;
3158 }
3159
3160 elog(DEBUG1, "performing multixact truncation: "
3161 "offsets [%u, %u), offsets segments [%" PRIx64 ", %" PRIx64 "), "
3162 "members [%u, %u), members segments [%" PRIx64 ", %" PRIx64 ")",
3163 oldestMulti, newOldestMulti,
3164 MultiXactIdToOffsetSegment(oldestMulti),
3165 MultiXactIdToOffsetSegment(newOldestMulti),
3166 oldestOffset, newOldestOffset,
3167 MXOffsetToMemberSegment(oldestOffset),
3168 MXOffsetToMemberSegment(newOldestOffset));
3169
3170 /*
3171 * Do truncation, and the WAL logging of the truncation, in a critical
3172 * section. That way offsets/members cannot get out of sync anymore, i.e.
3173 * once consistent the newOldestMulti will always exist in members, even
3174 * if we crashed in the wrong moment.
3175 */
3177
3178 /*
3179 * Prevent checkpoints from being scheduled concurrently. This is critical
3180 * because otherwise a truncation record might not be replayed after a
3181 * crash/basebackup, even though the state of the data directory would
3182 * require it.
3183 */
3186
3187 /* WAL log truncation */
3188 WriteMTruncateXlogRec(newOldestMultiDB,
3189 oldestMulti, newOldestMulti,
3190 oldestOffset, newOldestOffset);
3191
3192 /*
3193 * Update in-memory limits before performing the truncation, while inside
3194 * the critical section: Have to do it before truncation, to prevent
3195 * concurrent lookups of those values. Has to be inside the critical
3196 * section as otherwise a future call to this function would error out,
3197 * while looking up the oldest member in offsets, if our caller crashes
3198 * before updating the limits.
3199 */
3200 LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
3201 MultiXactState->oldestMultiXactId = newOldestMulti;
3202 MultiXactState->oldestMultiXactDB = newOldestMultiDB;
3203 LWLockRelease(MultiXactGenLock);
3204
3205 /* First truncate members */
3206 PerformMembersTruncation(oldestOffset, newOldestOffset);
3207
3208 /* Then offsets */
3209 PerformOffsetsTruncation(oldestMulti, newOldestMulti);
3210
3211 MyProc->delayChkptFlags &= ~DELAY_CHKPT_START;
3212
3214 LWLockRelease(MultiXactTruncationLock);
3215}
3216
3217/*
3218 * Decide whether a MultiXactOffset page number is "older" for truncation
3219 * purposes. Analogous to CLOGPagePrecedes().
3220 *
3221 * Offsetting the values is optional, because MultiXactIdPrecedes() has
3222 * translational symmetry.
3223 */
3224static bool
3226{
3227 MultiXactId multi1;
3228 MultiXactId multi2;
3229
3230 multi1 = ((MultiXactId) page1) * MULTIXACT_OFFSETS_PER_PAGE;
3231 multi1 += FirstMultiXactId + 1;
3232 multi2 = ((MultiXactId) page2) * MULTIXACT_OFFSETS_PER_PAGE;
3233 multi2 += FirstMultiXactId + 1;
3234
3235 return (MultiXactIdPrecedes(multi1, multi2) &&
3236 MultiXactIdPrecedes(multi1,
3237 multi2 + MULTIXACT_OFFSETS_PER_PAGE - 1));
3238}
3239
3240/*
3241 * Decide whether a MultiXactMember page number is "older" for truncation
3242 * purposes. There is no "invalid offset number" so use the numbers verbatim.
3243 */
3244static bool
3246{
3247 MultiXactOffset offset1;
3248 MultiXactOffset offset2;
3249
3250 offset1 = ((MultiXactOffset) page1) * MULTIXACT_MEMBERS_PER_PAGE;
3251 offset2 = ((MultiXactOffset) page2) * MULTIXACT_MEMBERS_PER_PAGE;
3252
3253 return (MultiXactOffsetPrecedes(offset1, offset2) &&
3255 offset2 + MULTIXACT_MEMBERS_PER_PAGE - 1));
3256}
3257
3258/*
3259 * Decide which of two MultiXactIds is earlier.
3260 *
3261 * XXX do we need to do something special for InvalidMultiXactId?
3262 * (Doesn't look like it.)
3263 */
3264bool
3266{
3267 int32 diff = (int32) (multi1 - multi2);
3268
3269 return (diff < 0);
3270}
3271
3272/*
3273 * MultiXactIdPrecedesOrEquals -- is multi1 logically <= multi2?
3274 *
3275 * XXX do we need to do something special for InvalidMultiXactId?
3276 * (Doesn't look like it.)
3277 */
3278bool
3280{
3281 int32 diff = (int32) (multi1 - multi2);
3282
3283 return (diff <= 0);
3284}
3285
3286
3287/*
3288 * Decide which of two offsets is earlier.
3289 */
3290static bool
3292{
3293 int32 diff = (int32) (offset1 - offset2);
3294
3295 return (diff < 0);
3296}
3297
3298/*
3299 * Write a TRUNCATE xlog record
3300 *
3301 * We must flush the xlog record to disk before returning --- see notes in
3302 * TruncateCLOG().
3303 */
3304static void
3306 MultiXactId startTruncOff, MultiXactId endTruncOff,
3307 MultiXactOffset startTruncMemb, MultiXactOffset endTruncMemb)
3308{
3309 XLogRecPtr recptr;
3311
3312 xlrec.oldestMultiDB = oldestMultiDB;
3313
3314 xlrec.startTruncOff = startTruncOff;
3315 xlrec.endTruncOff = endTruncOff;
3316
3317 xlrec.startTruncMemb = startTruncMemb;
3318 xlrec.endTruncMemb = endTruncMemb;
3319
3322 recptr = XLogInsert(RM_MULTIXACT_ID, XLOG_MULTIXACT_TRUNCATE_ID);
3323 XLogFlush(recptr);
3324}
3325
3326/*
3327 * MULTIXACT resource manager's routines
3328 */
3329void
3331{
3332 uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
3333
3334 /* Backup blocks are not used in multixact records */
3336
3337 if (info == XLOG_MULTIXACT_ZERO_OFF_PAGE)
3338 {
3339 int64 pageno;
3340
3341 memcpy(&pageno, XLogRecGetData(record), sizeof(pageno));
3343 }
3344 else if (info == XLOG_MULTIXACT_ZERO_MEM_PAGE)
3345 {
3346 int64 pageno;
3347
3348 memcpy(&pageno, XLogRecGetData(record), sizeof(pageno));
3350 }
3351 else if (info == XLOG_MULTIXACT_CREATE_ID)
3352 {
3353 xl_multixact_create *xlrec =
3355 TransactionId max_xid;
3356 int i;
3357
3358 /* Store the data back into the SLRU files */
3359 RecordNewMultiXact(xlrec->mid, xlrec->moff, xlrec->nmembers,
3360 xlrec->members);
3361
3362 /* Make sure nextMXact/nextOffset are beyond what this record has */
3363 MultiXactAdvanceNextMXact(xlrec->mid + 1,
3364 xlrec->moff + xlrec->nmembers);
3365
3366 /*
3367 * Make sure nextXid is beyond any XID mentioned in the record. This
3368 * should be unnecessary, since any XID found here ought to have other
3369 * evidence in the XLOG, but let's be safe.
3370 */
3371 max_xid = XLogRecGetXid(record);
3372 for (i = 0; i < xlrec->nmembers; i++)
3373 {
3374 if (TransactionIdPrecedes(max_xid, xlrec->members[i].xid))
3375 max_xid = xlrec->members[i].xid;
3376 }
3377
3379 }
3380 else if (info == XLOG_MULTIXACT_TRUNCATE_ID)
3381 {
3383 int64 pageno;
3384
3385 memcpy(&xlrec, XLogRecGetData(record),
3387
3388 elog(DEBUG1, "replaying multixact truncation: "
3389 "offsets [%u, %u), offsets segments [%" PRIx64 ", %" PRIx64 "), "
3390 "members [%u, %u), members segments [%" PRIx64 ", %" PRIx64 ")",
3391 xlrec.startTruncOff, xlrec.endTruncOff,
3394 xlrec.startTruncMemb, xlrec.endTruncMemb,
3397
3398 /* should not be required, but more than cheap enough */
3399 LWLockAcquire(MultiXactTruncationLock, LW_EXCLUSIVE);
3400
3401 /*
3402 * Advance the horizon values, so they're current at the end of
3403 * recovery.
3404 */
3405 SetMultiXactIdLimit(xlrec.endTruncOff, xlrec.oldestMultiDB, false);
3406
3408
3409 /*
3410 * During XLOG replay, latest_page_number isn't necessarily set up
3411 * yet; insert a suitable value to bypass the sanity test in
3412 * SimpleLruTruncate.
3413 */
3414 pageno = MultiXactIdToOffsetPage(xlrec.endTruncOff);
3415 pg_atomic_write_u64(&MultiXactOffsetCtl->shared->latest_page_number,
3416 pageno);
3418
3419 LWLockRelease(MultiXactTruncationLock);
3420 }
3421 else
3422 elog(PANIC, "multixact_redo: unknown op code %u", info);
3423}
3424
3425/*
3426 * Entrypoint for sync.c to sync offsets files.
3427 */
3428int
3429multixactoffsetssyncfiletag(const FileTag *ftag, char *path)
3430{
3431 return SlruSyncFileTag(MultiXactOffsetCtl, ftag, path);
3432}
3433
3434/*
3435 * Entrypoint for sync.c to sync members files.
3436 */
3437int
3438multixactmemberssyncfiletag(const FileTag *ftag, char *path)
3439{
3440 return SlruSyncFileTag(MultiXactMemberCtl, ftag, path);
3441}
static void pg_atomic_write_u64(volatile pg_atomic_uint64 *ptr, uint64 val)
Definition: atomics.h:483
int autovacuum_multixact_freeze_max_age
Definition: autovacuum.c:130
static int32 next
Definition: blutils.c:224
#define Min(x, y)
Definition: c.h:1004
uint8_t uint8
Definition: c.h:537
int64_t int64
Definition: c.h:536
uint32 MultiXactOffset
Definition: c.h:670
TransactionId MultiXactId
Definition: c.h:668
#define FLEXIBLE_ARRAY_MEMBER
Definition: c.h:471
int32_t int32
Definition: c.h:535
uint16_t uint16
Definition: c.h:538
uint32_t uint32
Definition: c.h:539
#define MemSet(start, val, len)
Definition: c.h:1020
uint32 TransactionId
Definition: c.h:658
size_t Size
Definition: c.h:611
bool ConditionVariableCancelSleep(void)
void ConditionVariableBroadcast(ConditionVariable *cv)
void ConditionVariableInit(ConditionVariable *cv)
void ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info)
int errmsg_plural(const char *fmt_singular, const char *fmt_plural, unsigned long n,...)
Definition: elog.c:1184
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1161
int errdetail_plural(const char *fmt_singular, const char *fmt_plural, unsigned long n,...)
Definition: elog.c:1299
int errhint(const char *fmt,...)
Definition: elog.c:1321
int errcode(int sqlerrcode)
Definition: elog.c:854
int errmsg(const char *fmt,...)
Definition: elog.c:1071
#define LOG
Definition: elog.h:31
#define WARNING
Definition: elog.h:36
#define DEBUG2
Definition: elog.h:29
#define PANIC
Definition: elog.h:42
#define DEBUG1
Definition: elog.h:30
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:226
#define ereport(elevel,...)
Definition: elog.h:150
Datum difference(PG_FUNCTION_ARGS)
int multixact_offset_buffers
Definition: globals.c:163
bool IsBinaryUpgrade
Definition: globals.c:121
ProcNumber MyProcNumber
Definition: globals.c:90
bool IsUnderPostmaster
Definition: globals.c:120
int multixact_member_buffers
Definition: globals.c:162
#define newval
GucSource
Definition: guc.h:112
Assert(PointerIsAligned(start, uint64))
return str start
const char * str
#define dclist_container(type, membername, ptr)
Definition: ilist.h:947
static uint32 dclist_count(const dclist_head *head)
Definition: ilist.h:932
static void dclist_move_head(dclist_head *head, dlist_node *node)
Definition: ilist.h:808
static dlist_node * dclist_tail_node(dclist_head *head)
Definition: ilist.h:920
static void dclist_delete_from(dclist_head *head, dlist_node *node)
Definition: ilist.h:763
#define DCLIST_STATIC_INIT(name)
Definition: ilist.h:282
static void dclist_push_head(dclist_head *head, dlist_node *node)
Definition: ilist.h:693
static void dclist_init(dclist_head *head)
Definition: ilist.h:671
#define dclist_foreach(iter, lhead)
Definition: ilist.h:970
#define INJECTION_POINT(name, arg)
#define INJECTION_POINT_CACHED(name, arg)
#define INJECTION_POINT_LOAD(name)
int j
Definition: isn.c:78
int i
Definition: isn.c:77
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:81
char * get_database_name(Oid dbid)
Definition: lsyscache.c:1259
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1174
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1894
@ LW_SHARED
Definition: lwlock.h:113
@ LW_EXCLUSIVE
Definition: lwlock.h:112
char * MemoryContextStrdup(MemoryContext context, const char *string)
Definition: mcxt.c:1746
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition: mcxt.c:1229
MemoryContext TopTransactionContext
Definition: mcxt.c:171
void pfree(void *pointer)
Definition: mcxt.c:1594
MemoryContext TopMemoryContext
Definition: mcxt.c:166
void * palloc(Size size)
Definition: mcxt.c:1365
#define AllocSetContextCreate
Definition: memutils.h:129
#define ALLOCSET_SMALL_SIZES
Definition: memutils.h:170
#define START_CRIT_SECTION()
Definition: miscadmin.h:149
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:122
#define END_CRIT_SECTION()
Definition: miscadmin.h:151
static void WriteMTruncateXlogRec(Oid oldestMultiDB, MultiXactId startTruncOff, MultiXactId endTruncOff, MultiXactOffset startTruncMemb, MultiXactOffset endTruncMemb)
Definition: multixact.c:3305
static MultiXactId PreviousMultiXactId(MultiXactId multi)
Definition: multixact.c:217
static SlruCtlData MultiXactOffsetCtlData
Definition: multixact.c:225
void MultiXactShmemInit(void)
Definition: multixact.c:1955
#define MULTIXACT_MEMBER_SAFE_THRESHOLD
Definition: multixact.c:212
static bool MultiXactMemberPagePrecedes(int64 page1, int64 page2)
Definition: multixact.c:3245
static MultiXactId GetNewMultiXactId(int nmembers, MultiXactOffset *offset)
Definition: multixact.c:1023
static int mXactCacheGetById(MultiXactId multi, MultiXactMember **members)
Definition: multixact.c:1653
MultiXactId MultiXactIdExpand(MultiXactId multi, TransactionId xid, MultiXactStatus status)
Definition: multixact.c:478
static int64 MXOffsetToMemberPage(MultiXactOffset offset)
Definition: multixact.c:169
#define MXACT_MEMBER_BITS_PER_XACT
Definition: multixact.c:139
static int64 MultiXactIdToOffsetSegment(MultiXactId multi)
Definition: multixact.c:121
static void ExtendMultiXactMember(MultiXactOffset offset, int nmembers)
Definition: multixact.c:2517
void ReadMultiXactIdRange(MultiXactId *oldest, MultiXactId *next)
Definition: multixact.c:782
static void PerformOffsetsTruncation(MultiXactId oldestMulti, MultiXactId newOldestMulti)
Definition: multixact.c:3025
#define MXACT_MEMBER_XACT_BITMASK
Definition: multixact.c:141
#define MULTIXACT_FLAGBYTES_PER_GROUP
Definition: multixact.c:144
bool MultiXactIdPrecedes(MultiXactId multi1, MultiXactId multi2)
Definition: multixact.c:3265
char * mxstatus_to_string(MultiXactStatus status)
Definition: multixact.c:1745
void multixact_redo(XLogReaderState *record)
Definition: multixact.c:3330
#define MULTIXACT_OFFSETS_PER_PAGE
Definition: multixact.c:106
void multixact_twophase_postcommit(FullTransactionId fxid, uint16 info, void *recdata, uint32 len)
Definition: multixact.c:1911
#define debug_elog5(a, b, c, d, e)
Definition: multixact.c:380
static void MultiXactIdSetOldestVisible(void)
Definition: multixact.c:721
int multixactoffsetssyncfiletag(const FileTag *ftag, char *path)
Definition: multixact.c:3429
static bool find_multixact_start(MultiXactId multi, MultiXactOffset *result)
Definition: multixact.c:2822
void PostPrepare_MultiXact(FullTransactionId fxid)
Definition: multixact.c:1841
void MultiXactSetNextMXact(MultiXactId nextMulti, MultiXactOffset nextMultiOffset)
Definition: multixact.c:2258
#define MultiXactMemberCtl
Definition: multixact.c:229
static bool SlruScanDirCbFindEarliest(SlruCtl ctl, char *filename, int64 segpage, void *data)
Definition: multixact.c:2973
void AtPrepare_MultiXact(void)
Definition: multixact.c:1827
static bool MultiXactOffsetWouldWrap(MultiXactOffset boundary, MultiXactOffset start, uint32 distance)
Definition: multixact.c:2774
bool MultiXactIdPrecedesOrEquals(MultiXactId multi1, MultiXactId multi2)
Definition: multixact.c:3279
void MultiXactAdvanceOldest(MultiXactId oldestMulti, Oid oldestMultiDB)
Definition: multixact.c:2466
static int MultiXactIdToOffsetEntry(MultiXactId multi)
Definition: multixact.c:115
static void mXactCachePut(MultiXactId multi, int nmembers, MultiXactMember *members)
Definition: multixact.c:1700
static void MaybeExtendOffsetSlru(void)
Definition: multixact.c:2048
bool MultiXactIdIsRunning(MultiXactId multi, bool isLockOnly)
Definition: multixact.c:590
void MultiXactIdSetOldestMember(void)
Definition: multixact.c:664
static void PerformMembersTruncation(MultiXactOffset oldestOffset, MultiXactOffset newOldestOffset)
Definition: multixact.c:2996
static MemoryContext MXactContext
Definition: multixact.c:368
#define SHARED_MULTIXACT_STATE_SIZE
static MultiXactId * OldestVisibleMXactId
Definition: multixact.c:338
struct mxtruncinfo mxtruncinfo
static int mxactMemberComparator(const void *arg1, const void *arg2)
Definition: multixact.c:1580
struct MultiXactStateData MultiXactStateData
static void ExtendMultiXactOffset(MultiXactId multi)
Definition: multixact.c:2483
Size MultiXactShmemSize(void)
Definition: multixact.c:1938
#define MULTIXACT_MEMBERGROUPS_PER_PAGE
Definition: multixact.c:150
#define MultiXactOffsetCtl
Definition: multixact.c:228
static int MXOffsetToMemberOffset(MultiXactOffset offset)
Definition: multixact.c:202
void MultiXactGetCheckptMulti(bool is_shutdown, MultiXactId *nextMulti, MultiXactOffset *nextMultiOffset, MultiXactId *oldestMulti, Oid *oldestMultiDB)
Definition: multixact.c:2212
void SetMultiXactIdLimit(MultiXactId oldest_datminmxid, Oid oldest_datoid, bool is_startup)
Definition: multixact.c:2292
static void RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset, int nmembers, MultiXactMember *members)
Definition: multixact.c:907
int multixactmemberssyncfiletag(const FileTag *ftag, char *path)
Definition: multixact.c:3438
#define MAX_CACHE_ENTRIES
Definition: multixact.c:366
static int64 MultiXactIdToOffsetPage(MultiXactId multi)
Definition: multixact.c:109
MultiXactId GetOldestMultiXactId(void)
Definition: multixact.c:2594
void CheckPointMultiXact(void)
Definition: multixact.c:2234
#define MaxOldestSlot
Definition: multixact.c:333
MultiXactId MultiXactIdCreateFromMembers(int nmembers, MultiXactMember *members)
Definition: multixact.c:806
struct mXactCacheEnt mXactCacheEnt
static int64 MXOffsetToMemberSegment(MultiXactOffset offset)
Definition: multixact.c:175
static MultiXactId mXactCacheGetBySet(int nmembers, MultiXactMember *members)
Definition: multixact.c:1610
static dclist_head MXactCache
Definition: multixact.c:367
void TrimMultiXact(void)
Definition: multixact.c:2108
#define debug_elog3(a, b, c)
Definition: multixact.c:378
char * mxid_to_string(MultiXactId multi, int nmembers, MultiXactMember *members)
Definition: multixact.c:1768
#define MULTIXACT_MEMBERGROUP_SIZE
Definition: multixact.c:148
#define debug_elog4(a, b, c, d)
Definition: multixact.c:379
void multixact_twophase_postabort(FullTransactionId fxid, uint16 info, void *recdata, uint32 len)
Definition: multixact.c:1926
static bool MultiXactOffsetPagePrecedes(int64 page1, int64 page2)
Definition: multixact.c:3225
static bool SetOffsetVacuumLimit(bool is_startup)
Definition: multixact.c:2647
static int MXOffsetToFlagsOffset(MultiXactOffset offset)
Definition: multixact.c:182
int MultiXactMemberFreezeThreshold(void)
Definition: multixact.c:2924
void MultiXactAdvanceNextMXact(MultiXactId minMulti, MultiXactOffset minMultiOffset)
Definition: multixact.c:2441
static MultiXactId * OldestMemberMXactId
Definition: multixact.c:337
#define MAX_MEMBERS_IN_LAST_MEMBERS_PAGE
Definition: multixact.c:164
static MultiXactStateData * MultiXactState
Definition: multixact.c:336
#define MULTIXACT_MEMBERS_PER_MEMBERGROUP
Definition: multixact.c:145
#define OFFSET_WARN_SEGMENTS
MultiXactId ReadNextMultiXactId(void)
Definition: multixact.c:762
void BootStrapMultiXact(void)
Definition: multixact.c:2025
#define debug_elog6(a, b, c, d, e, f)
Definition: multixact.c:381
void multixact_twophase_recover(FullTransactionId fxid, uint16 info, void *recdata, uint32 len)
Definition: multixact.c:1890
#define MULTIXACT_MEMBERS_PER_PAGE
Definition: multixact.c:151
MultiXactId MultiXactIdCreate(TransactionId xid1, MultiXactStatus status1, TransactionId xid2, MultiXactStatus status2)
Definition: multixact.c:425
void TruncateMultiXact(MultiXactId newOldestMulti, Oid newOldestMultiDB)
Definition: multixact.c:3050
#define MULTIXACT_MEMBER_DANGER_THRESHOLD
Definition: multixact.c:213
static int MXOffsetToFlagsBitShift(MultiXactOffset offset)
Definition: multixact.c:192
bool check_multixact_offset_buffers(int *newval, void **extra, GucSource source)
Definition: multixact.c:2005
static bool MultiXactOffsetPrecedes(MultiXactOffset offset1, MultiXactOffset offset2)
Definition: multixact.c:3291
bool GetMultiXactInfo(uint32 *multixacts, MultiXactOffset *members, MultiXactId *oldestMultiXactId, MultiXactOffset *oldestOffset)
Definition: multixact.c:2867
bool check_multixact_member_buffers(int *newval, void **extra, GucSource source)
Definition: multixact.c:2014
void AtEOXact_MultiXact(void)
Definition: multixact.c:1799
static SlruCtlData MultiXactMemberCtlData
Definition: multixact.c:226
#define debug_elog2(a, b)
Definition: multixact.c:377
void StartupMultiXact(void)
Definition: multixact.c:2083
int GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members, bool from_pgupgrade, bool isLockOnly)
Definition: multixact.c:1290
#define MultiXactIdIsValid(multi)
Definition: multixact.h:29
#define XLOG_MULTIXACT_ZERO_MEM_PAGE
Definition: multixact.h:70
#define XLOG_MULTIXACT_ZERO_OFF_PAGE
Definition: multixact.h:69
#define FirstMultiXactId
Definition: multixact.h:26
MultiXactStatus
Definition: multixact.h:39
@ MultiXactStatusForShare
Definition: multixact.h:41
@ MultiXactStatusForNoKeyUpdate
Definition: multixact.h:42
@ MultiXactStatusNoKeyUpdate
Definition: multixact.h:45
@ MultiXactStatusUpdate
Definition: multixact.h:47
@ MultiXactStatusForUpdate
Definition: multixact.h:43
@ MultiXactStatusForKeyShare
Definition: multixact.h:40
#define ISUPDATE_from_mxstatus(status)
Definition: multixact.h:53
#define InvalidMultiXactId
Definition: multixact.h:25
#define XLOG_MULTIXACT_TRUNCATE_ID
Definition: multixact.h:72
#define SizeOfMultiXactCreate
Definition: multixact.h:82
#define SizeOfMultiXactTruncate
Definition: multixact.h:97
#define XLOG_MULTIXACT_CREATE_ID
Definition: multixact.h:71
#define MaxMultiXactOffset
Definition: multixact.h:31
#define MaxMultiXactId
Definition: multixact.h:27
struct MultiXactMember MultiXactMember
const void size_t len
const void * data
static char * filename
Definition: pg_dumpall.c:120
static rewind_source * source
Definition: pg_rewind.c:89
static char * buf
Definition: pg_test_fsync.c:72
void SendPostmasterSignal(PMSignalReason reason)
Definition: pmsignal.c:165
@ PMSIGNAL_START_AUTOVAC_LAUNCHER
Definition: pmsignal.h:39
#define qsort(a, b, c, d)
Definition: port.h:479
unsigned int Oid
Definition: postgres_ext.h:32
#define DELAY_CHKPT_START
Definition: proc.h:135
bool TransactionIdIsInProgress(TransactionId xid)
Definition: procarray.c:1402
int ProcNumber
Definition: procnumber.h:24
tree ctl
Definition: radixtree.h:1838
Size add_size(Size s1, Size s2)
Definition: shmem.c:493
void * ShmemInitStruct(const char *name, Size size, bool *foundPtr)
Definition: shmem.c:387
void SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns, const char *subdir, int buffer_tranche_id, int bank_tranche_id, SyncRequestHandler sync_handler, bool long_segment_names)
Definition: slru.c:252
int SimpleLruReadPage_ReadOnly(SlruCtl ctl, int64 pageno, TransactionId xid)
Definition: slru.c:630
void SimpleLruWritePage(SlruCtl ctl, int slotno)
Definition: slru.c:757
void SimpleLruWriteAll(SlruCtl ctl, bool allow_redirtied)
Definition: slru.c:1347
bool SimpleLruDoesPhysicalPageExist(SlruCtl ctl, int64 pageno)
Definition: slru.c:771
void SlruDeleteSegment(SlruCtl ctl, int64 segno)
Definition: slru.c:1551
bool SlruScanDirectory(SlruCtl ctl, SlruScanCallback callback, void *data)
Definition: slru.c:1816
int SimpleLruReadPage(SlruCtl ctl, int64 pageno, bool write_ok, TransactionId xid)
Definition: slru.c:527
int SlruSyncFileTag(SlruCtl ctl, const FileTag *ftag, char *path)
Definition: slru.c:1856
int SimpleLruZeroPage(SlruCtl ctl, int64 pageno)
Definition: slru.c:375
void SimpleLruZeroAndWritePage(SlruCtl ctl, int64 pageno)
Definition: slru.c:444
void SimpleLruTruncate(SlruCtl ctl, int64 cutoffPage)
Definition: slru.c:1433
Size SimpleLruShmemSize(int nslots, int nlsns)
Definition: slru.c:198
bool check_slru_buffers(const char *name, int *newval)
Definition: slru.c:355
static LWLock * SimpleLruGetBankLock(SlruCtl ctl, int64 pageno)
Definition: slru.h:175
#define SlruPagePrecedesUnitTests(ctl, per_page)
Definition: slru.h:200
#define SLRU_PAGES_PER_SEGMENT
Definition: slru.h:39
PGPROC * MyProc
Definition: proc.c:66
void appendStringInfo(StringInfo str, const char *fmt,...)
Definition: stringinfo.c:145
void appendStringInfoChar(StringInfo str, char ch)
Definition: stringinfo.c:242
void initStringInfo(StringInfo str)
Definition: stringinfo.c:97
Definition: sync.h:51
Definition: lwlock.h:42
TransactionId xid
Definition: multixact.h:59
MultiXactStatus status
Definition: multixact.h:60
MultiXactId multiWrapLimit
Definition: multixact.c:269
MultiXactId multiStopLimit
Definition: multixact.c:268
MultiXactId multiWarnLimit
Definition: multixact.c:267
MultiXactId multiVacLimit
Definition: multixact.c:266
MultiXactOffset offsetStopLimit
Definition: multixact.c:272
MultiXactOffset nextOffset
Definition: multixact.c:244
MultiXactId nextMXact
Definition: multixact.c:241
MultiXactId oldestMultiXactId
Definition: multixact.c:254
MultiXactId perBackendXactIds[FLEXIBLE_ARRAY_MEMBER]
Definition: multixact.c:327
MultiXactOffset oldestOffset
Definition: multixact.c:262
ConditionVariable nextoff_cv
Definition: multixact.c:278
int delayChkptFlags
Definition: proc.h:257
dlist_node * cur
Definition: ilist.h:179
MultiXactId multi
Definition: multixact.c:360
dlist_node node
Definition: multixact.c:362
MultiXactMember members[FLEXIBLE_ARRAY_MEMBER]
Definition: multixact.c:363
int64 earliestExistingPage
Definition: multixact.c:2965
MultiXactId mid
Definition: multixact.h:76
MultiXactMember members[FLEXIBLE_ARRAY_MEMBER]
Definition: multixact.h:79
MultiXactOffset moff
Definition: multixact.h:77
MultiXactId endTruncOff
Definition: multixact.h:90
MultiXactOffset startTruncMemb
Definition: multixact.h:93
MultiXactOffset endTruncMemb
Definition: multixact.h:94
MultiXactId startTruncOff
Definition: multixact.h:89
@ SYNC_HANDLER_MULTIXACT_MEMBER
Definition: sync.h:41
@ SYNC_HANDLER_MULTIXACT_OFFSET
Definition: sync.h:40
bool TransactionIdDidCommit(TransactionId transactionId)
Definition: transam.c:126
bool TransactionIdPrecedes(TransactionId id1, TransactionId id2)
Definition: transam.c:280
#define TransactionIdEquals(id1, id2)
Definition: transam.h:43
#define TransactionIdIsValid(xid)
Definition: transam.h:41
ProcNumber TwoPhaseGetDummyProcNumber(FullTransactionId fxid, bool lock_held)
Definition: twophase.c:908
void RegisterTwoPhaseRecord(TwoPhaseRmgrId rmid, uint16 info, const void *data, uint32 len)
Definition: twophase.c:1271
#define TWOPHASE_RM_MULTIXACT_ID
Definition: twophase_rmgr.h:29
void AdvanceNextFullTransactionIdPastXid(TransactionId xid)
Definition: varsup.c:304
bool IsTransactionState(void)
Definition: xact.c:387
bool TransactionIdIsCurrentTransactionId(TransactionId xid)
Definition: xact.c:941
bool RecoveryInProgress(void)
Definition: xlog.c:6383
void XLogFlush(XLogRecPtr record)
Definition: xlog.c:2780
uint64 XLogRecPtr
Definition: xlogdefs.h:21
XLogRecPtr XLogSimpleInsertInt64(RmgrId rmid, uint8 info, int64 value)
Definition: xloginsert.c:537
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:474
void XLogRegisterData(const void *data, uint32 len)
Definition: xloginsert.c:364
void XLogBeginInsert(void)
Definition: xloginsert.c:149
#define XLogRecGetInfo(decoder)
Definition: xlogreader.h:410
#define XLogRecGetData(decoder)
Definition: xlogreader.h:415
#define XLogRecGetXid(decoder)
Definition: xlogreader.h:412
#define XLogRecHasAnyBlockRefs(decoder)
Definition: xlogreader.h:417
bool InRecovery
Definition: xlogutils.c:50