Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 2095206

Browse files
committed
Adjust btree index build to not use shared buffers, thereby avoiding the
locking conflict against concurrent CHECKPOINT that was discussed a few weeks ago. Also, if not using WAL archiving (which is always true ATM but won't be if PITR makes it into this release), there's no need to WAL-log the index build process; it's sufficient to force-fsync the completed index before commit. This seems to gain about a factor of 2 in my tests, which is consistent with writing half as much data. I did not try it with WAL on a separate drive though --- probably the gain would be a lot less in that scenario.
1 parent 4d0e47d commit 2095206

File tree

8 files changed

+304
-214
lines changed

8 files changed

+304
-214
lines changed

src/backend/access/nbtree/nbtpage.c

Lines changed: 32 additions & 89 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
*
1010
*
1111
* IDENTIFICATION
12-
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtpage.c,v 1.75 2004/04/21 18:24:25 tgl Exp $
12+
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtpage.c,v 1.76 2004/06/02 17:28:17 tgl Exp $
1313
*
1414
* NOTES
1515
* Postgres btree pages look like ordinary relation pages. The opaque
@@ -31,21 +31,21 @@
3131
/*
3232
* _bt_metapinit() -- Initialize the metadata page of a new btree.
3333
*
34-
* If markvalid is true, the index is immediately marked valid, else it
35-
* will be invalid until _bt_metaproot() is called.
34+
* Note: this is actually not used for standard btree index building;
35+
* nbtsort.c prefers not to make the metadata page valid until completion
36+
* of build.
3637
*
3738
* Note: there's no real need for any locking here. Since the transaction
3839
* creating the index hasn't committed yet, no one else can even see the index
3940
* much less be trying to use it. (In a REINDEX-in-place scenario, that's
4041
* not true, but we assume the caller holds sufficient locks on the index.)
4142
*/
4243
void
43-
_bt_metapinit(Relation rel, bool markvalid)
44+
_bt_metapinit(Relation rel)
4445
{
4546
Buffer buf;
4647
Page pg;
4748
BTMetaPageData *metad;
48-
BTPageOpaque op;
4949

5050
if (RelationGetNumberOfBlocks(rel) != 0)
5151
elog(ERROR, "cannot initialize non-empty btree index \"%s\"",
@@ -55,21 +55,11 @@ _bt_metapinit(Relation rel, bool markvalid)
5555
Assert(BufferGetBlockNumber(buf) == BTREE_METAPAGE);
5656
pg = BufferGetPage(buf);
5757

58-
/* NO ELOG(ERROR) from here till newmeta op is logged */
59-
START_CRIT_SECTION();
60-
61-
_bt_pageinit(pg, BufferGetPageSize(buf));
62-
58+
_bt_initmetapage(pg, P_NONE, 0);
6359
metad = BTPageGetMeta(pg);
64-
metad->btm_magic = markvalid ? BTREE_MAGIC : 0;
65-
metad->btm_version = BTREE_VERSION;
66-
metad->btm_root = P_NONE;
67-
metad->btm_level = 0;
68-
metad->btm_fastroot = P_NONE;
69-
metad->btm_fastlevel = 0;
7060

71-
op = (BTPageOpaque) PageGetSpecialPointer(pg);
72-
op->btpo_flags = BTP_META;
61+
/* NO ELOG(ERROR) from here till newmeta op is logged */
62+
START_CRIT_SECTION();
7363

7464
/* XLOG stuff */
7565
if (!rel->rd_istemp)
@@ -90,7 +80,7 @@ _bt_metapinit(Relation rel, bool markvalid)
9080
rdata[0].next = NULL;
9181

9282
recptr = XLogInsert(RM_BTREE_ID,
93-
markvalid ? XLOG_BTREE_NEWMETA : XLOG_BTREE_INVALIDMETA,
83+
XLOG_BTREE_NEWMETA,
9484
rdata);
9585

9686
PageSetLSN(pg, recptr);
@@ -102,6 +92,29 @@ _bt_metapinit(Relation rel, bool markvalid)
10292
WriteBuffer(buf);
10393
}
10494

95+
/*
96+
* _bt_initmetapage() -- Fill a page buffer with a correct metapage image
97+
*/
98+
void
99+
_bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level)
100+
{
101+
BTMetaPageData *metad;
102+
BTPageOpaque metaopaque;
103+
104+
_bt_pageinit(page, BLCKSZ);
105+
106+
metad = BTPageGetMeta(page);
107+
metad->btm_magic = BTREE_MAGIC;
108+
metad->btm_version = BTREE_VERSION;
109+
metad->btm_root = rootbknum;
110+
metad->btm_level = level;
111+
metad->btm_fastroot = rootbknum;
112+
metad->btm_fastlevel = level;
113+
114+
metaopaque = (BTPageOpaque) PageGetSpecialPointer(page);
115+
metaopaque->btpo_flags = BTP_META;
116+
}
117+
105118
/*
106119
* _bt_getroot() -- Get the root page of the btree.
107120
*
@@ -609,76 +622,6 @@ _bt_page_recyclable(Page page)
609622
return false;
610623
}
611624

612-
/*
613-
* _bt_metaproot() -- Change the root page of the btree.
614-
*
615-
* Lehman and Yao require that the root page move around in order to
616-
* guarantee deadlock-free short-term, fine-granularity locking. When
617-
* we split the root page, we record the new parent in the metadata page
618-
* for the relation. This routine does the work.
619-
*
620-
* No direct preconditions, but if you don't have the write lock on
621-
* at least the old root page when you call this, you're making a big
622-
* mistake. On exit, metapage data is correct and we no longer have
623-
* a pin or lock on the metapage.
624-
*
625-
* Actually this is not used for splitting on-the-fly anymore. It's only used
626-
* in nbtsort.c at the completion of btree building, where we know we have
627-
* sole access to the index anyway.
628-
*/
629-
void
630-
_bt_metaproot(Relation rel, BlockNumber rootbknum, uint32 level)
631-
{
632-
Buffer metabuf;
633-
Page metap;
634-
BTPageOpaque metaopaque;
635-
BTMetaPageData *metad;
636-
637-
metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
638-
metap = BufferGetPage(metabuf);
639-
metaopaque = (BTPageOpaque) PageGetSpecialPointer(metap);
640-
Assert(metaopaque->btpo_flags & BTP_META);
641-
642-
/* NO ELOG(ERROR) from here till newmeta op is logged */
643-
START_CRIT_SECTION();
644-
645-
metad = BTPageGetMeta(metap);
646-
Assert(metad->btm_magic == BTREE_MAGIC || metad->btm_magic == 0);
647-
metad->btm_magic = BTREE_MAGIC; /* it's valid now for sure */
648-
metad->btm_root = rootbknum;
649-
metad->btm_level = level;
650-
metad->btm_fastroot = rootbknum;
651-
metad->btm_fastlevel = level;
652-
653-
/* XLOG stuff */
654-
if (!rel->rd_istemp)
655-
{
656-
xl_btree_newmeta xlrec;
657-
XLogRecPtr recptr;
658-
XLogRecData rdata[1];
659-
660-
xlrec.node = rel->rd_node;
661-
xlrec.meta.root = metad->btm_root;
662-
xlrec.meta.level = metad->btm_level;
663-
xlrec.meta.fastroot = metad->btm_fastroot;
664-
xlrec.meta.fastlevel = metad->btm_fastlevel;
665-
666-
rdata[0].buffer = InvalidBuffer;
667-
rdata[0].data = (char *) &xlrec;
668-
rdata[0].len = SizeOfBtreeNewmeta;
669-
rdata[0].next = NULL;
670-
671-
recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWMETA, rdata);
672-
673-
PageSetLSN(metap, recptr);
674-
PageSetSUI(metap, ThisStartUpID);
675-
}
676-
677-
END_CRIT_SECTION();
678-
679-
_bt_wrtbuf(rel, metabuf);
680-
}
681-
682625
/*
683626
* Delete item(s) from a btree page.
684627
*

src/backend/access/nbtree/nbtree.c

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
* Portions Copyright (c) 1994, Regents of the University of California
1313
*
1414
* IDENTIFICATION
15-
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtree.c,v 1.116 2004/05/31 19:24:04 tgl Exp $
15+
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtree.c,v 1.117 2004/06/02 17:28:17 tgl Exp $
1616
*
1717
*-------------------------------------------------------------------------
1818
*/
@@ -112,10 +112,6 @@ btbuild(PG_FUNCTION_ARGS)
112112
elog(ERROR, "index \"%s\" already contains data",
113113
RelationGetRelationName(index));
114114

115-
/* initialize the btree index metadata page */
116-
/* mark it valid right away only if using slow build */
117-
_bt_metapinit(index, !buildstate.usefast);
118-
119115
if (buildstate.usefast)
120116
{
121117
buildstate.spool = _bt_spoolinit(index, indexInfo->ii_Unique, false);
@@ -127,6 +123,11 @@ btbuild(PG_FUNCTION_ARGS)
127123
if (indexInfo->ii_Unique)
128124
buildstate.spool2 = _bt_spoolinit(index, false, true);
129125
}
126+
else
127+
{
128+
/* if using slow build, initialize the btree index metadata page */
129+
_bt_metapinit(index);
130+
}
130131

131132
/* do the heap scan */
132133
reltuples = IndexBuildHeapScan(heap, index, indexInfo,

0 commit comments

Comments
 (0)