Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 3bbd6af

Browse files
committed
Adjust btbulkdelete logic so that only one WAL record is issued while
deleting multiple index entries on a single index page. This makes for a very substantial reduction in the amount of WAL traffic during a large delete operation.
1 parent 13dadef commit 3bbd6af

File tree

4 files changed

+135
-107
lines changed

4 files changed

+135
-107
lines changed

src/backend/access/nbtree/nbtpage.c

Lines changed: 33 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
*
1010
*
1111
* IDENTIFICATION
12-
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtpage.c,v 1.61 2003/02/23 06:17:13 tgl Exp $
12+
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtpage.c,v 1.62 2003/02/23 22:43:08 tgl Exp $
1313
*
1414
* NOTES
1515
* Postgres btree pages look like ordinary relation pages. The opaque
@@ -618,26 +618,34 @@ _bt_metaproot(Relation rel, BlockNumber rootbknum, uint32 level)
618618
}
619619

620620
/*
621-
* Delete an item from a btree page.
621+
* Delete item(s) from a btree page.
622622
*
623623
* This must only be used for deleting leaf items. Deleting an item on a
624624
* non-leaf page has to be done as part of an atomic action that includes
625625
* deleting the page it points to.
626626
*
627627
* This routine assumes that the caller has pinned and locked the buffer,
628-
* and will write the buffer afterwards.
628+
* and will write the buffer afterwards. Also, the given itemnos *must*
629+
* appear in increasing order in the array.
629630
*/
630631
void
631-
_bt_itemdel(Relation rel, Buffer buf, ItemPointer tid)
632+
_bt_delitems(Relation rel, Buffer buf,
633+
OffsetNumber *itemnos, int nitems)
632634
{
633635
Page page = BufferGetPage(buf);
634-
OffsetNumber offno;
635-
636-
offno = ItemPointerGetOffsetNumber(tid);
636+
int i;
637637

638+
/* No elog(ERROR) until changes are logged */
638639
START_CRIT_SECTION();
639640

640-
PageIndexTupleDelete(page, offno);
641+
/*
642+
* Delete the items in reverse order so we don't have to think about
643+
* adjusting item numbers for previous deletions.
644+
*/
645+
for (i = nitems - 1; i >= 0; i--)
646+
{
647+
PageIndexTupleDelete(page, itemnos[i]);
648+
}
641649

642650
/* XLOG stuff */
643651
if (!rel->rd_istemp)
@@ -646,17 +654,30 @@ _bt_itemdel(Relation rel, Buffer buf, ItemPointer tid)
646654
XLogRecPtr recptr;
647655
XLogRecData rdata[2];
648656

649-
xlrec.target.node = rel->rd_node;
650-
xlrec.target.tid = *tid;
657+
xlrec.node = rel->rd_node;
658+
xlrec.block = BufferGetBlockNumber(buf);
651659

652660
rdata[0].buffer = InvalidBuffer;
653661
rdata[0].data = (char *) &xlrec;
654662
rdata[0].len = SizeOfBtreeDelete;
655663
rdata[0].next = &(rdata[1]);
656664

665+
/*
666+
* The target-offsets array is not in the buffer, but pretend
667+
* that it is. When XLogInsert stores the whole buffer, the offsets
668+
* array need not be stored too.
669+
*/
657670
rdata[1].buffer = buf;
658-
rdata[1].data = NULL;
659-
rdata[1].len = 0;
671+
if (nitems > 0)
672+
{
673+
rdata[1].data = (char *) itemnos;
674+
rdata[1].len = nitems * sizeof(OffsetNumber);
675+
}
676+
else
677+
{
678+
rdata[1].data = NULL;
679+
rdata[1].len = 0;
680+
}
660681
rdata[1].next = NULL;
661682

662683
recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DELETE, rdata);

src/backend/access/nbtree/nbtree.c

Lines changed: 72 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
* Portions Copyright (c) 1994, Regents of the University of California
1313
*
1414
* IDENTIFICATION
15-
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.97 2003/02/23 06:17:13 tgl Exp $
15+
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.98 2003/02/23 22:43:08 tgl Exp $
1616
*
1717
*-------------------------------------------------------------------------
1818
*/
@@ -572,121 +572,110 @@ btbulkdelete(PG_FUNCTION_ARGS)
572572
IndexBulkDeleteCallback callback = (IndexBulkDeleteCallback) PG_GETARG_POINTER(1);
573573
void *callback_state = (void *) PG_GETARG_POINTER(2);
574574
IndexBulkDeleteResult *result;
575-
BlockNumber num_pages;
576575
double tuples_removed;
577576
double num_index_tuples;
578-
IndexScanDesc scan;
579-
BTScanOpaque so;
580-
ItemPointer current;
577+
OffsetNumber deletable[BLCKSZ / sizeof(OffsetNumber)];
578+
int ndeletable;
579+
Buffer buf;
580+
BlockNumber num_pages;
581581

582582
tuples_removed = 0;
583583
num_index_tuples = 0;
584584

585585
/*
586-
* We use a standard IndexScanDesc scan object, but to speed up the
587-
* loop, we skip most of the wrapper layers of index_getnext and
588-
* instead call _bt_step directly. This implies holding buffer lock
589-
* on a target page throughout the loop over the page's tuples.
590-
*
591-
* Whenever we step onto a new page, we have to trade in the read
592-
* lock acquired by _bt_first or _bt_step for an exclusive write lock
593-
* (fortunately, _bt_relbuf doesn't care which kind of lock it's
594-
* releasing when it comes time for _bt_step to release our lock).
586+
* The outer loop iterates over index leaf pages, the inner over items
587+
* on a leaf page. We issue just one _bt_delitems() call per page,
588+
* so as to minimize WAL traffic.
595589
*
596-
* Note that we exclusive-lock every leaf page, or at least every one
597-
* containing data items. It sounds attractive to only exclusive-lock
590+
* Note that we exclusive-lock every leaf page containing data items,
591+
* in sequence left to right. It sounds attractive to only exclusive-lock
598592
* those containing items we need to delete, but unfortunately that
599593
* is not safe: we could then pass a stopped indexscan, which could
600594
* in rare cases lead to deleting the item it needs to find when it
601595
* resumes. (See _bt_restscan --- this could only happen if an indexscan
602596
* stops on a deletable item and then a page split moves that item
603597
* into a page further to its right, which the indexscan will have no
604-
* pin on.)
598+
* pin on.) We can skip obtaining exclusive lock on empty pages
599+
* though, since no indexscan could be stopped on those.
605600
*/
606-
scan = index_beginscan(NULL, rel, SnapshotAny, 0, (ScanKey) NULL);
607-
so = (BTScanOpaque) scan->opaque;
608-
current = &(scan->currentItemData);
609-
610-
/* Use _bt_first to get started, then _bt_step to remaining tuples */
611-
if (_bt_first(scan, ForwardScanDirection))
601+
buf = _bt_get_endpoint(rel, 0, false);
602+
if (BufferIsValid(buf)) /* check for empty index */
612603
{
613-
Buffer buf;
614-
BlockNumber lockedBlock = InvalidBlockNumber;
615-
616-
/* we have the buffer pinned and read-locked */
617-
buf = so->btso_curbuf;
618-
Assert(BufferIsValid(buf));
619-
620-
do
604+
for (;;)
621605
{
622606
Page page;
623-
BlockNumber blkno;
624-
OffsetNumber offnum;
625-
BTItem btitem;
626607
BTPageOpaque opaque;
627-
IndexTuple itup;
628-
ItemPointer htup;
608+
OffsetNumber offnum,
609+
minoff,
610+
maxoff;
611+
BlockNumber nextpage;
629612

630613
CHECK_FOR_INTERRUPTS();
631614

632-
/* current is the next index tuple */
615+
ndeletable = 0;
633616
page = BufferGetPage(buf);
634-
blkno = ItemPointerGetBlockNumber(current);
635-
636-
/*
637-
* Make sure we have a super-exclusive write lock on this page.
638-
*
639-
* We assume that only concurrent insertions, not deletions,
640-
* can occur while we're not holding the page lock (the
641-
* caller should hold a suitable relation lock to ensure
642-
* this). Therefore, no items can escape being scanned because
643-
* of this temporary lock release.
644-
*/
645-
if (blkno != lockedBlock)
617+
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
618+
minoff = P_FIRSTDATAKEY(opaque);
619+
maxoff = PageGetMaxOffsetNumber(page);
620+
/* We probably cannot see deleted pages, but skip 'em if so */
621+
if (minoff <= maxoff && !P_ISDELETED(opaque))
646622
{
623+
/*
624+
* Trade in the initial read lock for a super-exclusive
625+
* write lock on this page.
626+
*/
647627
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
648628
LockBufferForCleanup(buf);
649-
lockedBlock = blkno;
650629
/*
651-
* If the page was formerly rightmost but was split while we
652-
* didn't hold the lock, and ip_posid is pointing to item
653-
* 1, then ip_posid now points at the high key not a valid
654-
* data item. In this case we need to step forward.
630+
* Recompute minoff/maxoff, both of which could have changed
631+
* while we weren't holding the lock.
655632
*/
656-
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
657-
if (current->ip_posid < P_FIRSTDATAKEY(opaque))
658-
current->ip_posid = P_FIRSTDATAKEY(opaque);
659-
}
660-
661-
offnum = ItemPointerGetOffsetNumber(current);
662-
btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
663-
itup = &btitem->bti_itup;
664-
htup = &(itup->t_tid);
665-
666-
if (callback(htup, callback_state))
667-
{
668-
/* Okay to delete the item from the page */
669-
_bt_itemdel(rel, buf, current);
670-
671-
/* Mark buffer dirty, but keep the lock and pin */
672-
WriteNoReleaseBuffer(buf);
673-
674-
tuples_removed += 1;
675-
633+
minoff = P_FIRSTDATAKEY(opaque);
634+
maxoff = PageGetMaxOffsetNumber(page);
676635
/*
677-
* We now need to back up the scan one item, so that the next
678-
* cycle will re-examine the same offnum on this page (which
679-
* now holds the next item).
636+
* Scan over all items to see which ones need deleted
637+
* according to the callback function.
680638
*/
681-
current->ip_posid--;
639+
for (offnum = minoff;
640+
offnum <= maxoff;
641+
offnum = OffsetNumberNext(offnum))
642+
{
643+
BTItem btitem;
644+
ItemPointer htup;
645+
646+
btitem = (BTItem) PageGetItem(page,
647+
PageGetItemId(page, offnum));
648+
htup = &(btitem->bti_itup.t_tid);
649+
if (callback(htup, callback_state))
650+
{
651+
deletable[ndeletable++] = offnum;
652+
tuples_removed += 1;
653+
}
654+
else
655+
num_index_tuples += 1;
656+
}
657+
}
658+
/*
659+
* If we need to delete anything, do it and write the buffer;
660+
* else just release the buffer.
661+
*/
662+
nextpage = opaque->btpo_next;
663+
if (ndeletable > 0)
664+
{
665+
_bt_delitems(rel, buf, deletable, ndeletable);
666+
_bt_wrtbuf(rel, buf);
682667
}
683668
else
684-
num_index_tuples += 1;
685-
} while (_bt_step(scan, &buf, ForwardScanDirection));
669+
{
670+
_bt_relbuf(rel, buf);
671+
}
672+
/* And advance to next page, if any */
673+
if (nextpage == P_NONE)
674+
break;
675+
buf = _bt_getbuf(rel, nextpage, BT_READ);
676+
}
686677
}
687678

688-
index_endscan(scan);
689-
690679
/* return statistics */
691680
num_pages = RelationGetNumberOfBlocks(rel);
692681

@@ -765,7 +754,7 @@ btvacuumcleanup(PG_FUNCTION_ARGS)
765754
}
766755
}
767756
else if ((opaque->btpo_flags & BTP_HALF_DEAD) ||
768-
P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page))
757+
P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page))
769758
{
770759
/* Empty, try to delete */
771760
int ndel;

src/backend/access/nbtree/nbtxlog.c

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
* Portions Copyright (c) 1994, Regents of the University of California
99
*
1010
* IDENTIFICATION
11-
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.2 2003/02/23 06:17:13 tgl Exp $
11+
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.3 2003/02/23 22:43:08 tgl Exp $
1212
*
1313
*-------------------------------------------------------------------------
1414
*/
@@ -379,11 +379,10 @@ btree_xlog_delete(bool redo, XLogRecPtr lsn, XLogRecord *record)
379379
return;
380380

381381
xlrec = (xl_btree_delete *) XLogRecGetData(record);
382-
reln = XLogOpenRelation(redo, RM_BTREE_ID, xlrec->target.node);
382+
reln = XLogOpenRelation(redo, RM_BTREE_ID, xlrec->node);
383383
if (!RelationIsValid(reln))
384384
return;
385-
buffer = XLogReadBuffer(false, reln,
386-
ItemPointerGetBlockNumber(&(xlrec->target.tid)));
385+
buffer = XLogReadBuffer(false, reln, xlrec->block);
387386
if (!BufferIsValid(buffer))
388387
elog(PANIC, "btree_delete_redo: block unfound");
389388
page = (Page) BufferGetPage(buffer);
@@ -396,7 +395,21 @@ btree_xlog_delete(bool redo, XLogRecPtr lsn, XLogRecord *record)
396395
return;
397396
}
398397

399-
PageIndexTupleDelete(page, ItemPointerGetOffsetNumber(&(xlrec->target.tid)));
398+
if (record->xl_len > SizeOfBtreeDelete)
399+
{
400+
OffsetNumber *unused;
401+
OffsetNumber *unend;
402+
403+
unused = (OffsetNumber *) ((char *) xlrec + SizeOfBtreeDelete);
404+
unend = (OffsetNumber *) ((char *) xlrec + record->xl_len);
405+
406+
/* be careful to delete from back to front */
407+
while (unused < unend)
408+
{
409+
unend--;
410+
PageIndexTupleDelete(page, *unend);
411+
}
412+
}
400413

401414
PageSetLSN(page, lsn);
402415
PageSetSUI(page, ThisStartUpID);
@@ -853,8 +866,8 @@ btree_desc(char *buf, uint8 xl_info, char *rec)
853866
{
854867
xl_btree_delete *xlrec = (xl_btree_delete *) rec;
855868

856-
strcat(buf, "delete: ");
857-
out_target(buf, &(xlrec->target));
869+
sprintf(buf + strlen(buf), "delete: node %u/%u; blk %u",
870+
xlrec->node.tblNode, xlrec->node.relNode, xlrec->block);
858871
break;
859872
}
860873
case XLOG_BTREE_DELETE_PAGE:

src/include/access/nbtree.h

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
* Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
88
* Portions Copyright (c) 1994, Regents of the University of California
99
*
10-
* $Id: nbtree.h,v 1.66 2003/02/23 06:17:13 tgl Exp $
10+
* $Id: nbtree.h,v 1.67 2003/02/23 22:43:09 tgl Exp $
1111
*
1212
*-------------------------------------------------------------------------
1313
*/
@@ -263,14 +263,18 @@ typedef struct xl_btree_split
263263
#define SizeOfBtreeSplit (offsetof(xl_btree_split, leftlen) + sizeof(uint16))
264264

265265
/*
266-
* This is what we need to know about delete of an individual leaf btitem
266+
* This is what we need to know about delete of individual leaf btitems.
267+
* The WAL record can represent deletion of any number of btitems on a
268+
* single index page.
267269
*/
268270
typedef struct xl_btree_delete
269271
{
270-
xl_btreetid target; /* deleted tuple id */
272+
RelFileNode node;
273+
BlockNumber block;
274+
/* TARGET OFFSET NUMBERS FOLLOW AT THE END */
271275
} xl_btree_delete;
272276

273-
#define SizeOfBtreeDelete (offsetof(xl_btreetid, tid) + SizeOfIptrData)
277+
#define SizeOfBtreeDelete (offsetof(xl_btree_delete, block) + sizeof(BlockNumber))
274278

275279
/*
276280
* This is what we need to know about deletion of a btree page. The target
@@ -453,7 +457,8 @@ extern void _bt_wrtnorelbuf(Relation rel, Buffer buf);
453457
extern void _bt_pageinit(Page page, Size size);
454458
extern bool _bt_page_recyclable(Page page);
455459
extern void _bt_metaproot(Relation rel, BlockNumber rootbknum, uint32 level);
456-
extern void _bt_itemdel(Relation rel, Buffer buf, ItemPointer tid);
460+
extern void _bt_delitems(Relation rel, Buffer buf,
461+
OffsetNumber *itemnos, int nitems);
457462
extern int _bt_pagedel(Relation rel, Buffer buf, bool vacuum_full);
458463

459464
/*

0 commit comments

Comments
 (0)