Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 44ca402

Browse files
committed
Partition the freelist for shared dynahash tables.
Without this, contention on the freelist can become a pretty serious problem on large servers. Aleksander Alekseev, reviewed by Anastasia Lubennikova, Dilip Kumar, and me.
1 parent ea4b8bd commit 44ca402

File tree

1 file changed

+137
-47
lines changed

1 file changed

+137
-47
lines changed

src/backend/utils/hash/dynahash.c

Lines changed: 137 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
* to hash_create. This prevents any attempt to split buckets on-the-fly.
1616
* Therefore, each hash bucket chain operates independently, and no fields
1717
* of the hash header change after init except nentries and freeList.
18-
* A partitioned table uses a spinlock to guard changes of those two fields.
18+
* A partitioned table uses spinlocks to guard changes of those fields.
1919
* This lets any subset of the hash buckets be treated as a separately
2020
* lockable partition. We expect callers to use the low-order bits of a
2121
* lookup key's hash value as a partition number --- this will work because
@@ -111,13 +111,27 @@
111111
#define DEF_DIRSIZE 256
112112
#define DEF_FFACTOR 1 /* default fill factor */
113113

114+
/* Number of freelists to be used for a partitioned hash table. */
115+
#define NUM_FREELISTS 32
114116

115117
/* A hash bucket is a linked list of HASHELEMENTs */
116118
typedef HASHELEMENT *HASHBUCKET;
117119

118120
/* A hash segment is an array of bucket headers */
119121
typedef HASHBUCKET *HASHSEGMENT;
120122

123+
/*
124+
* Using array of FreeListData instead of separate arrays of mutexes, nentries
125+
* and freeLists prevents, at least partially, sharing one cache line between
126+
* different mutexes (see below).
127+
*/
128+
typedef struct
129+
{
130+
slock_t mutex; /* spinlock */
131+
long nentries; /* number of entries */
132+
HASHELEMENT *freeList; /* list of free elements */
133+
} FreeListData;
134+
121135
/*
122136
* Header structure for a hash table --- contains all changeable info
123137
*
@@ -128,12 +142,15 @@ typedef HASHBUCKET *HASHSEGMENT;
128142
*/
129143
struct HASHHDR
130144
{
131-
/* In a partitioned table, take this lock to touch nentries or freeList */
132-
slock_t mutex; /* unused if not partitioned table */
133-
134-
/* These fields change during entry addition/deletion */
135-
long nentries; /* number of entries in hash table */
136-
HASHELEMENT *freeList; /* linked list of free elements */
145+
/*
146+
* The freelist can become a point of contention on high-concurrency hash
147+
* tables, so we use an array of freelist, each with its own mutex and
148+
* nentries count, instead of just a single one.
149+
*
150+
* If hash table is not partitioned only freeList[0] is used and spinlocks
151+
* are not used at all.
152+
*/
153+
FreeListData freeList[NUM_FREELISTS];
137154

138155
/* These fields can change, but not in a partitioned table */
139156
/* Also, dsize can't change in a shared table, even if unpartitioned */
@@ -166,6 +183,9 @@ struct HASHHDR
166183

167184
#define IS_PARTITIONED(hctl) ((hctl)->num_partitions != 0)
168185

186+
#define FREELIST_IDX(hctl, hashcode) \
187+
(IS_PARTITIONED(hctl) ? hashcode % NUM_FREELISTS : 0)
188+
169189
/*
170190
* Top control structure for a hashtable --- in a shared table, each backend
171191
* has its own copy (OK since no fields change at runtime)
@@ -219,10 +239,10 @@ static long hash_accesses,
219239
*/
220240
static void *DynaHashAlloc(Size size);
221241
static HASHSEGMENT seg_alloc(HTAB *hashp);
222-
static bool element_alloc(HTAB *hashp, int nelem);
242+
static bool element_alloc(HTAB *hashp, int nelem, int freelist_idx);
223243
static bool dir_realloc(HTAB *hashp);
224244
static bool expand_table(HTAB *hashp);
225-
static HASHBUCKET get_hash_entry(HTAB *hashp);
245+
static HASHBUCKET get_hash_entry(HTAB *hashp, int freelist_idx);
226246
static void hdefault(HTAB *hashp);
227247
static int choose_nelem_alloc(Size entrysize);
228248
static bool init_htab(HTAB *hashp, long nelem);
@@ -482,10 +502,40 @@ hash_create(const char *tabname, long nelem, HASHCTL *info, int flags)
482502
if ((flags & HASH_SHARED_MEM) ||
483503
nelem < hctl->nelem_alloc)
484504
{
485-
if (!element_alloc(hashp, (int) nelem))
486-
ereport(ERROR,
487-
(errcode(ERRCODE_OUT_OF_MEMORY),
488-
errmsg("out of memory")));
505+
int i,
506+
freelist_partitions,
507+
nelem_alloc,
508+
nelem_alloc_first;
509+
510+
/*
511+
* If hash table is partitioned all freeLists have equal number of
512+
* elements. Otherwise only freeList[0] is used.
513+
*/
514+
if (IS_PARTITIONED(hashp->hctl))
515+
freelist_partitions = NUM_FREELISTS;
516+
else
517+
freelist_partitions = 1;
518+
519+
nelem_alloc = nelem / freelist_partitions;
520+
if (nelem_alloc == 0)
521+
nelem_alloc = 1;
522+
523+
/* Make sure all memory will be used */
524+
if (nelem_alloc * freelist_partitions < nelem)
525+
nelem_alloc_first =
526+
nelem - nelem_alloc * (freelist_partitions - 1);
527+
else
528+
nelem_alloc_first = nelem_alloc;
529+
530+
for (i = 0; i < freelist_partitions; i++)
531+
{
532+
int temp = (i == 0) ? nelem_alloc_first : nelem_alloc;
533+
534+
if (!element_alloc(hashp, temp, i))
535+
ereport(ERROR,
536+
(errcode(ERRCODE_OUT_OF_MEMORY),
537+
errmsg("out of memory")));
538+
}
489539
}
490540

491541
if (flags & HASH_FIXED_SIZE)
@@ -503,9 +553,6 @@ hdefault(HTAB *hashp)
503553

504554
MemSet(hctl, 0, sizeof(HASHHDR));
505555

506-
hctl->nentries = 0;
507-
hctl->freeList = NULL;
508-
509556
hctl->dsize = DEF_DIRSIZE;
510557
hctl->nsegs = 0;
511558

@@ -572,12 +619,14 @@ init_htab(HTAB *hashp, long nelem)
572619
HASHSEGMENT *segp;
573620
int nbuckets;
574621
int nsegs;
622+
int i;
575623

576624
/*
577625
* initialize mutex if it's a partitioned table
578626
*/
579627
if (IS_PARTITIONED(hctl))
580-
SpinLockInit(&hctl->mutex);
628+
for (i = 0; i < NUM_FREELISTS; i++)
629+
SpinLockInit(&(hctl->freeList[i].mutex));
581630

582631
/*
583632
* Divide number of elements by the fill factor to determine a desired
@@ -648,7 +697,7 @@ init_htab(HTAB *hashp, long nelem)
648697
"HIGH MASK ", hctl->high_mask,
649698
"LOW MASK ", hctl->low_mask,
650699
"NSEGS ", hctl->nsegs,
651-
"NENTRIES ", hctl->nentries);
700+
"NENTRIES ", hash_get_num_entries(hctl));
652701
#endif
653702
return true;
654703
}
@@ -769,7 +818,7 @@ hash_stats(const char *where, HTAB *hashp)
769818
where, hashp->hctl->accesses, hashp->hctl->collisions);
770819

771820
fprintf(stderr, "hash_stats: entries %ld keysize %ld maxp %u segmentcount %ld\n",
772-
hashp->hctl->nentries, (long) hashp->hctl->keysize,
821+
hash_get_num_entries(hashp), (long) hashp->hctl->keysize,
773822
hashp->hctl->max_bucket, hashp->hctl->nsegs);
774823
fprintf(stderr, "%s: total accesses %ld total collisions %ld\n",
775824
where, hash_accesses, hash_collisions);
@@ -863,6 +912,7 @@ hash_search_with_hash_value(HTAB *hashp,
863912
HASHBUCKET currBucket;
864913
HASHBUCKET *prevBucketPtr;
865914
HashCompareFunc match;
915+
int freelist_idx = FREELIST_IDX(hctl, hashvalue);
866916

867917
#if HASH_STATISTICS
868918
hash_accesses++;
@@ -885,7 +935,7 @@ hash_search_with_hash_value(HTAB *hashp,
885935
* order of these tests is to try to check cheaper conditions first.
886936
*/
887937
if (!IS_PARTITIONED(hctl) && !hashp->frozen &&
888-
hctl->nentries / (long) (hctl->max_bucket + 1) >= hctl->ffactor &&
938+
hctl->freeList[0].nentries / (long) (hctl->max_bucket + 1) >= hctl->ffactor &&
889939
!has_seq_scans(hashp))
890940
(void) expand_table(hashp);
891941
}
@@ -943,20 +993,20 @@ hash_search_with_hash_value(HTAB *hashp,
943993
{
944994
/* if partitioned, must lock to touch nentries and freeList */
945995
if (IS_PARTITIONED(hctl))
946-
SpinLockAcquire(&hctl->mutex);
996+
SpinLockAcquire(&(hctl->freeList[freelist_idx].mutex));
947997

948-
Assert(hctl->nentries > 0);
949-
hctl->nentries--;
998+
Assert(hctl->freeList[freelist_idx].nentries > 0);
999+
hctl->freeList[freelist_idx].nentries--;
9501000

9511001
/* remove record from hash bucket's chain. */
9521002
*prevBucketPtr = currBucket->link;
9531003

9541004
/* add the record to the freelist for this table. */
955-
currBucket->link = hctl->freeList;
956-
hctl->freeList = currBucket;
1005+
currBucket->link = hctl->freeList[freelist_idx].freeList;
1006+
hctl->freeList[freelist_idx].freeList = currBucket;
9571007

9581008
if (IS_PARTITIONED(hctl))
959-
SpinLockRelease(&hctl->mutex);
1009+
SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
9601010

9611011
/*
9621012
* better hope the caller is synchronizing access to this
@@ -982,7 +1032,7 @@ hash_search_with_hash_value(HTAB *hashp,
9821032
elog(ERROR, "cannot insert into frozen hashtable \"%s\"",
9831033
hashp->tabname);
9841034

985-
currBucket = get_hash_entry(hashp);
1035+
currBucket = get_hash_entry(hashp, freelist_idx);
9861036
if (currBucket == NULL)
9871037
{
9881038
/* out of memory */
@@ -1175,39 +1225,69 @@ hash_update_hash_key(HTAB *hashp,
11751225
* create a new entry if possible
11761226
*/
11771227
static HASHBUCKET
1178-
get_hash_entry(HTAB *hashp)
1228+
get_hash_entry(HTAB *hashp, int freelist_idx)
11791229
{
1180-
HASHHDR *hctl = hashp->hctl;
1230+
HASHHDR *hctl = hashp->hctl;
11811231
HASHBUCKET newElement;
1232+
int borrow_from_idx;
11821233

11831234
for (;;)
11841235
{
11851236
/* if partitioned, must lock to touch nentries and freeList */
11861237
if (IS_PARTITIONED(hctl))
1187-
SpinLockAcquire(&hctl->mutex);
1238+
SpinLockAcquire(&hctl->freeList[freelist_idx].mutex);
11881239

11891240
/* try to get an entry from the freelist */
1190-
newElement = hctl->freeList;
1241+
newElement = hctl->freeList[freelist_idx].freeList;
1242+
11911243
if (newElement != NULL)
11921244
break;
11931245

1194-
/* no free elements. allocate another chunk of buckets */
11951246
if (IS_PARTITIONED(hctl))
1196-
SpinLockRelease(&hctl->mutex);
1247+
SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
11971248

1198-
if (!element_alloc(hashp, hctl->nelem_alloc))
1249+
/* no free elements. allocate another chunk of buckets */
1250+
if (!element_alloc(hashp, hctl->nelem_alloc, freelist_idx))
11991251
{
1200-
/* out of memory */
1201-
return NULL;
1252+
if (!IS_PARTITIONED(hctl))
1253+
return NULL; /* out of memory */
1254+
1255+
/* try to borrow element from another partition */
1256+
borrow_from_idx = freelist_idx;
1257+
for (;;)
1258+
{
1259+
borrow_from_idx = (borrow_from_idx + 1) % NUM_FREELISTS;
1260+
if (borrow_from_idx == freelist_idx)
1261+
break;
1262+
1263+
SpinLockAcquire(&(hctl->freeList[borrow_from_idx].mutex));
1264+
newElement = hctl->freeList[borrow_from_idx].freeList;
1265+
1266+
if (newElement != NULL)
1267+
{
1268+
hctl->freeList[borrow_from_idx].freeList = newElement->link;
1269+
SpinLockRelease(&(hctl->freeList[borrow_from_idx].mutex));
1270+
1271+
SpinLockAcquire(&hctl->freeList[freelist_idx].mutex);
1272+
hctl->freeList[freelist_idx].nentries++;
1273+
SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
1274+
1275+
break;
1276+
}
1277+
1278+
SpinLockRelease(&(hctl->freeList[borrow_from_idx].mutex));
1279+
}
1280+
1281+
return newElement;
12021282
}
12031283
}
12041284

12051285
/* remove entry from freelist, bump nentries */
1206-
hctl->freeList = newElement->link;
1207-
hctl->nentries++;
1286+
hctl->freeList[freelist_idx].freeList = newElement->link;
1287+
hctl->freeList[freelist_idx].nentries++;
12081288

12091289
if (IS_PARTITIONED(hctl))
1210-
SpinLockRelease(&hctl->mutex);
1290+
SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
12111291

12121292
return newElement;
12131293
}
@@ -1218,11 +1298,21 @@ get_hash_entry(HTAB *hashp)
12181298
long
12191299
hash_get_num_entries(HTAB *hashp)
12201300
{
1301+
int i;
1302+
long sum = hashp->hctl->freeList[0].nentries;
1303+
12211304
/*
12221305
* We currently don't bother with the mutex; it's only sensible to call
12231306
* this function if you've got lock on all partitions of the table.
12241307
*/
1225-
return hashp->hctl->nentries;
1308+
1309+
if (!IS_PARTITIONED(hashp->hctl))
1310+
return sum;
1311+
1312+
for (i = 1; i < NUM_FREELISTS; i++)
1313+
sum += hashp->hctl->freeList[i].nentries;
1314+
1315+
return sum;
12261316
}
12271317

12281318
/*
@@ -1527,12 +1617,12 @@ seg_alloc(HTAB *hashp)
15271617
}
15281618

15291619
/*
1530-
* allocate some new elements and link them into the free list
1620+
* allocate some new elements and link them into the indicated free list
15311621
*/
15321622
static bool
1533-
element_alloc(HTAB *hashp, int nelem)
1623+
element_alloc(HTAB *hashp, int nelem, int freelist_idx)
15341624
{
1535-
HASHHDR *hctl = hashp->hctl;
1625+
HASHHDR *hctl = hashp->hctl;
15361626
Size elementSize;
15371627
HASHELEMENT *firstElement;
15381628
HASHELEMENT *tmpElement;
@@ -1563,14 +1653,14 @@ element_alloc(HTAB *hashp, int nelem)
15631653

15641654
/* if partitioned, must lock to touch freeList */
15651655
if (IS_PARTITIONED(hctl))
1566-
SpinLockAcquire(&hctl->mutex);
1656+
SpinLockAcquire(&hctl->freeList[freelist_idx].mutex);
15671657

15681658
/* freelist could be nonempty if two backends did this concurrently */
1569-
firstElement->link = hctl->freeList;
1570-
hctl->freeList = prevElement;
1659+
firstElement->link = hctl->freeList[freelist_idx].freeList;
1660+
hctl->freeList[freelist_idx].freeList = prevElement;
15711661

15721662
if (IS_PARTITIONED(hctl))
1573-
SpinLockRelease(&hctl->mutex);
1663+
SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
15741664

15751665
return true;
15761666
}

0 commit comments

Comments
 (0)