Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 2ac3ef7

Browse files
committed
Fix tuple routing in cases where tuple descriptors don't match.
The previous coding failed to work correctly when we have a multi-level partitioned hierarchy where tables at successive levels have different attribute numbers for the partition key attributes. To fix, have each PartitionDispatch object store a standalone TupleTableSlot initialized with the TupleDesc of the corresponding partitioned table, along with a TupleConversionMap to map tuples from the its parent's rowtype to own rowtype. After tuple routing chooses a leaf partition, we must use the leaf partition's tuple descriptor, not the root table's. To that end, a dedicated TupleTableSlot for tuple routing is now allocated in EState. Amit Langote
1 parent 12bd7dd commit 2ac3ef7

File tree

7 files changed

+190
-15
lines changed

7 files changed

+190
-15
lines changed

src/backend/catalog/partition.c

Lines changed: 61 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -923,13 +923,19 @@ RelationGetPartitionQual(Relation rel, bool recurse)
923923
return generate_partition_qual(rel, recurse);
924924
}
925925

926-
/* Turn an array of OIDs with N elements into a list */
927-
#define OID_ARRAY_TO_LIST(arr, N, list) \
926+
/*
927+
* Append OIDs of rel's partitions to the list 'partoids' and for each OID,
928+
* append pointer rel to the list 'parents'.
929+
*/
930+
#define APPEND_REL_PARTITION_OIDS(rel, partoids, parents) \
928931
do\
929932
{\
930933
int i;\
931-
for (i = 0; i < (N); i++)\
932-
(list) = lappend_oid((list), (arr)[i]);\
934+
for (i = 0; i < (rel)->rd_partdesc->nparts; i++)\
935+
{\
936+
(partoids) = lappend_oid((partoids), (rel)->rd_partdesc->oids[i]);\
937+
(parents) = lappend((parents), (rel));\
938+
}\
933939
} while(0)
934940

935941
/*
@@ -944,11 +950,13 @@ PartitionDispatch *
944950
RelationGetPartitionDispatchInfo(Relation rel, int lockmode,
945951
int *num_parted, List **leaf_part_oids)
946952
{
947-
PartitionDesc rootpartdesc = RelationGetPartitionDesc(rel);
948953
PartitionDispatchData **pd;
949954
List *all_parts = NIL,
950-
*parted_rels;
951-
ListCell *lc;
955+
*all_parents = NIL,
956+
*parted_rels,
957+
*parted_rel_parents;
958+
ListCell *lc1,
959+
*lc2;
952960
int i,
953961
k,
954962
offset;
@@ -965,10 +973,13 @@ RelationGetPartitionDispatchInfo(Relation rel, int lockmode,
965973
*/
966974
*num_parted = 1;
967975
parted_rels = list_make1(rel);
968-
OID_ARRAY_TO_LIST(rootpartdesc->oids, rootpartdesc->nparts, all_parts);
969-
foreach(lc, all_parts)
976+
/* Root partitioned table has no parent, so NULL for parent */
977+
parted_rel_parents = list_make1(NULL);
978+
APPEND_REL_PARTITION_OIDS(rel, all_parts, all_parents);
979+
forboth(lc1, all_parts, lc2, all_parents)
970980
{
971-
Relation partrel = heap_open(lfirst_oid(lc), lockmode);
981+
Relation partrel = heap_open(lfirst_oid(lc1), lockmode);
982+
Relation parent = lfirst(lc2);
972983
PartitionDesc partdesc = RelationGetPartitionDesc(partrel);
973984

974985
/*
@@ -979,7 +990,8 @@ RelationGetPartitionDispatchInfo(Relation rel, int lockmode,
979990
{
980991
(*num_parted)++;
981992
parted_rels = lappend(parted_rels, partrel);
982-
OID_ARRAY_TO_LIST(partdesc->oids, partdesc->nparts, all_parts);
993+
parted_rel_parents = lappend(parted_rel_parents, parent);
994+
APPEND_REL_PARTITION_OIDS(partrel, all_parts, all_parents);
983995
}
984996
else
985997
heap_close(partrel, NoLock);
@@ -1004,10 +1016,12 @@ RelationGetPartitionDispatchInfo(Relation rel, int lockmode,
10041016
sizeof(PartitionDispatchData *));
10051017
*leaf_part_oids = NIL;
10061018
i = k = offset = 0;
1007-
foreach(lc, parted_rels)
1019+
forboth(lc1, parted_rels, lc2, parted_rel_parents)
10081020
{
1009-
Relation partrel = lfirst(lc);
1021+
Relation partrel = lfirst(lc1);
1022+
Relation parent = lfirst(lc2);
10101023
PartitionKey partkey = RelationGetPartitionKey(partrel);
1024+
TupleDesc tupdesc = RelationGetDescr(partrel);
10111025
PartitionDesc partdesc = RelationGetPartitionDesc(partrel);
10121026
int j,
10131027
m;
@@ -1017,6 +1031,27 @@ RelationGetPartitionDispatchInfo(Relation rel, int lockmode,
10171031
pd[i]->key = partkey;
10181032
pd[i]->keystate = NIL;
10191033
pd[i]->partdesc = partdesc;
1034+
if (parent != NULL)
1035+
{
1036+
/*
1037+
* For every partitioned table other than root, we must store
1038+
* a tuple table slot initialized with its tuple descriptor and
1039+
* a tuple conversion map to convert a tuple from its parent's
1040+
* rowtype to its own. That is to make sure that we are looking
1041+
* at the correct row using the correct tuple descriptor when
1042+
* computing its partition key for tuple routing.
1043+
*/
1044+
pd[i]->tupslot = MakeSingleTupleTableSlot(tupdesc);
1045+
pd[i]->tupmap = convert_tuples_by_name(RelationGetDescr(parent),
1046+
tupdesc,
1047+
gettext_noop("could not convert row type"));
1048+
}
1049+
else
1050+
{
1051+
/* Not required for the root partitioned table */
1052+
pd[i]->tupslot = NULL;
1053+
pd[i]->tupmap = NULL;
1054+
}
10201055
pd[i]->indexes = (int *) palloc(partdesc->nparts * sizeof(int));
10211056

10221057
/*
@@ -1610,6 +1645,8 @@ get_partition_for_tuple(PartitionDispatch *pd,
16101645
{
16111646
PartitionKey key = parent->key;
16121647
PartitionDesc partdesc = parent->partdesc;
1648+
TupleTableSlot *myslot = parent->tupslot;
1649+
TupleConversionMap *map = parent->tupmap;
16131650

16141651
/* Quick exit */
16151652
if (partdesc->nparts == 0)
@@ -1618,6 +1655,17 @@ get_partition_for_tuple(PartitionDispatch *pd,
16181655
return -1;
16191656
}
16201657

1658+
if (myslot != NULL)
1659+
{
1660+
HeapTuple tuple = ExecFetchSlotTuple(slot);
1661+
1662+
ExecClearTuple(myslot);
1663+
Assert(map != NULL);
1664+
tuple = do_convert_tuple(tuple, map);
1665+
ExecStoreTuple(tuple, myslot, InvalidBuffer, true);
1666+
slot = myslot;
1667+
}
1668+
16211669
/* Extract partition key from tuple */
16221670
FormPartitionKeyDatum(parent, slot, estate, values, isnull);
16231671

src/backend/commands/copy.c

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2435,6 +2435,15 @@ CopyFrom(CopyState cstate)
24352435
/* Triggers might need a slot as well */
24362436
estate->es_trig_tuple_slot = ExecInitExtraTupleSlot(estate);
24372437

2438+
/*
2439+
* Initialize a dedicated slot to manipulate tuples of any given
2440+
* partition's rowtype.
2441+
*/
2442+
if (cstate->partition_dispatch_info)
2443+
estate->es_partition_tuple_slot = ExecInitExtraTupleSlot(estate);
2444+
else
2445+
estate->es_partition_tuple_slot = NULL;
2446+
24382447
/*
24392448
* It's more efficient to prepare a bunch of tuples for insertion, and
24402449
* insert them in one heap_multi_insert() call, than call heap_insert()
@@ -2484,7 +2493,8 @@ CopyFrom(CopyState cstate)
24842493

24852494
for (;;)
24862495
{
2487-
TupleTableSlot *slot;
2496+
TupleTableSlot *slot,
2497+
*oldslot = NULL;
24882498
bool skip_tuple;
24892499
Oid loaded_oid = InvalidOid;
24902500

@@ -2571,7 +2581,19 @@ CopyFrom(CopyState cstate)
25712581
map = cstate->partition_tupconv_maps[leaf_part_index];
25722582
if (map)
25732583
{
2584+
Relation partrel = resultRelInfo->ri_RelationDesc;
2585+
25742586
tuple = do_convert_tuple(tuple, map);
2587+
2588+
/*
2589+
* We must use the partition's tuple descriptor from this
2590+
* point on. Use a dedicated slot from this point on until
2591+
* we're finished dealing with the partition.
2592+
*/
2593+
oldslot = slot;
2594+
slot = estate->es_partition_tuple_slot;
2595+
Assert(slot != NULL);
2596+
ExecSetSlotDescriptor(slot, RelationGetDescr(partrel));
25752597
ExecStoreTuple(tuple, slot, InvalidBuffer, true);
25762598
}
25772599

@@ -2667,6 +2689,10 @@ CopyFrom(CopyState cstate)
26672689
{
26682690
resultRelInfo = saved_resultRelInfo;
26692691
estate->es_result_relation_info = resultRelInfo;
2692+
2693+
/* Switch back to the slot corresponding to the root table */
2694+
Assert(oldslot != NULL);
2695+
slot = oldslot;
26702696
}
26712697
}
26722698
}
@@ -2714,13 +2740,14 @@ CopyFrom(CopyState cstate)
27142740
* Remember cstate->partition_dispatch_info[0] corresponds to the root
27152741
* partitioned table, which we must not try to close, because it is
27162742
* the main target table of COPY that will be closed eventually by
2717-
* DoCopy().
2743+
* DoCopy(). Also, tupslot is NULL for the root partitioned table.
27182744
*/
27192745
for (i = 1; i < cstate->num_dispatch; i++)
27202746
{
27212747
PartitionDispatch pd = cstate->partition_dispatch_info[i];
27222748

27232749
heap_close(pd->reldesc, NoLock);
2750+
ExecDropSingleTupleTableSlot(pd->tupslot);
27242751
}
27252752
for (i = 0; i < cstate->num_partitions; i++)
27262753
{

src/backend/executor/nodeModifyTable.c

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -262,6 +262,7 @@ ExecInsert(ModifyTableState *mtstate,
262262
Relation resultRelationDesc;
263263
Oid newId;
264264
List *recheckIndexes = NIL;
265+
TupleTableSlot *oldslot = NULL;
265266

266267
/*
267268
* get the heap tuple out of the tuple table slot, making sure we have a
@@ -318,7 +319,19 @@ ExecInsert(ModifyTableState *mtstate,
318319
map = mtstate->mt_partition_tupconv_maps[leaf_part_index];
319320
if (map)
320321
{
322+
Relation partrel = resultRelInfo->ri_RelationDesc;
323+
321324
tuple = do_convert_tuple(tuple, map);
325+
326+
/*
327+
* We must use the partition's tuple descriptor from this
328+
* point on, until we're finished dealing with the partition.
329+
* Use the dedicated slot for that.
330+
*/
331+
oldslot = slot;
332+
slot = estate->es_partition_tuple_slot;
333+
Assert(slot != NULL);
334+
ExecSetSlotDescriptor(slot, RelationGetDescr(partrel));
322335
ExecStoreTuple(tuple, slot, InvalidBuffer, true);
323336
}
324337
}
@@ -566,6 +579,10 @@ ExecInsert(ModifyTableState *mtstate,
566579
{
567580
resultRelInfo = saved_resultRelInfo;
568581
estate->es_result_relation_info = resultRelInfo;
582+
583+
/* Switch back to the slot corresponding to the root table */
584+
Assert(oldslot != NULL);
585+
slot = oldslot;
569586
}
570587

571588
/*
@@ -1734,7 +1751,15 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
17341751
mtstate->mt_partitions = partitions;
17351752
mtstate->mt_num_partitions = num_partitions;
17361753
mtstate->mt_partition_tupconv_maps = partition_tupconv_maps;
1754+
1755+
/*
1756+
* Initialize a dedicated slot to manipulate tuples of any given
1757+
* partition's rowtype.
1758+
*/
1759+
estate->es_partition_tuple_slot = ExecInitExtraTupleSlot(estate);
17371760
}
1761+
else
1762+
estate->es_partition_tuple_slot = NULL;
17381763

17391764
/*
17401765
* Initialize any WITH CHECK OPTION constraints if needed.
@@ -2058,12 +2083,14 @@ ExecEndModifyTable(ModifyTableState *node)
20582083
* Remember node->mt_partition_dispatch_info[0] corresponds to the root
20592084
* partitioned table, which we must not try to close, because it is the
20602085
* main target table of the query that will be closed by ExecEndPlan().
2086+
* Also, tupslot is NULL for the root partitioned table.
20612087
*/
20622088
for (i = 1; i < node->mt_num_dispatch; i++)
20632089
{
20642090
PartitionDispatch pd = node->mt_partition_dispatch_info[i];
20652091

20662092
heap_close(pd->reldesc, NoLock);
2093+
ExecDropSingleTupleTableSlot(pd->tupslot);
20672094
}
20682095
for (i = 0; i < node->mt_num_partitions; i++)
20692096
{

src/include/catalog/partition.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,11 @@ typedef struct PartitionDescData *PartitionDesc;
4747
* key Partition key information of the table
4848
* keystate Execution state required for expressions in the partition key
4949
* partdesc Partition descriptor of the table
50+
* tupslot A standalone TupleTableSlot initialized with this table's tuple
51+
* descriptor
52+
* tupmap TupleConversionMap to convert from the parent's rowtype to
53+
* this table's rowtype (when extracting the partition key of a
54+
* tuple just before routing it through this table)
5055
* indexes Array with partdesc->nparts members (for details on what
5156
* individual members represent, see how they are set in
5257
* RelationGetPartitionDispatchInfo())
@@ -58,6 +63,8 @@ typedef struct PartitionDispatchData
5863
PartitionKey key;
5964
List *keystate; /* list of ExprState */
6065
PartitionDesc partdesc;
66+
TupleTableSlot *tupslot;
67+
TupleConversionMap *tupmap;
6168
int *indexes;
6269
} PartitionDispatchData;
6370

src/include/nodes/execnodes.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -384,6 +384,9 @@ typedef struct EState
384384
TupleTableSlot *es_trig_oldtup_slot; /* for TriggerEnabled */
385385
TupleTableSlot *es_trig_newtup_slot; /* for TriggerEnabled */
386386

387+
/* Slot used to manipulate a tuple after it is routed to a partition */
388+
TupleTableSlot *es_partition_tuple_slot;
389+
387390
/* Parameter info: */
388391
ParamListInfo es_param_list_info; /* values of external params */
389392
ParamExecData *es_param_exec_vals; /* values of internal params */

src/test/regress/expected/insert.out

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -300,3 +300,40 @@ drop cascades to table part_null
300300
drop cascades to table part_ee_ff
301301
drop cascades to table part_ee_ff1
302302
drop cascades to table part_ee_ff2
303+
-- more tests for certain multi-level partitioning scenarios
304+
create table p (a int, b int) partition by range (a, b);
305+
create table p1 (b int, a int not null) partition by range (b);
306+
create table p11 (like p1);
307+
alter table p11 drop a;
308+
alter table p11 add a int;
309+
alter table p11 drop a;
310+
alter table p11 add a int not null;
311+
-- attnum for key attribute 'a' is different in p, p1, and p11
312+
select attrelid::regclass, attname, attnum
313+
from pg_attribute
314+
where attname = 'a'
315+
and (attrelid = 'p'::regclass
316+
or attrelid = 'p1'::regclass
317+
or attrelid = 'p11'::regclass);
318+
attrelid | attname | attnum
319+
----------+---------+--------
320+
p | a | 1
321+
p1 | a | 2
322+
p11 | a | 4
323+
(3 rows)
324+
325+
alter table p1 attach partition p11 for values from (2) to (5);
326+
alter table p attach partition p1 for values from (1, 2) to (1, 10);
327+
-- check that "(1, 2)" is correctly routed to p11.
328+
insert into p values (1, 2);
329+
select tableoid::regclass, * from p;
330+
tableoid | a | b
331+
----------+---+---
332+
p11 | 1 | 2
333+
(1 row)
334+
335+
-- cleanup
336+
drop table p cascade;
337+
NOTICE: drop cascades to 2 other objects
338+
DETAIL: drop cascades to table p1
339+
drop cascades to table p11

src/test/regress/sql/insert.sql

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,3 +170,29 @@ select tableoid::regclass, * from list_parted;
170170
-- cleanup
171171
drop table range_parted cascade;
172172
drop table list_parted cascade;
173+
174+
-- more tests for certain multi-level partitioning scenarios
175+
create table p (a int, b int) partition by range (a, b);
176+
create table p1 (b int, a int not null) partition by range (b);
177+
create table p11 (like p1);
178+
alter table p11 drop a;
179+
alter table p11 add a int;
180+
alter table p11 drop a;
181+
alter table p11 add a int not null;
182+
-- attnum for key attribute 'a' is different in p, p1, and p11
183+
select attrelid::regclass, attname, attnum
184+
from pg_attribute
185+
where attname = 'a'
186+
and (attrelid = 'p'::regclass
187+
or attrelid = 'p1'::regclass
188+
or attrelid = 'p11'::regclass);
189+
190+
alter table p1 attach partition p11 for values from (2) to (5);
191+
alter table p attach partition p1 for values from (1, 2) to (1, 10);
192+
193+
-- check that "(1, 2)" is correctly routed to p11.
194+
insert into p values (1, 2);
195+
select tableoid::regclass, * from p;
196+
197+
-- cleanup
198+
drop table p cascade;

0 commit comments

Comments
 (0)