diff --git a/contrib/postgres_fdw/expected/postgres_fdw.out b/contrib/postgres_fdw/expected/postgres_fdw.out index 48e3185b227d..de7227a60403 100644 --- a/contrib/postgres_fdw/expected/postgres_fdw.out +++ b/contrib/postgres_fdw/expected/postgres_fdw.out @@ -3701,33 +3701,30 @@ select count(t1.c3) from ft2 t1 left join ft2 t2 on (t1.c1 = random() * t2.c2); -- Subquery in FROM clause having aggregate explain (verbose, costs off) select count(*), x.b from ft1, (select c2 a, sum(c1) b from ft1 group by c2) x where ft1.c2 = x.a group by x.b order by 1, 2; - QUERY PLAN ------------------------------------------------------------------------------------------ + QUERY PLAN +----------------------------------------------------------------------------------- Sort Output: (count(*)), (sum(ft1_1.c1)) Sort Key: (count(*)), (sum(ft1_1.c1)) - -> Finalize GroupAggregate + -> Finalize IndexAggregate Output: count(*), (sum(ft1_1.c1)) Group Key: (sum(ft1_1.c1)) - -> Sort + -> Hash Join Output: (sum(ft1_1.c1)), (PARTIAL count(*)) - Sort Key: (sum(ft1_1.c1)) - -> Hash Join - Output: (sum(ft1_1.c1)), (PARTIAL count(*)) - Hash Cond: (ft1_1.c2 = ft1.c2) - -> Foreign Scan - Output: ft1_1.c2, (sum(ft1_1.c1)) - Relations: Aggregate on (public.ft1 ft1_1) - Remote SQL: SELECT c2, sum("C 1") FROM "S 1"."T 1" GROUP BY 1 - -> Hash - Output: ft1.c2, (PARTIAL count(*)) - -> Partial HashAggregate - Output: ft1.c2, PARTIAL count(*) - Group Key: ft1.c2 - -> Foreign Scan on public.ft1 - Output: ft1.c2 - Remote SQL: SELECT c2 FROM "S 1"."T 1" -(24 rows) + Hash Cond: (ft1_1.c2 = ft1.c2) + -> Foreign Scan + Output: ft1_1.c2, (sum(ft1_1.c1)) + Relations: Aggregate on (public.ft1 ft1_1) + Remote SQL: SELECT c2, sum("C 1") FROM "S 1"."T 1" GROUP BY 1 + -> Hash + Output: ft1.c2, (PARTIAL count(*)) + -> Partial HashAggregate + Output: ft1.c2, PARTIAL count(*) + Group Key: ft1.c2 + -> Foreign Scan on public.ft1 + Output: ft1.c2 + Remote SQL: SELECT c2 FROM "S 1"."T 1" +(21 rows) select count(*), x.b from ft1, (select c2 a, sum(c1) b from ft1 group by c2) x where ft1.c2 = x.a group by x.b order by 1, 2; count | b diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index 5a6390631eba..9e16c547b068 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -134,7 +134,7 @@ static void show_recursive_union_info(RecursiveUnionState *rstate, ExplainState *es); static void show_memoize_info(MemoizeState *mstate, List *ancestors, ExplainState *es); -static void show_hashagg_info(AggState *aggstate, ExplainState *es); +static void show_agg_spill_info(AggState *aggstate, ExplainState *es); static void show_indexsearches_info(PlanState *planstate, ExplainState *es); static void show_tidbitmap_info(BitmapHeapScanState *planstate, ExplainState *es); @@ -1556,6 +1556,10 @@ ExplainNode(PlanState *planstate, List *ancestors, pname = "MixedAggregate"; strategy = "Mixed"; break; + case AGG_INDEX: + pname = "IndexAggregate"; + strategy = "Indexed"; + break; default: pname = "Aggregate ???"; strategy = "???"; @@ -2200,7 +2204,7 @@ ExplainNode(PlanState *planstate, List *ancestors, case T_Agg: show_agg_keys(castNode(AggState, planstate), ancestors, es); show_upper_qual(plan->qual, "Filter", planstate, ancestors, es); - show_hashagg_info((AggState *) planstate, es); + show_agg_spill_info((AggState *) planstate, es); if (plan->qual) show_instrumentation_count("Rows Removed by Filter", 1, planstate, es); @@ -2631,6 +2635,24 @@ show_agg_keys(AggState *astate, List *ancestors, if (plan->groupingSets) show_grouping_sets(outerPlanState(astate), plan, ancestors, es); + else if (plan->aggstrategy == AGG_INDEX) + { + Sort *sort = astate->index_sort; + + /* + * Index Agg reorders GROUP BY keys to match ORDER BY + * so they must be the same, but we should show other + * useful information about used ordering, such as direction. + */ + Assert(sort != NULL); + show_sort_group_keys(outerPlanState(astate), "Group Key", + plan->numCols, 0, + sort->sortColIdx, + sort->sortOperators, + sort->collations, + sort->nullsFirst, + ancestors, es); + } else show_sort_group_keys(outerPlanState(astate), "Group Key", plan->numCols, 0, plan->grpColIdx, @@ -3735,47 +3757,67 @@ show_memoize_info(MemoizeState *mstate, List *ancestors, ExplainState *es) } /* - * Show information on hash aggregate memory usage and batches. + * Show information on hash or index aggregate memory usage and batches. */ static void -show_hashagg_info(AggState *aggstate, ExplainState *es) +show_agg_spill_info(AggState *aggstate, ExplainState *es) { Agg *agg = (Agg *) aggstate->ss.ps.plan; - int64 memPeakKb = BYTES_TO_KILOBYTES(aggstate->hash_mem_peak); + int64 memPeakKb = BYTES_TO_KILOBYTES(aggstate->spill_mem_peak); if (agg->aggstrategy != AGG_HASHED && - agg->aggstrategy != AGG_MIXED) + agg->aggstrategy != AGG_MIXED && + agg->aggstrategy != AGG_INDEX) return; if (es->format != EXPLAIN_FORMAT_TEXT) { if (es->costs) ExplainPropertyInteger("Planned Partitions", NULL, - aggstate->hash_planned_partitions, es); + aggstate->spill_planned_partitions, es); /* * During parallel query the leader may have not helped out. We * detect this by checking how much memory it used. If we find it * didn't do any work then we don't show its properties. */ - if (es->analyze && aggstate->hash_mem_peak > 0) + if (es->analyze && aggstate->spill_mem_peak > 0) { ExplainPropertyInteger("HashAgg Batches", NULL, - aggstate->hash_batches_used, es); + aggstate->spill_batches_used, es); ExplainPropertyInteger("Peak Memory Usage", "kB", memPeakKb, es); ExplainPropertyInteger("Disk Usage", "kB", - aggstate->hash_disk_used, es); + aggstate->spill_disk_used, es); + } + + if ( es->analyze + && aggstate->aggstrategy == AGG_INDEX + && aggstate->mergestate != NULL) + { + TuplesortInstrumentation stats; + const char *mergeMethod; + const char *spaceType; + int64 spaceUsed; + + tuplesort_get_stats(aggstate->mergestate, &stats); + mergeMethod = tuplesort_method_name(stats.sortMethod); + spaceType = tuplesort_space_type_name(stats.spaceType); + spaceUsed = stats.spaceUsed; + + ExplainPropertyText("Merge Method", mergeMethod, es); + ExplainPropertyInteger("Merge Space Used", "kB", spaceUsed, es); + ExplainPropertyText("Merge Space Type", spaceType, es); } } else { bool gotone = false; - if (es->costs && aggstate->hash_planned_partitions > 0) + if (es->costs && aggstate->spill_planned_partitions > 0) { ExplainIndentText(es); appendStringInfo(es->str, "Planned Partitions: %d", - aggstate->hash_planned_partitions); + aggstate->spill_planned_partitions); gotone = true; } @@ -3784,7 +3826,7 @@ show_hashagg_info(AggState *aggstate, ExplainState *es) * detect this by checking how much memory it used. If we find it * didn't do any work then we don't show its properties. */ - if (es->analyze && aggstate->hash_mem_peak > 0) + if (es->analyze && aggstate->spill_mem_peak > 0) { if (!gotone) ExplainIndentText(es); @@ -3792,17 +3834,44 @@ show_hashagg_info(AggState *aggstate, ExplainState *es) appendStringInfoSpaces(es->str, 2); appendStringInfo(es->str, "Batches: %d Memory Usage: " INT64_FORMAT "kB", - aggstate->hash_batches_used, memPeakKb); + aggstate->spill_batches_used, memPeakKb); gotone = true; /* Only display disk usage if we spilled to disk */ - if (aggstate->hash_batches_used > 1) + if (aggstate->spill_batches_used > 1) { appendStringInfo(es->str, " Disk Usage: " UINT64_FORMAT "kB", - aggstate->hash_disk_used); + aggstate->spill_disk_used); } } + /* For index aggregate show stats for final merging */ + if ( es->analyze + && aggstate->aggstrategy == AGG_INDEX + && aggstate->mergestate != NULL) + { + TuplesortInstrumentation stats; + const char *mergeMethod; + const char *spaceType; + int64 spaceUsed; + + tuplesort_get_stats(aggstate->mergestate, &stats); + mergeMethod = tuplesort_method_name(stats.sortMethod); + spaceType = tuplesort_space_type_name(stats.spaceType); + spaceUsed = stats.spaceUsed; + + /* + * If we are here that means that previous check (for mem peak) was + * successfull (can not directly go to merge without any in-memory + * operations). Do not check other state and just start a new line. + */ + appendStringInfoChar(es->str, '\n'); + ExplainIndentText(es); + appendStringInfo(es->str, "Merge Method: %s %s: " INT64_FORMAT "kB", + mergeMethod, spaceType, spaceUsed); + gotone = true; + } + if (gotone) appendStringInfoChar(es->str, '\n'); } diff --git a/src/backend/executor/execExpr.c b/src/backend/executor/execExpr.c index c35744b105e5..117d7ba31d0b 100644 --- a/src/backend/executor/execExpr.c +++ b/src/backend/executor/execExpr.c @@ -94,7 +94,7 @@ static void ExecInitCoerceToDomain(ExprEvalStep *scratch, CoerceToDomain *ctest, static void ExecBuildAggTransCall(ExprState *state, AggState *aggstate, ExprEvalStep *scratch, FunctionCallInfo fcinfo, AggStatePerTrans pertrans, - int transno, int setno, int setoff, bool ishash, + int transno, int setno, int setoff, int strategy, bool nullcheck); static void ExecInitJsonExpr(JsonExpr *jsexpr, ExprState *state, Datum *resv, bool *resnull, @@ -3667,7 +3667,7 @@ ExecInitCoerceToDomain(ExprEvalStep *scratch, CoerceToDomain *ctest, */ ExprState * ExecBuildAggTrans(AggState *aggstate, AggStatePerPhase phase, - bool doSort, bool doHash, bool nullcheck) + int groupStrategy, bool nullcheck) { ExprState *state = makeNode(ExprState); PlanState *parent = &aggstate->ss.ps; @@ -3925,7 +3925,7 @@ ExecBuildAggTrans(AggState *aggstate, AggStatePerPhase phase, * grouping set). Do so for both sort and hash based computations, as * applicable. */ - if (doSort) + if (groupStrategy & GROUPING_STRATEGY_SORT) { int processGroupingSets = Max(phase->numsets, 1); int setoff = 0; @@ -3933,13 +3933,13 @@ ExecBuildAggTrans(AggState *aggstate, AggStatePerPhase phase, for (int setno = 0; setno < processGroupingSets; setno++) { ExecBuildAggTransCall(state, aggstate, &scratch, trans_fcinfo, - pertrans, transno, setno, setoff, false, - nullcheck); + pertrans, transno, setno, setoff, + GROUPING_STRATEGY_SORT, nullcheck); setoff++; } } - if (doHash) + if (groupStrategy & GROUPING_STRATEGY_HASH) { int numHashes = aggstate->num_hashes; int setoff; @@ -3953,12 +3953,19 @@ ExecBuildAggTrans(AggState *aggstate, AggStatePerPhase phase, for (int setno = 0; setno < numHashes; setno++) { ExecBuildAggTransCall(state, aggstate, &scratch, trans_fcinfo, - pertrans, transno, setno, setoff, true, - nullcheck); + pertrans, transno, setno, setoff, + GROUPING_STRATEGY_HASH, nullcheck); setoff++; } } + if (groupStrategy & GROUPING_STRATEGY_INDEX) + { + ExecBuildAggTransCall(state, aggstate, &scratch, trans_fcinfo, + pertrans, transno, 0, 0, + GROUPING_STRATEGY_INDEX, nullcheck); + } + /* adjust early bail out jump target(s) */ foreach(bail, adjust_bailout) { @@ -4011,16 +4018,18 @@ static void ExecBuildAggTransCall(ExprState *state, AggState *aggstate, ExprEvalStep *scratch, FunctionCallInfo fcinfo, AggStatePerTrans pertrans, - int transno, int setno, int setoff, bool ishash, + int transno, int setno, int setoff, int strategy, bool nullcheck) { ExprContext *aggcontext; int adjust_jumpnull = -1; - if (ishash) + if (strategy & GROUPING_STRATEGY_HASH) aggcontext = aggstate->hashcontext; - else + else if (strategy & GROUPING_STRATEGY_SORT) aggcontext = aggstate->aggcontexts[setno]; + else + aggcontext = aggstate->indexcontext; /* add check for NULL pointer? */ if (nullcheck) diff --git a/src/backend/executor/execGrouping.c b/src/backend/executor/execGrouping.c index 8eb4c25e1cb0..c83a3f2223d3 100644 --- a/src/backend/executor/execGrouping.c +++ b/src/backend/executor/execGrouping.c @@ -622,3 +622,646 @@ TupleHashTableMatch(struct tuplehash_hash *tb, MinimalTuple tuple1, MinimalTuple econtext->ecxt_outertuple = slot1; return !ExecQualAndReset(hashtable->cur_eq_func, econtext); } + +/***************************************************************************** + * Utility routines for all-in-memory btree index + * + * These routines build btree index for grouping tuples together (eg, for + * index aggregation). There is one entry for each not-distinct set of tuples + * presented. + *****************************************************************************/ + +/* + * Representation of searched entry in tuple index. This have + * separate representation to avoid necessary memory allocations + * to create MinimalTuple for TupleIndexEntry. + */ +typedef struct TupleIndexSearchEntryData +{ + TupleTableSlot *slot; /* search TupleTableSlot */ + Datum key1; /* first searched key data */ + bool isnull1; /* first searched key is null */ +} TupleIndexSearchEntryData; + +typedef TupleIndexSearchEntryData *TupleIndexSearchEntry; + +/* + * compare_index_tuple_tiebreak + * Perform full comparison of tuples without key abbreviation. + * + * Invoked if first key (possibly abbreviated) can not decide comparison, so + * we have to compare all keys. + */ +static inline int +compare_index_tuple_tiebreak(TupleIndex index, TupleIndexEntry left, + TupleIndexSearchEntry right) +{ + HeapTupleData ltup; + SortSupport sortKey = index->sortKeys; + TupleDesc tupDesc = index->tupDesc; + AttrNumber attno; + Datum datum1, + datum2; + bool isnull1, + isnull2; + int cmp; + + ltup.t_len = left->tuple->t_len + MINIMAL_TUPLE_OFFSET; + ltup.t_data = (HeapTupleHeader) ((char *) left->tuple - MINIMAL_TUPLE_OFFSET); + tupDesc = index->tupDesc; + + if (sortKey->abbrev_converter) + { + attno = sortKey->ssup_attno; + + datum1 = heap_getattr(<up, attno, tupDesc, &isnull1); + datum2 = slot_getattr(right->slot, attno, &isnull2); + + cmp = ApplySortAbbrevFullComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + if (cmp != 0) + return cmp; + } + + sortKey++; + for (int nkey = 1; nkey < index->nkeys; nkey++, sortKey++) + { + attno = sortKey->ssup_attno; + + datum1 = heap_getattr(<up, attno, tupDesc, &isnull1); + datum2 = slot_getattr(right->slot, attno, &isnull2); + + cmp = ApplySortComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + if (cmp != 0) + return cmp; + } + + return 0; +} + +/* + * compare_index_tuple + * Compare pair of tuples during index lookup + * + * The comparison honors key abbreviation. + */ +static int +compare_index_tuple(TupleIndex index, + TupleIndexEntry left, + TupleIndexSearchEntry right) +{ + SortSupport sortKey = &index->sortKeys[0]; + int cmp = 0; + + cmp = ApplySortComparator(left->key1, left->isnull1, + right->key1, right->isnull1, + sortKey); + if (cmp != 0) + return cmp; + + return compare_index_tuple_tiebreak(index, left, right); +} + +/* + * tuple_index_node_bsearch + * Perform binary search in the index node. + * + * On return, if 'found' is set to 'true', then exact match found and returned + * index is an index in tuples array. Otherwise the value handled differently: + * - for internal nodes this is an index in 'pointers' array which to follow + * - for leaf nodes this is an index to which new entry must be inserted. + */ +static int +tuple_index_node_bsearch(TupleIndex index, TupleIndexNode node, + TupleIndexSearchEntry search, bool *found) +{ + int low; + int high; + + low = 0; + high = node->ntuples; + *found = false; + + while (low < high) + { + OffsetNumber mid = (low + high) / 2; + TupleIndexEntry mid_entry = node->tuples[mid]; + int cmp; + + cmp = compare_index_tuple(index, mid_entry, search); + if (cmp == 0) + { + *found = true; + return mid; + } + + if (cmp < 0) + low = mid + 1; + else + high = mid; + } + + return low; +} + +static inline TupleIndexNode +IndexLeafNodeGetNext(TupleIndexNode node) +{ + return node->pointers[0]; +} + +static inline void +IndexLeafNodeSetNext(TupleIndexNode node, TupleIndexNode next) +{ + node->pointers[0] = next; +} + +#define SizeofTupleIndexInternalNode \ + (offsetof(TupleIndexNodeData, pointers) \ + + (TUPLE_INDEX_NODE_MAX_ENTRIES + 1) * sizeof(TupleIndexNode)) + +#define SizeofTupleIndexLeafNode \ + offsetof(TupleIndexNodeData, pointers) + sizeof(TupleIndexNode) + +static inline TupleIndexNode +AllocLeafIndexNode(TupleIndex index, TupleIndexNode next) +{ + TupleIndexNode leaf; + leaf = MemoryContextAllocZero(index->nodecxt, SizeofTupleIndexLeafNode); + IndexLeafNodeSetNext(leaf, next); + return leaf; +} + +static inline TupleIndexNode +AllocInternalIndexNode(TupleIndex index) +{ + return MemoryContextAllocZero(index->nodecxt, SizeofTupleIndexInternalNode); +} + +/* + * tuple_index_node_insert_at + * Insert new tuple in the node at specified index + * + * This function is inserted when new tuple must be inserted in the node (both + * leaf and internal). For internal nodes 'pointer' must be also specified. + * + * Node must have free space available. It's up to caller to check if node + * is full and needs splitting. For split use 'tuple_index_perform_insert_split'. + */ +static inline void +tuple_index_node_insert_at(TupleIndexNode node, bool is_leaf, int idx, + TupleIndexEntry entry, TupleIndexNode pointer) +{ + int move_count; + + Assert(node->ntuples < TUPLE_INDEX_NODE_MAX_ENTRIES); + Assert(0 <= idx && idx <= node->ntuples); + move_count = node->ntuples - idx; + + if (move_count > 0) + memmove(&node->tuples[idx + 1], &node->tuples[idx], + move_count * sizeof(TupleIndexEntry)); + + node->tuples[idx] = entry; + + if (!is_leaf) + { + Assert(pointer != NULL); + + if (move_count > 0) + memmove(&node->pointers[idx + 2], &node->pointers[idx + 1], + move_count * sizeof(TupleIndexNode)); + node->pointers[idx + 1] = pointer; + } + + node->ntuples++; +} + +/* + * Insert tuple to full node with page split. + * + * 'split_node_out' - new page containing nodes on right side + * 'split_tuple_out' - tuple, which sent to the parent node as new separator key + */ +static void +tuple_index_insert_split(TupleIndex index, TupleIndexNode node, bool is_leaf, + int insert_pos, TupleIndexNode *split_node_out, + TupleIndexEntry *split_entry_out) +{ + TupleIndexNode split; + int split_tuple_idx; + + Assert(node->ntuples == TUPLE_INDEX_NODE_MAX_ENTRIES); + + if (is_leaf) + { + /* + * Max amount of tuples is kept odd, so we need to decide at + * which index to perform page split. We know that split occurred + * during insert, so left less entries to the page at which + * insertion must occur. + */ + if (TUPLE_INDEX_NODE_MAX_ENTRIES / 2 < insert_pos) + split_tuple_idx = TUPLE_INDEX_NODE_MAX_ENTRIES / 2 + 1; + else + split_tuple_idx = TUPLE_INDEX_NODE_MAX_ENTRIES / 2; + + split = AllocLeafIndexNode(index, IndexLeafNodeGetNext(node)); + split->ntuples = node->ntuples - split_tuple_idx; + node->ntuples = split_tuple_idx; + memcpy(&split->tuples[0], &node->tuples[node->ntuples], + sizeof(TupleIndexEntry) * split->ntuples); + IndexLeafNodeSetNext(node, split); + } + else + { + /* + * After split on internal node split tuple will be removed. + * Max amount of tuples is odd, so division by 2 will handle it. + */ + split_tuple_idx = TUPLE_INDEX_NODE_MAX_ENTRIES / 2; + split = AllocInternalIndexNode(index); + split->ntuples = split_tuple_idx; + node->ntuples = split_tuple_idx; + memcpy(&split->tuples[0], &node->tuples[split_tuple_idx + 1], + sizeof(TupleIndexEntry) * split->ntuples); + memcpy(&split->pointers[0], &node->pointers[split_tuple_idx + 1], + sizeof(TupleIndexNode) * (split->ntuples + 1)); + } + + *split_node_out = split; + *split_entry_out = node->tuples[split_tuple_idx]; +} + +static inline Datum +mintup_getattr(MinimalTuple tup, TupleDesc tupdesc, AttrNumber attnum, bool *isnull) +{ + HeapTupleData htup; + + htup.t_len = tup->t_len + MINIMAL_TUPLE_OFFSET; + htup.t_data = (HeapTupleHeader) ((char *) tup - MINIMAL_TUPLE_OFFSET); + + return heap_getattr(&htup, attnum, tupdesc, isnull); +} + +static TupleIndexEntry +tuple_index_node_lookup(TupleIndex index, + TupleIndexNode node, int level, + TupleIndexSearchEntry search, bool *is_new, + TupleIndexNode *split_node_out, + TupleIndexEntry *split_entry_out) +{ + TupleIndexEntry entry; + int idx; + bool found; + bool is_leaf; + + TupleIndexNode insert_pointer; + TupleIndexEntry insert_entry; + bool need_insert; + + Assert(level >= 0); + + idx = tuple_index_node_bsearch(index, node, search, &found); + if (found) + { + /* + * Both internal and leaf nodes store pointers to elements, so we can + * safely return exact match found at each level. + */ + if (is_new) + *is_new = false; + return node->tuples[idx]; + } + + is_leaf = level == 0; + if (is_leaf) + { + MemoryContext oldcxt; + + if (is_new == NULL) + return NULL; + + oldcxt = MemoryContextSwitchTo(index->tuplecxt); + + entry = palloc(sizeof(TupleIndexEntryData)); + entry->tuple = ExecCopySlotMinimalTupleExtra(search->slot, index->additionalsize); + + MemoryContextSwitchTo(oldcxt); + + /* + * key1 in search tuple stored in TableTupleSlot which have it's own + * lifetime, so we must not copy it. + * + * But if key abbreviation is in use than we should copy it from search + * tuple: this is safe (pass-by-value) and extra recalculation can + * spoil statistics calculation. + */ + if (index->sortKeys->abbrev_converter) + { + entry->isnull1 = search->isnull1; + entry->key1 = search->key1; + } + else + { + SortSupport sortKey = &index->sortKeys[0]; + entry->key1 = mintup_getattr(entry->tuple, index->tupDesc, + sortKey->ssup_attno, &entry->isnull1); + } + + index->ntuples++; + + *is_new = true; + need_insert = true; + insert_pointer = NULL; + insert_entry = entry; + } + else + { + TupleIndexNode child_split_node = NULL; + TupleIndexEntry child_split_entry; + + entry = tuple_index_node_lookup(index, node->pointers[idx], level - 1, + search, is_new, + &child_split_node, &child_split_entry); + if (entry == NULL) + return NULL; + + if (child_split_node != NULL) + { + need_insert = true; + insert_pointer = child_split_node; + insert_entry = child_split_entry; + } + else + need_insert = false; + } + + if (need_insert) + { + Assert(insert_entry != NULL); + + if (node->ntuples == TUPLE_INDEX_NODE_MAX_ENTRIES) + { + TupleIndexNode split_node; + TupleIndexEntry split_entry; + + tuple_index_insert_split(index, node, is_leaf, idx, + &split_node, &split_entry); + + /* adjust insertion index if tuple is inserted to the splitted page */ + if (node->ntuples < idx) + { + /* keep split tuple for leaf nodes and remove for internal */ + if (is_leaf) + idx -= node->ntuples; + else + idx -= node->ntuples + 1; + + node = split_node; + } + + *split_node_out = split_node; + *split_entry_out = split_entry; + } + + Assert(idx >= 0); + tuple_index_node_insert_at(node, is_leaf, idx, insert_entry, insert_pointer); + } + + return entry; +} + +static void +remove_index_abbreviations(TupleIndex index) +{ + TupleIndexIteratorData iter; + TupleIndexEntry entry; + SortSupport sortKey = &index->sortKeys[0]; + + sortKey->comparator = sortKey->abbrev_full_comparator; + sortKey->abbrev_converter = NULL; + sortKey->abbrev_abort = NULL; + sortKey->abbrev_full_comparator = NULL; + + /* now traverse all index entries and convert all existing keys */ + InitTupleIndexIterator(index, &iter); + while ((entry = TupleIndexIteratorNext(&iter)) != NULL) + entry->key1 = mintup_getattr(entry->tuple, index->tupDesc, + sortKey->ssup_attno, &entry->isnull1); +} + +static inline void +prepare_search_index_tuple(TupleIndex index, TupleTableSlot *slot, + TupleIndexSearchEntry entry) +{ + SortSupport sortKey; + + sortKey = &index->sortKeys[0]; + + entry->slot = slot; + entry->key1 = slot_getattr(slot, sortKey->ssup_attno, &entry->isnull1); + + /* NULL can not be abbreviated */ + if (entry->isnull1) + return; + + /* abbreviation is not used */ + if (!sortKey->abbrev_converter) + return; + + /* check if abbreviation should be removed */ + if (index->abbrevNext <= index->ntuples) + { + index->abbrevNext *= 2; + + if (sortKey->abbrev_abort(index->ntuples, sortKey)) + { + remove_index_abbreviations(index); + return; + } + } + + entry->key1 = sortKey->abbrev_converter(entry->key1, sortKey); +} + +TupleIndexEntry +TupleIndexLookup(TupleIndex index, TupleTableSlot *searchslot, bool *is_new) +{ + TupleIndexEntry entry; + TupleIndexSearchEntryData search_entry; + TupleIndexNode split_node = NULL; + TupleIndexEntry split_entry; + TupleIndexNode new_root; + + prepare_search_index_tuple(index, searchslot, &search_entry); + + entry = tuple_index_node_lookup(index, index->root, index->height, + &search_entry, is_new, &split_node, &split_entry); + + if (entry == NULL) + return NULL; + + if (split_node == NULL) + return entry; + + /* root split */ + new_root = AllocInternalIndexNode(index); + new_root->ntuples = 1; + new_root->tuples[0] = split_entry; + new_root->pointers[0] = index->root; + new_root->pointers[1] = split_node; + index->root = new_root; + index->height++; + + return entry; +} + +void +InitTupleIndexIterator(TupleIndex index, TupleIndexIterator iter) +{ + TupleIndexNode min_node; + int level; + + /* iterate to the left-most node */ + min_node = index->root; + level = index->height; + while (level-- > 0) + min_node = min_node->pointers[0]; + + iter->cur_leaf = min_node; + iter->cur_idx = 0; +} + +TupleIndexEntry +TupleIndexIteratorNext(TupleIndexIterator iter) +{ + TupleIndexNode leaf = iter->cur_leaf; + TupleIndexEntry tuple; + + if (leaf == NULL) + return NULL; + + /* this also handles single empty root node case */ + if (leaf->ntuples <= iter->cur_idx) + { + leaf = iter->cur_leaf = IndexLeafNodeGetNext(leaf); + if (leaf == NULL) + return NULL; + iter->cur_idx = 0; + } + + tuple = leaf->tuples[iter->cur_idx]; + iter->cur_idx++; + return tuple; +} + +/* + * Construct an empty TupleIndex + * + * inputDesc: tuple descriptor for input tuples + * nkeys: number of columns to be compared (length of next 4 arrays) + * attNums: attribute numbers used for grouping in sort order + * sortOperators: Oids of sort operator families used for comparisons + * sortCollations: collations used for comparisons + * nullsFirstFlags: strategy for handling NULL values + * additionalsize: size of data that may be stored along with the index entry + * used for storing per-trans information during aggregation + * metacxt: memory context for TupleIndex itself + * tuplecxt: memory context for storing MinimalTuples + * nodecxt: memory context for storing index nodes + */ +TupleIndex +BuildTupleIndex(TupleDesc inputDesc, + int nkeys, + AttrNumber *attNums, + Oid *sortOperators, + Oid *sortCollations, + bool *nullsFirstFlags, + Size additionalsize, + MemoryContext metacxt, + MemoryContext tuplecxt, + MemoryContext nodecxt) +{ + TupleIndex index; + MemoryContext oldcxt; + + Assert(nkeys > 0); + + additionalsize = MAXALIGN(additionalsize); + + oldcxt = MemoryContextSwitchTo(metacxt); + + index = (TupleIndex) palloc(sizeof(TupleIndexData)); + index->tuplecxt = tuplecxt; + index->nodecxt = nodecxt; + index->additionalsize = additionalsize; + index->tupDesc = CreateTupleDescCopy(inputDesc); + index->root = AllocLeafIndexNode(index, NULL); + index->ntuples = 0; + index->height = 0; + + index->nkeys = nkeys; + index->sortKeys = (SortSupport) palloc0(nkeys * sizeof(SortSupportData)); + + for (int i = 0; i < nkeys; ++i) + { + SortSupport sortKey = &index->sortKeys[i]; + + Assert(AttributeNumberIsValid(attNums[i])); + Assert(OidIsValid(sortOperators[i])); + + sortKey->ssup_cxt = CurrentMemoryContext; + sortKey->ssup_collation = sortCollations[i]; + sortKey->ssup_nulls_first = nullsFirstFlags[i]; + sortKey->ssup_attno = attNums[i]; + /* abbreviation applies only for the first key */ + sortKey->abbreviate = i == 0; + + PrepareSortSupportFromOrderingOp(sortOperators[i], sortKey); + } + + /* Update abbreviation information */ + if (index->sortKeys[0].abbrev_converter != NULL) + { + index->abbrevUsed = true; + index->abbrevNext = 10; + index->abbrevSortOp = sortOperators[0]; + } + else + index->abbrevUsed = false; + + MemoryContextSwitchTo(oldcxt); + return index; +} + +/* + * Resets contents of the index to be empty, preserving all the non-content + * state. + */ +void +ResetTupleIndex(TupleIndex index) +{ + SortSupport ssup; + + /* by this time indexcxt must be reset by the caller */ + index->root = AllocLeafIndexNode(index, NULL); + index->height = 0; + index->ntuples = 0; + + if (!index->abbrevUsed) + return; + + /* + * If key abbreviation is used then we must reset it's state. + * All fields in SortSupport are already setup, but we should clean + * some fields to make it look just if we setup this for the first time. + */ + ssup = &index->sortKeys[0]; + ssup->comparator = NULL; + PrepareSortSupportFromOrderingOp(index->abbrevSortOp, ssup); +} + diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c index a18556f62ecc..1284c928c50d 100644 --- a/src/backend/executor/nodeAgg.c +++ b/src/backend/executor/nodeAgg.c @@ -364,7 +364,7 @@ typedef struct FindColsContext Bitmapset *unaggregated; /* other column references */ } FindColsContext; -static void select_current_set(AggState *aggstate, int setno, bool is_hash); +static void select_current_set(AggState *aggstate, int setno, int strategy); static void initialize_phase(AggState *aggstate, int newphase); static TupleTableSlot *fetch_input_tuple(AggState *aggstate); static void initialize_aggregates(AggState *aggstate, @@ -403,8 +403,8 @@ static void find_cols(AggState *aggstate, Bitmapset **aggregated, static bool find_cols_walker(Node *node, FindColsContext *context); static void build_hash_tables(AggState *aggstate); static void build_hash_table(AggState *aggstate, int setno, double nbuckets); -static void hashagg_recompile_expressions(AggState *aggstate, bool minslot, - bool nullcheck); +static void agg_recompile_expressions(AggState *aggstate, bool minslot, + bool nullcheck); static void hash_create_memory(AggState *aggstate); static double hash_choose_num_buckets(double hashentrysize, double ngroups, Size memory); @@ -431,13 +431,13 @@ static HashAggBatch *hashagg_batch_new(LogicalTape *input_tape, int setno, int64 input_tuples, double input_card, int used_bits); static MinimalTuple hashagg_batch_read(HashAggBatch *batch, uint32 *hashp); -static void hashagg_spill_init(HashAggSpill *spill, LogicalTapeSet *tapeset, - int used_bits, double input_groups, - double hashentrysize); -static Size hashagg_spill_tuple(AggState *aggstate, HashAggSpill *spill, - TupleTableSlot *inputslot, uint32 hash); -static void hashagg_spill_finish(AggState *aggstate, HashAggSpill *spill, - int setno); +static void agg_spill_init(HashAggSpill *spill, LogicalTapeSet *tapeset, + int used_bits, double input_groups, + double hashentrysize); +static Size agg_spill_tuple(AggState *aggstate, HashAggSpill *spill, + TupleTableSlot *inputslot, uint32 hash); +static void agg_spill_finish(AggState *aggstate, HashAggSpill *spill, + int setno); static Datum GetAggInitVal(Datum textInitVal, Oid transtype); static void build_pertrans_for_aggref(AggStatePerTrans pertrans, AggState *aggstate, EState *estate, @@ -446,21 +446,27 @@ static void build_pertrans_for_aggref(AggStatePerTrans pertrans, Oid aggdeserialfn, Datum initValue, bool initValueIsNull, Oid *inputTypes, int numArguments); - +static void agg_fill_index(AggState *state); +static TupleTableSlot *agg_retrieve_index(AggState *state); +static void lookup_index_entries(AggState *state); +static void indexagg_finish_initial_spills(AggState *aggstate); +static void index_agg_enter_spill_mode(AggState *aggstate); /* * Select the current grouping set; affects current_set and * curaggcontext. */ static void -select_current_set(AggState *aggstate, int setno, bool is_hash) +select_current_set(AggState *aggstate, int setno, int strategy) { /* * When changing this, also adapt ExecAggPlainTransByVal() and * ExecAggPlainTransByRef(). */ - if (is_hash) + if (strategy == GROUPING_STRATEGY_HASH) aggstate->curaggcontext = aggstate->hashcontext; + else if (strategy == GROUPING_STRATEGY_INDEX) + aggstate->curaggcontext = aggstate->indexcontext; else aggstate->curaggcontext = aggstate->aggcontexts[setno]; @@ -680,7 +686,7 @@ initialize_aggregates(AggState *aggstate, { AggStatePerGroup pergroup = pergroups[setno]; - select_current_set(aggstate, setno, false); + select_current_set(aggstate, setno, GROUPING_STRATEGY_SORT); for (transno = 0; transno < numTrans; transno++) { @@ -1478,7 +1484,7 @@ build_hash_tables(AggState *aggstate) continue; } - memory = aggstate->hash_mem_limit / aggstate->num_hashes; + memory = aggstate->spill_mem_limit / aggstate->num_hashes; /* choose reasonable number of buckets per hashtable */ nbuckets = hash_choose_num_buckets(aggstate->hashentrysize, @@ -1496,7 +1502,7 @@ build_hash_tables(AggState *aggstate) build_hash_table(aggstate, setno, nbuckets); } - aggstate->hash_ngroups_current = 0; + aggstate->spill_ngroups_current = 0; } /* @@ -1728,7 +1734,7 @@ hash_agg_entry_size(int numTrans, Size tupleWidth, Size transitionSpace) } /* - * hashagg_recompile_expressions() + * agg_recompile_expressions() * * Identifies the right phase, compiles the right expression given the * arguments, and then sets phase->evalfunc to that expression. @@ -1746,34 +1752,47 @@ hash_agg_entry_size(int numTrans, Size tupleWidth, Size transitionSpace) * expressions in the AggStatePerPhase, and reuse when appropriate. */ static void -hashagg_recompile_expressions(AggState *aggstate, bool minslot, bool nullcheck) +agg_recompile_expressions(AggState *aggstate, bool minslot, bool nullcheck) { AggStatePerPhase phase; int i = minslot ? 1 : 0; int j = nullcheck ? 1 : 0; Assert(aggstate->aggstrategy == AGG_HASHED || - aggstate->aggstrategy == AGG_MIXED); + aggstate->aggstrategy == AGG_MIXED || + aggstate->aggstrategy == AGG_INDEX); - if (aggstate->aggstrategy == AGG_HASHED) - phase = &aggstate->phases[0]; - else /* AGG_MIXED */ + if (aggstate->aggstrategy == AGG_MIXED) phase = &aggstate->phases[1]; + else /* AGG_HASHED or AGG_INDEX */ + phase = &aggstate->phases[0]; if (phase->evaltrans_cache[i][j] == NULL) { const TupleTableSlotOps *outerops = aggstate->ss.ps.outerops; bool outerfixed = aggstate->ss.ps.outeropsfixed; - bool dohash = true; - bool dosort = false; + int strategy = 0; - /* - * If minslot is true, that means we are processing a spilled batch - * (inside agg_refill_hash_table()), and we must not advance the - * sorted grouping sets. - */ - if (aggstate->aggstrategy == AGG_MIXED && !minslot) - dosort = true; + switch (aggstate->aggstrategy) + { + case AGG_MIXED: + /* + * If minslot is true, that means we are processing a spilled batch + * (inside agg_refill_hash_table()), and we must not advance the + * sorted grouping sets. + */ + if (!minslot) + strategy |= GROUPING_STRATEGY_SORT; + /* FALLTHROUGH */ + case AGG_HASHED: + strategy |= GROUPING_STRATEGY_HASH; + break; + case AGG_INDEX: + strategy |= GROUPING_STRATEGY_INDEX; + break; + default: + Assert(false); + } /* temporarily change the outerops while compiling the expression */ if (minslot) @@ -1783,8 +1802,7 @@ hashagg_recompile_expressions(AggState *aggstate, bool minslot, bool nullcheck) } phase->evaltrans_cache[i][j] = ExecBuildAggTrans(aggstate, phase, - dosort, dohash, - nullcheck); + strategy, nullcheck); /* change back */ aggstate->ss.ps.outerops = outerops; @@ -1803,9 +1821,9 @@ hashagg_recompile_expressions(AggState *aggstate, bool minslot, bool nullcheck) * substantially larger than the initial value. */ void -hash_agg_set_limits(double hashentrysize, double input_groups, int used_bits, - Size *mem_limit, uint64 *ngroups_limit, - int *num_partitions) +agg_set_limits(double hashentrysize, double input_groups, int used_bits, + Size *mem_limit, uint64 *ngroups_limit, + int *num_partitions) { int npartitions; Size partition_mem; @@ -1853,6 +1871,18 @@ hash_agg_set_limits(double hashentrysize, double input_groups, int used_bits, *ngroups_limit = 1; } +static inline bool +agg_spill_required(AggState *aggstate, Size total_mem) +{ + /* + * Don't spill unless there's at least one group in the hash table so we + * can be sure to make progress even in edge cases. + */ + return aggstate->spill_ngroups_current > 0 && + (total_mem > aggstate->spill_mem_limit || + aggstate->spill_ngroups_current > aggstate->spill_ngroups_limit); +} + /* * hash_agg_check_limits * @@ -1863,7 +1893,6 @@ hash_agg_set_limits(double hashentrysize, double input_groups, int used_bits, static void hash_agg_check_limits(AggState *aggstate) { - uint64 ngroups = aggstate->hash_ngroups_current; Size meta_mem = MemoryContextMemAllocated(aggstate->hash_metacxt, true); Size entry_mem = MemoryContextMemAllocated(aggstate->hash_tuplescxt, @@ -1874,7 +1903,7 @@ hash_agg_check_limits(AggState *aggstate) bool do_spill = false; #ifdef USE_INJECTION_POINTS - if (ngroups >= 1000) + if (aggstate->spill_ngroups_current >= 1000) { if (IS_INJECTION_POINT_ATTACHED("hash-aggregate-spill-1000")) { @@ -1888,9 +1917,7 @@ hash_agg_check_limits(AggState *aggstate) * Don't spill unless there's at least one group in the hash table so we * can be sure to make progress even in edge cases. */ - if (aggstate->hash_ngroups_current > 0 && - (total_mem > aggstate->hash_mem_limit || - ngroups > aggstate->hash_ngroups_limit)) + if (agg_spill_required(aggstate, total_mem)) { do_spill = true; } @@ -1899,68 +1926,150 @@ hash_agg_check_limits(AggState *aggstate) hash_agg_enter_spill_mode(aggstate); } +static void +index_agg_check_limits(AggState *aggstate) +{ + Size meta_mem = MemoryContextMemAllocated(aggstate->index_metacxt, + true); + Size node_mem = MemoryContextMemAllocated(aggstate->index_nodecxt, + true); + Size entry_mem = MemoryContextMemAllocated(aggstate->index_entrycxt, + true); + Size tval_mem = MemoryContextMemAllocated(aggstate->indexcontext->ecxt_per_tuple_memory, + true); + Size total_mem = meta_mem + node_mem + entry_mem + tval_mem; + bool do_spill = false; + +#ifdef USE_INJECTION_POINTS + if (aggstate->spill_ngroups_current >= 1000) + { + if (IS_INJECTION_POINT_ATTACHED("index-aggregate-spill-1000")) + { + do_spill = true; + INJECTION_POINT_CACHED("index-aggregate-spill-1000", NULL); + } + } +#endif + + if (agg_spill_required(aggstate, total_mem)) + { + do_spill = true; + } + + if (do_spill) + index_agg_enter_spill_mode(aggstate); +} + /* * Enter "spill mode", meaning that no new groups are added to any of the hash * tables. Tuples that would create a new group are instead spilled, and * processed later. */ -static void -hash_agg_enter_spill_mode(AggState *aggstate) +static inline void +agg_enter_spill_mode(AggState *aggstate, bool ishash) { - INJECTION_POINT("hash-aggregate-enter-spill-mode", NULL); - aggstate->hash_spill_mode = true; - hashagg_recompile_expressions(aggstate, aggstate->table_filled, true); - - if (!aggstate->hash_ever_spilled) + if (ishash) { - Assert(aggstate->hash_tapeset == NULL); - Assert(aggstate->hash_spills == NULL); - - aggstate->hash_ever_spilled = true; - - aggstate->hash_tapeset = LogicalTapeSetCreate(true, NULL, -1); + INJECTION_POINT("hash-aggregate-enter-spill-mode", NULL); + aggstate->spill_mode = true; + agg_recompile_expressions(aggstate, aggstate->table_filled, true); + } + else + { + INJECTION_POINT("index-aggregate-enter-spill-mode", NULL); + aggstate->spill_mode = true; + agg_recompile_expressions(aggstate, aggstate->index_filled, true); + } + + if (!aggstate->spill_ever_happened) + { + Assert(aggstate->spill_tapeset == NULL); + Assert(aggstate->spills == NULL); - aggstate->hash_spills = palloc_array(HashAggSpill, aggstate->num_hashes); + aggstate->spill_ever_happened = true; + aggstate->spill_tapeset = LogicalTapeSetCreate(true, NULL, -1); - for (int setno = 0; setno < aggstate->num_hashes; setno++) + if (ishash) { - AggStatePerHash perhash = &aggstate->perhash[setno]; - HashAggSpill *spill = &aggstate->hash_spills[setno]; - - hashagg_spill_init(spill, aggstate->hash_tapeset, 0, + aggstate->spills = palloc_array(HashAggSpill, aggstate->num_hashes); + + for (int setno = 0; setno < aggstate->num_hashes; setno++) + { + AggStatePerHash perhash = &aggstate->perhash[setno]; + HashAggSpill *spill = &aggstate->spills[setno]; + + agg_spill_init(spill, aggstate->spill_tapeset, 0, perhash->aggnode->numGroups, aggstate->hashentrysize); + } + } + else + { + aggstate->spills = palloc(sizeof(HashAggSpill)); + agg_spill_init(aggstate->spills, aggstate->spill_tapeset, 0, + aggstate->perindex->aggnode->numGroups, + aggstate->hashentrysize); } } } +static void +hash_agg_enter_spill_mode(AggState *aggstate) +{ + agg_enter_spill_mode(aggstate, true); +} + +static void +index_agg_enter_spill_mode(AggState *aggstate) +{ + agg_enter_spill_mode(aggstate, false); +} + /* * Update metrics after filling the hash table. * * If reading from the outer plan, from_tape should be false; if reading from * another tape, from_tape should be true. */ -static void -hash_agg_update_metrics(AggState *aggstate, bool from_tape, int npartitions) +static inline void +agg_update_spill_metrics(AggState *aggstate, bool from_tape, int npartitions, bool ishash) { Size meta_mem; Size entry_mem; - Size hashkey_mem; + Size key_mem; Size buffer_mem; Size total_mem; if (aggstate->aggstrategy != AGG_MIXED && - aggstate->aggstrategy != AGG_HASHED) + aggstate->aggstrategy != AGG_HASHED && + aggstate->aggstrategy != AGG_INDEX) return; - /* memory for the hash table itself */ - meta_mem = MemoryContextMemAllocated(aggstate->hash_metacxt, true); - - /* memory for hash entries */ - entry_mem = MemoryContextMemAllocated(aggstate->hash_tuplescxt, true); - - /* memory for byref transition states */ - hashkey_mem = MemoryContextMemAllocated(aggstate->hashcontext->ecxt_per_tuple_memory, true); + if (ishash) + { + /* memory for the hash table itself */ + meta_mem = MemoryContextMemAllocated(aggstate->hash_metacxt, true); + + /* memory for hash entries */ + entry_mem = MemoryContextMemAllocated(aggstate->hash_tuplescxt, true); + + /* memory for byref transition states */ + key_mem = MemoryContextMemAllocated(aggstate->hashcontext->ecxt_per_tuple_memory, true); + } + else + { + /* memory for the index itself */ + meta_mem = MemoryContextMemAllocated(aggstate->index_metacxt, true); + + /* memory for the index nodes */ + meta_mem += MemoryContextMemAllocated(aggstate->index_nodecxt, true); + + /* memory for index entries */ + entry_mem = MemoryContextMemAllocated(aggstate->index_entrycxt, true); + + /* memory for byref transition states */ + key_mem = MemoryContextMemAllocated(aggstate->indexcontext->ecxt_per_tuple_memory, true); + } /* memory for read/write tape buffers, if spilled */ buffer_mem = npartitions * HASHAGG_WRITE_BUFFER_SIZE; @@ -1968,28 +2077,49 @@ hash_agg_update_metrics(AggState *aggstate, bool from_tape, int npartitions) buffer_mem += HASHAGG_READ_BUFFER_SIZE; /* update peak mem */ - total_mem = meta_mem + entry_mem + hashkey_mem + buffer_mem; - if (total_mem > aggstate->hash_mem_peak) - aggstate->hash_mem_peak = total_mem; + total_mem = meta_mem + entry_mem + key_mem + buffer_mem; + if (total_mem > aggstate->spill_mem_peak) + aggstate->spill_mem_peak = total_mem; /* update disk usage */ - if (aggstate->hash_tapeset != NULL) + if (aggstate->spill_tapeset != NULL) { - uint64 disk_used = LogicalTapeSetBlocks(aggstate->hash_tapeset) * (BLCKSZ / 1024); + uint64 disk_used = LogicalTapeSetBlocks(aggstate->spill_tapeset) * (BLCKSZ / 1024); - if (aggstate->hash_disk_used < disk_used) - aggstate->hash_disk_used = disk_used; + if (aggstate->spill_disk_used < disk_used) + aggstate->spill_disk_used = disk_used; } /* update hashentrysize estimate based on contents */ - if (aggstate->hash_ngroups_current > 0) + if (aggstate->spill_ngroups_current > 0) { - aggstate->hashentrysize = - TupleHashEntrySize() + - (hashkey_mem / (double) aggstate->hash_ngroups_current); + if (ishash) + { + aggstate->hashentrysize = + TupleHashEntrySize() + + (key_mem / (double) aggstate->spill_ngroups_current); + } + else + { + /* index stores MinimalTuples directly without any wrapper */ + aggstate->hashentrysize = + (key_mem / (double) aggstate->spill_ngroups_current); + } } } +static void +hash_agg_update_metrics(AggState *aggstate, bool from_tape, int npartitions) +{ + agg_update_spill_metrics(aggstate, from_tape, npartitions, true); +} + +static void +index_agg_update_metrics(AggState *aggstate, bool from_tape, int npartitions) +{ + agg_update_spill_metrics(aggstate, from_tape, npartitions, false); +} + /* * Create memory contexts used for hash aggregation. */ @@ -2048,6 +2178,33 @@ hash_create_memory(AggState *aggstate) } +/* + * Create memory contexts used for index aggregation. + */ +static void +index_create_memory(AggState *aggstate) +{ + Size maxBlockSize = ALLOCSET_DEFAULT_MAXSIZE; + + aggstate->indexcontext = CreateWorkExprContext(aggstate->ss.ps.state); + + aggstate->index_metacxt = AllocSetContextCreate(aggstate->ss.ps.state->es_query_cxt, + "IndexAgg meta context", + ALLOCSET_DEFAULT_SIZES); + aggstate->index_nodecxt = BumpContextCreate(aggstate->ss.ps.state->es_query_cxt, + "IndexAgg node context", + ALLOCSET_SMALL_SIZES); + + maxBlockSize = pg_prevpower2_size_t(work_mem * (Size) 1024 / 16); + maxBlockSize = Min(maxBlockSize, ALLOCSET_DEFAULT_MAXSIZE); + maxBlockSize = Max(maxBlockSize, ALLOCSET_DEFAULT_INITSIZE); + aggstate->index_entrycxt = AllocSetContextCreate(aggstate->ss.ps.state->es_query_cxt, + "IndexAgg table context", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + maxBlockSize); +} + /* * Choose a reasonable number of buckets for the initial hash table size. */ @@ -2141,7 +2298,7 @@ initialize_hash_entry(AggState *aggstate, TupleHashTable hashtable, AggStatePerGroup pergroup; int transno; - aggstate->hash_ngroups_current++; + aggstate->spill_ngroups_current++; hash_agg_check_limits(aggstate); /* no need to allocate or initialize per-group state */ @@ -2196,9 +2353,9 @@ lookup_hash_entries(AggState *aggstate) bool *p_isnew; /* if hash table already spilled, don't create new entries */ - p_isnew = aggstate->hash_spill_mode ? NULL : &isnew; + p_isnew = aggstate->spill_mode ? NULL : &isnew; - select_current_set(aggstate, setno, true); + select_current_set(aggstate, setno, GROUPING_STRATEGY_HASH); prepare_hash_slot(perhash, outerslot, hashslot); @@ -2214,15 +2371,15 @@ lookup_hash_entries(AggState *aggstate) } else { - HashAggSpill *spill = &aggstate->hash_spills[setno]; + HashAggSpill *spill = &aggstate->spills[setno]; TupleTableSlot *slot = aggstate->tmpcontext->ecxt_outertuple; if (spill->partitions == NULL) - hashagg_spill_init(spill, aggstate->hash_tapeset, 0, - perhash->aggnode->numGroups, - aggstate->hashentrysize); + agg_spill_init(spill, aggstate->spill_tapeset, 0, + perhash->aggnode->numGroups, + aggstate->hashentrysize); - hashagg_spill_tuple(aggstate, spill, slot, hash); + agg_spill_tuple(aggstate, spill, slot, hash); pergroup[setno] = NULL; } } @@ -2265,6 +2422,12 @@ ExecAgg(PlanState *pstate) case AGG_SORTED: result = agg_retrieve_direct(node); break; + case AGG_INDEX: + if (!node->index_filled) + agg_fill_index(node); + + result = agg_retrieve_index(node); + break; } if (!TupIsNull(result)) @@ -2381,7 +2544,7 @@ agg_retrieve_direct(AggState *aggstate) aggstate->table_filled = true; ResetTupleHashIterator(aggstate->perhash[0].hashtable, &aggstate->perhash[0].hashiter); - select_current_set(aggstate, 0, true); + select_current_set(aggstate, 0, GROUPING_STRATEGY_HASH); return agg_retrieve_hash_table(aggstate); } else @@ -2601,7 +2764,7 @@ agg_retrieve_direct(AggState *aggstate) prepare_projection_slot(aggstate, econtext->ecxt_outertuple, currentSet); - select_current_set(aggstate, currentSet, false); + select_current_set(aggstate, currentSet, GROUPING_STRATEGY_SORT); finalize_aggregates(aggstate, peragg, @@ -2683,19 +2846,19 @@ agg_refill_hash_table(AggState *aggstate) HashAggBatch *batch; AggStatePerHash perhash; HashAggSpill spill; - LogicalTapeSet *tapeset = aggstate->hash_tapeset; + LogicalTapeSet *tapeset = aggstate->spill_tapeset; bool spill_initialized = false; - if (aggstate->hash_batches == NIL) + if (aggstate->spill_batches == NIL) return false; /* hash_batches is a stack, with the top item at the end of the list */ - batch = llast(aggstate->hash_batches); - aggstate->hash_batches = list_delete_last(aggstate->hash_batches); + batch = llast(aggstate->spill_batches); + aggstate->spill_batches = list_delete_last(aggstate->spill_batches); - hash_agg_set_limits(aggstate->hashentrysize, batch->input_card, - batch->used_bits, &aggstate->hash_mem_limit, - &aggstate->hash_ngroups_limit, NULL); + agg_set_limits(aggstate->hashentrysize, batch->input_card, + batch->used_bits, &aggstate->spill_mem_limit, + &aggstate->spill_ngroups_limit, NULL); /* * Each batch only processes one grouping set; set the rest to NULL so @@ -2712,7 +2875,7 @@ agg_refill_hash_table(AggState *aggstate) for (int setno = 0; setno < aggstate->num_hashes; setno++) ResetTupleHashTable(aggstate->perhash[setno].hashtable); - aggstate->hash_ngroups_current = 0; + aggstate->spill_ngroups_current = 0; /* * In AGG_MIXED mode, hash aggregation happens in phase 1 and the output @@ -2726,7 +2889,7 @@ agg_refill_hash_table(AggState *aggstate) aggstate->phase = &aggstate->phases[aggstate->current_phase]; } - select_current_set(aggstate, batch->setno, true); + select_current_set(aggstate, batch->setno, GROUPING_STRATEGY_HASH); perhash = &aggstate->perhash[aggstate->current_set]; @@ -2737,19 +2900,19 @@ agg_refill_hash_table(AggState *aggstate) * We still need the NULL check, because we are only processing one * grouping set at a time and the rest will be NULL. */ - hashagg_recompile_expressions(aggstate, true, true); + agg_recompile_expressions(aggstate, true, true); INJECTION_POINT("hash-aggregate-process-batch", NULL); for (;;) { - TupleTableSlot *spillslot = aggstate->hash_spill_rslot; + TupleTableSlot *spillslot = aggstate->spill_rslot; TupleTableSlot *hashslot = perhash->hashslot; TupleHashTable hashtable = perhash->hashtable; TupleHashEntry entry; MinimalTuple tuple; uint32 hash; bool isnew = false; - bool *p_isnew = aggstate->hash_spill_mode ? NULL : &isnew; + bool *p_isnew = aggstate->spill_mode ? NULL : &isnew; CHECK_FOR_INTERRUPTS(); @@ -2782,11 +2945,11 @@ agg_refill_hash_table(AggState *aggstate) * that we don't assign tapes that will never be used. */ spill_initialized = true; - hashagg_spill_init(&spill, tapeset, batch->used_bits, - batch->input_card, aggstate->hashentrysize); + agg_spill_init(&spill, tapeset, batch->used_bits, + batch->input_card, aggstate->hashentrysize); } /* no memory for a new group, spill */ - hashagg_spill_tuple(aggstate, &spill, spillslot, hash); + agg_spill_tuple(aggstate, &spill, spillslot, hash); aggstate->hash_pergroup[batch->setno] = NULL; } @@ -2806,16 +2969,16 @@ agg_refill_hash_table(AggState *aggstate) if (spill_initialized) { - hashagg_spill_finish(aggstate, &spill, batch->setno); + agg_spill_finish(aggstate, &spill, batch->setno); hash_agg_update_metrics(aggstate, true, spill.npartitions); } else hash_agg_update_metrics(aggstate, true, 0); - aggstate->hash_spill_mode = false; + aggstate->spill_mode = false; /* prepare to walk the first hash table */ - select_current_set(aggstate, batch->setno, true); + select_current_set(aggstate, batch->setno, GROUPING_STRATEGY_HASH); ResetTupleHashIterator(aggstate->perhash[batch->setno].hashtable, &aggstate->perhash[batch->setno].hashiter); @@ -2975,14 +3138,14 @@ agg_retrieve_hash_table_in_memory(AggState *aggstate) } /* - * hashagg_spill_init + * agg_spill_init * * Called after we determined that spilling is necessary. Chooses the number * of partitions to create, and initializes them. */ static void -hashagg_spill_init(HashAggSpill *spill, LogicalTapeSet *tapeset, int used_bits, - double input_groups, double hashentrysize) +agg_spill_init(HashAggSpill *spill, LogicalTapeSet *tapeset, int used_bits, + double input_groups, double hashentrysize) { int npartitions; int partition_bits; @@ -3018,14 +3181,13 @@ hashagg_spill_init(HashAggSpill *spill, LogicalTapeSet *tapeset, int used_bits, } /* - * hashagg_spill_tuple + * agg_spill_tuple * - * No room for new groups in the hash table. Save for later in the appropriate - * partition. + * No room for new groups in memory. Save for later in the appropriate partition. */ static Size -hashagg_spill_tuple(AggState *aggstate, HashAggSpill *spill, - TupleTableSlot *inputslot, uint32 hash) +agg_spill_tuple(AggState *aggstate, HashAggSpill *spill, + TupleTableSlot *inputslot, uint32 hash) { TupleTableSlot *spillslot; int partition; @@ -3039,7 +3201,7 @@ hashagg_spill_tuple(AggState *aggstate, HashAggSpill *spill, /* spill only attributes that we actually need */ if (!aggstate->all_cols_needed) { - spillslot = aggstate->hash_spill_wslot; + spillslot = aggstate->spill_wslot; slot_getsomeattrs(inputslot, aggstate->max_colno_needed); ExecClearTuple(spillslot); for (int i = 0; i < spillslot->tts_tupleDescriptor->natts; i++) @@ -3167,14 +3329,14 @@ hashagg_finish_initial_spills(AggState *aggstate) int setno; int total_npartitions = 0; - if (aggstate->hash_spills != NULL) + if (aggstate->spills != NULL) { for (setno = 0; setno < aggstate->num_hashes; setno++) { - HashAggSpill *spill = &aggstate->hash_spills[setno]; + HashAggSpill *spill = &aggstate->spills[setno]; total_npartitions += spill->npartitions; - hashagg_spill_finish(aggstate, spill, setno); + agg_spill_finish(aggstate, spill, setno); } /* @@ -3182,21 +3344,21 @@ hashagg_finish_initial_spills(AggState *aggstate) * processing batches of spilled tuples. The initial spill structures * are no longer needed. */ - pfree(aggstate->hash_spills); - aggstate->hash_spills = NULL; + pfree(aggstate->spills); + aggstate->spills = NULL; } hash_agg_update_metrics(aggstate, false, total_npartitions); - aggstate->hash_spill_mode = false; + aggstate->spill_mode = false; } /* - * hashagg_spill_finish + * agg_spill_finish * * Transform spill partitions into new batches. */ static void -hashagg_spill_finish(AggState *aggstate, HashAggSpill *spill, int setno) +agg_spill_finish(AggState *aggstate, HashAggSpill *spill, int setno) { int i; int used_bits = 32 - spill->shift; @@ -3223,8 +3385,8 @@ hashagg_spill_finish(AggState *aggstate, HashAggSpill *spill, int setno) new_batch = hashagg_batch_new(tape, setno, spill->ntuples[i], cardinality, used_bits); - aggstate->hash_batches = lappend(aggstate->hash_batches, new_batch); - aggstate->hash_batches_used++; + aggstate->spill_batches = lappend(aggstate->spill_batches, new_batch); + aggstate->spill_batches_used++; } pfree(spill->ntuples); @@ -3239,33 +3401,668 @@ static void hashagg_reset_spill_state(AggState *aggstate) { /* free spills from initial pass */ - if (aggstate->hash_spills != NULL) + if (aggstate->spills != NULL) { int setno; for (setno = 0; setno < aggstate->num_hashes; setno++) { - HashAggSpill *spill = &aggstate->hash_spills[setno]; + HashAggSpill *spill = &aggstate->spills[setno]; pfree(spill->ntuples); pfree(spill->partitions); } - pfree(aggstate->hash_spills); - aggstate->hash_spills = NULL; + pfree(aggstate->spills); + aggstate->spills = NULL; + } + + /* free batches */ + list_free_deep(aggstate->spill_batches); + aggstate->spill_batches = NIL; + + /* close tape set */ + if (aggstate->spill_tapeset != NULL) + { + LogicalTapeSetClose(aggstate->spill_tapeset); + aggstate->spill_tapeset = NULL; + } +} +static void +agg_fill_index(AggState *aggstate) +{ + AggStatePerIndex perindex = aggstate->perindex; + ExprContext *tmpcontext = aggstate->tmpcontext; + + /* + * Process each outer-plan tuple, and then fetch the next one, until we + * exhaust the outer plan. + */ + for (;;) + { + TupleTableSlot *outerslot; + + outerslot = fetch_input_tuple(aggstate); + if (TupIsNull(outerslot)) + break; + + /* set up for lookup_index_entries and advance_aggregates */ + tmpcontext->ecxt_outertuple = outerslot; + + /* insert input tuple to index possibly spilling index to disk */ + lookup_index_entries(aggstate); + + /* Advance the aggregates (or combine functions) */ + advance_aggregates(aggstate); + + /* + * Reset per-input-tuple context after each tuple, but note that the + * hash lookups do this too + */ + ResetExprContext(aggstate->tmpcontext); + } + + /* + * Mark that index filled here, so during after recompilation + * expr will expect MinimalTuple instead of outer plan's one type. + */ + aggstate->index_filled = true; + + indexagg_finish_initial_spills(aggstate); + + /* + * This is useful only when there is no spill occurred and projecting + * occurs in memory, but still initialize it. + */ + select_current_set(aggstate, 0, GROUPING_STRATEGY_INDEX); + InitTupleIndexIterator(perindex->index, &perindex->iter); +} + +/* + * Extract the attributes that make up the grouping key into the + * indexslot. This is necessary to perform comparison in index. + */ +static void +prepare_index_slot(AggStatePerIndex perindex, + TupleTableSlot *inputslot, + TupleTableSlot *indexslot) +{ + slot_getsomeattrs(inputslot, perindex->largestGrpColIdx); + ExecClearTuple(indexslot); + + for (int i = 0; i < perindex->numCols; ++i) + { + int varNumber = perindex->idxKeyColIdxInput[i] - 1; + indexslot->tts_values[i] = inputslot->tts_values[varNumber]; + indexslot->tts_isnull[i] = inputslot->tts_isnull[varNumber]; + } + ExecStoreVirtualTuple(indexslot); +} + +static void +indexagg_reset_spill_state(AggState *aggstate) +{ + /* free spills from initial pass */ + if (aggstate->spills != NULL) + { + HashAggSpill *spill = &aggstate->spills[0]; + pfree(spill->ntuples); + pfree(spill->partitions); + pfree(aggstate->spills); + aggstate->spills = NULL; } /* free batches */ - list_free_deep(aggstate->hash_batches); - aggstate->hash_batches = NIL; + list_free_deep(aggstate->spill_batches); + aggstate->spill_batches = NIL; /* close tape set */ - if (aggstate->hash_tapeset != NULL) + if (aggstate->spill_tapeset != NULL) + { + LogicalTapeSetClose(aggstate->spill_tapeset); + aggstate->spill_tapeset = NULL; + } +} + +/* + * Initialize a freshly-created MinimalTuple in index + */ +static void +initialize_index_entry(AggState *aggstate, TupleIndex index, TupleIndexEntry entry) +{ + AggStatePerGroup pergroup; + + aggstate->spill_ngroups_current++; + index_agg_check_limits(aggstate); + + /* no need to allocate or initialize per-group state */ + if (aggstate->numtrans == 0) + return; + + pergroup = (AggStatePerGroup) TupleIndexEntryGetAdditional(index, entry); + + /* + * Initialize aggregates for new tuple group, indexagg_lookup_entries() + * already has selected the relevant grouping set. + */ + for (int transno = 0; transno < aggstate->numtrans; ++transno) + { + AggStatePerTrans pertrans = &aggstate->pertrans[transno]; + AggStatePerGroup pergroupstate = &pergroup[transno]; + + initialize_aggregate(aggstate, pertrans, pergroupstate); + } +} + +/* + * Create new sorted run from current in-memory stored index. + */ +static void +indexagg_save_index_run(AggState *aggstate) +{ + AggStatePerIndex perindex = aggstate->perindex; + ExprContext *econtext; + TupleIndexIteratorData iter; + AggStatePerAgg peragg; + TupleTableSlot *firstSlot; + TupleIndexEntry entry; + TupleTableSlot *indexslot; + AggStatePerGroup pergroup; + + econtext = aggstate->ss.ps.ps_ExprContext; + firstSlot = aggstate->ss.ss_ScanTupleSlot; + peragg = aggstate->peragg; + indexslot = perindex->indexslot; + + InitTupleIndexIterator(perindex->index, &iter); + + tuplemerge_start_run(aggstate->mergestate); + + while ((entry = TupleIndexIteratorNext(&iter)) != NULL) { - LogicalTapeSetClose(aggstate->hash_tapeset); - aggstate->hash_tapeset = NULL; + MinimalTuple tuple = TupleIndexEntryGetMinimalTuple(entry); + TupleTableSlot *output; + + ResetExprContext(econtext); + ExecStoreMinimalTuple(tuple, indexslot, false); + slot_getallattrs(indexslot); + + ExecClearTuple(firstSlot); + memset(firstSlot->tts_isnull, true, + firstSlot->tts_tupleDescriptor->natts * sizeof(bool)); + + for (int i = 0; i < perindex->numCols; i++) + { + int varNumber = perindex->idxKeyColIdxInput[i] - 1; + + firstSlot->tts_values[varNumber] = indexslot->tts_values[i]; + firstSlot->tts_isnull[varNumber] = indexslot->tts_isnull[i]; + } + ExecStoreVirtualTuple(firstSlot); + + pergroup = (AggStatePerGroup) TupleIndexEntryGetAdditional(perindex->index, entry); + + econtext->ecxt_outertuple = firstSlot; + prepare_projection_slot(aggstate, + econtext->ecxt_outertuple, + aggstate->current_set); + finalize_aggregates(aggstate, peragg, pergroup); + output = project_aggregates(aggstate); + if (output) + tuplemerge_puttupleslot(aggstate->mergestate, output); } + + tuplemerge_end_run(aggstate->mergestate); } +/* + * Fill in index with tuples in given batch. + */ +static void +indexagg_refill_batch(AggState *aggstate, HashAggBatch *batch) +{ + AggStatePerIndex perindex = aggstate->perindex; + TupleTableSlot *spillslot = aggstate->spill_rslot; + TupleTableSlot *indexslot = perindex->indexslot; + TupleIndex index = perindex->index; + LogicalTapeSet *tapeset = aggstate->spill_tapeset; + HashAggSpill spill; + bool spill_initialized = false; + + agg_set_limits(aggstate->hashentrysize, batch->input_card, batch->used_bits, + &aggstate->spill_mem_limit, &aggstate->spill_ngroups_limit, NULL); + + ReScanExprContext(aggstate->indexcontext); + + MemoryContextReset(aggstate->index_entrycxt); + MemoryContextReset(aggstate->index_nodecxt); + ResetTupleIndex(perindex->index); + + aggstate->spill_ngroups_current = 0; + + select_current_set(aggstate, batch->setno, GROUPING_STRATEGY_INDEX); + + agg_recompile_expressions(aggstate, true, true); + + for (;;) + { + MinimalTuple tuple; + TupleIndexEntry entry; + bool isnew = false; + bool *p_isnew; + uint32 hash; + + CHECK_FOR_INTERRUPTS(); + + tuple = hashagg_batch_read(batch, &hash); + if (tuple == NULL) + break; + + ExecStoreMinimalTuple(tuple, spillslot, true); + aggstate->tmpcontext->ecxt_outertuple = spillslot; + + prepare_index_slot(perindex, spillslot, indexslot); + + p_isnew = aggstate->spill_mode ? NULL : &isnew; + entry = TupleIndexLookup(index, indexslot, p_isnew); + + if (entry != NULL) + { + if (isnew) + initialize_index_entry(aggstate, index, entry); + + aggstate->all_pergroups[batch->setno] = TupleIndexEntryGetAdditional(index, entry); + advance_aggregates(aggstate); + } + else + { + if (!spill_initialized) + { + spill_initialized = true; + agg_spill_init(&spill, tapeset, batch->used_bits, + batch->input_card, aggstate->hashentrysize); + } + + agg_spill_tuple(aggstate, &spill, spillslot, hash); + aggstate->all_pergroups[batch->setno] = NULL; + } + + ResetExprContext(aggstate->tmpcontext); + } + + LogicalTapeClose(batch->input_tape); + + if (spill_initialized) + { + agg_spill_finish(aggstate, &spill, 0); + index_agg_update_metrics(aggstate, true, spill.npartitions); + } + else + index_agg_update_metrics(aggstate, true, 0); + + aggstate->spill_mode = false; + select_current_set(aggstate, batch->setno, GROUPING_STRATEGY_INDEX); + + pfree(batch); +} + +static void +indexagg_finish_initial_spills(AggState *aggstate) +{ + HashAggSpill *spill; + AggStatePerIndex perindex; + Sort *sort; + + if (!aggstate->spill_ever_happened) + return; + + Assert(aggstate->spills != NULL); + + spill = aggstate->spills; + agg_spill_finish(aggstate, aggstate->spills, 0); + + index_agg_update_metrics(aggstate, false, spill->npartitions); + aggstate->spill_mode = false; + + pfree(aggstate->spills); + aggstate->spills = NULL; + + perindex = aggstate->perindex; + sort = aggstate->index_sort; + aggstate->mergestate = tuplemerge_begin_heap(aggstate->ss.ps.ps_ResultTupleDesc, + perindex->numKeyCols, + perindex->idxKeyColIdxTL, + sort->sortOperators, + sort->collations, + sort->nullsFirst, + work_mem, NULL); + /* + * Some data was spilled. Index aggregate requires output to be sorted, + * so now we must process all remaining spilled data and produce sorted + * runs for external merge. The first saved run is current opened index. + */ + indexagg_save_index_run(aggstate); + + while (aggstate->spill_batches != NIL) + { + HashAggBatch *batch = llast(aggstate->spill_batches); + aggstate->spill_batches = list_delete_last(aggstate->spill_batches); + + indexagg_refill_batch(aggstate, batch); + indexagg_save_index_run(aggstate); + } + + tuplemerge_performmerge(aggstate->mergestate); +} + +static uint32 +index_calculate_input_slot_hash(AggState *aggstate, + TupleTableSlot *inputslot) +{ + AggStatePerIndex perindex = aggstate->perindex; + MemoryContext oldcxt; + uint32 hash; + bool isnull; + + oldcxt = MemoryContextSwitchTo(aggstate->tmpcontext->ecxt_per_tuple_memory); + + perindex->exprcontext->ecxt_innertuple = inputslot; + hash = DatumGetUInt32(ExecEvalExpr(perindex->indexhashexpr, + perindex->exprcontext, + &isnull)); + + MemoryContextSwitchTo(oldcxt); + + return hash; +} + +/* + * indexagg_lookup_entries + * + * Insert input tuples to in-memory index. + */ +static void +lookup_index_entries(AggState *aggstate) +{ + int numGroupingSets = Max(aggstate->maxsets, 1); + AggStatePerGroup *pergroup = aggstate->all_pergroups; + TupleTableSlot *outerslot = aggstate->tmpcontext->ecxt_outertuple; + + for (int setno = 0; setno < numGroupingSets; ++setno) + { + AggStatePerIndex perindex = &aggstate->perindex[setno]; + TupleIndex index = perindex->index; + TupleTableSlot *indexslot = perindex->indexslot; + TupleIndexEntry entry; + bool isnew = false; + bool *p_isnew; + + p_isnew = aggstate->spill_mode ? NULL : &isnew; + select_current_set(aggstate, setno, GROUPING_STRATEGY_INDEX); + + prepare_index_slot(perindex, outerslot, indexslot); + + /* Lookup entry in btree */ + entry = TupleIndexLookup(perindex->index, indexslot, p_isnew); + + /* For now everything is stored in memory - no disk spills */ + if (entry != NULL) + { + /* Initialize it's trans state if just created */ + if (isnew) + initialize_index_entry(aggstate, index, entry); + + pergroup[setno] = TupleIndexEntryGetAdditional(index, entry); + } + else + { + HashAggSpill *spill = &aggstate->spills[setno]; + uint32 hash; + + if (spill->partitions == NULL) + { + agg_spill_init(spill, aggstate->spill_tapeset, 0, + perindex->aggnode->numGroups, + aggstate->hashentrysize); + } + + hash = index_calculate_input_slot_hash(aggstate, indexslot); + agg_spill_tuple(aggstate, spill, outerslot, hash); + pergroup[setno] = NULL; + } + } +} + +static TupleTableSlot * +agg_retrieve_index_in_memory(AggState *aggstate) +{ + ExprContext *econtext; + TupleTableSlot *firstSlot; + AggStatePerIndex perindex; + AggStatePerAgg peragg; + AggStatePerGroup pergroup; + TupleTableSlot *result; + + econtext = aggstate->ss.ps.ps_ExprContext; + firstSlot = aggstate->ss.ss_ScanTupleSlot; + peragg = aggstate->peragg; + perindex = &aggstate->perindex[aggstate->current_set]; + + for (;;) + { + TupleIndexEntry entry; + TupleTableSlot *indexslot = perindex->indexslot; + + CHECK_FOR_INTERRUPTS(); + + entry = TupleIndexIteratorNext(&perindex->iter); + if (entry == NULL) + return NULL; + + ResetExprContext(econtext); + ExecStoreMinimalTuple(TupleIndexEntryGetMinimalTuple(entry), indexslot, false); + slot_getallattrs(indexslot); + + ExecClearTuple(firstSlot); + memset(firstSlot->tts_isnull, true, + firstSlot->tts_tupleDescriptor->natts * sizeof(bool)); + + for (int i = 0; i < perindex->numCols; i++) + { + int varNumber = perindex->idxKeyColIdxInput[i] - 1; + + firstSlot->tts_values[varNumber] = indexslot->tts_values[i]; + firstSlot->tts_isnull[varNumber] = indexslot->tts_isnull[i]; + } + ExecStoreVirtualTuple(firstSlot); + + pergroup = (AggStatePerGroup) TupleIndexEntryGetAdditional(perindex->index, entry); + + econtext->ecxt_outertuple = firstSlot; + prepare_projection_slot(aggstate, + econtext->ecxt_outertuple, + aggstate->current_set); + finalize_aggregates(aggstate, peragg, pergroup); + result = project_aggregates(aggstate); + if (result) + return result; + } + + /* no more groups */ + return NULL; +} + +static TupleTableSlot * +agg_retrieve_index_merge(AggState *aggstate) +{ + AggStatePerIndex perindex = aggstate->perindex; + TupleTableSlot *slot = perindex->mergeslot; + TupleTableSlot *resultslot = aggstate->ss.ps.ps_ResultTupleSlot; + + ExecClearTuple(slot); + + if (!tuplesort_gettupleslot(aggstate->mergestate, true, true, slot, NULL)) + return NULL; + + slot_getallattrs(slot); + ExecClearTuple(resultslot); + + for (int i = 0; i < resultslot->tts_tupleDescriptor->natts; ++i) + { + resultslot->tts_values[i] = slot->tts_values[i]; + resultslot->tts_isnull[i] = slot->tts_isnull[i]; + } + ExecStoreVirtualTuple(resultslot); + + return resultslot; +} + +static TupleTableSlot * +agg_retrieve_index(AggState *aggstate) +{ + if (aggstate->spill_ever_happened) + return agg_retrieve_index_merge(aggstate); + else + return agg_retrieve_index_in_memory(aggstate); +} + +static void +build_index(AggState *aggstate) +{ + AggStatePerIndex perindex = aggstate->perindex; + MemoryContext metacxt = aggstate->index_metacxt; + MemoryContext entrycxt = aggstate->index_entrycxt; + MemoryContext nodecxt = aggstate->index_nodecxt; + MemoryContext oldcxt; + Size additionalsize; + Oid *eqfuncoids; + Sort *sort; + + Assert(aggstate->aggstrategy == AGG_INDEX); + + additionalsize = aggstate->numtrans * sizeof(AggStatePerGroupData); + sort = aggstate->index_sort; + + /* inmem index */ + perindex->index = BuildTupleIndex(perindex->indexslot->tts_tupleDescriptor, + perindex->numKeyCols, + perindex->idxKeyColIdxIndex, + sort->sortOperators, + sort->collations, + sort->nullsFirst, + additionalsize, + metacxt, + entrycxt, + nodecxt); + + /* disk spill logic */ + oldcxt = MemoryContextSwitchTo(metacxt); + execTuplesHashPrepare(perindex->numKeyCols, perindex->aggnode->grpOperators, + &eqfuncoids, &perindex->hashfunctions); + perindex->indexhashexpr = + ExecBuildHash32FromAttrs(perindex->indexslot->tts_tupleDescriptor, + perindex->indexslot->tts_ops, + perindex->hashfunctions, + perindex->aggnode->grpCollations, + perindex->numKeyCols, + perindex->idxKeyColIdxIndex, + &aggstate->ss.ps, + 0); + perindex->exprcontext = CreateStandaloneExprContext(); + MemoryContextSwitchTo(oldcxt); +} + +static void +find_index_columns(AggState *aggstate) +{ + Bitmapset *base_colnos; + Bitmapset *aggregated_colnos; + TupleDesc scanDesc = aggstate->ss.ss_ScanTupleSlot->tts_tupleDescriptor; + List *outerTlist = outerPlanState(aggstate)->plan->targetlist; + EState *estate = aggstate->ss.ps.state; + AggStatePerIndex perindex; + Bitmapset *colnos; + AttrNumber *sortColIdx; + List *indexTlist = NIL; + TupleDesc indexDesc; + int maxCols; + int i; + + find_cols(aggstate, &aggregated_colnos, &base_colnos); + aggstate->colnos_needed = bms_union(base_colnos, aggregated_colnos); + aggstate->max_colno_needed = 0; + aggstate->all_cols_needed = true; + + for (i = 0; i < scanDesc->natts; i++) + { + int colno = i + 1; + + if (bms_is_member(colno, aggstate->colnos_needed)) + aggstate->max_colno_needed = colno; + else + aggstate->all_cols_needed = false; + } + + perindex = aggstate->perindex; + colnos = bms_copy(base_colnos); + + if (aggstate->phases[0].grouped_cols) + { + Bitmapset *grouped_cols = aggstate->phases[0].grouped_cols[0]; + ListCell *lc; + foreach(lc, aggstate->all_grouped_cols) + { + int attnum = lfirst_int(lc); + if (!bms_is_member(attnum, grouped_cols)) + colnos = bms_del_member(colnos, attnum); + } + } + + maxCols = bms_num_members(colnos) + perindex->numKeyCols; + + perindex->idxKeyColIdxInput = palloc(maxCols * sizeof(AttrNumber)); + perindex->idxKeyColIdxIndex = palloc(perindex->numKeyCols * sizeof(AttrNumber)); + + /* Add all the sorting/grouping columns to colnos */ + sortColIdx = aggstate->index_sort->sortColIdx; + for (i = 0; i < perindex->numKeyCols; i++) + colnos = bms_add_member(colnos, sortColIdx[i]); + + for (i = 0; i < perindex->numKeyCols; i++) + { + perindex->idxKeyColIdxInput[i] = sortColIdx[i]; + perindex->idxKeyColIdxIndex[i] = i + 1; + + perindex->numCols++; + /* delete already mapped columns */ + colnos = bms_del_member(colnos, sortColIdx[i]); + } + + /* and the remainig columns */ + i = -1; + while ((i = bms_next_member(colnos, i)) >= 0) + { + perindex->idxKeyColIdxInput[perindex->numCols] = i; + perindex->numCols++; + } + + /* build tuple descriptor for the index */ + perindex->largestGrpColIdx = 0; + for (i = 0; i < perindex->numCols; i++) + { + int varNumber = perindex->idxKeyColIdxInput[i] - 1; + + indexTlist = lappend(indexTlist, list_nth(outerTlist, varNumber)); + perindex->largestGrpColIdx = Max(varNumber + 1, perindex->largestGrpColIdx); + } + + indexDesc = ExecTypeFromTL(indexTlist); + perindex->indexslot = ExecAllocTableSlot(&estate->es_tupleTable, indexDesc, + &TTSOpsMinimalTuple); + list_free(indexTlist); + bms_free(colnos); + + bms_free(base_colnos); +} /* ----------------- * ExecInitAgg @@ -3297,10 +4094,12 @@ ExecInitAgg(Agg *node, EState *estate, int eflags) int numGroupingSets = 1; int numPhases; int numHashes; + int numIndexes; int i = 0; int j = 0; bool use_hashing = (node->aggstrategy == AGG_HASHED || node->aggstrategy == AGG_MIXED); + bool use_index = (node->aggstrategy == AGG_INDEX); /* check for unsupported flags */ Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK))); @@ -3337,6 +4136,7 @@ ExecInitAgg(Agg *node, EState *estate, int eflags) */ numPhases = (use_hashing ? 1 : 2); numHashes = (use_hashing ? 1 : 0); + numIndexes = (use_index ? 1 : 0); /* * Calculate the maximum number of grouping sets in any phase; this @@ -3356,7 +4156,8 @@ ExecInitAgg(Agg *node, EState *estate, int eflags) /* * additional AGG_HASHED aggs become part of phase 0, but all - * others add an extra phase. + * others add an extra phase. AGG_INDEX does not support grouping + * sets, so else branch must be AGG_SORTED or AGG_MIXED. */ if (agg->aggstrategy != AGG_HASHED) ++numPhases; @@ -3395,6 +4196,8 @@ ExecInitAgg(Agg *node, EState *estate, int eflags) if (use_hashing) hash_create_memory(aggstate); + else if (use_index) + index_create_memory(aggstate); ExecAssignExprContext(estate, &aggstate->ss.ps); @@ -3501,6 +4304,13 @@ ExecInitAgg(Agg *node, EState *estate, int eflags) aggstate->phases[0].gset_lengths = palloc_array(int, numHashes); aggstate->phases[0].grouped_cols = palloc_array(Bitmapset *, numHashes); } + else if (numIndexes) + { + aggstate->perindex = palloc0(sizeof(AggStatePerIndexData) * numIndexes); + aggstate->phases[0].numsets = 0; + aggstate->phases[0].gset_lengths = palloc(numIndexes * sizeof(int)); + aggstate->phases[0].grouped_cols = palloc(numIndexes * sizeof(Bitmapset *)); + } phase = 0; for (phaseidx = 0; phaseidx <= list_length(node->chain); ++phaseidx) @@ -3513,6 +4323,18 @@ ExecInitAgg(Agg *node, EState *estate, int eflags) aggnode = list_nth_node(Agg, node->chain, phaseidx - 1); sortnode = castNode(Sort, outerPlan(aggnode)); } + else if (use_index) + { + Assert(list_length(node->chain) == 1); + + aggnode = node; + sortnode = castNode(Sort, linitial(node->chain)); + /* + * list contains single element, so we must adjust loop variable, + * so it will be single iteration at all. + */ + phaseidx++; + } else { aggnode = node; @@ -3549,6 +4371,35 @@ ExecInitAgg(Agg *node, EState *estate, int eflags) all_grouped_cols = bms_add_members(all_grouped_cols, cols); continue; } + else if (aggnode->aggstrategy == AGG_INDEX) + { + AggStatePerPhase phasedata = &aggstate->phases[0]; + AggStatePerIndex perindex; + Bitmapset *cols; + + Assert(phase == 0); + Assert(sortnode); + + i = phasedata->numsets++; + + /* phase 0 always points to the "real" Agg in the index case */ + phasedata->aggnode = node; + phasedata->aggstrategy = node->aggstrategy; + phasedata->sortnode = sortnode; + + perindex = &aggstate->perindex[i]; + perindex->aggnode = aggnode; + aggstate->index_sort = sortnode; + + phasedata->gset_lengths[i] = perindex->numKeyCols = aggnode->numCols; + + cols = NULL; + for (j = 0; j < aggnode->numCols; ++j) + cols = bms_add_member(cols, aggnode->grpColIdx[j]); + + phasedata->grouped_cols[i] = cols; + all_grouped_cols = bms_add_members(all_grouped_cols, cols); + } else { AggStatePerPhase phasedata = &aggstate->phases[++phase]; @@ -3666,7 +4517,7 @@ ExecInitAgg(Agg *node, EState *estate, int eflags) aggstate->all_pergroups = palloc0_array(AggStatePerGroup, numGroupingSets + numHashes); pergroups = aggstate->all_pergroups; - if (node->aggstrategy != AGG_HASHED) + if (node->aggstrategy != AGG_HASHED && node->aggstrategy != AGG_INDEX) { for (i = 0; i < numGroupingSets; i++) { @@ -3680,18 +4531,15 @@ ExecInitAgg(Agg *node, EState *estate, int eflags) /* * Hashing can only appear in the initial phase. */ - if (use_hashing) + if (use_hashing || use_index) { Plan *outerplan = outerPlan(node); double totalGroups = 0; - aggstate->hash_spill_rslot = ExecInitExtraTupleSlot(estate, scanDesc, - &TTSOpsMinimalTuple); - aggstate->hash_spill_wslot = ExecInitExtraTupleSlot(estate, scanDesc, - &TTSOpsVirtual); - - /* this is an array of pointers, not structures */ - aggstate->hash_pergroup = pergroups; + aggstate->spill_rslot = ExecInitExtraTupleSlot(estate, scanDesc, + &TTSOpsMinimalTuple); + aggstate->spill_wslot = ExecInitExtraTupleSlot(estate, scanDesc, + &TTSOpsVirtual); aggstate->hashentrysize = hash_agg_entry_size(aggstate->numtrans, outerplan->plan_width, @@ -3706,20 +4554,115 @@ ExecInitAgg(Agg *node, EState *estate, int eflags) for (int k = 0; k < aggstate->num_hashes; k++) totalGroups += aggstate->perhash[k].aggnode->numGroups; - hash_agg_set_limits(aggstate->hashentrysize, totalGroups, 0, - &aggstate->hash_mem_limit, - &aggstate->hash_ngroups_limit, - &aggstate->hash_planned_partitions); - find_hash_columns(aggstate); + agg_set_limits(aggstate->hashentrysize, totalGroups, 0, + &aggstate->spill_mem_limit, + &aggstate->spill_ngroups_limit, + &aggstate->spill_planned_partitions); + + if (use_hashing) + { + /* this is an array of pointers, not structures */ + aggstate->hash_pergroup = pergroups; + + find_hash_columns(aggstate); + + /* Skip massive memory allocation if we are just doing EXPLAIN */ + if (!(eflags & EXEC_FLAG_EXPLAIN_ONLY)) + build_hash_tables(aggstate); + aggstate->table_filled = false; + } + else + { + find_index_columns(aggstate); - /* Skip massive memory allocation if we are just doing EXPLAIN */ - if (!(eflags & EXEC_FLAG_EXPLAIN_ONLY)) - build_hash_tables(aggstate); + if (!(eflags & EXEC_FLAG_EXPLAIN_ONLY)) + build_index(aggstate); + aggstate->index_filled = false; + } - aggstate->table_filled = false; /* Initialize this to 1, meaning nothing spilled, yet */ - aggstate->hash_batches_used = 1; + aggstate->spill_batches_used = 1; + } + + /* + * For index merge disk spill may be required and we perform external + * merge for this purpose. But stored tuples are already projected, so + * have different TupleDesc than used in-memory (inputDesc and indexDesc). + */ + if (use_index) + { + AggStatePerIndex perindex = aggstate->perindex; + ListCell *lc; + List *targetlist = aggstate->ss.ps.plan->targetlist; + AttrNumber *attr_mapping_tl = + palloc0(sizeof(AttrNumber) * list_length(targetlist)); + AttrNumber *keyColIdxResult; + + /* + * Build grouping column attribute mapping and store it in + * attr_mapping_tl. If there is no such mapping (projected), then + * InvalidAttrNumber is set, otherwise index in indexDesc column + * storing this attribute. + */ + foreach (lc, targetlist) + { + TargetEntry *te = (TargetEntry *)lfirst(lc); + Var *group_var; + + /* All grouping expressions in targetlist stored as OUTER Vars */ + if (!IsA(te->expr, Var)) + continue; + + group_var = (Var *)te->expr; + if (group_var->varno != OUTER_VAR) + continue; + + attr_mapping_tl[foreach_current_index(lc)] = group_var->varattno; + } + + /* Mapping is built and now create reverse mapping */ + keyColIdxResult = palloc0(sizeof(AttrNumber) * list_length(outerPlan(node)->targetlist)); + for (i = 0; i < list_length(targetlist); ++i) + { + AttrNumber outer_attno = attr_mapping_tl[i]; + AttrNumber existingIdx; + + if (!AttributeNumberIsValid(outer_attno)) + continue; + + existingIdx = keyColIdxResult[outer_attno - 1]; + + /* attnumbers can duplicate, so use first ones */ + if (AttributeNumberIsValid(existingIdx) && existingIdx <= outer_attno) + continue; + + /* + * column can be referenced in query but planner can decide to + * remove is from grouping. + */ + if (!bms_is_member(outer_attno, all_grouped_cols)) + continue; + + keyColIdxResult[outer_attno - 1] = i + 1; + } + + perindex->idxKeyColIdxTL = palloc(sizeof(AttrNumber) * perindex->numKeyCols); + for (i = 0; i < perindex->numKeyCols; ++i) + { + AttrNumber attno = keyColIdxResult[perindex->idxKeyColIdxInput[i] - 1]; + if (!AttributeNumberIsValid(attno)) + elog(ERROR, "could not locate group by attributes in targetlist for index mapping"); + + perindex->idxKeyColIdxTL[i] = attno; + } + + pfree(attr_mapping_tl); + pfree(keyColIdxResult); + + perindex->mergeslot = ExecInitExtraTupleSlot(estate, + aggstate->ss.ps.ps_ResultTupleDesc, + &TTSOpsMinimalTuple); } /* @@ -3732,13 +4675,19 @@ ExecInitAgg(Agg *node, EState *estate, int eflags) { aggstate->current_phase = 0; initialize_phase(aggstate, 0); - select_current_set(aggstate, 0, true); + select_current_set(aggstate, 0, GROUPING_STRATEGY_HASH); + } + else if (node->aggstrategy == AGG_INDEX) + { + aggstate->current_phase = 0; + initialize_phase(aggstate, 0); + select_current_set(aggstate, 0, GROUPING_STRATEGY_INDEX); } else { aggstate->current_phase = 1; initialize_phase(aggstate, 1); - select_current_set(aggstate, 0, false); + select_current_set(aggstate, 0, GROUPING_STRATEGY_SORT); } /* @@ -4066,8 +5015,7 @@ ExecInitAgg(Agg *node, EState *estate, int eflags) for (phaseidx = 0; phaseidx < aggstate->numphases; phaseidx++) { AggStatePerPhase phase = &aggstate->phases[phaseidx]; - bool dohash = false; - bool dosort = false; + int strategy; /* phase 0 doesn't necessarily exist */ if (!phase->aggnode) @@ -4079,8 +5027,7 @@ ExecInitAgg(Agg *node, EState *estate, int eflags) * Phase one, and only phase one, in a mixed agg performs both * sorting and aggregation. */ - dohash = true; - dosort = true; + strategy = GROUPING_STRATEGY_HASH | GROUPING_STRATEGY_SORT; } else if (aggstate->aggstrategy == AGG_MIXED && phaseidx == 0) { @@ -4094,19 +5041,24 @@ ExecInitAgg(Agg *node, EState *estate, int eflags) else if (phase->aggstrategy == AGG_PLAIN || phase->aggstrategy == AGG_SORTED) { - dohash = false; - dosort = true; + strategy = GROUPING_STRATEGY_SORT; } else if (phase->aggstrategy == AGG_HASHED) { - dohash = true; - dosort = false; + strategy = GROUPING_STRATEGY_HASH; + } + else if (phase->aggstrategy == AGG_INDEX) + { + strategy = GROUPING_STRATEGY_INDEX; } else + { Assert(false); + /* keep compiler quiet */ + strategy = 0; + } - phase->evaltrans = ExecBuildAggTrans(aggstate, phase, dosort, dohash, - false); + phase->evaltrans = ExecBuildAggTrans(aggstate, phase, strategy, false); /* cache compiled expression for outer slot without NULL check */ phase->evaltrans_cache[0][0] = phase->evaltrans; @@ -4409,9 +5361,9 @@ ExecEndAgg(AggState *node) Assert(ParallelWorkerNumber <= node->shared_info->num_workers); si = &node->shared_info->sinstrument[ParallelWorkerNumber]; - si->hash_batches_used = node->hash_batches_used; - si->hash_disk_used = node->hash_disk_used; - si->hash_mem_peak = node->hash_mem_peak; + si->hash_batches_used = node->spill_batches_used; + si->hash_disk_used = node->spill_disk_used; + si->hash_mem_peak = node->spill_mem_peak; } /* Make sure we have closed any open tuplesorts */ @@ -4421,7 +5373,10 @@ ExecEndAgg(AggState *node) if (node->sort_out) tuplesort_end(node->sort_out); - hashagg_reset_spill_state(node); + if (node->aggstrategy == AGG_INDEX) + indexagg_reset_spill_state(node); + else + hashagg_reset_spill_state(node); /* Release hash tables too */ if (node->hash_metacxt != NULL) @@ -4434,6 +5389,26 @@ ExecEndAgg(AggState *node) MemoryContextDelete(node->hash_tuplescxt); node->hash_tuplescxt = NULL; } + if (node->index_metacxt != NULL) + { + MemoryContextDelete(node->index_metacxt); + node->index_metacxt = NULL; + } + if (node->index_entrycxt != NULL) + { + MemoryContextDelete(node->index_entrycxt); + node->index_entrycxt = NULL; + } + if (node->index_nodecxt != NULL) + { + MemoryContextDelete(node->index_nodecxt); + node->index_nodecxt = NULL; + } + if (node->mergestate) + { + tuplesort_end(node->mergestate); + node->mergestate = NULL; + } for (transno = 0; transno < node->numtrans; transno++) { @@ -4451,6 +5426,8 @@ ExecEndAgg(AggState *node) ReScanExprContext(node->aggcontexts[setno]); if (node->hashcontext) ReScanExprContext(node->hashcontext); + if (node->indexcontext) + ReScanExprContext(node->indexcontext); outerPlan = outerPlanState(node); ExecEndNode(outerPlan); @@ -4486,12 +5463,27 @@ ExecReScanAgg(AggState *node) * we can just rescan the existing hash table; no need to build it * again. */ - if (outerPlan->chgParam == NULL && !node->hash_ever_spilled && + if (outerPlan->chgParam == NULL && !node->spill_ever_happened && !bms_overlap(node->ss.ps.chgParam, aggnode->aggParams)) { ResetTupleHashIterator(node->perhash[0].hashtable, &node->perhash[0].hashiter); - select_current_set(node, 0, true); + select_current_set(node, 0, GROUPING_STRATEGY_HASH); + return; + } + } + + if (node->aggstrategy == AGG_INDEX) + { + if (!node->index_filled) + return; + + if (outerPlan->chgParam == NULL && !node->spill_ever_happened && + !bms_overlap(node->ss.ps.chgParam, aggnode->aggParams)) + { + AggStatePerIndex perindex = node->perindex; + ResetTupleIndexIterator(perindex->index, &perindex->iter); + select_current_set(node, 0, GROUPING_STRATEGY_INDEX); return; } } @@ -4545,9 +5537,9 @@ ExecReScanAgg(AggState *node) { hashagg_reset_spill_state(node); - node->hash_ever_spilled = false; - node->hash_spill_mode = false; - node->hash_ngroups_current = 0; + node->spill_ever_happened = false; + node->spill_mode = false; + node->spill_ngroups_current = 0; ReScanExprContext(node->hashcontext); /* Rebuild empty hash table(s) */ @@ -4555,10 +5547,33 @@ ExecReScanAgg(AggState *node) node->table_filled = false; /* iterator will be reset when the table is filled */ - hashagg_recompile_expressions(node, false, false); + agg_recompile_expressions(node, false, false); } - if (node->aggstrategy != AGG_HASHED) + if (node->aggstrategy == AGG_INDEX) + { + indexagg_reset_spill_state(node); + + node->spill_ever_happened = false; + node->spill_mode = false; + node->spill_ngroups_current = 0; + + ReScanExprContext(node->indexcontext); + MemoryContextReset(node->index_entrycxt); + MemoryContextReset(node->index_nodecxt); + + build_index(node); + node->index_filled = false; + + agg_recompile_expressions(node, false, false); + + if (node->mergestate) + { + tuplesort_end(node->mergestate); + node->mergestate = NULL; + } + } + else if (node->aggstrategy != AGG_HASHED) { /* * Reset the per-group state (in particular, mark transvalues null) diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c index 4c43fd0b19b2..5fcac30af84f 100644 --- a/src/backend/optimizer/path/allpaths.c +++ b/src/backend/optimizer/path/allpaths.c @@ -3446,6 +3446,7 @@ generate_grouped_paths(PlannerInfo *root, RelOptInfo *grouped_rel, AggClauseCosts agg_costs; bool can_hash; bool can_sort; + bool can_index; Path *cheapest_total_path = NULL; Path *cheapest_partial_path = NULL; double dNumGroups = 0; @@ -3498,6 +3499,12 @@ generate_grouped_paths(PlannerInfo *root, RelOptInfo *grouped_rel, can_hash = (agg_info->group_clauses != NIL && grouping_is_hashable(agg_info->group_clauses)); + /* + * Determine whether we should consider index-based implementations of + * grouping. + */ + can_index = can_sort && can_hash; + /* * Consider whether we should generate partially aggregated non-partial * paths. We can only do this if we have a non-partial path. @@ -3615,6 +3622,7 @@ generate_grouped_paths(PlannerInfo *root, RelOptInfo *grouped_rel, AGGSPLIT_INITIAL_SERIAL, agg_info->group_clauses, NIL, + NIL, &agg_costs, dNumGroups); @@ -3691,6 +3699,7 @@ generate_grouped_paths(PlannerInfo *root, RelOptInfo *grouped_rel, AGGSPLIT_INITIAL_SERIAL, agg_info->group_clauses, NIL, + NIL, &agg_costs, dNumPartialGroups); @@ -3727,6 +3736,7 @@ generate_grouped_paths(PlannerInfo *root, RelOptInfo *grouped_rel, AGGSPLIT_INITIAL_SERIAL, agg_info->group_clauses, NIL, + NIL, &agg_costs, dNumGroups); @@ -3762,6 +3772,72 @@ generate_grouped_paths(PlannerInfo *root, RelOptInfo *grouped_rel, AGGSPLIT_INITIAL_SERIAL, agg_info->group_clauses, NIL, + NIL, + &agg_costs, + dNumPartialGroups); + + add_partial_path(grouped_rel, path); + } + + if (can_index && cheapest_total_path != NULL) + { + Path *path; + + /* + * Since the path originates from a non-grouped relation that is + * not aware of eager aggregation, we must ensure that it provides + * the correct input for partial aggregation. + */ + path = (Path *) create_projection_path(root, + grouped_rel, + cheapest_total_path, + agg_info->agg_input); + /* + * qual is NIL because the HAVING clause cannot be evaluated until the + * final value of the aggregate is known. + */ + path = (Path *) create_agg_path(root, + grouped_rel, + path, + agg_info->target, + AGG_INDEX, + AGGSPLIT_INITIAL_SERIAL, + agg_info->group_clauses, + NIL, + group_pathkeys, + &agg_costs, + dNumGroups); + + add_path(grouped_rel, path); + } + + if (can_index && cheapest_partial_path != NULL) + { + Path *path; + + /* + * Since the path originates from a non-grouped relation that is not + * aware of eager aggregation, we must ensure that it provides the + * correct input for partial aggregation. + */ + path = (Path *) create_projection_path(root, + grouped_rel, + cheapest_partial_path, + agg_info->agg_input); + + /* + * qual is NIL because the HAVING clause cannot be evaluated until the + * final value of the aggregate is known. + */ + path = (Path *) create_agg_path(root, + grouped_rel, + path, + agg_info->target, + AGG_INDEX, + AGGSPLIT_INITIAL_SERIAL, + agg_info->group_clauses, + NIL, + group_pathkeys, &agg_costs, dNumPartialGroups); diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index a39cc793b4d8..a966fb761138 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -150,6 +150,7 @@ bool enable_tidscan = true; bool enable_sort = true; bool enable_incremental_sort = true; bool enable_hashagg = true; +bool enable_indexagg = true; bool enable_nestloop = true; bool enable_material = true; bool enable_memoize = true; @@ -1848,6 +1849,32 @@ cost_recursive_union(Path *runion, Path *nrterm, Path *rterm) rterm->pathtarget->width); } +/* + * cost_tuplemerge + * Determines and returns the cost of external merge used in tuplesort. + */ +static void +cost_tuplemerge(double availMem, double input_bytes, double ntuples, + Cost comparison_cost, Cost *cost) +{ + double npages = ceil(input_bytes / BLCKSZ); + double nruns = input_bytes / availMem; + double mergeorder = tuplesort_merge_order(availMem); + double log_runs; + double npageaccesses; + + /* Compute logM(r) as log(r) / log(M) */ + if (nruns > mergeorder) + log_runs = ceil(log(nruns) / log(mergeorder)); + else + log_runs = 1.0; + + npageaccesses = 2.0 * npages * log_runs; + + /* Assume 3/4ths of accesses are sequential, 1/4th are not */ + *cost += npageaccesses * (seq_page_cost * 0.75 + random_page_cost * 0.25); +} + /* * cost_tuplesort * Determines and returns the cost of sorting a relation using tuplesort, @@ -1922,11 +1949,6 @@ cost_tuplesort(Cost *startup_cost, Cost *run_cost, /* * We'll have to use a disk-based sort of all the tuples */ - double npages = ceil(input_bytes / BLCKSZ); - double nruns = input_bytes / sort_mem_bytes; - double mergeorder = tuplesort_merge_order(sort_mem_bytes); - double log_runs; - double npageaccesses; /* * CPU costs @@ -1936,16 +1958,8 @@ cost_tuplesort(Cost *startup_cost, Cost *run_cost, *startup_cost = comparison_cost * tuples * LOG2(tuples); /* Disk costs */ - - /* Compute logM(r) as log(r) / log(M) */ - if (nruns > mergeorder) - log_runs = ceil(log(nruns) / log(mergeorder)); - else - log_runs = 1.0; - npageaccesses = 2.0 * npages * log_runs; - /* Assume 3/4ths of accesses are sequential, 1/4th are not */ - *startup_cost += npageaccesses * - (seq_page_cost * 0.75 + random_page_cost * 0.25); + cost_tuplemerge(sort_mem_bytes, input_bytes, tuples, comparison_cost, + startup_cost); } else if (tuples > 2 * output_tuples || input_bytes > sort_mem_bytes) { @@ -2770,7 +2784,7 @@ cost_agg(Path *path, PlannerInfo *root, total_cost += cpu_tuple_cost * numGroups; output_tuples = numGroups; } - else + else if (aggstrategy == AGG_HASHED) { /* must be AGG_HASHED */ startup_cost = input_total_cost; @@ -2788,6 +2802,50 @@ cost_agg(Path *path, PlannerInfo *root, total_cost += cpu_tuple_cost * numGroups; output_tuples = numGroups; } + else + { + /* must be AGG_INDEX */ + startup_cost = input_total_cost; + if (!enable_indexagg) + ++disabled_nodes; + + /* these matches AGG_HASHED */ + startup_cost += aggcosts->transCost.startup; + startup_cost += aggcosts->transCost.per_tuple * input_tuples; + startup_cost += (cpu_operator_cost * numGroupCols) * input_tuples; + startup_cost += aggcosts->finalCost.startup; + + /* cost of btree top-down traversal */ + startup_cost += LOG2(numGroups) /* amount of comparisons */ + * (2.0 * cpu_operator_cost) /* comparison cost */ + * input_tuples; + + total_cost = startup_cost; + total_cost += aggcosts->finalCost.per_tuple * numGroups; + total_cost += cpu_tuple_cost * numGroups; + output_tuples = numGroups; + } + + /* + * If there are quals (HAVING quals), account for their cost and + * selectivity. Process it before disk spill logic, because output + * cardinality is required for AGG_INDEX. + */ + if (quals) + { + QualCost qual_cost; + + cost_qual_eval(&qual_cost, quals, root); + startup_cost += qual_cost.startup; + total_cost += qual_cost.startup + output_tuples * qual_cost.per_tuple; + + output_tuples = clamp_row_est(output_tuples * + clauselist_selectivity(root, + quals, + 0, + JOIN_INNER, + NULL)); + } /* * Add the disk costs of hash aggregation that spills to disk. @@ -2802,7 +2860,7 @@ cost_agg(Path *path, PlannerInfo *root, * Accrue writes (spilled tuples) to startup_cost and to total_cost; * accrue reads only to total_cost. */ - if (aggstrategy == AGG_HASHED || aggstrategy == AGG_MIXED) + if (aggstrategy == AGG_HASHED || aggstrategy == AGG_MIXED || aggstrategy == AGG_INDEX) { double pages; double pages_written = 0.0; @@ -2814,6 +2872,7 @@ cost_agg(Path *path, PlannerInfo *root, uint64 ngroups_limit; int num_partitions; int depth; + bool canspill; /* * Estimate number of batches based on the computed limits. If less @@ -2823,8 +2882,9 @@ cost_agg(Path *path, PlannerInfo *root, hashentrysize = hash_agg_entry_size(list_length(root->aggtransinfos), input_width, aggcosts->transitionSpace); - hash_agg_set_limits(hashentrysize, numGroups, 0, &mem_limit, - &ngroups_limit, &num_partitions); + agg_set_limits(hashentrysize, numGroups, 0, &mem_limit, + &ngroups_limit, &num_partitions); + canspill = num_partitions != 0; nbatches = Max((numGroups * hashentrysize) / mem_limit, numGroups / ngroups_limit); @@ -2861,26 +2921,27 @@ cost_agg(Path *path, PlannerInfo *root, spill_cost = depth * input_tuples * 2.0 * cpu_tuple_cost; startup_cost += spill_cost; total_cost += spill_cost; - } - - /* - * If there are quals (HAVING quals), account for their cost and - * selectivity. - */ - if (quals) - { - QualCost qual_cost; - cost_qual_eval(&qual_cost, quals, root); - startup_cost += qual_cost.startup; - total_cost += qual_cost.startup + output_tuples * qual_cost.per_tuple; - - output_tuples = clamp_row_est(output_tuples * - clauselist_selectivity(root, - quals, - 0, - JOIN_INNER, - NULL)); + /* + * IndexAgg requires final external merge stage, but only if spill + * can occur, otherwise everything processed in memory. + */ + if (aggstrategy == AGG_INDEX && canspill) + { + double output_bytes; + Cost comparison_cost; + Cost merge_cost = 0; + + /* size of all projected tuples */ + output_bytes = path->pathtarget->width * output_tuples; + /* default comparison cost */ + comparison_cost = 2.0 * cpu_operator_cost; + + cost_tuplemerge(work_mem, output_bytes, output_tuples, + comparison_cost, &merge_cost); + startup_cost += merge_cost; + total_cost += merge_cost; + } } path->rows = output_tuples; diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index bc417f938401..de9bb1ef30b7 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -2158,6 +2158,8 @@ create_agg_plan(PlannerInfo *root, AggPath *best_path) Plan *subplan; List *tlist; List *quals; + List *chain; + AttrNumber *grpColIdx; /* * Agg can project, so no need to be terribly picky about child tlist, but @@ -2169,17 +2171,24 @@ create_agg_plan(PlannerInfo *root, AggPath *best_path) quals = order_qual_clauses(root, best_path->qual); + grpColIdx = extract_grouping_cols(best_path->groupClause, subplan->targetlist); + + /* For index aggregation we should consider the desired sorting order. */ + if (best_path->aggstrategy == AGG_INDEX) + chain = list_make1(make_sort_from_groupcols(best_path->groupClause, grpColIdx, subplan)); + else + chain = NIL; + plan = make_agg(tlist, quals, best_path->aggstrategy, best_path->aggsplit, list_length(best_path->groupClause), - extract_grouping_cols(best_path->groupClause, - subplan->targetlist), + grpColIdx, extract_grouping_ops(best_path->groupClause), extract_grouping_collations(best_path->groupClause, subplan->targetlist), NIL, - NIL, + chain, best_path->numGroups, best_path->transitionSpace, subplan); diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index 1268ea92b6f0..43c6f99b68cf 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -3878,6 +3878,22 @@ create_grouping_paths(PlannerInfo *root, (gd ? gd->any_hashable : grouping_is_hashable(root->processed_groupClause)))) flags |= GROUPING_CAN_USE_HASH; + /* + * Determine whether we should consider index-based implementation of + * grouping. + * + * This is more restrictive since it not only must be sortable (for + * purposes of Btree), but also must be hashable, so we can effectively + * spill tuples and later process each batch. + */ + if ( gd == NULL + && root->numOrderedAggs == 0 + && parse->groupClause != NIL + && parse->groupingSets == NIL + && grouping_is_sortable(root->processed_groupClause) + && grouping_is_hashable(root->processed_groupClause)) + flags |= GROUPING_CAN_USE_INDEX; + /* * Determine whether partial aggregation is possible. */ @@ -5016,6 +5032,7 @@ create_partial_distinct_paths(PlannerInfo *root, RelOptInfo *input_rel, AGGSPLIT_SIMPLE, root->processed_distinctClause, NIL, + NIL, NULL, numDistinctRows)); } @@ -5224,6 +5241,7 @@ create_final_distinct_paths(PlannerInfo *root, RelOptInfo *input_rel, AGGSPLIT_SIMPLE, root->processed_distinctClause, NIL, + NIL, NULL, numDistinctRows)); } @@ -7109,6 +7127,7 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel, ListCell *lc; bool can_hash = (extra->flags & GROUPING_CAN_USE_HASH) != 0; bool can_sort = (extra->flags & GROUPING_CAN_USE_SORT) != 0; + bool can_index = (extra->flags & GROUPING_CAN_USE_INDEX) != 0; List *havingQual = (List *) extra->havingQual; AggClauseCosts *agg_final_costs = &extra->agg_final_costs; double dNumGroups = 0; @@ -7194,6 +7213,7 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel, AGGSPLIT_SIMPLE, info->clauses, havingQual, + NIL, agg_costs, dNumGroups)); } @@ -7265,6 +7285,7 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel, AGGSPLIT_FINAL_DESERIAL, info->clauses, havingQual, + NIL, agg_final_costs, dNumFinalGroups)); else @@ -7306,6 +7327,7 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel, AGGSPLIT_SIMPLE, root->processed_groupClause, havingQual, + NIL, agg_costs, dNumGroups)); } @@ -7325,6 +7347,47 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel, AGGSPLIT_FINAL_DESERIAL, root->processed_groupClause, havingQual, + NIL, + agg_final_costs, + dNumFinalGroups)); + } + } + + if (can_index) + { + List *pathkeys = make_pathkeys_for_sortclauses(root, + root->processed_groupClause, + root->processed_tlist); + + add_path(grouped_rel, (Path *) + create_agg_path(root, + grouped_rel, + cheapest_path, + grouped_rel->reltarget, + AGG_INDEX, + AGGSPLIT_SIMPLE, + root->processed_groupClause, + havingQual, + pathkeys, + agg_costs, + dNumGroups)); + + /* + * Instead of operating directly on the input relation, we can + * consider finalizing a partially aggregated path. + */ + if (partially_grouped_rel != NULL) + { + add_path(grouped_rel, (Path *) + create_agg_path(root, + grouped_rel, + cheapest_partially_grouped_path, + grouped_rel->reltarget, + AGG_INDEX, + AGGSPLIT_FINAL_DESERIAL, + root->processed_groupClause, + havingQual, + pathkeys, agg_final_costs, dNumFinalGroups)); } @@ -7376,6 +7439,7 @@ create_partial_grouping_paths(PlannerInfo *root, ListCell *lc; bool can_hash = (extra->flags & GROUPING_CAN_USE_HASH) != 0; bool can_sort = (extra->flags & GROUPING_CAN_USE_SORT) != 0; + bool can_index = (extra->flags & GROUPING_CAN_USE_INDEX) != 0; /* * Check whether any partially aggregated paths have been generated @@ -7527,6 +7591,7 @@ create_partial_grouping_paths(PlannerInfo *root, AGGSPLIT_INITIAL_SERIAL, info->clauses, NIL, + NIL, agg_partial_costs, dNumPartialGroups)); else @@ -7585,6 +7650,7 @@ create_partial_grouping_paths(PlannerInfo *root, AGGSPLIT_INITIAL_SERIAL, info->clauses, NIL, + NIL, agg_partial_costs, dNumPartialPartialGroups)); else @@ -7616,6 +7682,7 @@ create_partial_grouping_paths(PlannerInfo *root, AGGSPLIT_INITIAL_SERIAL, root->processed_groupClause, NIL, + NIL, agg_partial_costs, dNumPartialGroups)); } @@ -7634,6 +7701,62 @@ create_partial_grouping_paths(PlannerInfo *root, AGGSPLIT_INITIAL_SERIAL, root->processed_groupClause, NIL, + NIL, + agg_partial_costs, + dNumPartialPartialGroups)); + } + + /* + * Add a partially-grouped IndexAgg Path where possible + */ + if (can_index && cheapest_total_path != NULL) + { + List *pathkeys; + + /* This should have been checked previously */ + Assert(parse->hasAggs || parse->groupClause); + + pathkeys = make_pathkeys_for_sortclauses(root, + root->processed_groupClause, + root->processed_tlist); + + add_path(partially_grouped_rel, (Path *) + create_agg_path(root, + partially_grouped_rel, + cheapest_total_path, + partially_grouped_rel->reltarget, + AGG_INDEX, + AGGSPLIT_INITIAL_SERIAL, + root->processed_groupClause, + NIL, + pathkeys, + agg_partial_costs, + dNumPartialGroups)); + } + + /* + * Now add a partially-grouped IndexAgg partial Path where possible + */ + if (can_index && cheapest_partial_path != NULL) + { + List *pathkeys; + + /* This should have been checked previously */ + Assert(parse->hasAggs || parse->groupClause); + + pathkeys = make_pathkeys_for_sortclauses(root, + root->processed_groupClause, + root->processed_tlist); + add_partial_path(partially_grouped_rel, (Path *) + create_agg_path(root, + partially_grouped_rel, + cheapest_partial_path, + partially_grouped_rel->reltarget, + AGG_INDEX, + AGGSPLIT_INITIAL_SERIAL, + root->processed_groupClause, + NIL, + pathkeys, agg_partial_costs, dNumPartialPartialGroups)); } @@ -8795,6 +8918,7 @@ create_final_unique_paths(PlannerInfo *root, RelOptInfo *input_rel, AGGSPLIT_SIMPLE, groupClause, NIL, + NIL, NULL, unique_rel->rows); @@ -8937,6 +9061,7 @@ create_partial_unique_paths(PlannerInfo *root, RelOptInfo *input_rel, AGGSPLIT_SIMPLE, groupClause, NIL, + NIL, NULL, partial_unique_rel->rows); diff --git a/src/backend/optimizer/prep/prepunion.c b/src/backend/optimizer/prep/prepunion.c index a01b02f3a7b6..de6a1558044a 100644 --- a/src/backend/optimizer/prep/prepunion.c +++ b/src/backend/optimizer/prep/prepunion.c @@ -949,6 +949,7 @@ generate_union_paths(SetOperationStmt *op, PlannerInfo *root, AGGSPLIT_SIMPLE, groupList, NIL, + NIL, NULL, dNumGroups); add_path(result_rel, path); @@ -965,6 +966,7 @@ generate_union_paths(SetOperationStmt *op, PlannerInfo *root, AGGSPLIT_SIMPLE, groupList, NIL, + NIL, NULL, dNumGroups); add_path(result_rel, path); diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index b6be4ddbd01b..646762be43b7 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -2988,6 +2988,7 @@ create_unique_path(PlannerInfo *root, * 'aggsplit' is the Agg node's aggregate-splitting mode * 'groupClause' is a list of SortGroupClause's representing the grouping * 'qual' is the HAVING quals if any + * 'pathkeys' for AGG_INDEX must be a list of PathKey used by this agg node * 'aggcosts' contains cost info about the aggregate functions to be computed * 'numGroups' is the estimated number of groups (1 if not grouping) */ @@ -3000,6 +3001,7 @@ create_agg_path(PlannerInfo *root, AggSplit aggsplit, List *groupClause, List *qual, + List *pathkeys, const AggClauseCosts *aggcosts, double numGroups) { @@ -3030,6 +3032,21 @@ create_agg_path(PlannerInfo *root, else pathnode->path.pathkeys = subpath->pathkeys; /* preserves order */ } + else if (aggstrategy == AGG_INDEX) + { + /* + * For IndexAgg we also must know used ordering just like for GroupAgg, + * but for the latter this information is passed by child node, i.e. + * Sort. But here we can not use make_pathkeys_for_sortclauses, because + * in case of partial aggregates the node will contain different target + * list and sortgroupref indexes, so this function will not find required + * entries. So caller must build pathkeys for us. + * + * NOTE: pathkeys CAN be NIL, i.e. if planner decided that all values + * are same constant. + */ + pathnode->path.pathkeys = pathkeys; + } else pathnode->path.pathkeys = NIL; /* output is unordered */ diff --git a/src/backend/utils/misc/guc_parameters.dat b/src/backend/utils/misc/guc_parameters.dat index ac0c7c36c561..76fad99bbf2e 100644 --- a/src/backend/utils/misc/guc_parameters.dat +++ b/src/backend/utils/misc/guc_parameters.dat @@ -877,6 +877,13 @@ boot_val => 'true', }, +{ name => 'enable_indexagg', type => 'bool', context => 'PGC_USERSET', group => 'QUERY_TUNING_METHOD', + short_desc => 'Enables the planner\'s use of index aggregation plans.', + flags => 'GUC_EXPLAIN', + variable => 'enable_indexagg', + boot_val => 'true', +}, + { name => 'enable_indexonlyscan', type => 'bool', context => 'PGC_USERSET', group => 'QUERY_TUNING_METHOD', short_desc => 'Enables the planner\'s use of index-only-scan plans.', flags => 'GUC_EXPLAIN', diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index dc9e2255f8a7..307b9ee660db 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -410,6 +410,7 @@ #enable_hashagg = on #enable_hashjoin = on #enable_incremental_sort = on +#enable_indexagg = on #enable_indexscan = on #enable_indexonlyscan = on #enable_material = on diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c index 88ae529e8431..fc3497077780 100644 --- a/src/backend/utils/sort/tuplesort.c +++ b/src/backend/utils/sort/tuplesort.c @@ -1900,6 +1900,7 @@ static void inittapestate(Tuplesortstate *state, int maxTapes) { int64 tapeSpace; + Size memtuplesSize; /* * Decrease availMem to reflect the space needed for tape buffers; but @@ -1912,7 +1913,16 @@ inittapestate(Tuplesortstate *state, int maxTapes) */ tapeSpace = (int64) maxTapes * TAPE_BUFFER_OVERHEAD; - if (tapeSpace + GetMemoryChunkSpace(state->memtuples) < state->allowedMem) + /* + * In merge state during initial run creation we do not use in-memory + * tuples array and write to tapes directly. + */ + if (state->memtuples != NULL) + memtuplesSize = GetMemoryChunkSpace(state->memtuples); + else + memtuplesSize = 0; + + if (tapeSpace + memtuplesSize < state->allowedMem) USEMEM(state, tapeSpace); /* @@ -2031,11 +2041,14 @@ mergeruns(Tuplesortstate *state) /* * We no longer need a large memtuples array. (We will allocate a smaller - * one for the heap later.) + * one for the heap later.) Note that in merge state this array can be NULL. */ - FREEMEM(state, GetMemoryChunkSpace(state->memtuples)); - pfree(state->memtuples); - state->memtuples = NULL; + if (state->memtuples) + { + FREEMEM(state, GetMemoryChunkSpace(state->memtuples)); + pfree(state->memtuples); + state->memtuples = NULL; + } /* * Initialize the slab allocator. We need one slab slot per input tape, @@ -3157,3 +3170,189 @@ ssup_datum_int32_cmp(Datum x, Datum y, SortSupport ssup) else return 0; } + +/* + * tuplemerge_begin_common + * + * Create new Tuplesortstate for performing merge only. This is used when + * we know, that input is sorted, but stored in multiple tapes, so only + * have to perform merge. + * + * Unlike tuplesort_begin_common it does not accept sortopt, because none + * of current options are supported by merge (random access and bounded sort). + */ +Tuplesortstate * +tuplemerge_begin_common(int workMem, SortCoordinate coordinate) +{ + Tuplesortstate *state; + MemoryContext maincontext; + MemoryContext sortcontext; + MemoryContext oldcontext; + + /* + * Memory context surviving tuplesort_reset. This memory context holds + * data which is useful to keep while sorting multiple similar batches. + */ + maincontext = AllocSetContextCreate(CurrentMemoryContext, + "TupleMerge main", + ALLOCSET_DEFAULT_SIZES); + + /* + * Create a working memory context for one sort operation. The content of + * this context is deleted by tuplesort_reset. + */ + sortcontext = AllocSetContextCreate(maincontext, + "TupleMerge merge", + ALLOCSET_DEFAULT_SIZES); + + /* + * Make the Tuplesortstate within the per-sortstate context. This way, we + * don't need a separate pfree() operation for it at shutdown. + */ + oldcontext = MemoryContextSwitchTo(maincontext); + + state = (Tuplesortstate *) palloc0(sizeof(Tuplesortstate)); + + if (trace_sort) + pg_rusage_init(&state->ru_start); + + state->base.sortopt = TUPLESORT_NONE; + state->base.tuples = true; + state->abbrevNext = 10; + + /* + * workMem is forced to be at least 64KB, the current minimum valid value + * for the work_mem GUC. This is a defense against parallel sort callers + * that divide out memory among many workers in a way that leaves each + * with very little memory. + */ + state->allowedMem = Max(workMem, 64) * (int64) 1024; + state->base.sortcontext = sortcontext; + state->base.maincontext = maincontext; + + /* + * After all of the other non-parallel-related state, we setup all of the + * state needed for each batch. + */ + + /* + * Merging do not accept RANDOMACCESS, so only possible context is Bump, + * which saves some cycles. + */ + state->base.tuplecontext = BumpContextCreate(state->base.sortcontext, + "Caller tuples", + ALLOCSET_DEFAULT_SIZES); + + state->status = TSS_BUILDRUNS; + state->bounded = false; + state->boundUsed = false; + state->availMem = state->allowedMem; + + /* + * When performing merge we do not need in-memory array for sorting. + * Even if we do not use memtuples, still allocate it, but make it empty. + * So if someone will invoke inappropriate function in merge mode we will + * not fail. + */ + state->memtuples = NULL; + state->memtupcount = 0; + state->memtupsize = INITIAL_MEMTUPSIZE; + state->growmemtuples = true; + state->slabAllocatorUsed = false; + + /* + * Tape variables (inputTapes, outputTapes, etc.) will be initialized by + * inittapes(), if needed. + */ + state->result_tape = NULL; /* flag that result tape has not been formed */ + state->tapeset = NULL; + + inittapes(state, true); + + /* + * Initialize parallel-related state based on coordination information + * from caller + */ + if (!coordinate) + { + /* Serial sort */ + state->shared = NULL; + state->worker = -1; + state->nParticipants = -1; + } + else if (coordinate->isWorker) + { + /* Parallel worker produces exactly one final run from all input */ + state->shared = coordinate->sharedsort; + state->worker = worker_get_identifier(state); + state->nParticipants = -1; + } + else + { + /* Parallel leader state only used for final merge */ + state->shared = coordinate->sharedsort; + state->worker = -1; + state->nParticipants = coordinate->nParticipants; + Assert(state->nParticipants >= 1); + } + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +void +tuplemerge_start_run(Tuplesortstate *state) +{ + if (state->memtupcount == 0) + return; + + selectnewtape(state); + state->memtupcount = 0; +} + +void +tuplemerge_performmerge(Tuplesortstate *state) +{ + if (state->memtupcount == 0) + { + /* + * We have started new run, but no tuples were written. mergeruns + * expects that each run have at least 1 tuple, otherwise it + * will fail to even fill initial merge heap. + */ + state->nOutputRuns--; + } + else + state->memtupcount = 0; + + mergeruns(state); + + state->current = 0; + state->eof_reached = false; + state->markpos_block = 0L; + state->markpos_offset = 0; + state->markpos_eof = false; +} + +void +tuplemerge_puttuple_common(Tuplesortstate *state, SortTuple *tuple, Size tuplen) +{ + MemoryContext oldcxt = MemoryContextSwitchTo(state->base.sortcontext); + + Assert(state->destTape); + WRITETUP(state, state->destTape, tuple); + + MemoryContextSwitchTo(oldcxt); + + state->memtupcount++; +} + +void +tuplemerge_end_run(Tuplesortstate *state) +{ + if (state->memtupcount != 0) + { + markrunend(state->destTape); + } +} diff --git a/src/backend/utils/sort/tuplesortvariants.c b/src/backend/utils/sort/tuplesortvariants.c index a1f5c19ee976..96cc66900fa9 100644 --- a/src/backend/utils/sort/tuplesortvariants.c +++ b/src/backend/utils/sort/tuplesortvariants.c @@ -2070,3 +2070,108 @@ readtup_datum(Tuplesortstate *state, SortTuple *stup, if (base->sortopt & TUPLESORT_RANDOMACCESS) /* need trailing length word? */ LogicalTapeReadExact(tape, &tuplen, sizeof(tuplen)); } + +Tuplesortstate * +tuplemerge_begin_heap(TupleDesc tupDesc, + int nkeys, AttrNumber *attNums, + Oid *sortOperators, Oid *sortCollations, + bool *nullsFirstFlags, + int workMem, SortCoordinate coordinate) +{ + Tuplesortstate *state = tuplemerge_begin_common(workMem, coordinate); + TuplesortPublic *base = TuplesortstateGetPublic(state); + MemoryContext oldcontext; + int i; + + oldcontext = MemoryContextSwitchTo(base->maincontext); + + Assert(nkeys > 0); + + if (trace_sort) + elog(LOG, + "begin tuple merge: nkeys = %d, workMem = %d", nkeys, workMem); + + base->nKeys = nkeys; + + TRACE_POSTGRESQL_SORT_START(HEAP_SORT, + false, /* no unique check */ + nkeys, + workMem, + false, + PARALLEL_SORT(coordinate)); + + base->removeabbrev = removeabbrev_heap; + base->comparetup = comparetup_heap; + base->comparetup_tiebreak = comparetup_heap_tiebreak; + base->writetup = writetup_heap; + base->readtup = readtup_heap; + base->haveDatum1 = true; + base->arg = tupDesc; /* assume we need not copy tupDesc */ + + /* Prepare SortSupport data for each column */ + base->sortKeys = (SortSupport) palloc0(nkeys * sizeof(SortSupportData)); + + for (i = 0; i < nkeys; i++) + { + SortSupport sortKey = base->sortKeys + i; + + Assert(attNums[i] != 0); + Assert(sortOperators[i] != 0); + + sortKey->ssup_cxt = CurrentMemoryContext; + sortKey->ssup_collation = sortCollations[i]; + sortKey->ssup_nulls_first = nullsFirstFlags[i]; + sortKey->ssup_attno = attNums[i]; + /* Convey if abbreviation optimization is applicable in principle */ + sortKey->abbreviate = (i == 0 && base->haveDatum1); + + PrepareSortSupportFromOrderingOp(sortOperators[i], sortKey); + } + + /* + * The "onlyKey" optimization cannot be used with abbreviated keys, since + * tie-breaker comparisons may be required. Typically, the optimization + * is only of value to pass-by-value types anyway, whereas abbreviated + * keys are typically only of value to pass-by-reference types. + */ + if (nkeys == 1 && !base->sortKeys->abbrev_converter) + base->onlyKey = base->sortKeys; + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +void +tuplemerge_puttupleslot(Tuplesortstate *state, TupleTableSlot *slot) +{ + TuplesortPublic *base = TuplesortstateGetPublic(state); + MemoryContext oldcontext = MemoryContextSwitchTo(base->tuplecontext); + TupleDesc tupDesc = (TupleDesc) base->arg; + SortTuple stup; + MinimalTuple tuple; + HeapTupleData htup; + Size tuplen; + + /* copy the tuple into sort storage */ + tuple = ExecCopySlotMinimalTuple(slot); + stup.tuple = tuple; + /* set up first-column key value */ + htup.t_len = tuple->t_len + MINIMAL_TUPLE_OFFSET; + htup.t_data = (HeapTupleHeader) ((char *) tuple - MINIMAL_TUPLE_OFFSET); + stup.datum1 = heap_getattr(&htup, + base->sortKeys[0].ssup_attno, + tupDesc, + &stup.isnull1); + + /* GetMemoryChunkSpace is not supported for bump contexts */ + if (TupleSortUseBumpTupleCxt(base->sortopt)) + tuplen = MAXALIGN(tuple->t_len); + else + tuplen = GetMemoryChunkSpace(tuple); + + tuplemerge_puttuple_common(state, &stup, tuplen); + + MemoryContextSwitchTo(oldcontext); +} + diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h index 7cd6a49309f0..57e53d94a174 100644 --- a/src/include/executor/executor.h +++ b/src/include/executor/executor.h @@ -198,6 +198,71 @@ TupleHashEntryGetAdditional(TupleHashTable hashtable, TupleHashEntry entry) } #endif +extern TupleIndex BuildTupleIndex(TupleDesc inputDesc, + int nkeys, + AttrNumber *attNums, + Oid *sortOperators, + Oid *sortCollations, + bool *nullsFirstFlags, + Size additionalsize, + MemoryContext metacxt, + MemoryContext tablecxt, + MemoryContext nodecxt); +extern TupleIndexEntry TupleIndexLookup(TupleIndex index, TupleTableSlot *search, + bool *is_new); +extern void ResetTupleIndex(TupleIndex index); + +/* + * Start iteration over tuples in index. Supports only ascending direction. + * During iterations no modifications are allowed, because it can break iterator. + */ +extern void InitTupleIndexIterator(TupleIndex index, TupleIndexIterator iter); +extern TupleIndexEntry TupleIndexIteratorNext(TupleIndexIterator iter); +static inline void +ResetTupleIndexIterator(TupleIndex index, TupleIndexIterator iter) +{ + InitTupleIndexIterator(index, iter); +} + +#ifndef FRONTEND + +/* + * Return size of the index entry. Useful for estimating memory usage. + */ +static inline size_t +TupleIndexEntrySize(void) +{ + return sizeof(TupleIndexEntryData); +} + +/* + * Get a pointer to the additional space allocated for this entry. The + * memory will be maxaligned and zeroed. + * + * The amount of space available is the additionalsize requested in the call + * to BuildTupleIndex(). If additionalsize was specified as zero, return + * NULL. + */ +static inline void * +TupleIndexEntryGetAdditional(TupleIndex index, TupleIndexEntry entry) +{ +if (index->additionalsize > 0) + return (char *) (entry->tuple) - index->additionalsize; +else + return NULL; +} + +/* + * Return tuple from index entry + */ +static inline MinimalTuple +TupleIndexEntryGetMinimalTuple(TupleIndexEntry entry) +{ + return entry->tuple; +} + +#endif + /* * prototypes from functions in execJunk.c */ @@ -328,8 +393,16 @@ extern ExprState *ExecInitExprWithParams(Expr *node, ParamListInfo ext_params); extern ExprState *ExecInitQual(List *qual, PlanState *parent); extern ExprState *ExecInitCheck(List *qual, PlanState *parent); extern List *ExecInitExprList(List *nodes, PlanState *parent); + +/* + * Which strategy to use for aggregation/grouping + */ +#define GROUPING_STRATEGY_SORT 1 +#define GROUPING_STRATEGY_HASH (1 << 1) +#define GROUPING_STRATEGY_INDEX (1 << 2) + extern ExprState *ExecBuildAggTrans(AggState *aggstate, struct AggStatePerPhaseData *phase, - bool doSort, bool doHash, bool nullcheck); + int groupStrategy, bool nullcheck); extern ExprState *ExecBuildHash32FromAttrs(TupleDesc desc, const TupleTableSlotOps *ops, FmgrInfo *hashfunctions, diff --git a/src/include/executor/nodeAgg.h b/src/include/executor/nodeAgg.h index df52fa1e1afc..a8deeb4d1b95 100644 --- a/src/include/executor/nodeAgg.h +++ b/src/include/executor/nodeAgg.h @@ -321,6 +321,33 @@ typedef struct AggStatePerHashData Agg *aggnode; /* original Agg node, for numGroups etc. */ } AggStatePerHashData; +/* + * AggStatePerIndexData - per-index state + * + * Logic is the same as for AggStatePerHashData - one of these for each + * grouping set. + */ +typedef struct AggStatePerIndexData +{ + TupleIndex index; /* current in-memory index data */ + MemoryContext metacxt; /* memory context containing TupleIndex */ + MemoryContext tempctx; /* short-lived context */ + TupleTableSlot *indexslot; /* slot for loading index */ + int numCols; /* total number of columns in index tuple */ + int numKeyCols; /* number of key columns in index tuple */ + int largestGrpColIdx; /* largest col required for comparison */ + AttrNumber *idxKeyColIdxInput; /* key column indices in input slot */ + AttrNumber *idxKeyColIdxIndex; /* key column indices in index tuples */ + TupleIndexIteratorData iter; /* iterator state for index */ + Agg *aggnode; /* original Agg node, for numGroups etc. */ + + /* state used only for spill mode */ + AttrNumber *idxKeyColIdxTL; /* key column indices in target list */ + FmgrInfo *hashfunctions; /* tuple hashing function */ + ExprState *indexhashexpr; /* ExprState for hashing index datatype(s) */ + ExprContext *exprcontext; /* expression context */ + TupleTableSlot *mergeslot; /* slot for loading tuple during merge */ +} AggStatePerIndexData; extern AggState *ExecInitAgg(Agg *node, EState *estate, int eflags); extern void ExecEndAgg(AggState *node); @@ -328,9 +355,9 @@ extern void ExecReScanAgg(AggState *node); extern Size hash_agg_entry_size(int numTrans, Size tupleWidth, Size transitionSpace); -extern void hash_agg_set_limits(double hashentrysize, double input_groups, - int used_bits, Size *mem_limit, - uint64 *ngroups_limit, int *num_partitions); +extern void agg_set_limits(double hashentrysize, double input_groups, + int used_bits, Size *mem_limit, + uint64 *ngroups_limit, int *num_partitions); /* parallel instrumentation support */ extern void ExecAggEstimate(AggState *node, ParallelContext *pcxt); diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index 3968429f9919..13ae5ea68545 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -900,7 +900,91 @@ typedef tuplehash_iterator TupleHashIterator; #define ScanTupleHashTable(htable, iter) \ tuplehash_iterate(htable->hashtab, iter) +/* --------------------------------------------------------------- + * Tuple Btree index + * + * All-in-memory tuple Btree index used for grouping and aggregating. + * --------------------------------------------------------------- + */ + +/* + * Representation of tuple in index. It stores both tuple and + * first key information. If key abbreviation is used, then this + * first key stores abbreviated key. + */ +typedef struct TupleIndexEntryData +{ + MinimalTuple tuple; /* actual stored tuple */ + Datum key1; /* value of first key */ + bool isnull1; /* first key is null */ +} TupleIndexEntryData; + +typedef TupleIndexEntryData *TupleIndexEntry; + +/* + * Btree node of tuple index. Common for both internal and leaf nodes. + */ +typedef struct TupleIndexNodeData +{ + /* amount of tuples in the node */ + int ntuples; + +/* + * Maximal amount of tuples stored in tuple index node. + * + * NOTE: use 2^n - 1 count, so all all tuples will fully utilize cache lines + * (except first because of 'ntuples' padding) + */ +#define TUPLE_INDEX_NODE_MAX_ENTRIES 63 + /* + * array of tuples for this page. + * + * for internal node these are separator keys. + * for leaf nodes actual tuples. + */ + TupleIndexEntry tuples[TUPLE_INDEX_NODE_MAX_ENTRIES]; + + /* + * for internal nodes this is an array with size + * TUPLE_INDEX_NODE_MAX_ENTRIES + 1 - pointers to nodes below. + * + * for leaf nodes this is an array of 1 element - pointer to sibling + * node required for iteration + */ + struct TupleIndexNodeData *pointers[FLEXIBLE_ARRAY_MEMBER]; +} TupleIndexNodeData; + +typedef TupleIndexNodeData *TupleIndexNode; + +typedef struct TupleIndexData +{ + TupleDesc tupDesc; /* descriptor for stored tuples */ + TupleIndexNode root; /* root of the tree */ + int height; /* current tree height */ + int ntuples; /* number of tuples in index */ + int nkeys; /* amount of keys in tuple */ + SortSupport sortKeys; /* support functions for key comparison */ + MemoryContext tuplecxt; /* memory context containing tuples */ + MemoryContext nodecxt; /* memory context containing index nodes */ + Size additionalsize; /* size of additional data for tuple */ + int abbrevNext; /* next time we should check abbreviation + * optimization efficiency */ + bool abbrevUsed; /* true if key abbreviation optimization + * was ever used */ + Oid abbrevSortOp; /* sort operator for first key */ +} TupleIndexData; + +typedef struct TupleIndexData *TupleIndex; + +typedef struct TupleIndexIteratorData +{ + TupleIndexNode cur_leaf; /* current leaf node */ + OffsetNumber cur_idx; /* index of tuple to return next */ +} TupleIndexIteratorData; + +typedef TupleIndexIteratorData *TupleIndexIterator; + /* ---------------------------------------------------------------- * Expression State Nodes * @@ -2529,6 +2613,7 @@ typedef struct AggStatePerTransData *AggStatePerTrans; typedef struct AggStatePerGroupData *AggStatePerGroup; typedef struct AggStatePerPhaseData *AggStatePerPhase; typedef struct AggStatePerHashData *AggStatePerHash; +typedef struct AggStatePerIndexData *AggStatePerIndex; typedef struct AggState { @@ -2544,17 +2629,18 @@ typedef struct AggState AggStatePerAgg peragg; /* per-Aggref information */ AggStatePerTrans pertrans; /* per-Trans state information */ ExprContext *hashcontext; /* econtexts for long-lived data (hashtable) */ + ExprContext *indexcontext; /* econtexts for long-lived data (index) */ ExprContext **aggcontexts; /* econtexts for long-lived data (per GS) */ ExprContext *tmpcontext; /* econtext for input expressions */ -#define FIELDNO_AGGSTATE_CURAGGCONTEXT 14 +#define FIELDNO_AGGSTATE_CURAGGCONTEXT 15 ExprContext *curaggcontext; /* currently active aggcontext */ AggStatePerAgg curperagg; /* currently active aggregate, if any */ -#define FIELDNO_AGGSTATE_CURPERTRANS 16 +#define FIELDNO_AGGSTATE_CURPERTRANS 17 AggStatePerTrans curpertrans; /* currently active trans state, if any */ bool input_done; /* indicates end of input */ bool agg_done; /* indicates completion of Agg scan */ int projected_set; /* The last projected grouping set */ -#define FIELDNO_AGGSTATE_CURRENT_SET 20 +#define FIELDNO_AGGSTATE_CURRENT_SET 21 int current_set; /* The current grouping set being evaluated */ Bitmapset *grouped_cols; /* grouped cols in current projection */ List *all_grouped_cols; /* list of all grouped cols in DESC order */ @@ -2576,32 +2662,43 @@ typedef struct AggState int num_hashes; MemoryContext hash_metacxt; /* memory for hash table bucket array */ MemoryContext hash_tuplescxt; /* memory for hash table tuples */ - struct LogicalTapeSet *hash_tapeset; /* tape set for hash spill tapes */ - struct HashAggSpill *hash_spills; /* HashAggSpill for each grouping set, - * exists only during first pass */ - TupleTableSlot *hash_spill_rslot; /* for reading spill files */ - TupleTableSlot *hash_spill_wslot; /* for writing spill files */ - List *hash_batches; /* hash batches remaining to be processed */ - bool hash_ever_spilled; /* ever spilled during this execution? */ - bool hash_spill_mode; /* we hit a limit during the current batch - * and we must not create new groups */ - Size hash_mem_limit; /* limit before spilling hash table */ - uint64 hash_ngroups_limit; /* limit before spilling hash table */ - int hash_planned_partitions; /* number of partitions planned - * for first pass */ - double hashentrysize; /* estimate revised during execution */ - Size hash_mem_peak; /* peak hash table memory usage */ - uint64 hash_ngroups_current; /* number of groups currently in - * memory in all hash tables */ - uint64 hash_disk_used; /* kB of disk space used */ - int hash_batches_used; /* batches used during entire execution */ - AggStatePerHash perhash; /* array of per-hashtable data */ AggStatePerGroup *hash_pergroup; /* grouping set indexed array of * per-group pointers */ + /* Fields used for managing spill mode in hash and index aggs */ + struct LogicalTapeSet *spill_tapeset; /* tape set for hash spill tapes */ + struct HashAggSpill *spills; /* HashAggSpill for each grouping set, + * exists only during first pass */ + TupleTableSlot *spill_rslot; /* for reading spill files */ + TupleTableSlot *spill_wslot; /* for writing spill files */ + List *spill_batches; /* hash batches remaining to be processed */ + + bool spill_ever_happened; /* ever spilled during this execution? */ + bool spill_mode; /* we hit a limit during the current batch + * and we must not create new groups */ + Size spill_mem_limit; /* limit before spilling hash table or index */ + uint64 spill_ngroups_limit; /* limit before spilling hash table or index */ + int spill_planned_partitions; /* number of partitions planned + * for first pass */ + double hashentrysize; /* estimate revised during execution */ + Size spill_mem_peak; /* peak memory usage of hash table or index */ + uint64 spill_ngroups_current; /* number of groups currently in + * memory in all hash tables */ + uint64 spill_disk_used; /* kB of disk space used */ + int spill_batches_used; /* batches used during entire execution */ + + /* these fields are used in AGG_INDEXED mode: */ + AggStatePerIndex perindex; /* pointer to per-index state data */ + bool index_filled; /* index filled yet? */ + MemoryContext index_metacxt; /* memory for index structure */ + MemoryContext index_nodecxt; /* memory for index nodes */ + MemoryContext index_entrycxt; /* memory for index entries */ + Sort *index_sort; /* ordering information for index */ + Tuplesortstate *mergestate; /* state for merging projected tuples if + * spill occurred */ /* support for evaluation of agg input expressions: */ -#define FIELDNO_AGGSTATE_ALL_PERGROUPS 54 +#define FIELDNO_AGGSTATE_ALL_PERGROUPS 62 AggStatePerGroup *all_pergroups; /* array of first ->pergroups, than * ->hash_pergroup */ SharedAggInfo *shared_info; /* one entry per worker */ diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h index fb3957e75e5f..b0e2d781c01d 100644 --- a/src/include/nodes/nodes.h +++ b/src/include/nodes/nodes.h @@ -365,6 +365,7 @@ typedef enum AggStrategy AGG_SORTED, /* grouped agg, input must be sorted */ AGG_HASHED, /* grouped agg, use internal hashtable */ AGG_MIXED, /* grouped agg, hash and sort both used */ + AGG_INDEX, /* grouped agg, build index for input */ } AggStrategy; /* diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h index b5ff456ef7fa..f09f793a7174 100644 --- a/src/include/nodes/pathnodes.h +++ b/src/include/nodes/pathnodes.h @@ -3518,7 +3518,8 @@ typedef struct JoinPathExtraData */ #define GROUPING_CAN_USE_SORT 0x0001 #define GROUPING_CAN_USE_HASH 0x0002 -#define GROUPING_CAN_PARTIAL_AGG 0x0004 +#define GROUPING_CAN_USE_INDEX 0x0004 +#define GROUPING_CAN_PARTIAL_AGG 0x0008 /* * What kind of partitionwise aggregation is in use? diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h index c4393a943211..b19dacf5de48 100644 --- a/src/include/nodes/plannodes.h +++ b/src/include/nodes/plannodes.h @@ -1219,7 +1219,7 @@ typedef struct Agg /* grouping sets to use */ List *groupingSets; - /* chained Agg/Sort nodes */ + /* chained Agg/Sort nodes, for AGG_INDEX contains single Sort node */ List *chain; } Agg; diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h index b523bcda8f3d..5d03b5971bdf 100644 --- a/src/include/optimizer/cost.h +++ b/src/include/optimizer/cost.h @@ -57,6 +57,7 @@ extern PGDLLIMPORT bool enable_tidscan; extern PGDLLIMPORT bool enable_sort; extern PGDLLIMPORT bool enable_incremental_sort; extern PGDLLIMPORT bool enable_hashagg; +extern PGDLLIMPORT bool enable_indexagg; extern PGDLLIMPORT bool enable_nestloop; extern PGDLLIMPORT bool enable_material; extern PGDLLIMPORT bool enable_memoize; diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h index 6b010f0b1a5a..a2aad4ecba78 100644 --- a/src/include/optimizer/pathnode.h +++ b/src/include/optimizer/pathnode.h @@ -235,6 +235,7 @@ extern AggPath *create_agg_path(PlannerInfo *root, AggSplit aggsplit, List *groupClause, List *qual, + List *pathkeys, const AggClauseCosts *aggcosts, double numGroups); extern GroupingSetsPath *create_groupingsets_path(PlannerInfo *root, diff --git a/src/include/utils/tuplesort.h b/src/include/utils/tuplesort.h index 63a7cc13a31c..c0019ba425a3 100644 --- a/src/include/utils/tuplesort.h +++ b/src/include/utils/tuplesort.h @@ -475,6 +475,21 @@ extern GinTuple *tuplesort_getgintuple(Tuplesortstate *state, Size *len, bool forward); extern bool tuplesort_getdatum(Tuplesortstate *state, bool forward, bool copy, Datum *val, bool *isNull, Datum *abbrev); - +/* +* Special state for merge mode. +*/ +extern Tuplesortstate *tuplemerge_begin_common(int workMem, + SortCoordinate coordinate); +extern Tuplesortstate *tuplemerge_begin_heap(TupleDesc tupDesc, + int nkeys, AttrNumber *attNums, + Oid *sortOperators, Oid *sortCollations, + bool *nullsFirstFlags, + int workMem, SortCoordinate coordinate); +extern void tuplemerge_start_run(Tuplesortstate *state); +extern void tuplemerge_end_run(Tuplesortstate *state); +extern void tuplemerge_puttuple_common(Tuplesortstate *state, SortTuple *tuple, + Size tuplen); +extern void tuplemerge_puttupleslot(Tuplesortstate *state, TupleTableSlot *slot); +extern void tuplemerge_performmerge(Tuplesortstate *state); #endif /* TUPLESORT_H */ diff --git a/src/test/regress/expected/aggregates.out b/src/test/regress/expected/aggregates.out index cae8e7bca313..afe01f5da854 100644 --- a/src/test/regress/expected/aggregates.out +++ b/src/test/regress/expected/aggregates.out @@ -1533,7 +1533,7 @@ explain (costs off) select * from t1 group by a,b,c,d; explain (costs off) select * from only t1 group by a,b,c,d; QUERY PLAN ---------------------- - HashAggregate + IndexAggregate Group Key: a, b -> Seq Scan on t1 (3 rows) @@ -3270,6 +3270,7 @@ FROM generate_series(1, 100) AS i; CREATE INDEX btg_x_y_idx ON btg(x, y); ANALYZE btg; SET enable_hashagg = off; +SET enable_indexagg = off; SET enable_seqscan = off; -- Utilize the ordering of index scan to avoid a Sort operation EXPLAIN (COSTS OFF) @@ -3707,10 +3708,242 @@ select v||'a', case when v||'a' = 'aa' then 1 else 0 end, count(*) ba | 0 | 1 (2 rows) + +-- +-- Index Aggregation tests +-- +set enable_hashagg = false; +set enable_sort = false; +set enable_indexagg = true; +set enable_indexscan = false; +-- require ordered output +EXPLAIN (COSTS OFF, VERBOSE) +SELECT unique1, SUM(two) FROM tenk1 +GROUP BY 1 +ORDER BY 1 +LIMIT 10; + QUERY PLAN +------------------------------------------------------------------------------------------------------------------------------------------------------------- + Limit + Output: unique1, (sum(two)) + -> IndexAggregate + Output: unique1, sum(two) + Group Key: tenk1.unique1 + -> Seq Scan on public.tenk1 + Output: unique1, unique2, two, four, ten, twenty, hundred, thousand, twothousand, fivethous, tenthous, odd, even, stringu1, stringu2, string4 +(7 rows) + +SELECT unique1, SUM(two) FROM tenk1 +GROUP BY 1 +ORDER BY 1 +LIMIT 10; + unique1 | sum +---------+----- + 0 | 0 + 1 | 1 + 2 | 0 + 3 | 1 + 4 | 0 + 5 | 1 + 6 | 0 + 7 | 1 + 8 | 0 + 9 | 1 +(10 rows) + +EXPLAIN (COSTS OFF, VERBOSE) +SELECT even, sum(two) FROM tenk1 +GROUP BY 1 +ORDER BY 1 +LIMIT 10; + QUERY PLAN +------------------------------------------------------------------------------------------------------------------------------------------------------------- + Limit + Output: even, (sum(two)) + -> IndexAggregate + Output: even, sum(two) + Group Key: tenk1.even + -> Seq Scan on public.tenk1 + Output: unique1, unique2, two, four, ten, twenty, hundred, thousand, twothousand, fivethous, tenthous, odd, even, stringu1, stringu2, string4 +(7 rows) + +SELECT even, sum(two) FROM tenk1 +GROUP BY 1 +ORDER BY 1 +LIMIT 10; + even | sum +------+----- + 1 | 0 + 3 | 100 + 5 | 0 + 7 | 100 + 9 | 0 + 11 | 100 + 13 | 0 + 15 | 100 + 17 | 0 + 19 | 100 +(10 rows) + +-- multiple grouping columns +EXPLAIN (COSTS OFF, VERBOSE) +SELECT even, odd, sum(unique1) FROM tenk1 +GROUP BY 1, 2 +ORDER BY 1, 2 +LIMIT 10; + QUERY PLAN +------------------------------------------------------------------------------------------------------------------------------------------------------------- + Limit + Output: even, odd, (sum(unique1)) + -> IndexAggregate + Output: even, odd, sum(unique1) + Group Key: tenk1.even, tenk1.odd + -> Seq Scan on public.tenk1 + Output: unique1, unique2, two, four, ten, twenty, hundred, thousand, twothousand, fivethous, tenthous, odd, even, stringu1, stringu2, string4 +(7 rows) + +SELECT even, odd, sum(unique1) FROM tenk1 +GROUP BY 1, 2 +ORDER BY 1, 2 +LIMIT 10; + even | odd | sum +------+-----+-------- + 1 | 0 | 495000 + 3 | 2 | 495100 + 5 | 4 | 495200 + 7 | 6 | 495300 + 9 | 8 | 495400 + 11 | 10 | 495500 + 13 | 12 | 495600 + 15 | 14 | 495700 + 17 | 16 | 495800 + 19 | 18 | 495900 +(10 rows) + +-- mixing columns between group by and order by +begin; +create temp table tmp(x int, y int); +insert into tmp values (1, 8), (2, 7), (3, 6), (4, 5); +EXPLAIN (COSTS OFF, VERBOSE) +SELECT x, y, sum(x) FROM tmp +GROUP BY 1, 2 +ORDER BY 1, 2; + QUERY PLAN +------------------------------- + IndexAggregate + Output: x, y, sum(x) + Group Key: tmp.x, tmp.y + -> Seq Scan on pg_temp.tmp + Output: x, y +(5 rows) + +SELECT x, y, sum(x) FROM tmp +GROUP BY 1, 2 +ORDER BY 1, 2; + x | y | sum +---+---+----- + 1 | 8 | 1 + 2 | 7 | 2 + 3 | 6 | 3 + 4 | 5 | 4 +(4 rows) + +EXPLAIN (COSTS OFF, VERBOSE) +SELECT x, y, sum(x) FROM tmp +GROUP BY 1, 2 +ORDER BY 2, 1; + QUERY PLAN +------------------------------- + IndexAggregate + Output: x, y, sum(x) + Group Key: tmp.y, tmp.x + -> Seq Scan on pg_temp.tmp + Output: x, y +(5 rows) + +SELECT x, y, sum(x) FROM tmp +GROUP BY 1, 2 +ORDER BY 2, 1; + x | y | sum +---+---+----- + 4 | 5 | 4 + 3 | 6 | 3 + 2 | 7 | 2 + 1 | 8 | 1 +(4 rows) + +-- +-- Index Aggregation Spill tests +-- +set enable_indexagg = true; +set enable_sort=false; +set enable_hashagg = false; +set work_mem='64kB'; +select unique1, count(*), sum(twothousand) from tenk1 +group by unique1 +having sum(fivethous) > 4975 +order by sum(twothousand); + unique1 | count | sum +---------+-------+------ + 4976 | 1 | 976 + 4977 | 1 | 977 + 4978 | 1 | 978 + 4979 | 1 | 979 + 4980 | 1 | 980 + 4981 | 1 | 981 + 4982 | 1 | 982 + 4983 | 1 | 983 + 4984 | 1 | 984 + 4985 | 1 | 985 + 4986 | 1 | 986 + 4987 | 1 | 987 + 4988 | 1 | 988 + 4989 | 1 | 989 + 4990 | 1 | 990 + 4991 | 1 | 991 + 4992 | 1 | 992 + 4993 | 1 | 993 + 4994 | 1 | 994 + 4995 | 1 | 995 + 4996 | 1 | 996 + 4997 | 1 | 997 + 4998 | 1 | 998 + 4999 | 1 | 999 + 9976 | 1 | 1976 + 9977 | 1 | 1977 + 9978 | 1 | 1978 + 9979 | 1 | 1979 + 9980 | 1 | 1980 + 9981 | 1 | 1981 + 9982 | 1 | 1982 + 9983 | 1 | 1983 + 9984 | 1 | 1984 + 9985 | 1 | 1985 + 9986 | 1 | 1986 + 9987 | 1 | 1987 + 9988 | 1 | 1988 + 9989 | 1 | 1989 + 9990 | 1 | 1990 + 9991 | 1 | 1991 + 9992 | 1 | 1992 + 9993 | 1 | 1993 + 9994 | 1 | 1994 + 9995 | 1 | 1995 + 9996 | 1 | 1996 + 9997 | 1 | 1997 + 9998 | 1 | 1998 + 9999 | 1 | 1999 +(48 rows) + +set work_mem to default; +set enable_sort to default; +set enable_hashagg to default; +set enable_indexagg to default; -- -- Hash Aggregation Spill tests -- set enable_sort=false; +set enable_indexagg = false; set work_mem='64kB'; select unique1, count(*), sum(twothousand) from tenk1 group by unique1 @@ -3783,6 +4016,7 @@ select g from generate_series(0, 19999) g; analyze agg_data_20k; -- Produce results with sorting. set enable_hashagg = false; +set enable_indexagg = false; set jit_above_cost = 0; explain (costs off) select g%10000 as c1, sum(g::numeric) as c2, count(*) as c3 @@ -3852,31 +4086,74 @@ select (g/2)::numeric as c1, array_agg(g::numeric) as c2, count(*) as c3 from agg_data_2k group by g/2; set enable_sort = true; set work_mem to default; +-- Produce results with index aggregation +set enable_sort = false; +set enable_hashagg = false; +set enable_indexagg = true; +set jit_above_cost = 0; +explain (costs off) +select g%10000 as c1, sum(g::numeric) as c2, count(*) as c3 + from agg_data_20k group by g%10000; + QUERY PLAN +-------------------------------- + IndexAggregate + Group Key: (g % 10000) + -> Seq Scan on agg_data_20k +(3 rows) + +create table agg_index_1 as +select g%10000 as c1, sum(g::numeric) as c2, count(*) as c3 + from agg_data_20k group by g%10000; +create table agg_index_2 as +select * from + (values (100), (300), (500)) as r(a), + lateral ( + select (g/2)::numeric as c1, + array_agg(g::numeric) as c2, + count(*) as c3 + from agg_data_2k + where g < r.a + group by g/2) as s; +set jit_above_cost to default; +create table agg_index_3 as +select (g/2)::numeric as c1, sum(7::int4) as c2, count(*) as c3 + from agg_data_2k group by g/2; +create table agg_index_4 as +select (g/2)::numeric as c1, array_agg(g::numeric) as c2, count(*) as c3 + from agg_data_2k group by g/2; -- Compare group aggregation results to hash aggregation results (select * from agg_hash_1 except select * from agg_group_1) union all -(select * from agg_group_1 except select * from agg_hash_1); +(select * from agg_group_1 except select * from agg_hash_1) + union all +(select * from agg_index_1 except select * from agg_group_1); c1 | c2 | c3 ----+----+---- (0 rows) (select * from agg_hash_2 except select * from agg_group_2) union all -(select * from agg_group_2 except select * from agg_hash_2); +(select * from agg_group_2 except select * from agg_hash_2) + union all +(select * from agg_index_2 except select * from agg_group_2); a | c1 | c2 | c3 ---+----+----+---- (0 rows) (select * from agg_hash_3 except select * from agg_group_3) union all -(select * from agg_group_3 except select * from agg_hash_3); +(select * from agg_group_3 except select * from agg_hash_3) + union all +(select * from agg_index_3 except select * from agg_group_3); c1 | c2 | c3 ----+----+---- (0 rows) (select * from agg_hash_4 except select * from agg_group_4) union all -(select * from agg_group_4 except select * from agg_hash_4); +(select * from agg_group_4 except select * from agg_hash_4) + union all +(select * from agg_index_4 except select * from agg_group_4); c1 | c2 | c3 ----+----+---- (0 rows) @@ -3889,3 +4166,7 @@ drop table agg_hash_1; drop table agg_hash_2; drop table agg_hash_3; drop table agg_hash_4; +drop table agg_index_1; +drop table agg_index_2; +drop table agg_index_3; +drop table agg_index_4; diff --git a/src/test/regress/expected/collate.icu.utf8.out b/src/test/regress/expected/collate.icu.utf8.out index 8023014fe637..c62e312175c5 100644 --- a/src/test/regress/expected/collate.icu.utf8.out +++ b/src/test/regress/expected/collate.icu.utf8.out @@ -2395,8 +2395,8 @@ SELECT upper(c collate case_insensitive), count(c) FROM pagg_tab3 GROUP BY c col -------------------------------------------------------------- Sort Sort Key: (upper(pagg_tab3.c)) COLLATE case_insensitive - -> Finalize HashAggregate - Group Key: pagg_tab3.c + -> Finalize IndexAggregate + Group Key: pagg_tab3.c COLLATE case_insensitive -> Append -> Partial HashAggregate Group Key: pagg_tab3.c @@ -2613,20 +2613,20 @@ INSERT INTO pagg_tab6 (b, c) SELECT substr('cdCD', (i % 4) + 1 , 1), substr('cdC ANALYZE pagg_tab6; EXPLAIN (COSTS OFF) SELECT t1.c, count(t2.c) FROM pagg_tab5 t1 JOIN pagg_tab6 t2 ON t1.c = t2.c AND t1.c = t2.b GROUP BY 1 ORDER BY t1.c COLLATE "C"; - QUERY PLAN -------------------------------------------------------- + QUERY PLAN +---------------------------------------------------------- Sort Sort Key: t1.c COLLATE "C" -> Append - -> HashAggregate - Group Key: t1.c + -> IndexAggregate + Group Key: t1.c COLLATE case_insensitive -> Nested Loop Join Filter: (t1.c = t2.c) -> Seq Scan on pagg_tab6_p1 t2 Filter: (c = b) -> Seq Scan on pagg_tab5_p1 t1 - -> HashAggregate - Group Key: t1_1.c + -> IndexAggregate + Group Key: t1_1.c COLLATE case_insensitive -> Nested Loop Join Filter: (t1_1.c = t2_1.c) -> Seq Scan on pagg_tab6_p2 t2_1 diff --git a/src/test/regress/expected/eager_aggregate.out b/src/test/regress/expected/eager_aggregate.out index 5ac966186f7c..0d4468fa6863 100644 --- a/src/test/regress/expected/eager_aggregate.out +++ b/src/test/regress/expected/eager_aggregate.out @@ -21,27 +21,24 @@ SELECT t1.a, avg(t2.c) FROM eager_agg_t1 t1 JOIN eager_agg_t2 t2 ON t1.b = t2.b GROUP BY t1.a ORDER BY t1.a; - QUERY PLAN ------------------------------------------------------------------- - Finalize GroupAggregate + QUERY PLAN +------------------------------------------------------------ + Finalize IndexAggregate Output: t1.a, avg(t2.c) Group Key: t1.a - -> Sort + -> Hash Join Output: t1.a, (PARTIAL avg(t2.c)) - Sort Key: t1.a - -> Hash Join - Output: t1.a, (PARTIAL avg(t2.c)) - Hash Cond: (t1.b = t2.b) - -> Seq Scan on public.eager_agg_t1 t1 - Output: t1.a, t1.b, t1.c - -> Hash - Output: t2.b, (PARTIAL avg(t2.c)) - -> Partial HashAggregate - Output: t2.b, PARTIAL avg(t2.c) - Group Key: t2.b - -> Seq Scan on public.eager_agg_t2 t2 - Output: t2.a, t2.b, t2.c -(18 rows) + Hash Cond: (t1.b = t2.b) + -> Seq Scan on public.eager_agg_t1 t1 + Output: t1.a, t1.b, t1.c + -> Hash + Output: t2.b, (PARTIAL avg(t2.c)) + -> Partial HashAggregate + Output: t2.b, PARTIAL avg(t2.c) + Group Key: t2.b + -> Seq Scan on public.eager_agg_t2 t2 + Output: t2.a, t2.b, t2.c +(15 rows) SELECT t1.a, avg(t2.c) FROM eager_agg_t1 t1 @@ -62,6 +59,7 @@ GROUP BY t1.a ORDER BY t1.a; -- Produce results with sorting aggregation SET enable_hashagg TO off; +SET enable_indexagg TO off; EXPLAIN (VERBOSE, COSTS OFF) SELECT t1.a, avg(t2.c) FROM eager_agg_t1 t1 @@ -110,6 +108,53 @@ GROUP BY t1.a ORDER BY t1.a; (9 rows) RESET enable_hashagg; +RESET enable_indexagg; +-- Produce results with index aggregation +SET enable_hashagg TO off; +SET enable_sort TO off; +EXPLAIN (VERBOSE, COSTS OFF) +SELECT t1.a, avg(t2.c) + FROM eager_agg_t1 t1 + JOIN eager_agg_t2 t2 ON t1.b = t2.b +GROUP BY t1.a ORDER BY t1.a; + QUERY PLAN +------------------------------------------------------------ + Finalize IndexAggregate + Output: t1.a, avg(t2.c) + Group Key: t1.a + -> Hash Join + Output: t1.a, (PARTIAL avg(t2.c)) + Hash Cond: (t1.b = t2.b) + -> Seq Scan on public.eager_agg_t1 t1 + Output: t1.a, t1.b, t1.c + -> Hash + Output: t2.b, (PARTIAL avg(t2.c)) + -> Partial IndexAggregate + Output: t2.b, PARTIAL avg(t2.c) + Group Key: t2.b + -> Seq Scan on public.eager_agg_t2 t2 + Output: t2.a, t2.b, t2.c +(15 rows) + +SELECT t1.a, avg(t2.c) + FROM eager_agg_t1 t1 + JOIN eager_agg_t2 t2 ON t1.b = t2.b +GROUP BY t1.a ORDER BY t1.a; + a | avg +---+----- + 1 | 496 + 2 | 497 + 3 | 498 + 4 | 499 + 5 | 500 + 6 | 501 + 7 | 502 + 8 | 503 + 9 | 504 +(9 rows) + +RESET enable_hashagg; +RESET enable_sort; -- -- Test eager aggregation over join rel -- @@ -121,34 +166,31 @@ SELECT t1.a, avg(t2.c + t3.c) JOIN eager_agg_t2 t2 ON t1.b = t2.b JOIN eager_agg_t3 t3 ON t2.a = t3.a GROUP BY t1.a ORDER BY t1.a; - QUERY PLAN ------------------------------------------------------------------------------- - Finalize GroupAggregate + QUERY PLAN +------------------------------------------------------------------------ + Finalize IndexAggregate Output: t1.a, avg((t2.c + t3.c)) Group Key: t1.a - -> Sort + -> Hash Join Output: t1.a, (PARTIAL avg((t2.c + t3.c))) - Sort Key: t1.a - -> Hash Join - Output: t1.a, (PARTIAL avg((t2.c + t3.c))) - Hash Cond: (t1.b = t2.b) - -> Seq Scan on public.eager_agg_t1 t1 - Output: t1.a, t1.b, t1.c - -> Hash - Output: t2.b, (PARTIAL avg((t2.c + t3.c))) - -> Partial HashAggregate - Output: t2.b, PARTIAL avg((t2.c + t3.c)) - Group Key: t2.b - -> Hash Join - Output: t2.c, t2.b, t3.c - Hash Cond: (t3.a = t2.a) - -> Seq Scan on public.eager_agg_t3 t3 - Output: t3.a, t3.b, t3.c - -> Hash + Hash Cond: (t1.b = t2.b) + -> Seq Scan on public.eager_agg_t1 t1 + Output: t1.a, t1.b, t1.c + -> Hash + Output: t2.b, (PARTIAL avg((t2.c + t3.c))) + -> Partial HashAggregate + Output: t2.b, PARTIAL avg((t2.c + t3.c)) + Group Key: t2.b + -> Hash Join + Output: t2.c, t2.b, t3.c + Hash Cond: (t3.a = t2.a) + -> Seq Scan on public.eager_agg_t3 t3 + Output: t3.a, t3.b, t3.c + -> Hash + Output: t2.c, t2.b, t2.a + -> Seq Scan on public.eager_agg_t2 t2 Output: t2.c, t2.b, t2.a - -> Seq Scan on public.eager_agg_t2 t2 - Output: t2.c, t2.b, t2.a -(25 rows) +(22 rows) SELECT t1.a, avg(t2.c + t3.c) FROM eager_agg_t1 t1 @@ -170,6 +212,7 @@ GROUP BY t1.a ORDER BY t1.a; -- Produce results with sorting aggregation SET enable_hashagg TO off; +SET enable_indexagg TO off; EXPLAIN (VERBOSE, COSTS OFF) SELECT t1.a, avg(t2.c + t3.c) FROM eager_agg_t1 t1 @@ -227,6 +270,62 @@ GROUP BY t1.a ORDER BY t1.a; (9 rows) RESET enable_hashagg; +RESET enable_indexagg; +-- Produce results with index aggregation +SET enable_hashagg TO off; +SET enable_sort TO off; +EXPLAIN (VERBOSE, COSTS OFF) +SELECT t1.a, avg(t2.c + t3.c) + FROM eager_agg_t1 t1 + JOIN eager_agg_t2 t2 ON t1.b = t2.b + JOIN eager_agg_t3 t3 ON t2.a = t3.a +GROUP BY t1.a ORDER BY t1.a; + QUERY PLAN +------------------------------------------------------------------------ + Finalize IndexAggregate + Output: t1.a, avg((t2.c + t3.c)) + Group Key: t1.a + -> Hash Join + Output: t1.a, (PARTIAL avg((t2.c + t3.c))) + Hash Cond: (t1.b = t2.b) + -> Seq Scan on public.eager_agg_t1 t1 + Output: t1.a, t1.b, t1.c + -> Hash + Output: t2.b, (PARTIAL avg((t2.c + t3.c))) + -> Partial IndexAggregate + Output: t2.b, PARTIAL avg((t2.c + t3.c)) + Group Key: t2.b + -> Hash Join + Output: t2.c, t2.b, t3.c + Hash Cond: (t3.a = t2.a) + -> Seq Scan on public.eager_agg_t3 t3 + Output: t3.a, t3.b, t3.c + -> Hash + Output: t2.c, t2.b, t2.a + -> Seq Scan on public.eager_agg_t2 t2 + Output: t2.c, t2.b, t2.a +(22 rows) + +SELECT t1.a, avg(t2.c + t3.c) + FROM eager_agg_t1 t1 + JOIN eager_agg_t2 t2 ON t1.b = t2.b + JOIN eager_agg_t3 t3 ON t2.a = t3.a +GROUP BY t1.a ORDER BY t1.a; + a | avg +---+----- + 1 | 497 + 2 | 499 + 3 | 501 + 4 | 503 + 5 | 505 + 6 | 507 + 7 | 509 + 8 | 511 + 9 | 513 +(9 rows) + +RESET enable_hashagg; +RESET enable_sort; -- -- Test that eager aggregation works for outer join -- @@ -236,27 +335,24 @@ SELECT t1.a, avg(t2.c) FROM eager_agg_t1 t1 RIGHT JOIN eager_agg_t2 t2 ON t1.b = t2.b GROUP BY t1.a ORDER BY t1.a; - QUERY PLAN ------------------------------------------------------------------- - Finalize GroupAggregate + QUERY PLAN +------------------------------------------------------------ + Finalize IndexAggregate Output: t1.a, avg(t2.c) Group Key: t1.a - -> Sort + -> Hash Right Join Output: t1.a, (PARTIAL avg(t2.c)) - Sort Key: t1.a - -> Hash Right Join - Output: t1.a, (PARTIAL avg(t2.c)) - Hash Cond: (t1.b = t2.b) - -> Seq Scan on public.eager_agg_t1 t1 - Output: t1.a, t1.b, t1.c - -> Hash - Output: t2.b, (PARTIAL avg(t2.c)) - -> Partial HashAggregate - Output: t2.b, PARTIAL avg(t2.c) - Group Key: t2.b - -> Seq Scan on public.eager_agg_t2 t2 - Output: t2.a, t2.b, t2.c -(18 rows) + Hash Cond: (t1.b = t2.b) + -> Seq Scan on public.eager_agg_t1 t1 + Output: t1.a, t1.b, t1.c + -> Hash + Output: t2.b, (PARTIAL avg(t2.c)) + -> Partial HashAggregate + Output: t2.b, PARTIAL avg(t2.c) + Group Key: t2.b + -> Seq Scan on public.eager_agg_t2 t2 + Output: t2.a, t2.b, t2.c +(15 rows) SELECT t1.a, avg(t2.c) FROM eager_agg_t1 t1 @@ -331,30 +427,27 @@ SELECT t1.a, avg(t2.c) FROM eager_agg_t1 t1 JOIN eager_agg_t2 t2 ON t1.b = t2.b GROUP BY t1.a ORDER BY t1.a; - QUERY PLAN ---------------------------------------------------------------------------------- - Finalize GroupAggregate + QUERY PLAN +--------------------------------------------------------------------------- + Finalize IndexAggregate Output: t1.a, avg(t2.c) Group Key: t1.a - -> Gather Merge + -> Gather Output: t1.a, (PARTIAL avg(t2.c)) Workers Planned: 2 - -> Sort + -> Parallel Hash Join Output: t1.a, (PARTIAL avg(t2.c)) - Sort Key: t1.a - -> Parallel Hash Join - Output: t1.a, (PARTIAL avg(t2.c)) - Hash Cond: (t1.b = t2.b) - -> Parallel Seq Scan on public.eager_agg_t1 t1 - Output: t1.a, t1.b, t1.c - -> Parallel Hash - Output: t2.b, (PARTIAL avg(t2.c)) - -> Partial HashAggregate - Output: t2.b, PARTIAL avg(t2.c) - Group Key: t2.b - -> Parallel Seq Scan on public.eager_agg_t2 t2 - Output: t2.a, t2.b, t2.c -(21 rows) + Hash Cond: (t1.b = t2.b) + -> Parallel Seq Scan on public.eager_agg_t1 t1 + Output: t1.a, t1.b, t1.c + -> Parallel Hash + Output: t2.b, (PARTIAL avg(t2.c)) + -> Partial HashAggregate + Output: t2.b, PARTIAL avg(t2.c) + Group Key: t2.b + -> Parallel Seq Scan on public.eager_agg_t2 t2 + Output: t2.a, t2.b, t2.c +(18 rows) SELECT t1.a, avg(t2.c) FROM eager_agg_t1 t1 @@ -387,27 +480,24 @@ SELECT t1.a, avg(t2.c) FROM eager_agg_t1 t1 JOIN eager_agg_t2 t2 ON t1.b = t2.b GROUP BY t1.a ORDER BY t1.a; - QUERY PLAN ------------------------------------------------------------------- - Finalize GroupAggregate + QUERY PLAN +------------------------------------------------------------ + Finalize IndexAggregate Output: t1.a, avg(t2.c) Group Key: t1.a - -> Sort + -> Hash Join Output: t1.a, (PARTIAL avg(t2.c)) - Sort Key: t1.a - -> Hash Join - Output: t1.a, (PARTIAL avg(t2.c)) - Hash Cond: (t1.b = t2.b) - -> Seq Scan on public.eager_agg_t1 t1 - Output: t1.a, t1.b, t1.c - -> Hash - Output: t2.b, (PARTIAL avg(t2.c)) - -> Partial HashAggregate - Output: t2.b, PARTIAL avg(t2.c) - Group Key: t2.b - -> Seq Scan on public.eager_agg_t2 t2 - Output: t2.a, t2.b, t2.c -(18 rows) + Hash Cond: (t1.b = t2.b) + -> Seq Scan on public.eager_agg_t1 t1 + Output: t1.a, t1.b, t1.c + -> Hash + Output: t2.b, (PARTIAL avg(t2.c)) + -> Partial HashAggregate + Output: t2.b, PARTIAL avg(t2.c) + Group Key: t2.b + -> Seq Scan on public.eager_agg_t2 t2 + Output: t2.a, t2.b, t2.c +(15 rows) SELECT t1.a, avg(t2.c) FROM eager_agg_t1 t1 @@ -696,79 +786,77 @@ SELECT t1.x, sum(t2.y + t3.y) JOIN eager_agg_tab1 t2 ON t1.x = t2.x JOIN eager_agg_tab1 t3 ON t2.x = t3.x GROUP BY t1.x ORDER BY t1.x; - QUERY PLAN -------------------------------------------------------------------------------------------- - Sort - Output: t1.x, (sum((t2.y + t3.y))) + QUERY PLAN +------------------------------------------------------------------------------------- + Merge Append Sort Key: t1.x - -> Append - -> Finalize HashAggregate - Output: t1.x, sum((t2.y + t3.y)) - Group Key: t1.x - -> Hash Join - Output: t1.x, (PARTIAL sum((t2.y + t3.y))) - Hash Cond: (t1.x = t2.x) - -> Seq Scan on public.eager_agg_tab1_p1 t1 - Output: t1.x - -> Hash - Output: t2.x, t3.x, (PARTIAL sum((t2.y + t3.y))) - -> Partial HashAggregate - Output: t2.x, t3.x, PARTIAL sum((t2.y + t3.y)) - Group Key: t2.x - -> Hash Join - Output: t2.y, t2.x, t3.y, t3.x - Hash Cond: (t2.x = t3.x) - -> Seq Scan on public.eager_agg_tab1_p1 t2 - Output: t2.y, t2.x - -> Hash + -> Finalize IndexAggregate + Output: t1.x, sum((t2.y + t3.y)) + Group Key: t1.x + -> Hash Join + Output: t1.x, (PARTIAL sum((t2.y + t3.y))) + Hash Cond: (t1.x = t2.x) + -> Seq Scan on public.eager_agg_tab1_p1 t1 + Output: t1.x + -> Hash + Output: t2.x, t3.x, (PARTIAL sum((t2.y + t3.y))) + -> Partial HashAggregate + Output: t2.x, t3.x, PARTIAL sum((t2.y + t3.y)) + Group Key: t2.x + -> Hash Join + Output: t2.y, t2.x, t3.y, t3.x + Hash Cond: (t2.x = t3.x) + -> Seq Scan on public.eager_agg_tab1_p1 t2 + Output: t2.y, t2.x + -> Hash + Output: t3.y, t3.x + -> Seq Scan on public.eager_agg_tab1_p1 t3 Output: t3.y, t3.x - -> Seq Scan on public.eager_agg_tab1_p1 t3 - Output: t3.y, t3.x - -> Finalize HashAggregate - Output: t1_1.x, sum((t2_1.y + t3_1.y)) - Group Key: t1_1.x - -> Hash Join - Output: t1_1.x, (PARTIAL sum((t2_1.y + t3_1.y))) - Hash Cond: (t1_1.x = t2_1.x) - -> Seq Scan on public.eager_agg_tab1_p2 t1_1 - Output: t1_1.x - -> Hash - Output: t2_1.x, t3_1.x, (PARTIAL sum((t2_1.y + t3_1.y))) - -> Partial HashAggregate - Output: t2_1.x, t3_1.x, PARTIAL sum((t2_1.y + t3_1.y)) - Group Key: t2_1.x - -> Hash Join - Output: t2_1.y, t2_1.x, t3_1.y, t3_1.x - Hash Cond: (t2_1.x = t3_1.x) - -> Seq Scan on public.eager_agg_tab1_p2 t2_1 - Output: t2_1.y, t2_1.x - -> Hash + -> Finalize IndexAggregate + Output: t1_1.x, sum((t2_1.y + t3_1.y)) + Group Key: t1_1.x + -> Hash Join + Output: t1_1.x, (PARTIAL sum((t2_1.y + t3_1.y))) + Hash Cond: (t1_1.x = t2_1.x) + -> Seq Scan on public.eager_agg_tab1_p2 t1_1 + Output: t1_1.x + -> Hash + Output: t2_1.x, t3_1.x, (PARTIAL sum((t2_1.y + t3_1.y))) + -> Partial HashAggregate + Output: t2_1.x, t3_1.x, PARTIAL sum((t2_1.y + t3_1.y)) + Group Key: t2_1.x + -> Hash Join + Output: t2_1.y, t2_1.x, t3_1.y, t3_1.x + Hash Cond: (t2_1.x = t3_1.x) + -> Seq Scan on public.eager_agg_tab1_p2 t2_1 + Output: t2_1.y, t2_1.x + -> Hash + Output: t3_1.y, t3_1.x + -> Seq Scan on public.eager_agg_tab1_p2 t3_1 Output: t3_1.y, t3_1.x - -> Seq Scan on public.eager_agg_tab1_p2 t3_1 - Output: t3_1.y, t3_1.x - -> Finalize HashAggregate - Output: t1_2.x, sum((t2_2.y + t3_2.y)) - Group Key: t1_2.x - -> Hash Join - Output: t1_2.x, (PARTIAL sum((t2_2.y + t3_2.y))) - Hash Cond: (t1_2.x = t2_2.x) - -> Seq Scan on public.eager_agg_tab1_p3 t1_2 - Output: t1_2.x - -> Hash - Output: t2_2.x, t3_2.x, (PARTIAL sum((t2_2.y + t3_2.y))) - -> Partial HashAggregate - Output: t2_2.x, t3_2.x, PARTIAL sum((t2_2.y + t3_2.y)) - Group Key: t2_2.x - -> Hash Join - Output: t2_2.y, t2_2.x, t3_2.y, t3_2.x - Hash Cond: (t2_2.x = t3_2.x) - -> Seq Scan on public.eager_agg_tab1_p3 t2_2 - Output: t2_2.y, t2_2.x - -> Hash + -> Finalize IndexAggregate + Output: t1_2.x, sum((t2_2.y + t3_2.y)) + Group Key: t1_2.x + -> Hash Join + Output: t1_2.x, (PARTIAL sum((t2_2.y + t3_2.y))) + Hash Cond: (t1_2.x = t2_2.x) + -> Seq Scan on public.eager_agg_tab1_p3 t1_2 + Output: t1_2.x + -> Hash + Output: t2_2.x, t3_2.x, (PARTIAL sum((t2_2.y + t3_2.y))) + -> Partial HashAggregate + Output: t2_2.x, t3_2.x, PARTIAL sum((t2_2.y + t3_2.y)) + Group Key: t2_2.x + -> Hash Join + Output: t2_2.y, t2_2.x, t3_2.y, t3_2.x + Hash Cond: (t2_2.x = t3_2.x) + -> Seq Scan on public.eager_agg_tab1_p3 t2_2 + Output: t2_2.y, t2_2.x + -> Hash + Output: t3_2.y, t3_2.x + -> Seq Scan on public.eager_agg_tab1_p3 t3_2 Output: t3_2.y, t3_2.x - -> Seq Scan on public.eager_agg_tab1_p3 t3_2 - Output: t3_2.y, t3_2.x -(70 rows) +(68 rows) SELECT t1.x, sum(t2.y + t3.y) FROM eager_agg_tab1 t1 @@ -803,97 +891,46 @@ SELECT t3.y, sum(t2.y + t3.y) JOIN eager_agg_tab1 t2 ON t1.x = t2.x JOIN eager_agg_tab1 t3 ON t2.x = t3.x GROUP BY t3.y ORDER BY t3.y; - QUERY PLAN -------------------------------------------------------------------------------------------- - Finalize GroupAggregate + QUERY PLAN +------------------------------------------------------------------------------------- + Finalize IndexAggregate Output: t3.y, sum((t2.y + t3.y)) Group Key: t3.y - -> Sort + -> Hash Join Output: t3.y, (PARTIAL sum((t2.y + t3.y))) - Sort Key: t3.y + Hash Cond: (t1.x = t2.x) -> Append - -> Hash Join - Output: t3.y, (PARTIAL sum((t2.y + t3.y))) - Hash Cond: (t2.x = t1.x) - -> Partial GroupAggregate - Output: t2.x, t3.y, t3.x, PARTIAL sum((t2.y + t3.y)) - Group Key: t2.x, t3.y, t3.x - -> Incremental Sort - Output: t2.y, t2.x, t3.y, t3.x - Sort Key: t2.x, t3.y - Presorted Key: t2.x - -> Merge Join - Output: t2.y, t2.x, t3.y, t3.x - Merge Cond: (t2.x = t3.x) - -> Sort - Output: t2.y, t2.x - Sort Key: t2.x - -> Seq Scan on public.eager_agg_tab1_p1 t2 - Output: t2.y, t2.x - -> Sort - Output: t3.y, t3.x - Sort Key: t3.x - -> Seq Scan on public.eager_agg_tab1_p1 t3 - Output: t3.y, t3.x - -> Hash - Output: t1.x - -> Seq Scan on public.eager_agg_tab1_p1 t1 - Output: t1.x - -> Hash Join - Output: t3_1.y, (PARTIAL sum((t2_1.y + t3_1.y))) - Hash Cond: (t2_1.x = t1_1.x) - -> Partial GroupAggregate - Output: t2_1.x, t3_1.y, t3_1.x, PARTIAL sum((t2_1.y + t3_1.y)) - Group Key: t2_1.x, t3_1.y, t3_1.x - -> Incremental Sort - Output: t2_1.y, t2_1.x, t3_1.y, t3_1.x - Sort Key: t2_1.x, t3_1.y - Presorted Key: t2_1.x - -> Merge Join - Output: t2_1.y, t2_1.x, t3_1.y, t3_1.x - Merge Cond: (t2_1.x = t3_1.x) - -> Sort - Output: t2_1.y, t2_1.x - Sort Key: t2_1.x - -> Seq Scan on public.eager_agg_tab1_p2 t2_1 - Output: t2_1.y, t2_1.x - -> Sort + -> Seq Scan on public.eager_agg_tab1_p1 t1_1 + Output: t1_1.x + -> Seq Scan on public.eager_agg_tab1_p2 t1_2 + Output: t1_2.x + -> Seq Scan on public.eager_agg_tab1_p3 t1_3 + Output: t1_3.x + -> Hash + Output: t2.x, t3.y, t3.x, (PARTIAL sum((t2.y + t3.y))) + -> Partial IndexAggregate + Output: t2.x, t3.y, t3.x, PARTIAL sum((t2.y + t3.y)) + Group Key: t2.x, t3.y, t3.x + -> Hash Join + Output: t2.y, t2.x, t3.y, t3.x + Hash Cond: (t2.x = t3.x) + -> Append + -> Seq Scan on public.eager_agg_tab1_p1 t2_1 + Output: t2_1.y, t2_1.x + -> Seq Scan on public.eager_agg_tab1_p2 t2_2 + Output: t2_2.y, t2_2.x + -> Seq Scan on public.eager_agg_tab1_p3 t2_3 + Output: t2_3.y, t2_3.x + -> Hash + Output: t3.y, t3.x + -> Append + -> Seq Scan on public.eager_agg_tab1_p1 t3_1 Output: t3_1.y, t3_1.x - Sort Key: t3_1.x - -> Seq Scan on public.eager_agg_tab1_p2 t3_1 - Output: t3_1.y, t3_1.x - -> Hash - Output: t1_1.x - -> Seq Scan on public.eager_agg_tab1_p2 t1_1 - Output: t1_1.x - -> Hash Join - Output: t3_2.y, (PARTIAL sum((t2_2.y + t3_2.y))) - Hash Cond: (t2_2.x = t1_2.x) - -> Partial GroupAggregate - Output: t2_2.x, t3_2.y, t3_2.x, PARTIAL sum((t2_2.y + t3_2.y)) - Group Key: t2_2.x, t3_2.y, t3_2.x - -> Incremental Sort - Output: t2_2.y, t2_2.x, t3_2.y, t3_2.x - Sort Key: t2_2.x, t3_2.y - Presorted Key: t2_2.x - -> Merge Join - Output: t2_2.y, t2_2.x, t3_2.y, t3_2.x - Merge Cond: (t2_2.x = t3_2.x) - -> Sort - Output: t2_2.y, t2_2.x - Sort Key: t2_2.x - -> Seq Scan on public.eager_agg_tab1_p3 t2_2 - Output: t2_2.y, t2_2.x - -> Sort + -> Seq Scan on public.eager_agg_tab1_p2 t3_2 Output: t3_2.y, t3_2.x - Sort Key: t3_2.x - -> Seq Scan on public.eager_agg_tab1_p3 t3_2 - Output: t3_2.y, t3_2.x - -> Hash - Output: t1_2.x - -> Seq Scan on public.eager_agg_tab1_p3 t1_2 - Output: t1_2.x -(88 rows) + -> Seq Scan on public.eager_agg_tab1_p3 t3_3 + Output: t3_3.y, t3_3.x +(37 rows) SELECT t3.y, sum(t2.y + t3.y) FROM eager_agg_tab1 t1 diff --git a/src/test/regress/expected/join.out b/src/test/regress/expected/join.out index edde9e99893a..a9a53e4bac7a 100644 --- a/src/test/regress/expected/join.out +++ b/src/test/regress/expected/join.out @@ -2830,6 +2830,7 @@ select count(*) from set enable_hashjoin = 0; set enable_nestloop = 0; set enable_hashagg = 0; +set enable_indexagg = 0; -- -- Check that we use the pathkeys from a prefix of the group by / order by -- clause for the join pathkeys when that prefix covers all join quals. We @@ -2857,6 +2858,7 @@ order by x.thousand desc, x.twothousand; -> Seq Scan on tenk1 x (13 rows) +reset enable_indexagg; reset enable_hashagg; reset enable_nestloop; reset enable_hashjoin; @@ -9534,23 +9536,20 @@ inner join (select distinct id from j3) j3 on j1.id = j3.id; explain (verbose, costs off) select * from j1 inner join (select id from j3 group by id) j3 on j1.id = j3.id; - QUERY PLAN ------------------------------------------ + QUERY PLAN +----------------------------------- Nested Loop Output: j1.id, j3.id Inner Unique: true Join Filter: (j1.id = j3.id) - -> Group + -> IndexAggregate Output: j3.id Group Key: j3.id - -> Sort + -> Seq Scan on public.j3 Output: j3.id - Sort Key: j3.id - -> Seq Scan on public.j3 - Output: j3.id -> Seq Scan on public.j1 Output: j1.id -(14 rows) +(11 rows) drop table j1; drop table j2; @@ -9867,16 +9866,14 @@ EXPLAIN (COSTS OFF) SELECT 1 FROM group_tbl t1 LEFT JOIN (SELECT a c1, COALESCE(a, a) c2 FROM group_tbl t2) s ON TRUE GROUP BY s.c1, s.c2; - QUERY PLAN ------------------------------------------------- - Group + QUERY PLAN +------------------------------------------- + IndexAggregate Group Key: t2.a, (COALESCE(t2.a, t2.a)) - -> Sort - Sort Key: t2.a, (COALESCE(t2.a, t2.a)) - -> Nested Loop Left Join - -> Seq Scan on group_tbl t1 - -> Seq Scan on group_tbl t2 -(7 rows) + -> Nested Loop Left Join + -> Seq Scan on group_tbl t1 + -> Seq Scan on group_tbl t2 +(5 rows) DROP TABLE group_tbl; -- diff --git a/src/test/regress/expected/partition_aggregate.out b/src/test/regress/expected/partition_aggregate.out index c30304b99c79..fce941ae1f04 100644 --- a/src/test/regress/expected/partition_aggregate.out +++ b/src/test/regress/expected/partition_aggregate.out @@ -150,7 +150,7 @@ EXPLAIN (COSTS OFF) SELECT c, sum(a) FROM pagg_tab WHERE 1 = 2 GROUP BY c; QUERY PLAN ------------------------------------ - HashAggregate + IndexAggregate Group Key: c -> Result Replaces: Scan on pagg_tab @@ -177,8 +177,9 @@ SELECT c, sum(a) FROM pagg_tab WHERE c = 'x' GROUP BY c; ---+----- (0 rows) --- Test GroupAggregate paths by disabling hash aggregates. +-- Test GroupAggregate paths by disabling hash and index aggregates. SET enable_hashagg TO false; +SET enable_indexagg TO false; -- When GROUP BY clause matches full aggregation is performed for each partition. EXPLAIN (COSTS OFF) SELECT c, sum(a), avg(b), count(*) FROM pagg_tab GROUP BY 1 HAVING avg(d) < 15 ORDER BY 1, 2, 3; @@ -370,6 +371,150 @@ SELECT count(*) FROM pagg_tab GROUP BY c ORDER BY c LIMIT 1; 250 (1 row) +RESET enable_hashagg; +RESET enable_indexagg; +-- Test IndexAggregate paths by disabling hash and group aggregates. +SET enable_sort TO false; +SET enable_hashagg TO false; +-- When GROUP BY clause matches full aggregation is performed for each partition. +EXPLAIN (COSTS OFF) +SELECT c, sum(a), avg(b), count(*) FROM pagg_tab GROUP BY 1 HAVING avg(d) < 15 ORDER BY 1, 2, 3; + QUERY PLAN +-------------------------------------------------------------- + Sort + Disabled: true + Sort Key: pagg_tab.c, (sum(pagg_tab.a)), (avg(pagg_tab.b)) + -> Append + -> IndexAggregate + Group Key: pagg_tab.c + Filter: (avg(pagg_tab.d) < '15'::numeric) + -> Seq Scan on pagg_tab_p1 pagg_tab + -> IndexAggregate + Group Key: pagg_tab_1.c + Filter: (avg(pagg_tab_1.d) < '15'::numeric) + -> Seq Scan on pagg_tab_p2 pagg_tab_1 + -> IndexAggregate + Group Key: pagg_tab_2.c + Filter: (avg(pagg_tab_2.d) < '15'::numeric) + -> Seq Scan on pagg_tab_p3 pagg_tab_2 +(16 rows) + +SELECT c, sum(a), avg(b), count(*) FROM pagg_tab GROUP BY 1 HAVING avg(d) < 15 ORDER BY 1, 2, 3; + c | sum | avg | count +------+------+---------------------+------- + 0000 | 2000 | 12.0000000000000000 | 250 + 0001 | 2250 | 13.0000000000000000 | 250 + 0002 | 2500 | 14.0000000000000000 | 250 + 0006 | 2500 | 12.0000000000000000 | 250 + 0007 | 2750 | 13.0000000000000000 | 250 + 0008 | 2000 | 14.0000000000000000 | 250 +(6 rows) + +-- When GROUP BY clause does not match; top finalize node is required +EXPLAIN (COSTS OFF) +SELECT a, sum(b), avg(b), count(*) FROM pagg_tab GROUP BY 1 HAVING avg(d) < 15 ORDER BY 1, 2, 3; + QUERY PLAN +-------------------------------------------------------------- + Sort + Disabled: true + Sort Key: pagg_tab.a, (sum(pagg_tab.b)), (avg(pagg_tab.b)) + -> Finalize GroupAggregate + Group Key: pagg_tab.a + Filter: (avg(pagg_tab.d) < '15'::numeric) + -> Merge Append + Sort Key: pagg_tab.a + -> Partial IndexAggregate + Group Key: pagg_tab.a + -> Seq Scan on pagg_tab_p1 pagg_tab + -> Partial IndexAggregate + Group Key: pagg_tab_1.a + -> Seq Scan on pagg_tab_p2 pagg_tab_1 + -> Partial IndexAggregate + Group Key: pagg_tab_2.a + -> Seq Scan on pagg_tab_p3 pagg_tab_2 +(17 rows) + +SELECT a, sum(b), avg(b), count(*) FROM pagg_tab GROUP BY 1 HAVING avg(d) < 15 ORDER BY 1, 2, 3; + a | sum | avg | count +----+------+---------------------+------- + 0 | 1500 | 10.0000000000000000 | 150 + 1 | 1650 | 11.0000000000000000 | 150 + 2 | 1800 | 12.0000000000000000 | 150 + 3 | 1950 | 13.0000000000000000 | 150 + 4 | 2100 | 14.0000000000000000 | 150 + 10 | 1500 | 10.0000000000000000 | 150 + 11 | 1650 | 11.0000000000000000 | 150 + 12 | 1800 | 12.0000000000000000 | 150 + 13 | 1950 | 13.0000000000000000 | 150 + 14 | 2100 | 14.0000000000000000 | 150 +(10 rows) + +-- Test partitionwise grouping without any aggregates +EXPLAIN (COSTS OFF) +SELECT c FROM pagg_tab GROUP BY c ORDER BY 1; + QUERY PLAN +------------------------------------------------ + Merge Append + Sort Key: pagg_tab.c + -> IndexAggregate + Group Key: pagg_tab.c + -> Seq Scan on pagg_tab_p1 pagg_tab + -> IndexAggregate + Group Key: pagg_tab_1.c + -> Seq Scan on pagg_tab_p2 pagg_tab_1 + -> IndexAggregate + Group Key: pagg_tab_2.c + -> Seq Scan on pagg_tab_p3 pagg_tab_2 +(11 rows) + +SELECT c FROM pagg_tab GROUP BY c ORDER BY 1; + c +------ + 0000 + 0001 + 0002 + 0003 + 0004 + 0005 + 0006 + 0007 + 0008 + 0009 + 0010 + 0011 +(12 rows) + +EXPLAIN (COSTS OFF) +SELECT a FROM pagg_tab WHERE a < 3 GROUP BY a ORDER BY 1; + QUERY PLAN +------------------------------------------------------ + Group + Group Key: pagg_tab.a + -> Merge Append + Sort Key: pagg_tab.a + -> Partial IndexAggregate + Group Key: pagg_tab.a + -> Seq Scan on pagg_tab_p1 pagg_tab + Filter: (a < 3) + -> Partial IndexAggregate + Group Key: pagg_tab_1.a + -> Seq Scan on pagg_tab_p2 pagg_tab_1 + Filter: (a < 3) + -> Partial IndexAggregate + Group Key: pagg_tab_2.a + -> Seq Scan on pagg_tab_p3 pagg_tab_2 + Filter: (a < 3) +(16 rows) + +SELECT a FROM pagg_tab WHERE a < 3 GROUP BY a ORDER BY 1; + a +--- + 0 + 1 + 2 +(3 rows) + +RESET enable_sort; RESET enable_hashagg; -- ROLLUP, partitionwise aggregation does not apply EXPLAIN (COSTS OFF) @@ -554,6 +699,7 @@ SELECT t2.y, sum(t1.y), count(*) FROM pagg_tab1 t1, pagg_tab2 t2 WHERE t1.x = t2 -- When GROUP BY clause does not match; partial aggregation is performed for each partition. -- Also test GroupAggregate paths by disabling hash aggregates. SET enable_hashagg TO false; +SET enable_indexagg TO false; EXPLAIN (COSTS OFF) SELECT t1.y, sum(t1.x), count(*) FROM pagg_tab1 t1, pagg_tab2 t2 WHERE t1.x = t2.y GROUP BY t1.y HAVING avg(t1.x) > 10 ORDER BY 1, 2, 3; QUERY PLAN @@ -606,41 +752,40 @@ SELECT t1.y, sum(t1.x), count(*) FROM pagg_tab1 t1, pagg_tab2 t2 WHERE t1.x = t2 (6 rows) RESET enable_hashagg; +RESET enable_indexagg; -- Check with LEFT/RIGHT/FULL OUTER JOINs which produces NULL values for -- aggregation -- LEFT JOIN, should produce partial partitionwise aggregation plan as -- GROUP BY is on nullable column EXPLAIN (COSTS OFF) SELECT b.y, sum(a.y) FROM pagg_tab1 a LEFT JOIN pagg_tab2 b ON a.x = b.y GROUP BY b.y ORDER BY 1 NULLS LAST; - QUERY PLAN ------------------------------------------------------------------- - Finalize GroupAggregate + QUERY PLAN +------------------------------------------------------------ + Finalize IndexAggregate Group Key: b.y - -> Sort - Sort Key: b.y - -> Append - -> Partial HashAggregate - Group Key: b.y - -> Hash Left Join - Hash Cond: (a.x = b.y) - -> Seq Scan on pagg_tab1_p1 a - -> Hash - -> Seq Scan on pagg_tab2_p1 b - -> Partial HashAggregate - Group Key: b_1.y - -> Hash Left Join - Hash Cond: (a_1.x = b_1.y) - -> Seq Scan on pagg_tab1_p2 a_1 - -> Hash - -> Seq Scan on pagg_tab2_p2 b_1 - -> Partial HashAggregate - Group Key: b_2.y - -> Hash Right Join - Hash Cond: (b_2.y = a_2.x) - -> Seq Scan on pagg_tab2_p3 b_2 - -> Hash - -> Seq Scan on pagg_tab1_p3 a_2 -(26 rows) + -> Append + -> Partial HashAggregate + Group Key: b.y + -> Hash Left Join + Hash Cond: (a.x = b.y) + -> Seq Scan on pagg_tab1_p1 a + -> Hash + -> Seq Scan on pagg_tab2_p1 b + -> Partial HashAggregate + Group Key: b_1.y + -> Hash Left Join + Hash Cond: (a_1.x = b_1.y) + -> Seq Scan on pagg_tab1_p2 a_1 + -> Hash + -> Seq Scan on pagg_tab2_p2 b_1 + -> Partial HashAggregate + Group Key: b_2.y + -> Hash Right Join + Hash Cond: (b_2.y = a_2.x) + -> Seq Scan on pagg_tab2_p3 b_2 + -> Hash + -> Seq Scan on pagg_tab1_p3 a_2 +(24 rows) SELECT b.y, sum(a.y) FROM pagg_tab1 a LEFT JOIN pagg_tab2 b ON a.x = b.y GROUP BY b.y ORDER BY 1 NULLS LAST; y | sum @@ -704,35 +849,33 @@ SELECT b.y, sum(a.y) FROM pagg_tab1 a RIGHT JOIN pagg_tab2 b ON a.x = b.y GROUP -- GROUP BY is on nullable column EXPLAIN (COSTS OFF) SELECT a.x, sum(b.x) FROM pagg_tab1 a FULL OUTER JOIN pagg_tab2 b ON a.x = b.y GROUP BY a.x ORDER BY 1 NULLS LAST; - QUERY PLAN ------------------------------------------------------------------- - Finalize GroupAggregate + QUERY PLAN +------------------------------------------------------------ + Finalize IndexAggregate Group Key: a.x - -> Sort - Sort Key: a.x - -> Append - -> Partial HashAggregate - Group Key: a.x - -> Hash Full Join - Hash Cond: (a.x = b.y) - -> Seq Scan on pagg_tab1_p1 a - -> Hash - -> Seq Scan on pagg_tab2_p1 b - -> Partial HashAggregate - Group Key: a_1.x - -> Hash Full Join - Hash Cond: (a_1.x = b_1.y) - -> Seq Scan on pagg_tab1_p2 a_1 - -> Hash - -> Seq Scan on pagg_tab2_p2 b_1 - -> Partial HashAggregate - Group Key: a_2.x - -> Hash Full Join - Hash Cond: (b_2.y = a_2.x) - -> Seq Scan on pagg_tab2_p3 b_2 - -> Hash - -> Seq Scan on pagg_tab1_p3 a_2 -(26 rows) + -> Append + -> Partial HashAggregate + Group Key: a.x + -> Hash Full Join + Hash Cond: (a.x = b.y) + -> Seq Scan on pagg_tab1_p1 a + -> Hash + -> Seq Scan on pagg_tab2_p1 b + -> Partial HashAggregate + Group Key: a_1.x + -> Hash Full Join + Hash Cond: (a_1.x = b_1.y) + -> Seq Scan on pagg_tab1_p2 a_1 + -> Hash + -> Seq Scan on pagg_tab2_p2 b_1 + -> Partial HashAggregate + Group Key: a_2.x + -> Hash Full Join + Hash Cond: (b_2.y = a_2.x) + -> Seq Scan on pagg_tab2_p3 b_2 + -> Hash + -> Seq Scan on pagg_tab1_p3 a_2 +(24 rows) SELECT a.x, sum(b.x) FROM pagg_tab1 a FULL OUTER JOIN pagg_tab2 b ON a.x = b.y GROUP BY a.x ORDER BY 1 NULLS LAST; x | sum @@ -839,16 +982,14 @@ SELECT a.x, b.y, count(*) FROM (SELECT * FROM pagg_tab1 WHERE x < 20) a FULL JOI -- Empty join relation because of empty outer side, no partitionwise agg plan EXPLAIN (COSTS OFF) SELECT a.x, a.y, count(*) FROM (SELECT * FROM pagg_tab1 WHERE x = 1 AND x = 2) a LEFT JOIN pagg_tab2 b ON a.x = b.y GROUP BY a.x, a.y ORDER BY 1, 2; - QUERY PLAN ----------------------------------------------- - GroupAggregate + QUERY PLAN +---------------------------------------- + IndexAggregate Group Key: pagg_tab1.y - -> Sort - Sort Key: pagg_tab1.y - -> Result - Replaces: Join on b, pagg_tab1 - One-Time Filter: false -(7 rows) + -> Result + Replaces: Join on b, pagg_tab1 + One-Time Filter: false +(5 rows) SELECT a.x, a.y, count(*) FROM (SELECT * FROM pagg_tab1 WHERE x = 1 AND x = 2) a LEFT JOIN pagg_tab2 b ON a.x = b.y GROUP BY a.x, a.y ORDER BY 1, 2; x | y | count @@ -869,7 +1010,7 @@ SELECT a, sum(b), avg(c), count(*) FROM pagg_tab_m GROUP BY a HAVING avg(c) < 22 -------------------------------------------------------------------- Sort Sort Key: pagg_tab_m.a, (sum(pagg_tab_m.b)), (avg(pagg_tab_m.c)) - -> Finalize HashAggregate + -> Finalize IndexAggregate Group Key: pagg_tab_m.a Filter: (avg(pagg_tab_m.c) < '22'::numeric) -> Append @@ -1067,8 +1208,8 @@ RESET parallel_setup_cost; -- PARTITION KEY, thus we will have a partial aggregation for them. EXPLAIN (COSTS OFF) SELECT a, sum(b), count(*) FROM pagg_tab_ml GROUP BY a HAVING avg(b) < 3 ORDER BY 1, 2, 3; - QUERY PLAN ---------------------------------------------------------------------------------- + QUERY PLAN +--------------------------------------------------------------------------- Sort Sort Key: pagg_tab_ml.a, (sum(pagg_tab_ml.b)), (count(*)) -> Append @@ -1076,31 +1217,27 @@ SELECT a, sum(b), count(*) FROM pagg_tab_ml GROUP BY a HAVING avg(b) < 3 ORDER B Group Key: pagg_tab_ml.a Filter: (avg(pagg_tab_ml.b) < '3'::numeric) -> Seq Scan on pagg_tab_ml_p1 pagg_tab_ml - -> Finalize GroupAggregate + -> Finalize IndexAggregate Group Key: pagg_tab_ml_2.a Filter: (avg(pagg_tab_ml_2.b) < '3'::numeric) - -> Sort - Sort Key: pagg_tab_ml_2.a - -> Append - -> Partial HashAggregate - Group Key: pagg_tab_ml_2.a - -> Seq Scan on pagg_tab_ml_p2_s1 pagg_tab_ml_2 - -> Partial HashAggregate - Group Key: pagg_tab_ml_3.a - -> Seq Scan on pagg_tab_ml_p2_s2 pagg_tab_ml_3 - -> Finalize GroupAggregate + -> Append + -> Partial HashAggregate + Group Key: pagg_tab_ml_2.a + -> Seq Scan on pagg_tab_ml_p2_s1 pagg_tab_ml_2 + -> Partial HashAggregate + Group Key: pagg_tab_ml_3.a + -> Seq Scan on pagg_tab_ml_p2_s2 pagg_tab_ml_3 + -> Finalize IndexAggregate Group Key: pagg_tab_ml_5.a Filter: (avg(pagg_tab_ml_5.b) < '3'::numeric) - -> Sort - Sort Key: pagg_tab_ml_5.a - -> Append - -> Partial HashAggregate - Group Key: pagg_tab_ml_5.a - -> Seq Scan on pagg_tab_ml_p3_s1 pagg_tab_ml_5 - -> Partial HashAggregate - Group Key: pagg_tab_ml_6.a - -> Seq Scan on pagg_tab_ml_p3_s2 pagg_tab_ml_6 -(31 rows) + -> Append + -> Partial HashAggregate + Group Key: pagg_tab_ml_5.a + -> Seq Scan on pagg_tab_ml_p3_s1 pagg_tab_ml_5 + -> Partial HashAggregate + Group Key: pagg_tab_ml_6.a + -> Seq Scan on pagg_tab_ml_p3_s2 pagg_tab_ml_6 +(27 rows) SELECT a, sum(b), count(*) FROM pagg_tab_ml GROUP BY a HAVING avg(b) < 3 ORDER BY 1, 2, 3; a | sum | count @@ -1120,31 +1257,29 @@ SELECT a, sum(b), count(*) FROM pagg_tab_ml GROUP BY a HAVING avg(b) < 3 ORDER B -- PARTITION KEY EXPLAIN (COSTS OFF) SELECT b, sum(a), count(*) FROM pagg_tab_ml GROUP BY b ORDER BY 1, 2, 3; - QUERY PLAN ---------------------------------------------------------------------------- + QUERY PLAN +--------------------------------------------------------------------- Sort Sort Key: pagg_tab_ml.b, (sum(pagg_tab_ml.a)), (count(*)) - -> Finalize GroupAggregate + -> Finalize IndexAggregate Group Key: pagg_tab_ml.b - -> Sort - Sort Key: pagg_tab_ml.b - -> Append - -> Partial HashAggregate - Group Key: pagg_tab_ml.b - -> Seq Scan on pagg_tab_ml_p1 pagg_tab_ml - -> Partial HashAggregate - Group Key: pagg_tab_ml_1.b - -> Seq Scan on pagg_tab_ml_p2_s1 pagg_tab_ml_1 - -> Partial HashAggregate - Group Key: pagg_tab_ml_2.b - -> Seq Scan on pagg_tab_ml_p2_s2 pagg_tab_ml_2 - -> Partial HashAggregate - Group Key: pagg_tab_ml_3.b - -> Seq Scan on pagg_tab_ml_p3_s1 pagg_tab_ml_3 - -> Partial HashAggregate - Group Key: pagg_tab_ml_4.b - -> Seq Scan on pagg_tab_ml_p3_s2 pagg_tab_ml_4 -(22 rows) + -> Append + -> Partial HashAggregate + Group Key: pagg_tab_ml.b + -> Seq Scan on pagg_tab_ml_p1 pagg_tab_ml + -> Partial HashAggregate + Group Key: pagg_tab_ml_1.b + -> Seq Scan on pagg_tab_ml_p2_s1 pagg_tab_ml_1 + -> Partial HashAggregate + Group Key: pagg_tab_ml_2.b + -> Seq Scan on pagg_tab_ml_p2_s2 pagg_tab_ml_2 + -> Partial HashAggregate + Group Key: pagg_tab_ml_3.b + -> Seq Scan on pagg_tab_ml_p3_s1 pagg_tab_ml_3 + -> Partial HashAggregate + Group Key: pagg_tab_ml_4.b + -> Seq Scan on pagg_tab_ml_p3_s2 pagg_tab_ml_4 +(20 rows) SELECT b, sum(a), count(*) FROM pagg_tab_ml GROUP BY b HAVING avg(a) < 15 ORDER BY 1, 2, 3; b | sum | count diff --git a/src/test/regress/expected/select_parallel.out b/src/test/regress/expected/select_parallel.out index 933921d1860b..0318863bf1f1 100644 --- a/src/test/regress/expected/select_parallel.out +++ b/src/test/regress/expected/select_parallel.out @@ -706,18 +706,16 @@ alter table tenk2 reset (parallel_workers); set enable_hashagg = false; explain (costs off) select count(*) from tenk1 group by twenty; - QUERY PLAN ----------------------------------------------------- + QUERY PLAN +---------------------------------------------- Finalize GroupAggregate Group Key: twenty -> Gather Merge Workers Planned: 4 - -> Partial GroupAggregate + -> Partial IndexAggregate Group Key: twenty - -> Sort - Sort Key: twenty - -> Parallel Seq Scan on tenk1 -(9 rows) + -> Parallel Seq Scan on tenk1 +(7 rows) select count(*) from tenk1 group by twenty; count @@ -772,19 +770,17 @@ drop function sp_simple_func(integer); -- test handling of SRFs in targetlist (bug in 10.0) explain (costs off) select count(*), generate_series(1,2) from tenk1 group by twenty; - QUERY PLAN ----------------------------------------------------------- + QUERY PLAN +---------------------------------------------------- ProjectSet -> Finalize GroupAggregate Group Key: twenty -> Gather Merge Workers Planned: 4 - -> Partial GroupAggregate + -> Partial IndexAggregate Group Key: twenty - -> Sort - Sort Key: twenty - -> Parallel Seq Scan on tenk1 -(10 rows) + -> Parallel Seq Scan on tenk1 +(8 rows) select count(*), generate_series(1,2) from tenk1 group by twenty; count | generate_series @@ -833,6 +829,7 @@ select count(*), generate_series(1,2) from tenk1 group by twenty; -- test gather merge with parallel leader participation disabled set parallel_leader_participation = off; +set enable_indexagg = off; explain (costs off) select count(*) from tenk1 group by twenty; QUERY PLAN @@ -876,6 +873,7 @@ select count(*) from tenk1 group by twenty; reset parallel_leader_participation; --test rescan behavior of gather merge set enable_material = false; +set enable_indexagg = false; explain (costs off) select * from (select string4, count(unique2) @@ -917,6 +915,7 @@ select * from (12 rows) reset enable_material; +reset enable_indexagg; reset enable_hashagg; -- check parallelized int8 aggregate (bug #14897) explain (costs off) diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out index 0411db832f13..d32bec316d36 100644 --- a/src/test/regress/expected/sysviews.out +++ b/src/test/regress/expected/sysviews.out @@ -157,6 +157,7 @@ select name, setting from pg_settings where name like 'enable%'; enable_hashagg | on enable_hashjoin | on enable_incremental_sort | on + enable_indexagg | on enable_indexonlyscan | on enable_indexscan | on enable_material | on @@ -173,7 +174,7 @@ select name, setting from pg_settings where name like 'enable%'; enable_seqscan | on enable_sort | on enable_tidscan | on -(25 rows) +(26 rows) -- There are always wait event descriptions for various types. InjectionPoint -- may be present or absent, depending on history since last postmaster start. diff --git a/src/test/regress/sql/aggregates.sql b/src/test/regress/sql/aggregates.sql index 850f5a5787f5..f72eb3671128 100644 --- a/src/test/regress/sql/aggregates.sql +++ b/src/test/regress/sql/aggregates.sql @@ -1392,6 +1392,7 @@ CREATE INDEX btg_x_y_idx ON btg(x, y); ANALYZE btg; SET enable_hashagg = off; +SET enable_indexagg = off; SET enable_seqscan = off; -- Utilize the ordering of index scan to avoid a Sort operation @@ -1623,12 +1624,100 @@ select v||'a', case v||'a' when 'aa' then 1 else 0 end, count(*) select v||'a', case when v||'a' = 'aa' then 1 else 0 end, count(*) from unnest(array['a','b']) u(v) group by v||'a' order by 1; + +-- +-- Index Aggregation tests +-- + +set enable_hashagg = false; +set enable_sort = false; +set enable_indexagg = true; +set enable_indexscan = false; + +-- require ordered output +EXPLAIN (COSTS OFF, VERBOSE) +SELECT unique1, SUM(two) FROM tenk1 +GROUP BY 1 +ORDER BY 1 +LIMIT 10; + +SELECT unique1, SUM(two) FROM tenk1 +GROUP BY 1 +ORDER BY 1 +LIMIT 10; + +EXPLAIN (COSTS OFF, VERBOSE) +SELECT even, sum(two) FROM tenk1 +GROUP BY 1 +ORDER BY 1 +LIMIT 10; + +SELECT even, sum(two) FROM tenk1 +GROUP BY 1 +ORDER BY 1 +LIMIT 10; + +-- multiple grouping columns +EXPLAIN (COSTS OFF, VERBOSE) +SELECT even, odd, sum(unique1) FROM tenk1 +GROUP BY 1, 2 +ORDER BY 1, 2 +LIMIT 10; + +SELECT even, odd, sum(unique1) FROM tenk1 +GROUP BY 1, 2 +ORDER BY 1, 2 +LIMIT 10; + +-- mixing columns between group by and order by +begin; + +create temp table tmp(x int, y int); +insert into tmp values (1, 8), (2, 7), (3, 6), (4, 5); + +EXPLAIN (COSTS OFF, VERBOSE) +SELECT x, y, sum(x) FROM tmp +GROUP BY 1, 2 +ORDER BY 1, 2; + +SELECT x, y, sum(x) FROM tmp +GROUP BY 1, 2 +ORDER BY 1, 2; + +EXPLAIN (COSTS OFF, VERBOSE) +SELECT x, y, sum(x) FROM tmp +GROUP BY 1, 2 +ORDER BY 2, 1; + +SELECT x, y, sum(x) FROM tmp +GROUP BY 1, 2 +ORDER BY 2, 1; + +-- +-- Index Aggregation Spill tests +-- + +set enable_indexagg = true; +set enable_sort=false; +set enable_hashagg = false; +set work_mem='64kB'; + +select unique1, count(*), sum(twothousand) from tenk1 +group by unique1 +having sum(fivethous) > 4975 +order by sum(twothousand); + +set work_mem to default; +set enable_sort to default; +set enable_hashagg to default; +set enable_indexagg to default; -- -- Hash Aggregation Spill tests -- set enable_sort=false; +set enable_indexagg = false; set work_mem='64kB'; select unique1, count(*), sum(twothousand) from tenk1 @@ -1657,6 +1746,7 @@ analyze agg_data_20k; -- Produce results with sorting. set enable_hashagg = false; +set enable_indexagg = false; set jit_above_cost = 0; @@ -1728,23 +1818,68 @@ select (g/2)::numeric as c1, array_agg(g::numeric) as c2, count(*) as c3 set enable_sort = true; set work_mem to default; +-- Produce results with index aggregation + +set enable_sort = false; +set enable_hashagg = false; +set enable_indexagg = true; + +set jit_above_cost = 0; + +explain (costs off) +select g%10000 as c1, sum(g::numeric) as c2, count(*) as c3 + from agg_data_20k group by g%10000; + +create table agg_index_1 as +select g%10000 as c1, sum(g::numeric) as c2, count(*) as c3 + from agg_data_20k group by g%10000; + +create table agg_index_2 as +select * from + (values (100), (300), (500)) as r(a), + lateral ( + select (g/2)::numeric as c1, + array_agg(g::numeric) as c2, + count(*) as c3 + from agg_data_2k + where g < r.a + group by g/2) as s; + +set jit_above_cost to default; + +create table agg_index_3 as +select (g/2)::numeric as c1, sum(7::int4) as c2, count(*) as c3 + from agg_data_2k group by g/2; + +create table agg_index_4 as +select (g/2)::numeric as c1, array_agg(g::numeric) as c2, count(*) as c3 + from agg_data_2k group by g/2; + -- Compare group aggregation results to hash aggregation results (select * from agg_hash_1 except select * from agg_group_1) union all -(select * from agg_group_1 except select * from agg_hash_1); +(select * from agg_group_1 except select * from agg_hash_1) + union all +(select * from agg_index_1 except select * from agg_group_1); (select * from agg_hash_2 except select * from agg_group_2) union all -(select * from agg_group_2 except select * from agg_hash_2); +(select * from agg_group_2 except select * from agg_hash_2) + union all +(select * from agg_index_2 except select * from agg_group_2); (select * from agg_hash_3 except select * from agg_group_3) union all -(select * from agg_group_3 except select * from agg_hash_3); +(select * from agg_group_3 except select * from agg_hash_3) + union all +(select * from agg_index_3 except select * from agg_group_3); (select * from agg_hash_4 except select * from agg_group_4) union all -(select * from agg_group_4 except select * from agg_hash_4); +(select * from agg_group_4 except select * from agg_hash_4) + union all +(select * from agg_index_4 except select * from agg_group_4); drop table agg_group_1; drop table agg_group_2; @@ -1754,3 +1889,7 @@ drop table agg_hash_1; drop table agg_hash_2; drop table agg_hash_3; drop table agg_hash_4; +drop table agg_index_1; +drop table agg_index_2; +drop table agg_index_3; +drop table agg_index_4; diff --git a/src/test/regress/sql/eager_aggregate.sql b/src/test/regress/sql/eager_aggregate.sql index abe6d6ae09f1..f9f4b5dcebd8 100644 --- a/src/test/regress/sql/eager_aggregate.sql +++ b/src/test/regress/sql/eager_aggregate.sql @@ -35,6 +35,7 @@ GROUP BY t1.a ORDER BY t1.a; -- Produce results with sorting aggregation SET enable_hashagg TO off; +SET enable_indexagg TO off; EXPLAIN (VERBOSE, COSTS OFF) SELECT t1.a, avg(t2.c) @@ -48,6 +49,25 @@ SELECT t1.a, avg(t2.c) GROUP BY t1.a ORDER BY t1.a; RESET enable_hashagg; +RESET enable_indexagg; + +-- Produce results with index aggregation +SET enable_hashagg TO off; +SET enable_sort TO off; + +EXPLAIN (VERBOSE, COSTS OFF) +SELECT t1.a, avg(t2.c) + FROM eager_agg_t1 t1 + JOIN eager_agg_t2 t2 ON t1.b = t2.b +GROUP BY t1.a ORDER BY t1.a; + +SELECT t1.a, avg(t2.c) + FROM eager_agg_t1 t1 + JOIN eager_agg_t2 t2 ON t1.b = t2.b +GROUP BY t1.a ORDER BY t1.a; + +RESET enable_hashagg; +RESET enable_sort; -- @@ -71,6 +91,7 @@ GROUP BY t1.a ORDER BY t1.a; -- Produce results with sorting aggregation SET enable_hashagg TO off; +SET enable_indexagg TO off; EXPLAIN (VERBOSE, COSTS OFF) SELECT t1.a, avg(t2.c + t3.c) @@ -86,7 +107,27 @@ SELECT t1.a, avg(t2.c + t3.c) GROUP BY t1.a ORDER BY t1.a; RESET enable_hashagg; +RESET enable_indexagg; +-- Produce results with index aggregation +SET enable_hashagg TO off; +SET enable_sort TO off; + +EXPLAIN (VERBOSE, COSTS OFF) +SELECT t1.a, avg(t2.c + t3.c) + FROM eager_agg_t1 t1 + JOIN eager_agg_t2 t2 ON t1.b = t2.b + JOIN eager_agg_t3 t3 ON t2.a = t3.a +GROUP BY t1.a ORDER BY t1.a; + +SELECT t1.a, avg(t2.c + t3.c) + FROM eager_agg_t1 t1 + JOIN eager_agg_t2 t2 ON t1.b = t2.b + JOIN eager_agg_t3 t3 ON t2.a = t3.a +GROUP BY t1.a ORDER BY t1.a; + +RESET enable_hashagg; +RESET enable_sort; -- -- Test that eager aggregation works for outer join diff --git a/src/test/regress/sql/join.sql b/src/test/regress/sql/join.sql index 7ec84f3b1436..98b3dfcc3ccf 100644 --- a/src/test/regress/sql/join.sql +++ b/src/test/regress/sql/join.sql @@ -605,6 +605,7 @@ select count(*) from set enable_hashjoin = 0; set enable_nestloop = 0; set enable_hashagg = 0; +set enable_indexagg = 0; -- -- Check that we use the pathkeys from a prefix of the group by / order by @@ -617,6 +618,7 @@ from tenk1 x inner join tenk1 y on x.thousand = y.thousand group by x.thousand, x.twothousand order by x.thousand desc, x.twothousand; +reset enable_indexagg; reset enable_hashagg; reset enable_nestloop; reset enable_hashjoin; diff --git a/src/test/regress/sql/partition_aggregate.sql b/src/test/regress/sql/partition_aggregate.sql index 7c725e2663a5..570aac38fc54 100644 --- a/src/test/regress/sql/partition_aggregate.sql +++ b/src/test/regress/sql/partition_aggregate.sql @@ -55,8 +55,9 @@ EXPLAIN (COSTS OFF) SELECT c, sum(a) FROM pagg_tab WHERE c = 'x' GROUP BY c; SELECT c, sum(a) FROM pagg_tab WHERE c = 'x' GROUP BY c; --- Test GroupAggregate paths by disabling hash aggregates. +-- Test GroupAggregate paths by disabling hash and index aggregates. SET enable_hashagg TO false; +SET enable_indexagg TO false; -- When GROUP BY clause matches full aggregation is performed for each partition. EXPLAIN (COSTS OFF) @@ -81,6 +82,32 @@ EXPLAIN (COSTS OFF) SELECT count(*) FROM pagg_tab GROUP BY c ORDER BY c LIMIT 1; SELECT count(*) FROM pagg_tab GROUP BY c ORDER BY c LIMIT 1; +RESET enable_hashagg; +RESET enable_indexagg; + +-- Test IndexAggregate paths by disabling hash and group aggregates. +SET enable_sort TO false; +SET enable_hashagg TO false; + +-- When GROUP BY clause matches full aggregation is performed for each partition. +EXPLAIN (COSTS OFF) +SELECT c, sum(a), avg(b), count(*) FROM pagg_tab GROUP BY 1 HAVING avg(d) < 15 ORDER BY 1, 2, 3; +SELECT c, sum(a), avg(b), count(*) FROM pagg_tab GROUP BY 1 HAVING avg(d) < 15 ORDER BY 1, 2, 3; + +-- When GROUP BY clause does not match; top finalize node is required +EXPLAIN (COSTS OFF) +SELECT a, sum(b), avg(b), count(*) FROM pagg_tab GROUP BY 1 HAVING avg(d) < 15 ORDER BY 1, 2, 3; +SELECT a, sum(b), avg(b), count(*) FROM pagg_tab GROUP BY 1 HAVING avg(d) < 15 ORDER BY 1, 2, 3; + +-- Test partitionwise grouping without any aggregates +EXPLAIN (COSTS OFF) +SELECT c FROM pagg_tab GROUP BY c ORDER BY 1; +SELECT c FROM pagg_tab GROUP BY c ORDER BY 1; +EXPLAIN (COSTS OFF) +SELECT a FROM pagg_tab WHERE a < 3 GROUP BY a ORDER BY 1; +SELECT a FROM pagg_tab WHERE a < 3 GROUP BY a ORDER BY 1; + +RESET enable_sort; RESET enable_hashagg; -- ROLLUP, partitionwise aggregation does not apply @@ -135,10 +162,12 @@ SELECT t2.y, sum(t1.y), count(*) FROM pagg_tab1 t1, pagg_tab2 t2 WHERE t1.x = t2 -- When GROUP BY clause does not match; partial aggregation is performed for each partition. -- Also test GroupAggregate paths by disabling hash aggregates. SET enable_hashagg TO false; +SET enable_indexagg TO false; EXPLAIN (COSTS OFF) SELECT t1.y, sum(t1.x), count(*) FROM pagg_tab1 t1, pagg_tab2 t2 WHERE t1.x = t2.y GROUP BY t1.y HAVING avg(t1.x) > 10 ORDER BY 1, 2, 3; SELECT t1.y, sum(t1.x), count(*) FROM pagg_tab1 t1, pagg_tab2 t2 WHERE t1.x = t2.y GROUP BY t1.y HAVING avg(t1.x) > 10 ORDER BY 1, 2, 3; RESET enable_hashagg; +RESET enable_indexagg; -- Check with LEFT/RIGHT/FULL OUTER JOINs which produces NULL values for -- aggregation diff --git a/src/test/regress/sql/select_parallel.sql b/src/test/regress/sql/select_parallel.sql index 71a75bc86ea7..5f398219166d 100644 --- a/src/test/regress/sql/select_parallel.sql +++ b/src/test/regress/sql/select_parallel.sql @@ -318,6 +318,7 @@ select count(*), generate_series(1,2) from tenk1 group by twenty; -- test gather merge with parallel leader participation disabled set parallel_leader_participation = off; +set enable_indexagg = off; explain (costs off) select count(*) from tenk1 group by twenty; @@ -328,6 +329,7 @@ reset parallel_leader_participation; --test rescan behavior of gather merge set enable_material = false; +set enable_indexagg = false; explain (costs off) select * from @@ -341,6 +343,7 @@ select * from right join (values (1),(2),(3)) v(x) on true; reset enable_material; +reset enable_indexagg; reset enable_hashagg;