Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 9bb3428

Browse files
committed
Rewrite the planner's handling of materialized plan types so that there is
an explicit model of rescan costs being different from first-time costs. The costing of Material nodes in particular now has some visible relationship to the actual runtime behavior, where before it was essentially fantasy. This also fixes up a couple of places where different materialized plan types were treated differently for no very good reason (probably just oversights). A couple of the regression tests are affected, because the planner now chooses to put the other relation on the inside of a nestloop-with-materialize. So far as I can see both changes are sane, and the planner is now more consistently following the expectation that it should prefer to materialize the smaller of two relations. Per a recent discussion with Robert Haas.
1 parent 5f1b32d commit 9bb3428

File tree

12 files changed

+1034
-921
lines changed

12 files changed

+1034
-921
lines changed

src/backend/executor/execAmi.c

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
77
* Portions Copyright (c) 1994, Regents of the University of California
88
*
9-
* $PostgreSQL: pgsql/src/backend/executor/execAmi.c,v 1.103 2009/01/01 17:23:41 momjian Exp $
9+
* $PostgreSQL: pgsql/src/backend/executor/execAmi.c,v 1.104 2009/09/12 22:12:03 tgl Exp $
1010
*
1111
*-------------------------------------------------------------------------
1212
*/
@@ -496,3 +496,30 @@ IndexSupportsBackwardScan(Oid indexid)
496496

497497
return result;
498498
}
499+
500+
/*
501+
* ExecMaterializesOutput - does a plan type materialize its output?
502+
*
503+
* Returns true if the plan node type is one that automatically materializes
504+
* its output (typically by keeping it in a tuplestore). For such plans,
505+
* a rescan without any parameter change will have zero startup cost and
506+
* very low per-tuple cost.
507+
*/
508+
bool
509+
ExecMaterializesOutput(NodeTag plantype)
510+
{
511+
switch (plantype)
512+
{
513+
case T_Material:
514+
case T_FunctionScan:
515+
case T_CteScan:
516+
case T_WorkTableScan:
517+
case T_Sort:
518+
return true;
519+
520+
default:
521+
break;
522+
}
523+
524+
return false;
525+
}

src/backend/optimizer/path/costsize.c

Lines changed: 158 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@
5454
* Portions Copyright (c) 1994, Regents of the University of California
5555
*
5656
* IDENTIFICATION
57-
* $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.210 2009/07/11 04:09:33 tgl Exp $
57+
* $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.211 2009/09/12 22:12:03 tgl Exp $
5858
*
5959
*-------------------------------------------------------------------------
6060
*/
@@ -63,6 +63,7 @@
6363

6464
#include <math.h>
6565

66+
#include "executor/executor.h"
6667
#include "executor/nodeHash.h"
6768
#include "miscadmin.h"
6869
#include "nodes/nodeFuncs.h"
@@ -119,6 +120,8 @@ typedef struct
119120
static MergeScanSelCache *cached_scansel(PlannerInfo *root,
120121
RestrictInfo *rinfo,
121122
PathKey *pathkey);
123+
static void cost_rescan(PlannerInfo *root, Path *path,
124+
Cost *rescan_startup_cost, Cost *rescan_total_cost);
122125
static bool cost_qual_eval_walker(Node *node, cost_qual_eval_context *context);
123126
static bool adjust_semi_join(PlannerInfo *root, JoinPath *path,
124127
SpecialJoinInfo *sjinfo,
@@ -895,15 +898,26 @@ cost_functionscan(Path *path, PlannerInfo *root, RelOptInfo *baserel)
895898
rte = planner_rt_fetch(baserel->relid, root);
896899
Assert(rte->rtekind == RTE_FUNCTION);
897900

898-
/* Estimate costs of executing the function expression */
901+
/*
902+
* Estimate costs of executing the function expression.
903+
*
904+
* Currently, nodeFunctionscan.c always executes the function to
905+
* completion before returning any rows, and caches the results in a
906+
* tuplestore. So the function eval cost is all startup cost, and
907+
* per-row costs are minimal.
908+
*
909+
* XXX in principle we ought to charge tuplestore spill costs if the
910+
* number of rows is large. However, given how phony our rowcount
911+
* estimates for functions tend to be, there's not a lot of point
912+
* in that refinement right now.
913+
*/
899914
cost_qual_eval_node(&exprcost, rte->funcexpr, root);
900915

901-
startup_cost += exprcost.startup;
902-
cpu_per_tuple = exprcost.per_tuple;
916+
startup_cost += exprcost.startup + exprcost.per_tuple;
903917

904918
/* Add scanning CPU costs */
905919
startup_cost += baserel->baserestrictcost.startup;
906-
cpu_per_tuple += cpu_tuple_cost + baserel->baserestrictcost.per_tuple;
920+
cpu_per_tuple = cpu_tuple_cost + baserel->baserestrictcost.per_tuple;
907921
run_cost += cpu_per_tuple * baserel->tuples;
908922

909923
path->startup_cost = startup_cost;
@@ -1176,41 +1190,44 @@ sort_exceeds_work_mem(Sort *sort)
11761190
*
11771191
* If the total volume of data to materialize exceeds work_mem, we will need
11781192
* to write it to disk, so the cost is much higher in that case.
1193+
*
1194+
* Note that here we are estimating the costs for the first scan of the
1195+
* relation, so the materialization is all overhead --- any savings will
1196+
* occur only on rescan, which is estimated in cost_rescan.
11791197
*/
11801198
void
11811199
cost_material(Path *path,
1182-
Cost input_cost, double tuples, int width)
1200+
Cost input_startup_cost, Cost input_total_cost,
1201+
double tuples, int width)
11831202
{
1184-
Cost startup_cost = input_cost;
1185-
Cost run_cost = 0;
1203+
Cost startup_cost = input_startup_cost;
1204+
Cost run_cost = input_total_cost - input_startup_cost;
11861205
double nbytes = relation_byte_size(tuples, width);
11871206
long work_mem_bytes = work_mem * 1024L;
11881207

1189-
/* disk costs */
1208+
/*
1209+
* Whether spilling or not, charge 2x cpu_tuple_cost per tuple to reflect
1210+
* bookkeeping overhead. (This rate must be more than cpu_tuple_cost;
1211+
* if it is exactly the same then there will be a cost tie between
1212+
* nestloop with A outer, materialized B inner and nestloop with B outer,
1213+
* materialized A inner. The extra cost ensures we'll prefer
1214+
* materializing the smaller rel.)
1215+
*/
1216+
run_cost += 2 * cpu_tuple_cost * tuples;
1217+
1218+
/*
1219+
* If we will spill to disk, charge at the rate of seq_page_cost per page.
1220+
* This cost is assumed to be evenly spread through the plan run phase,
1221+
* which isn't exactly accurate but our cost model doesn't allow for
1222+
* nonuniform costs within the run phase.
1223+
*/
11901224
if (nbytes > work_mem_bytes)
11911225
{
11921226
double npages = ceil(nbytes / BLCKSZ);
11931227

1194-
/* We'll write during startup and read during retrieval */
1195-
startup_cost += seq_page_cost * npages;
11961228
run_cost += seq_page_cost * npages;
11971229
}
11981230

1199-
/*
1200-
* Charge a very small amount per inserted tuple, to reflect bookkeeping
1201-
* costs. We use cpu_tuple_cost/10 for this. This is needed to break the
1202-
* tie that would otherwise exist between nestloop with A outer,
1203-
* materialized B inner and nestloop with B outer, materialized A inner.
1204-
* The extra cost ensures we'll prefer materializing the smaller rel.
1205-
*/
1206-
startup_cost += cpu_tuple_cost * 0.1 * tuples;
1207-
1208-
/*
1209-
* Also charge a small amount per extracted tuple. We use cpu_tuple_cost
1210-
* so that it doesn't appear worthwhile to materialize a bare seqscan.
1211-
*/
1212-
run_cost += cpu_tuple_cost * tuples;
1213-
12141231
path->startup_cost = startup_cost;
12151232
path->total_cost = startup_cost + run_cost;
12161233
}
@@ -1400,7 +1417,10 @@ cost_nestloop(NestPath *path, PlannerInfo *root, SpecialJoinInfo *sjinfo)
14001417
Path *inner_path = path->innerjoinpath;
14011418
Cost startup_cost = 0;
14021419
Cost run_cost = 0;
1420+
Cost inner_rescan_start_cost;
1421+
Cost inner_rescan_total_cost;
14031422
Cost inner_run_cost;
1423+
Cost inner_rescan_run_cost;
14041424
Cost cpu_per_tuple;
14051425
QualCost restrict_qual_cost;
14061426
double outer_path_rows = PATH_ROWS(outer_path);
@@ -1413,32 +1433,26 @@ cost_nestloop(NestPath *path, PlannerInfo *root, SpecialJoinInfo *sjinfo)
14131433
if (!enable_nestloop)
14141434
startup_cost += disable_cost;
14151435

1436+
/* estimate costs to rescan the inner relation */
1437+
cost_rescan(root, inner_path,
1438+
&inner_rescan_start_cost,
1439+
&inner_rescan_total_cost);
1440+
14161441
/* cost of source data */
14171442

14181443
/*
14191444
* NOTE: clearly, we must pay both outer and inner paths' startup_cost
14201445
* before we can start returning tuples, so the join's startup cost is
1421-
* their sum. What's not so clear is whether the inner path's
1422-
* startup_cost must be paid again on each rescan of the inner path. This
1423-
* is not true if the inner path is materialized or is a hashjoin, but
1424-
* probably is true otherwise.
1446+
* their sum. We'll also pay the inner path's rescan startup cost
1447+
* multiple times.
14251448
*/
14261449
startup_cost += outer_path->startup_cost + inner_path->startup_cost;
14271450
run_cost += outer_path->total_cost - outer_path->startup_cost;
1428-
if (IsA(inner_path, MaterialPath) ||
1429-
IsA(inner_path, HashPath))
1430-
{
1431-
/* charge only run cost for each iteration of inner path */
1432-
}
1433-
else
1434-
{
1435-
/*
1436-
* charge startup cost for each iteration of inner path, except we
1437-
* already charged the first startup_cost in our own startup
1438-
*/
1439-
run_cost += (outer_path_rows - 1) * inner_path->startup_cost;
1440-
}
1451+
if (outer_path_rows > 1)
1452+
run_cost += (outer_path_rows - 1) * inner_rescan_start_cost;
1453+
14411454
inner_run_cost = inner_path->total_cost - inner_path->startup_cost;
1455+
inner_rescan_run_cost = inner_rescan_total_cost - inner_rescan_start_cost;
14421456

14431457
if (adjust_semi_join(root, path, sjinfo,
14441458
&outer_match_frac,
@@ -1458,12 +1472,22 @@ cost_nestloop(NestPath *path, PlannerInfo *root, SpecialJoinInfo *sjinfo)
14581472
* that fraction. (If we used a larger fuzz factor, we'd have to
14591473
* clamp inner_scan_frac to at most 1.0; but since match_count is at
14601474
* least 1, no such clamp is needed now.)
1475+
*
1476+
* A complicating factor is that rescans may be cheaper than first
1477+
* scans. If we never scan all the way to the end of the inner rel,
1478+
* it might be (depending on the plan type) that we'd never pay the
1479+
* whole inner first-scan run cost. However it is difficult to
1480+
* estimate whether that will happen, so be conservative and always
1481+
* charge the whole first-scan cost once.
14611482
*/
1483+
run_cost += inner_run_cost;
1484+
14621485
outer_matched_rows = rint(outer_path_rows * outer_match_frac);
14631486
inner_scan_frac = 2.0 / (match_count + 1.0);
14641487

1465-
/* Add inner run cost for outer tuples having matches */
1466-
run_cost += outer_matched_rows * inner_run_cost * inner_scan_frac;
1488+
/* Add inner run cost for additional outer tuples having matches */
1489+
if (outer_matched_rows > 1)
1490+
run_cost += (outer_matched_rows - 1) * inner_rescan_run_cost * inner_scan_frac;
14671491

14681492
/* Compute number of tuples processed (not number emitted!) */
14691493
ntuples = outer_matched_rows * inner_path_rows * inner_scan_frac;
@@ -1479,21 +1503,26 @@ cost_nestloop(NestPath *path, PlannerInfo *root, SpecialJoinInfo *sjinfo)
14791503
if (indexed_join_quals)
14801504
{
14811505
run_cost += (outer_path_rows - outer_matched_rows) *
1482-
inner_run_cost / inner_path_rows;
1483-
/* We won't be evaluating any quals at all for these rows */
1506+
inner_rescan_run_cost / inner_path_rows;
1507+
/*
1508+
* We won't be evaluating any quals at all for these rows,
1509+
* so don't add them to ntuples.
1510+
*/
14841511
}
14851512
else
14861513
{
14871514
run_cost += (outer_path_rows - outer_matched_rows) *
1488-
inner_run_cost;
1515+
inner_rescan_run_cost;
14891516
ntuples += (outer_path_rows - outer_matched_rows) *
14901517
inner_path_rows;
14911518
}
14921519
}
14931520
else
14941521
{
14951522
/* Normal case; we'll scan whole input rel for each outer row */
1496-
run_cost += outer_path_rows * inner_run_cost;
1523+
run_cost += inner_run_cost;
1524+
if (outer_path_rows > 1)
1525+
run_cost += (outer_path_rows - 1) * inner_rescan_run_cost;
14971526

14981527
/* Compute number of tuples processed (not number emitted!) */
14991528
ntuples = outer_path_rows * inner_path_rows;
@@ -2190,13 +2219,13 @@ cost_subplan(PlannerInfo *root, SubPlan *subplan, Plan *plan)
21902219

21912220
/*
21922221
* Also account for subplan's startup cost. If the subplan is
2193-
* uncorrelated or undirect correlated, AND its topmost node is a Sort
2194-
* or Material node, assume that we'll only need to pay its startup
2195-
* cost once; otherwise assume we pay the startup cost every time.
2222+
* uncorrelated or undirect correlated, AND its topmost node is one
2223+
* that materializes its output, assume that we'll only need to pay
2224+
* its startup cost once; otherwise assume we pay the startup cost
2225+
* every time.
21962226
*/
21972227
if (subplan->parParam == NIL &&
2198-
(IsA(plan, Sort) ||
2199-
IsA(plan, Material)))
2228+
ExecMaterializesOutput(nodeTag(plan)))
22002229
sp_cost.startup += plan->startup_cost;
22012230
else
22022231
sp_cost.per_tuple += plan->startup_cost;
@@ -2207,6 +2236,81 @@ cost_subplan(PlannerInfo *root, SubPlan *subplan, Plan *plan)
22072236
}
22082237

22092238

2239+
/*
2240+
* cost_rescan
2241+
* Given a finished Path, estimate the costs of rescanning it after
2242+
* having done so the first time. For some Path types a rescan is
2243+
* cheaper than an original scan (if no parameters change), and this
2244+
* function embodies knowledge about that. The default is to return
2245+
* the same costs stored in the Path. (Note that the cost estimates
2246+
* actually stored in Paths are always for first scans.)
2247+
*
2248+
* This function is not currently intended to model effects such as rescans
2249+
* being cheaper due to disk block caching; what we are concerned with is
2250+
* plan types wherein the executor caches results explicitly, or doesn't
2251+
* redo startup calculations, etc.
2252+
*/
2253+
static void
2254+
cost_rescan(PlannerInfo *root, Path *path,
2255+
Cost *rescan_startup_cost, /* output parameters */
2256+
Cost *rescan_total_cost)
2257+
{
2258+
switch (path->pathtype)
2259+
{
2260+
case T_FunctionScan:
2261+
/*
2262+
* Currently, nodeFunctionscan.c always executes the function
2263+
* to completion before returning any rows, and caches the
2264+
* results in a tuplestore. So the function eval cost is
2265+
* all startup cost and isn't paid over again on rescans.
2266+
* However, all run costs will be paid over again.
2267+
*/
2268+
*rescan_startup_cost = 0;
2269+
*rescan_total_cost = path->total_cost - path->startup_cost;
2270+
break;
2271+
case T_HashJoin:
2272+
/*
2273+
* Assume that all of the startup cost represents hash table
2274+
* building, which we won't have to do over.
2275+
*/
2276+
*rescan_startup_cost = 0;
2277+
*rescan_total_cost = path->total_cost - path->startup_cost;
2278+
break;
2279+
case T_Material:
2280+
case T_CteScan:
2281+
case T_WorkTableScan:
2282+
case T_Sort:
2283+
{
2284+
/*
2285+
* These plan types materialize their final result in a
2286+
* tuplestore or tuplesort object. So the rescan cost is only
2287+
* cpu_tuple_cost per tuple, unless the result is large enough
2288+
* to spill to disk.
2289+
*/
2290+
Cost run_cost = cpu_tuple_cost * path->parent->rows;
2291+
double nbytes = relation_byte_size(path->parent->rows,
2292+
path->parent->width);
2293+
long work_mem_bytes = work_mem * 1024L;
2294+
2295+
if (nbytes > work_mem_bytes)
2296+
{
2297+
/* It will spill, so account for re-read cost */
2298+
double npages = ceil(nbytes / BLCKSZ);
2299+
2300+
run_cost += seq_page_cost * npages;
2301+
}
2302+
*rescan_startup_cost = 0;
2303+
*rescan_total_cost = run_cost;
2304+
}
2305+
break;
2306+
default:
2307+
*rescan_startup_cost = path->startup_cost;
2308+
*rescan_total_cost = path->total_cost;
2309+
break;
2310+
}
2311+
}
2312+
2313+
22102314
/*
22112315
* cost_qual_eval
22122316
* Estimate the CPU costs of evaluating a WHERE clause.

0 commit comments

Comments
 (0)