Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 8ad51b5

Browse files
committed
Sample postgres_fdw tables remotely during ANALYZE
When collecting ANALYZE sample on foreign tables, postgres_fdw fetched all rows and performed the sampling locally. For large tables this means transferring and immediately discarding large amounts of data. This commit allows the sampling to be performed on the remote server, transferring only the much smaller sample. The sampling is performed using the built-in TABLESAMPLE methods (system, bernoulli) or random() function, depending on the remote server version. Remote sampling can be enabled by analyze_sampling on the foreign server and/or foreign table, with supported values 'off', 'auto', 'system', 'bernoulli' and 'random'. The default value is 'auto' which uses either 'bernoulli' (TABLESAMPLE method) or 'random' (for remote servers without TABLESAMPLE support).
1 parent 02699bc commit 8ad51b5

File tree

7 files changed

+397
-9
lines changed

7 files changed

+397
-9
lines changed

contrib/postgres_fdw/deparse.c

+70-2
Original file line numberDiff line numberDiff line change
@@ -2367,14 +2367,57 @@ deparseAnalyzeSizeSql(StringInfo buf, Relation rel)
23672367
appendStringInfo(buf, "::pg_catalog.regclass) / %d", BLCKSZ);
23682368
}
23692369

2370+
/*
2371+
* Construct SELECT statement to acquire the number of rows of a relation.
2372+
*
2373+
* Note: we just return the remote server's reltuples value, which might
2374+
* be off a good deal, but it doesn't seem worth working harder. See
2375+
* comments in postgresAcquireSampleRowsFunc.
2376+
*/
2377+
void
2378+
deparseAnalyzeTuplesSql(StringInfo buf, Relation rel)
2379+
{
2380+
StringInfoData relname;
2381+
2382+
/* We'll need the remote relation name as a literal. */
2383+
initStringInfo(&relname);
2384+
deparseRelation(&relname, rel);
2385+
2386+
appendStringInfoString(buf, "SELECT reltuples FROM pg_catalog.pg_class WHERE oid = ");
2387+
deparseStringLiteral(buf, relname.data);
2388+
appendStringInfoString(buf, "::pg_catalog.regclass");
2389+
}
2390+
23702391
/*
23712392
* Construct SELECT statement to acquire sample rows of given relation.
23722393
*
23732394
* SELECT command is appended to buf, and list of columns retrieved
23742395
* is returned to *retrieved_attrs.
2396+
*
2397+
* We only support sampling methods we can decide based on server version.
2398+
* Allowing custom TSM modules (like tsm_system_rows) might be useful, but it
2399+
* would require detecting which extensions are installed, to allow automatic
2400+
* fall-back. Moreover, the methods may use different parameters like number
2401+
* of rows (and not sampling rate). So we leave this for future improvements.
2402+
*
2403+
* Using random() to sample rows on the remote server has the advantage that
2404+
* this works on all PostgreSQL versions (unlike TABLESAMPLE), and that it
2405+
* does the sampling on the remote side (without transferring everything and
2406+
* then discarding most rows).
2407+
*
2408+
* The disadvantage is that we still have to read all rows and evaluate the
2409+
* random(), while TABLESAMPLE (at least with the "system" method) may skip.
2410+
* It's not that different from the "bernoulli" method, though.
2411+
*
2412+
* We could also do "ORDER BY random() LIMIT x", which would always pick
2413+
* the expected number of rows, but it requires sorting so it may be much
2414+
* more expensive (particularly on large tables, which is what what the
2415+
* remote sampling is meant to improve).
23752416
*/
23762417
void
2377-
deparseAnalyzeSql(StringInfo buf, Relation rel, List **retrieved_attrs)
2418+
deparseAnalyzeSql(StringInfo buf, Relation rel,
2419+
PgFdwSamplingMethod sample_method, double sample_frac,
2420+
List **retrieved_attrs)
23782421
{
23792422
Oid relid = RelationGetRelid(rel);
23802423
TupleDesc tupdesc = RelationGetDescr(rel);
@@ -2422,10 +2465,35 @@ deparseAnalyzeSql(StringInfo buf, Relation rel, List **retrieved_attrs)
24222465
appendStringInfoString(buf, "NULL");
24232466

24242467
/*
2425-
* Construct FROM clause
2468+
* Construct FROM clause, and perhaps WHERE clause too, depending on the
2469+
* selected sampling method.
24262470
*/
24272471
appendStringInfoString(buf, " FROM ");
24282472
deparseRelation(buf, rel);
2473+
2474+
switch (sample_method)
2475+
{
2476+
case ANALYZE_SAMPLE_OFF:
2477+
/* nothing to do here */
2478+
break;
2479+
2480+
case ANALYZE_SAMPLE_RANDOM:
2481+
appendStringInfo(buf, " WHERE pg_catalog.random() < %f", sample_frac);
2482+
break;
2483+
2484+
case ANALYZE_SAMPLE_SYSTEM:
2485+
appendStringInfo(buf, " TABLESAMPLE SYSTEM(%f)", (100.0 * sample_frac));
2486+
break;
2487+
2488+
case ANALYZE_SAMPLE_BERNOULLI:
2489+
appendStringInfo(buf, " TABLESAMPLE BERNOULLI(%f)", (100.0 * sample_frac));
2490+
break;
2491+
2492+
case ANALYZE_SAMPLE_AUTO:
2493+
/* should have been resolved into actual method */
2494+
elog(ERROR, "unexpected sampling method");
2495+
break;
2496+
}
24292497
}
24302498

24312499
/*

contrib/postgres_fdw/expected/postgres_fdw.out

+25
Original file line numberDiff line numberDiff line change
@@ -11779,3 +11779,28 @@ SELECT * FROM prem2;
1177911779

1178011780
ALTER SERVER loopback OPTIONS (DROP parallel_commit);
1178111781
ALTER SERVER loopback2 OPTIONS (DROP parallel_commit);
11782+
-- ===================================================================
11783+
-- test for ANALYZE sampling
11784+
-- ===================================================================
11785+
CREATE TABLE analyze_table (id int, a text, b bigint);
11786+
CREATE FOREIGN TABLE analyze_ftable (id int, a text, b bigint)
11787+
SERVER loopback OPTIONS (table_name 'analyze_rtable1');
11788+
INSERT INTO analyze_table (SELECT x FROM generate_series(1,1000) x);
11789+
ANALYZE analyze_table;
11790+
SET default_statistics_target = 10;
11791+
ANALYZE analyze_table;
11792+
ALTER SERVER loopback OPTIONS (analyze_sampling 'invalid');
11793+
ERROR: invalid value for string option "analyze_sampling": invalid
11794+
ALTER SERVER loopback OPTIONS (analyze_sampling 'auto');
11795+
ANALYZE analyze_table;
11796+
ALTER SERVER loopback OPTIONS (SET analyze_sampling 'system');
11797+
ANALYZE analyze_table;
11798+
ALTER SERVER loopback OPTIONS (SET analyze_sampling 'bernoulli');
11799+
ANALYZE analyze_table;
11800+
ALTER SERVER loopback OPTIONS (SET analyze_sampling 'random');
11801+
ANALYZE analyze_table;
11802+
ALTER SERVER loopback OPTIONS (SET analyze_sampling 'off');
11803+
ANALYZE analyze_table;
11804+
-- cleanup
11805+
DROP FOREIGN TABLE analyze_ftable;
11806+
DROP TABLE analyze_table;

contrib/postgres_fdw/option.c

+21
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,23 @@ postgres_fdw_validator(PG_FUNCTION_ARGS)
210210
errmsg("sslcert and sslkey are superuser-only"),
211211
errhint("User mappings with the sslcert or sslkey options set may only be created or modified by the superuser.")));
212212
}
213+
else if (strcmp(def->defname, "analyze_sampling") == 0)
214+
{
215+
char *value;
216+
217+
value = defGetString(def);
218+
219+
/* we recognize off/auto/random/system/bernoulli */
220+
if (strcmp(value, "off") != 0 &&
221+
strcmp(value, "auto") != 0 &&
222+
strcmp(value, "random") != 0 &&
223+
strcmp(value, "system") != 0 &&
224+
strcmp(value, "bernoulli") != 0)
225+
ereport(ERROR,
226+
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
227+
errmsg("invalid value for string option \"%s\": %s",
228+
def->defname, value)));
229+
}
213230
}
214231

215232
PG_RETURN_VOID();
@@ -257,6 +274,10 @@ InitPgFdwOptions(void)
257274
{"keep_connections", ForeignServerRelationId, false},
258275
{"password_required", UserMappingRelationId, false},
259276

277+
/* sampling is available on both server and table */
278+
{"analyze_sampling", ForeignServerRelationId, false},
279+
{"analyze_sampling", ForeignTableRelationId, false},
280+
260281
/*
261282
* sslcert and sslkey are in fact libpq options, but we repeat them
262283
* here to allow them to appear in both foreign server context (when

0 commit comments

Comments
 (0)