diff --git a/README.md b/README.md index f4bd9bd..fef370e 100644 --- a/README.md +++ b/README.md @@ -27,19 +27,27 @@ When `pg_wait_sampling` is enabled, it collects two kinds of statistics. recent samples depending on history size (configurable). Assuming there is a client who periodically read this history and dump it somewhere, user can have continuous history. - * Waits profile. It's implemented as in-memory hash table where count - of samples are accumulated per each process and each wait event - (and each query with `pg_stat_statements`). This hash - table can be reset by user request. Assuming there is a client who - periodically dumps profile and resets it, user can have statistics of - intensivity of wait events among time. - -In combination with `pg_stat_statements` this extension can also provide -per query statistics. + * Waits profile. It's implemented as bounded in-memory hash table where counts + of samples are accumulated per triple of process pid, wait event and query id + (when its computing is enabled on PG server, on versions below 14 this + requires `pg_stat_statements` extension). The least used entries are evicted + when overflow of hash table is encountered. Hash table also can be reset by + user request. Assuming there is a client who periodically dumps profile and + computes differential counters from adjacent dumps, user can have statistics + of intensivity of wait events among time. + +Starting from PG14 this extension might activate computing of query id on server +side to enable per query id statistics. The older PG versions require to install +`pg_stat_statements` extension for this purpose. `pg_wait_sampling` launches special background worker for gathering the statistics above. +The profile statistics as well as history items are not persisted to disk so +server restart resets all already accummulated data. This is not crucial for +profile counters because we are primarily interested in differential values, not +absolute values of these counters. + Availability ------------ @@ -125,24 +133,28 @@ in-memory hash table. The work of wait event statistics collector worker is controlled by following GUCs. -| Parameter name | Data type | Description | Default value | -| ----------------------------------- | --------- | ------------------------------------------- | ------------: | -| pg_wait_sampling.history_size | int4 | Size of history in-memory ring buffer | 5000 | -| pg_wait_sampling.history_period | int4 | Period for history sampling in milliseconds | 10 | -| pg_wait_sampling.profile_period | int4 | Period for profile sampling in milliseconds | 10 | -| pg_wait_sampling.profile_pid | bool | Whether profile should be per pid | true | -| pg_wait_sampling.profile_queries | bool | Whether profile should be per query | true | +| Parameter name | Data type | Description | Default value | Change policy | +| ------------------------------------ | --------- | ----------------------------------------------------------------------------------- | ------------- | ------------- | +| pg_wait_sampling.max_profile_entries | int4 | Maximum number of entries in profile hash table | 5000 | restart | +| pg_wait_sampling.history_size | int4 | Size of history in-memory ring buffer | 5000 | restart | +| pg_wait_sampling.profile_period | int4 | Period for profile sampling in milliseconds (zero value disables profile gathering) | 10 | reload | +| pg_wait_sampling.history_period | int4 | Period for history sampling in milliseconds (zero value disables history gathering) | 0 | reload | +| pg_wait_sampling.profile_pid | bool | Whether profile should be per pid | true | restart | +| pg_wait_sampling.profile_queries | bool | Whether profile should be per query | true | restart | If `pg_wait_sampling.profile_pid` is set to false, sampling profile wouldn't be -collected in per-process manner. In this case the value of pid could would -be always zero and corresponding row contain samples among all the processes. +collected in per-process manner. In this case the value of pid will be NULL and +corresponding rows contain samples among all the processes. -While `pg_wait_sampling.profile_queries` is set to false `queryid` field in -views will be zero. +__Caution__: +When sampling per pid is enabled, all profile entries for already completed +processes are left in hash table. Therefore, it's neccessary to take into +account periodic flushing of profile to prevent recycling of 32-bit pid values +in profile hash table and as consequence possible increments to profile entries +belonging to some old processes with the same pid values as for current ones. -These GUCs are allowed to be changed by superuser. Also, they are placed into -shared memory. Thus, they could be changed from any backend and affects worker -runtime. +While `pg_wait_sampling.profile_queries` is set to false `queryid` field in +views will be NULL. See [PostgreSQL documentation](http://www.postgresql.org/docs/devel/static/monitoring-stats.html#WAIT-EVENT-TABLE) diff --git a/collector.c b/collector.c index dcb9695..af3337f 100644 --- a/collector.c +++ b/collector.c @@ -5,301 +5,221 @@ * Copyright (c) 2015-2016, Postgres Professional * * IDENTIFICATION - * contrib/pg_wait_sampling/pg_wait_sampling.c + * contrib/pg_wait_sampling/collector.c */ #include "postgres.h" -#include "catalog/pg_type.h" #if PG_VERSION_NUM >= 130000 #include "common/hashfn.h" #endif -#include "funcapi.h" -#include "miscadmin.h" +#include "pgstat.h" #include "postmaster/bgworker.h" +#if PG_VERSION_NUM >= 130000 +#include "postmaster/interrupt.h" +#endif #include "storage/ipc.h" -#include "storage/procarray.h" +#include "storage/proc.h" #include "storage/procsignal.h" -#include "storage/shm_mq.h" -#include "storage/shm_toc.h" -#include "storage/spin.h" -#include "utils/memutils.h" -#include "utils/resowner.h" -#include "pgstat.h" +#include "utils/guc.h" #include "compat.h" #include "pg_wait_sampling.h" +static const double USAGE_INIT = 1.0; +static const double USAGE_INCREASE = 1.0; +static const double USAGE_DECREASE_FACTOR = 0.99; +static const int USAGE_DEALLOC_PERCENT = 5; +static const int USAGE_DEALLOC_MIN_NUM = 10; static volatile sig_atomic_t shutdown_requested = false; static void handle_sigterm(SIGNAL_ARGS); -/* - * Register background worker for collecting waits history. - */ -void -pgws_register_wait_collector(void) +static void +handle_sigterm(SIGNAL_ARGS) { - BackgroundWorker worker; - - /* Set up background worker parameters */ - memset(&worker, 0, sizeof(worker)); - worker.bgw_flags = BGWORKER_SHMEM_ACCESS; - worker.bgw_start_time = BgWorkerStart_ConsistentState; - worker.bgw_restart_time = 1; - worker.bgw_notify_pid = 0; - snprintf(worker.bgw_library_name, BGW_MAXLEN, "pg_wait_sampling"); - snprintf(worker.bgw_function_name, BGW_MAXLEN, CppAsString(pgws_collector_main)); - snprintf(worker.bgw_name, BGW_MAXLEN, "pg_wait_sampling collector"); - worker.bgw_main_arg = (Datum) 0; - RegisterBackgroundWorker(&worker); + int save_errno = errno; + shutdown_requested = true; + if (MyProc) + SetLatch(&MyProc->procLatch); + errno = save_errno; } /* - * Allocate memory for waits history. + * qsort comparator for sorting into increasing usage order */ -static void -alloc_history(History *observations, int count) +static int +entry_cmp(const void *lhs, const void *rhs) { - observations->items = (HistoryItem *) palloc0(sizeof(HistoryItem) * count); - observations->index = 0; - observations->count = count; - observations->wraparound = false; + double l_usage = (*(ProfileHashEntry *const *) lhs)->usage; + double r_usage = (*(ProfileHashEntry *const *) rhs)->usage; + + if (l_usage < r_usage) + return -1; + else if (l_usage > r_usage) + return +1; + else + return 0; } /* - * Reallocate memory for changed number of history items. + * Deallocate least used entries in profile hashtable. + * Caller must hold an exclusive lock. */ static void -realloc_history(History *observations, int count) +pgws_entry_dealloc() { - HistoryItem *newitems; - int copyCount, - i, - j; - - /* Allocate new array for history */ - newitems = (HistoryItem *) palloc0(sizeof(HistoryItem) * count); - - /* Copy entries from old array to the new */ - if (observations->wraparound) - copyCount = observations->count; - else - copyCount = observations->index; + HASH_SEQ_STATUS hash_seq; + ProfileHashEntry **entries; + ProfileHashEntry *entry; + int nvictims; + int i; - copyCount = Min(copyCount, count); + /* + * Sort entries by usage and deallocate USAGE_DEALLOC_PERCENT of them. + * While we're scanning the table, apply the decay factor to the usage + * values. + */ + entries = palloc( + hash_get_num_entries(pgws_profile_hash) * sizeof(ProfileHashEntry *) + ); i = 0; - if (observations->wraparound) - j = observations->index + 1; - else - j = 0; - while (i < copyCount) + hash_seq_init(&hash_seq, pgws_profile_hash); + while ((entry = hash_seq_search(&hash_seq)) != NULL) { - if (j >= observations->count) - j = 0; - memcpy(&newitems[i], &observations->items[j], sizeof(HistoryItem)); - i++; - j++; + entries[i++] = entry; + entry->usage *= USAGE_DECREASE_FACTOR; } - /* Switch to new history array */ - pfree(observations->items); - observations->items = newitems; - observations->index = copyCount; - observations->count = count; - observations->wraparound = false; -} - -static void -handle_sigterm(SIGNAL_ARGS) -{ - int save_errno = errno; - shutdown_requested = true; - if (MyProc) - SetLatch(&MyProc->procLatch); - errno = save_errno; -} + qsort(entries, i, sizeof(ProfileHashEntry *), entry_cmp); -/* - * Get next item of history with rotation. - */ -static HistoryItem * -get_next_observation(History *observations) -{ - HistoryItem *result; + /* + * We remove USAGE_DEALLOC_PERCENT number of entries or at least + * USAGE_DEALLOC_MIN_NUM entries if full number of existing entries is not + * less + */ + nvictims = Max(USAGE_DEALLOC_MIN_NUM, i * USAGE_DEALLOC_PERCENT / 100); + nvictims = Min(nvictims, i); - if (observations->index >= observations->count) + for (i = 0; i < nvictims; i++) { - observations->index = 0; - observations->wraparound = true; + hash_search(pgws_profile_hash, &entries[i]->key, HASH_REMOVE, NULL); } - result = &observations->items[observations->index]; - observations->index++; - return result; + + pfree(entries); } /* - * Read current waits from backends and write them to history array - * and/or profile hash. + * Read current waits from backends and write them to shared structures */ static void -probe_waits(History *observations, HTAB *profile_hash, - bool write_history, bool write_profile, bool profile_pid) +probe_waits(const bool write_history, const bool write_profile) { - int i, - newSize; - TimestampTz ts = GetCurrentTimestamp(); - - /* Realloc waits history if needed */ - newSize = pgws_collector_hdr->historySize; - if (observations->count != newSize) - realloc_history(observations, newSize); + if (write_profile) + LWLockAcquire(pgws_profile_lock, LW_EXCLUSIVE); + if (write_history) + LWLockAcquire(pgws_history_lock, LW_EXCLUSIVE); - /* Iterate PGPROCs under shared lock */ + /* + * Iterate PGPROCs under shared lock. + * + * TODO: + * ProcArrayLock is heavy enough and in current case we might perform the + * non-trivial deallocation routine for profile hash table under this lock. + * Therefore to reduce possible contention it's worth to segregate the logic + * of PGPROCs iteration under ProcArrayLock and storing results to profile + * and/or history under corresponding another lock. + */ LWLockAcquire(ProcArrayLock, LW_SHARED); - for (i = 0; i < ProcGlobal->allProcCount; i++) + for (int i = 0; i < ProcGlobal->allProcCount; i++) { - HistoryItem item, - *observation; - PGPROC *proc = &ProcGlobal->allProcs[i]; + PGPROC *proc = GetPGProcByNumber(i); + pgwsQueryId queryId = WhetherProfileQueryId ? pgws_proc_queryids[i] : 0; + int32 wait_event_info = proc->wait_event_info, + pid = proc->pid; - if (proc->pid == 0) + /* + * FIXME: zero pid actually doesn't indicate that process slot is freed. + * After process termination this field becomes unchanged and thereby + * stores the pid of previous process. The possible indicator of process + * termination might be a condition `proc->procLatch->owner_pid == 0`. + * But even in this case ProcArrayLock doesn't protect `owner_pid` + * field from concurrent modifications that might cause race conditions. + * + * Abother option is to use the lists of freed PGPROCs from ProcGlocal: + * freeProcs, walsenderFreeProcs, bgworkerFreeProcs and autovacFreeProcs + * to define indexes of all freed slots in allProcs. But this requires + * acquiring ProcStructLock spinlock that is impractical for iteration + * over so long lists. + * + * The most appropriate solution here is to iterate over ProcArray items + * under ProcArrayLock and over AuxiliaryProcs under ProcStructLock + * spinlock (AuxiliaryProcs contains just NUM_AUXILIARY_PROCS=5 slots) + * or without any locks as it's done in pg_stat_get_activity() function. + * These arrays are not accessible externally and require to add some + * iterator object into corresponding containing modules. + */ + if (pid == 0) continue; - if (proc->wait_event_info == 0) + // TODO: take into account the state without waiting as CPU time + if (wait_event_info == 0) continue; - /* Collect next wait event sample */ - item.pid = proc->pid; - item.wait_event_info = proc->wait_event_info; - - if (pgws_collector_hdr->profileQueries) - item.queryId = pgws_proc_queryids[i]; - else - item.queryId = 0; - - item.ts = ts; - /* Write to the history if needed */ if (write_history) { - observation = get_next_observation(observations); - *observation = item; + int index = pgws_history_ring->index % HistoryBufferSize; + + pgws_history_ring->items[index] = (HistoryItem) { + pid, wait_event_info, queryId, GetCurrentTimestamp() + }; + pgws_history_ring->index++; } /* Write to the profile if needed */ if (write_profile) { - ProfileItem *profileItem; - bool found; + ProfileHashKey key; + ProfileHashEntry *entry; - if (!profile_pid) - item.pid = 0; + /* Set up key for hashtable search */ + key.pid = WhetherProfilePid ? pid : 0; + key.wait_event_info = wait_event_info; + key.queryid = queryId; - profileItem = (ProfileItem *) hash_search(profile_hash, &item, HASH_ENTER, &found); - if (found) - profileItem->count++; - else - profileItem->count = 1; - } - } - LWLockRelease(ProcArrayLock); -} + /* Lookup the hash table entry with exclusive lock */ + entry = (ProfileHashEntry *) + hash_search(pgws_profile_hash, &key, HASH_FIND, NULL); -/* - * Send waits history to shared memory queue. - */ -static void -send_history(History *observations, shm_mq_handle *mqh) -{ - Size count, - i; - shm_mq_result mq_result; - - if (observations->wraparound) - count = observations->count; - else - count = observations->index; + /* Create new entry, if not present */ + if (!entry) + { - mq_result = shm_mq_send_compat(mqh, sizeof(count), &count, false, true); - if (mq_result == SHM_MQ_DETACHED) - { - ereport(WARNING, - (errmsg("pg_wait_sampling collector: " - "receiver of message queue has been detached"))); - return; - } - for (i = 0; i < count; i++) - { - mq_result = shm_mq_send_compat(mqh, - sizeof(HistoryItem), - &observations->items[i], - false, - true); - if (mq_result == SHM_MQ_DETACHED) - { - ereport(WARNING, - (errmsg("pg_wait_sampling collector: " - "receiver of message queue has been detached"))); - return; - } - } -} + /* Make space if needed */ + while (hash_get_num_entries(pgws_profile_hash) >= MaxProfileEntries) + pgws_entry_dealloc(); -/* - * Send profile to shared memory queue. - */ -static void -send_profile(HTAB *profile_hash, shm_mq_handle *mqh) -{ - HASH_SEQ_STATUS scan_status; - ProfileItem *item; - Size count = hash_get_num_entries(profile_hash); - shm_mq_result mq_result; + entry = (ProfileHashEntry *) + hash_search(pgws_profile_hash, &key, HASH_ENTER_NULL, NULL); + Assert(entry); - mq_result = shm_mq_send_compat(mqh, sizeof(count), &count, false, true); - if (mq_result == SHM_MQ_DETACHED) - { - ereport(WARNING, - (errmsg("pg_wait_sampling collector: " - "receiver of message queue has been detached"))); - return; - } - hash_seq_init(&scan_status, profile_hash); - while ((item = (ProfileItem *) hash_seq_search(&scan_status)) != NULL) - { - mq_result = shm_mq_send_compat(mqh, sizeof(ProfileItem), item, false, - true); - if (mq_result == SHM_MQ_DETACHED) - { - hash_seq_term(&scan_status); - ereport(WARNING, - (errmsg("pg_wait_sampling collector: " - "receiver of message queue has been detached"))); - return; + entry->counter = 1; + entry->usage = USAGE_INIT; + } + else + { + entry->counter++; + entry->usage += USAGE_INCREASE; + } } } -} - -/* - * Make hash table for wait profile. - */ -static HTAB * -make_profile_hash() -{ - HASHCTL hash_ctl; - - hash_ctl.hash = tag_hash; - hash_ctl.hcxt = TopMemoryContext; - - if (pgws_collector_hdr->profileQueries) - hash_ctl.keysize = offsetof(ProfileItem, count); - else - hash_ctl.keysize = offsetof(ProfileItem, queryId); + LWLockRelease(ProcArrayLock); - hash_ctl.entrysize = sizeof(ProfileItem); - return hash_create("Waits profile hash", 1024, &hash_ctl, - HASH_FUNCTION | HASH_ELEM); + if (write_history) + LWLockRelease(pgws_history_lock); + if (write_profile) + LWLockRelease(pgws_profile_lock); } /* @@ -323,10 +243,6 @@ millisecs_diff(TimestampTz tz1, TimestampTz tz2) void pgws_collector_main(Datum main_arg) { - HTAB *profile_hash = NULL; - History observations; - MemoryContext old_context, - collector_context; TimestampTz current_ts, history_ts, profile_ts; @@ -350,6 +266,13 @@ pgws_collector_main(Datum main_arg) */ pqsignal(SIGTERM, handle_sigterm); pqsignal(SIGUSR1, procsignal_sigusr1_handler); + pqsignal(SIGHUP, +#if PG_VERSION_NUM >= 130000 + SignalHandlerForConfigReload +#else + PostgresSigHupHandler +#endif + ); BackgroundWorkerUnblockSignals(); InitPostgresCompat(NULL, InvalidOid, NULL, InvalidOid, false, false, NULL); SetProcessingMode(NormalProcessing); @@ -357,16 +280,6 @@ pgws_collector_main(Datum main_arg) /* Make pg_wait_sampling recognisable in pg_stat_activity */ pgstat_report_appname("pg_wait_sampling collector"); - profile_hash = make_profile_hash(); - pgws_collector_hdr->latch = &MyProc->procLatch; - - CurrentResourceOwner = ResourceOwnerCreate(NULL, "pg_wait_sampling collector"); - collector_context = AllocSetContextCreate(TopMemoryContext, - "pg_wait_sampling context", ALLOCSET_DEFAULT_SIZES); - old_context = MemoryContextSwitchTo(collector_context); - alloc_history(&observations, pgws_collector_hdr->historySize); - MemoryContextSwitchTo(old_context); - ereport(LOG, (errmsg("pg_wait_sampling collector started"))); /* Start counting time for history and profile samples */ @@ -374,33 +287,45 @@ pgws_collector_main(Datum main_arg) while (1) { - int rc; - shm_mq_handle *mqh; - int64 history_diff, - profile_diff; - int history_period, - profile_period; - bool write_history, - write_profile; + int rc; + int64 history_diff, + profile_diff; + bool write_history, + write_profile; + int history_timeout, + profile_timeout, + actual_timeout; + + /* Clear any already-pending wakeups */ + ResetLatch(MyLatch); /* We need an explicit call for at least ProcSignal notifications. */ CHECK_FOR_INTERRUPTS(); - /* Wait calculate time to next sample for history or profile */ - current_ts = GetCurrentTimestamp(); + /* Process any requests or signals received recently */ + if (ConfigReloadPending) + { + ConfigReloadPending = false; + ProcessConfigFile(PGC_SIGHUP); + } + + /* Shutdown if requested */ + if (shutdown_requested) + break; + /* Calculate time for the next sample of history or profile */ + current_ts = GetCurrentTimestamp(); history_diff = millisecs_diff(history_ts, current_ts); profile_diff = millisecs_diff(profile_ts, current_ts); - history_period = pgws_collector_hdr->historyPeriod; - profile_period = pgws_collector_hdr->profilePeriod; - - write_history = (history_diff >= (int64)history_period); - write_profile = (profile_diff >= (int64)profile_period); + /* Write profile or history */ + write_history = HistoryPeriod && + (history_diff >= (int64) HistoryPeriod); + write_profile = ProfilePeriod && + (profile_diff >= (int64) ProfilePeriod); if (write_history || write_profile) { - probe_waits(&observations, profile_hash, - write_history, write_profile, pgws_collector_hdr->profilePid); + probe_waits(write_history, write_profile); if (write_history) { @@ -415,87 +340,29 @@ pgws_collector_main(Datum main_arg) } } - /* Shutdown if requested */ - if (shutdown_requested) - break; - - /* - * Wait until next sample time or request to do something through - * shared memory. - */ -#if PG_VERSION_NUM >= 100000 - rc = WaitLatch(&MyProc->procLatch, WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, - Min(history_period - (int)history_diff, - profile_period - (int)profile_diff), PG_WAIT_EXTENSION); -#else - rc = WaitLatch(&MyProc->procLatch, WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, - Min(history_period - (int)history_diff, - profile_period - (int)profile_diff)); -#endif + /* Wait until next sample time */ + history_timeout = HistoryPeriod >= (int) history_diff ? + HistoryPeriod - (int) history_diff : 0; + profile_timeout = ProfilePeriod >= (int) profile_diff ? + ProfilePeriod - (int) profile_diff : 0; + + actual_timeout = 0; + if (ProfilePeriod && !HistoryPeriod) + actual_timeout = profile_timeout; + else if (HistoryPeriod && !ProfilePeriod) + actual_timeout = history_timeout; + else if (HistoryPeriod && ProfilePeriod) + actual_timeout = Min(history_timeout, profile_timeout); + + rc = WaitLatchCompat(MyLatch, + WL_LATCH_SET | WL_POSTMASTER_DEATH | + (HistoryPeriod || ProfilePeriod ? WL_TIMEOUT : 0), + actual_timeout, PG_WAIT_EXTENSION); if (rc & WL_POSTMASTER_DEATH) proc_exit(1); - - ResetLatch(&MyProc->procLatch); - - /* Handle request if any */ - if (pgws_collector_hdr->request != NO_REQUEST) - { - LOCKTAG tag; - SHMRequest request; - - pgws_init_lock_tag(&tag, PGWS_COLLECTOR_LOCK); - - LockAcquire(&tag, ExclusiveLock, false, false); - request = pgws_collector_hdr->request; - pgws_collector_hdr->request = NO_REQUEST; - - if (request == HISTORY_REQUEST || request == PROFILE_REQUEST) - { - shm_mq_result mq_result; - - /* Send history or profile */ - shm_mq_set_sender(pgws_collector_mq, MyProc); - mqh = shm_mq_attach(pgws_collector_mq, NULL, NULL); - mq_result = shm_mq_wait_for_attach(mqh); - switch (mq_result) - { - case SHM_MQ_SUCCESS: - switch (request) - { - case HISTORY_REQUEST: - send_history(&observations, mqh); - break; - case PROFILE_REQUEST: - send_profile(profile_hash, mqh); - break; - default: - Assert(false); - } - break; - case SHM_MQ_DETACHED: - ereport(WARNING, - (errmsg("pg_wait_sampling collector: " - "receiver of message queue have been " - "detached"))); - break; - default: - Assert(false); - } - shm_mq_detach_compat(mqh, pgws_collector_mq); - } - else if (request == PROFILE_RESET) - { - /* Reset profile hash */ - hash_destroy(profile_hash); - profile_hash = make_profile_hash(); - } - LockRelease(&tag, ExclusiveLock, false); - } } - MemoryContextReset(collector_context); - /* * We're done. Explicitly detach the shared memory segment so that we * don't get a resource leak warning at commit time. This will fire any diff --git a/compat.h b/compat.h index 32874f7..76aa874 100644 --- a/compat.h +++ b/compat.h @@ -14,8 +14,13 @@ #include "access/tupdesc.h" #include "miscadmin.h" -#include "storage/shm_mq.h" -#include "utils/guc_tables.h" +#include "storage/latch.h" + +#if PG_VERSION_NUM >= 110000 +typedef uint64 pgwsQueryId; +#else +typedef uint32 pgwsQueryId; +#endif static inline TupleDesc CreateTemplateTupleDescCompat(int nattrs, bool hasoid) @@ -27,27 +32,6 @@ CreateTemplateTupleDescCompat(int nattrs, bool hasoid) #endif } -static inline void -shm_mq_detach_compat(shm_mq_handle *mqh, shm_mq *mq) -{ -#if PG_VERSION_NUM >= 100000 - shm_mq_detach(mqh); -#else - shm_mq_detach(mq); -#endif -} - -static inline shm_mq_result -shm_mq_send_compat(shm_mq_handle *mqh, Size nbytes, const void *data, - bool nowait, bool force_flush) -{ -#if PG_VERSION_NUM >= 150000 - return shm_mq_send(mqh, nbytes, data, nowait, force_flush); -#else - return shm_mq_send(mqh, nbytes, data, nowait); -#endif -} - static inline void InitPostgresCompat(const char *in_dbname, Oid dboid, const char *username, Oid useroid, @@ -66,18 +50,20 @@ InitPostgresCompat(const char *in_dbname, Oid dboid, #endif } -static inline void -get_guc_variables_compat(struct config_generic ***vars, int *num_vars) +static inline int +WaitLatchCompat(Latch *latch, int wakeEvents, long timeout, + uint32 wait_event_info) { - Assert(vars != NULL); - Assert(num_vars != NULL); - -#if PG_VERSION_NUM >= 160000 - *vars = get_guc_variables(num_vars); +#if PG_VERSION_NUM >= 100000 + return WaitLatch(latch, wakeEvents, timeout, wait_event_info); #else - *vars = get_guc_variables(); - *num_vars = GetNumConfigOptions(); +#define PG_WAIT_EXTENSION -1 + return WaitLatch(latch, wakeEvents, timeout); #endif } +#if PG_VERSION_NUM < 100000 +#define GetPGProcByNumber(n) (&ProcGlobal->allProcs[(n)]) +#endif + #endif diff --git a/pg_wait_sampling.c b/pg_wait_sampling.c index eaa0327..8c7e781 100644 --- a/pg_wait_sampling.c +++ b/pg_wait_sampling.c @@ -10,27 +10,22 @@ #include "postgres.h" #include "access/htup_details.h" -#include "access/twophase.h" #include "catalog/pg_type.h" -#include "fmgr.h" #include "funcapi.h" -#include "miscadmin.h" #include "optimizer/planner.h" #include "pgstat.h" #include "postmaster/autovacuum.h" +#include "postmaster/bgworker.h" #if PG_VERSION_NUM >= 120000 #include "replication/walsender.h" #endif #include "storage/ipc.h" -#include "storage/pg_shmem.h" -#include "storage/procarray.h" -#include "storage/shm_mq.h" -#include "storage/shm_toc.h" -#include "storage/spin.h" +#include "storage/proc.h" #include "utils/builtins.h" -#include "utils/datetime.h" -#include "utils/guc_tables.h" #include "utils/guc.h" +#if PG_VERSION_NUM >= 140000 +#include "utils/queryjumble.h" +#endif #include "utils/memutils.h" /* TopMemoryContext. Actually for PG 9.6 only, * but there should be no harm for others. */ @@ -39,29 +34,28 @@ PG_MODULE_MAGIC; -void _PG_init(void); - +/* Marker whether extension is setup in shared mode */ static bool shmem_initialized = false; +/* Global settings */ +int MaxProfileEntries = 5000; +int HistoryBufferSize = 5000; +int HistoryPeriod = 0; +int ProfilePeriod = 10; +bool WhetherProfilePid = true; +bool WhetherProfileQueryId = true; + +/* Function declarations */ +void _PG_init(void); +// TODO: add void _PG_fini(void); + /* Hooks */ static ExecutorEnd_hook_type prev_ExecutorEnd = NULL; static planner_hook_type planner_hook_next = NULL; - -/* Pointers to shared memory objects */ -shm_mq *pgws_collector_mq = NULL; -uint64 *pgws_proc_queryids = NULL; -CollectorShmqHeader *pgws_collector_hdr = NULL; - -/* Receiver (backend) local shm_mq pointers and lock */ -static shm_mq *recv_mq = NULL; -static shm_mq_handle *recv_mqh = NULL; -static LOCKTAG queueTag; - #if PG_VERSION_NUM >= 150000 -static shmem_request_hook_type prev_shmem_request_hook = NULL; +static shmem_request_hook_type prev_shmem_request_hook = NULL; #endif -static shmem_startup_hook_type prev_shmem_startup_hook = NULL; -static PGPROC * search_proc(int backendPid); +static shmem_startup_hook_type prev_shmem_startup_hook = NULL; static PlannedStmt *pgws_planner_hook(Query *parse, #if PG_VERSION_NUM >= 130000 const char *query_string, @@ -69,6 +63,13 @@ static PlannedStmt *pgws_planner_hook(Query *parse, int cursorOptions, ParamListInfo boundParams); static void pgws_ExecutorEnd(QueryDesc *queryDesc); +/* Pointers to shared memory objects */ +pgwsQueryId *pgws_proc_queryids = NULL; +HTAB *pgws_profile_hash = NULL; +LWLock *pgws_profile_lock = NULL; +History *pgws_history_ring = NULL; +LWLock *pgws_history_lock = NULL; + /* * Calculate max processes count. * @@ -135,149 +136,63 @@ get_max_procs_count(void) static Size pgws_shmem_size(void) { - shm_toc_estimator e; - Size size; - int nkeys; - - shm_toc_initialize_estimator(&e); + Size size = 0; - nkeys = 3; - - shm_toc_estimate_chunk(&e, sizeof(CollectorShmqHeader)); - shm_toc_estimate_chunk(&e, (Size) COLLECTOR_QUEUE_SIZE); - shm_toc_estimate_chunk(&e, sizeof(uint64) * get_max_procs_count()); - - shm_toc_estimate_keys(&e, nkeys); - size = shm_toc_estimate(&e); + size = add_size(size, sizeof(pgwsQueryId) * get_max_procs_count()); + size = add_size(size, hash_estimate_size(MaxProfileEntries, + sizeof(ProfileHashEntry))); + size = add_size(size, + sizeof(History) + sizeof(HistoryItem) * HistoryBufferSize); return size; } -static bool -shmem_int_guc_check_hook(int *newval, void **extra, GucSource source) -{ - if (UsedShmemSegAddr == NULL) - return false; - return true; -} - -static bool -shmem_bool_guc_check_hook(bool *newval, void **extra, GucSource source) +static void +pgwsEnableQueryId(bool newval, void *extra) { - if (UsedShmemSegAddr == NULL) - return false; - return true; +#if PG_VERSION_NUM >= 140000 + if (newval) + EnableQueryId(); +#endif } -/* - * This union allows us to mix the numerous different types of structs - * that we are organizing. - */ -typedef union -{ - struct config_generic generic; - struct config_bool _bool; - struct config_real real; - struct config_int integer; - struct config_string string; - struct config_enum _enum; -} mixedStruct; - /* * Setup new GUCs or modify existsing. */ static void setup_gucs() { - struct config_generic **guc_vars; - int numOpts, - i; - bool history_size_found = false, - history_period_found = false, - profile_period_found = false, - profile_pid_found = false, - profile_queries_found = false; - - get_guc_variables_compat(&guc_vars, &numOpts); - - for (i = 0; i < numOpts; i++) - { - mixedStruct *var = (mixedStruct *) guc_vars[i]; - const char *name = var->generic.name; - - if (var->generic.flags & GUC_CUSTOM_PLACEHOLDER) - continue; - - if (!strcmp(name, "pg_wait_sampling.history_size")) - { - history_size_found = true; - var->integer.variable = &pgws_collector_hdr->historySize; - pgws_collector_hdr->historySize = 5000; - } - else if (!strcmp(name, "pg_wait_sampling.history_period")) - { - history_period_found = true; - var->integer.variable = &pgws_collector_hdr->historyPeriod; - pgws_collector_hdr->historyPeriod = 10; - } - else if (!strcmp(name, "pg_wait_sampling.profile_period")) - { - profile_period_found = true; - var->integer.variable = &pgws_collector_hdr->profilePeriod; - pgws_collector_hdr->profilePeriod = 10; - } - else if (!strcmp(name, "pg_wait_sampling.profile_pid")) - { - profile_pid_found = true; - var->_bool.variable = &pgws_collector_hdr->profilePid; - pgws_collector_hdr->profilePid = true; - } - else if (!strcmp(name, "pg_wait_sampling.profile_queries")) - { - profile_queries_found = true; - var->_bool.variable = &pgws_collector_hdr->profileQueries; - pgws_collector_hdr->profileQueries = true; - } - } - - if (!history_size_found) - DefineCustomIntVariable("pg_wait_sampling.history_size", - "Sets size of waits history.", NULL, - &pgws_collector_hdr->historySize, 5000, 100, INT_MAX, - PGC_SUSET, 0, shmem_int_guc_check_hook, NULL, NULL); - - if (!history_period_found) - DefineCustomIntVariable("pg_wait_sampling.history_period", - "Sets period of waits history sampling.", NULL, - &pgws_collector_hdr->historyPeriod, 10, 1, INT_MAX, - PGC_SUSET, 0, shmem_int_guc_check_hook, NULL, NULL); - - if (!profile_period_found) - DefineCustomIntVariable("pg_wait_sampling.profile_period", - "Sets period of waits profile sampling.", NULL, - &pgws_collector_hdr->profilePeriod, 10, 1, INT_MAX, - PGC_SUSET, 0, shmem_int_guc_check_hook, NULL, NULL); - - if (!profile_pid_found) - DefineCustomBoolVariable("pg_wait_sampling.profile_pid", - "Sets whether profile should be collected per pid.", NULL, - &pgws_collector_hdr->profilePid, true, - PGC_SUSET, 0, shmem_bool_guc_check_hook, NULL, NULL); - - if (!profile_queries_found) - DefineCustomBoolVariable("pg_wait_sampling.profile_queries", - "Sets whether profile should be collected per query.", NULL, - &pgws_collector_hdr->profileQueries, true, - PGC_SUSET, 0, shmem_bool_guc_check_hook, NULL, NULL); - - if (history_size_found - || history_period_found - || profile_period_found - || profile_pid_found - || profile_queries_found) - { - ProcessConfigFile(PGC_SIGHUP); - } + DefineCustomIntVariable("pg_wait_sampling.max_profile_entries", + "Sets maximum number of entries in bounded profile table.", NULL, + &MaxProfileEntries, 5000, 100, INT_MAX, + PGC_POSTMASTER, 0, NULL, NULL, NULL); + + DefineCustomIntVariable("pg_wait_sampling.history_size", + "Sets size for ring buffer for waits history in bytes.", NULL, + &HistoryBufferSize, 5000, 100, INT_MAX, + PGC_POSTMASTER, 0, NULL, NULL, NULL); + + DefineCustomIntVariable("pg_wait_sampling.history_period", + "Sets period of waits history sampling in milliseconds.", + "0 disables history populating.", + &HistoryPeriod, 0, 0, INT_MAX, + PGC_SIGHUP, 0, NULL, NULL, NULL); + + DefineCustomIntVariable("pg_wait_sampling.profile_period", + "Sets period of waits profile sampling in milliseconds.", + "0 disables profiling.", + &ProfilePeriod, 10, 0, INT_MAX, + PGC_SIGHUP, 0, NULL, NULL, NULL); + + DefineCustomBoolVariable("pg_wait_sampling.profile_pid", + "Sets whether profile should be collected per pid.", NULL, + &WhetherProfilePid, true, + PGC_POSTMASTER, 0, NULL, NULL, NULL); + + DefineCustomBoolVariable("pg_wait_sampling.profile_queries", + "Sets whether profile should be collected per query.", NULL, + &WhetherProfileQueryId, true, + PGC_POSTMASTER, 0, NULL, pgwsEnableQueryId, NULL); } #if PG_VERSION_NUM >= 150000 @@ -294,6 +209,7 @@ pgws_shmem_request(void) prev_shmem_request_hook(); RequestAddinShmemSpace(pgws_shmem_size()); + RequestNamedLWLockTranche("pg_wait_sampling", 2); } #endif @@ -303,48 +219,45 @@ pgws_shmem_request(void) static void pgws_shmem_startup(void) { - bool found; - Size segsize = pgws_shmem_size(); - void *pgws; - shm_toc *toc; + bool found; + HASHCTL info; + + if (prev_shmem_startup_hook) + prev_shmem_startup_hook(); - pgws = ShmemInitStruct("pg_wait_sampling", segsize, &found); + /* Create or attach to the shared memory state */ + LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE); + pgws_proc_queryids = ShmemInitStruct( + "pg_wait_sampling queryids", + sizeof(pgwsQueryId) * get_max_procs_count(), + &found); + MemSet(pgws_proc_queryids, 0, sizeof(pgwsQueryId) * get_max_procs_count()); if (!found) { - toc = shm_toc_create(PG_WAIT_SAMPLING_MAGIC, pgws, segsize); - - pgws_collector_hdr = shm_toc_allocate(toc, sizeof(CollectorShmqHeader)); - shm_toc_insert(toc, 0, pgws_collector_hdr); - pgws_collector_mq = shm_toc_allocate(toc, COLLECTOR_QUEUE_SIZE); - shm_toc_insert(toc, 1, pgws_collector_mq); - pgws_proc_queryids = shm_toc_allocate(toc, - sizeof(uint64) * get_max_procs_count()); - shm_toc_insert(toc, 2, pgws_proc_queryids); - MemSet(pgws_proc_queryids, 0, sizeof(uint64) * get_max_procs_count()); - - /* Initialize GUC variables in shared memory */ - setup_gucs(); - } - else - { - toc = shm_toc_attach(PG_WAIT_SAMPLING_MAGIC, pgws); + /* First time through ... */ + LWLockPadded *locks = GetNamedLWLockTranche("pg_wait_sampling"); -#if PG_VERSION_NUM >= 100000 - pgws_collector_hdr = shm_toc_lookup(toc, 0, false); - pgws_collector_mq = shm_toc_lookup(toc, 1, false); - pgws_proc_queryids = shm_toc_lookup(toc, 2, false); -#else - pgws_collector_hdr = shm_toc_lookup(toc, 0); - pgws_collector_mq = shm_toc_lookup(toc, 1); - pgws_proc_queryids = shm_toc_lookup(toc, 2); -#endif + pgws_profile_lock = &(locks[0]).lock; + pgws_history_lock = &(locks[1]).lock; } - shmem_initialized = true; + pgws_history_ring = ShmemInitStruct( + "pg_wait_sampling history ring", + sizeof(History) + sizeof(HistoryItem) * HistoryBufferSize, + &found); + pgws_history_ring->index = 0; - if (prev_shmem_startup_hook) - prev_shmem_startup_hook(); + memset(&info, 0, sizeof(info)); + info.keysize = sizeof(ProfileHashKey); + info.entrysize = sizeof(ProfileHashEntry); + pgws_profile_hash = ShmemInitHash("pg_wait_sampling hash", + MaxProfileEntries, MaxProfileEntries, + &info, HASH_ELEM | HASH_BLOBS); + + LWLockRelease(AddinShmemInitLock); + + shmem_initialized = true; } /* @@ -360,14 +273,28 @@ check_shmem(void) } } +/* + * Register background worker for collecting waits history. + */ static void -pgws_cleanup_callback(int code, Datum arg) +pgws_register_wait_collector(void) { - elog(DEBUG3, "pg_wait_sampling cleanup: detaching shm_mq and releasing queue lock"); - shm_mq_detach_compat(recv_mqh, recv_mq); - LockRelease(&queueTag, ExclusiveLock, false); + BackgroundWorker worker; + + /* Set up background worker parameters */ + memset(&worker, 0, sizeof(worker)); + worker.bgw_flags = BGWORKER_SHMEM_ACCESS; + worker.bgw_start_time = BgWorkerStart_ConsistentState; + worker.bgw_restart_time = 1; + worker.bgw_notify_pid = 0; + snprintf(worker.bgw_library_name, BGW_MAXLEN, "pg_wait_sampling"); + snprintf(worker.bgw_function_name, BGW_MAXLEN, CppAsString(pgws_collector_main)); + snprintf(worker.bgw_name, BGW_MAXLEN, "pg_wait_sampling collector"); + worker.bgw_main_arg = (Datum) 0; + RegisterBackgroundWorker(&worker); } + /* * Module load callback */ @@ -377,6 +304,8 @@ _PG_init(void) if (!process_shared_preload_libraries_in_progress) return; + setup_gucs(); + #if PG_VERSION_NUM < 150000 /* * Request additional shared resources. (These are no-ops if we're not in @@ -387,6 +316,7 @@ _PG_init(void) * in pgsp_shmem_request() for pg15 and later. */ RequestAddinShmemSpace(pgws_shmem_size()); + RequestNamedLWLockTranche("pg_wait_sampling", 2); #endif pgws_register_wait_collector(); @@ -555,111 +485,13 @@ pg_wait_sampling_get_current(PG_FUNCTION_ARGS) } } -typedef struct -{ - Size count; - ProfileItem *items; -} Profile; - -void -pgws_init_lock_tag(LOCKTAG *tag, uint32 lock) -{ - tag->locktag_field1 = PG_WAIT_SAMPLING_MAGIC; - tag->locktag_field2 = lock; - tag->locktag_field3 = 0; - tag->locktag_field4 = 0; - tag->locktag_type = LOCKTAG_USERLOCK; - tag->locktag_lockmethodid = USER_LOCKMETHOD; -} - -static void * -receive_array(SHMRequest request, Size item_size, Size *count) -{ - LOCKTAG collectorTag; - shm_mq_result res; - Size len, - i; - void *data; - Pointer result, - ptr; - MemoryContext oldctx; - - /* Ensure nobody else trying to send request to queue */ - pgws_init_lock_tag(&queueTag, PGWS_QUEUE_LOCK); - LockAcquire(&queueTag, ExclusiveLock, false, false); - - pgws_init_lock_tag(&collectorTag, PGWS_COLLECTOR_LOCK); - LockAcquire(&collectorTag, ExclusiveLock, false, false); - recv_mq = shm_mq_create(pgws_collector_mq, COLLECTOR_QUEUE_SIZE); - pgws_collector_hdr->request = request; - LockRelease(&collectorTag, ExclusiveLock, false); - - if (!pgws_collector_hdr->latch) - ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("pg_wait_sampling collector wasn't started"))); - - SetLatch(pgws_collector_hdr->latch); - - shm_mq_set_receiver(recv_mq, MyProc); - - /* - * We switch to TopMemoryContext, so that recv_mqh is allocated there - * and is guaranteed to survive until before_shmem_exit callbacks are - * fired. Anyway, shm_mq_detach() will free handler on its own. - * - * NB: we do not pass `seg` to shm_mq_attach(), so it won't set its own - * callback, i.e. we do not interfere here with shm_mq_detach_callback(). - */ - oldctx = MemoryContextSwitchTo(TopMemoryContext); - recv_mqh = shm_mq_attach(recv_mq, NULL, NULL); - MemoryContextSwitchTo(oldctx); - - /* - * Now we surely attached to the shm_mq and got collector's attention. - * If anything went wrong (e.g. Ctrl+C received from the client) we have - * to cleanup some things, i.e. detach from the shm_mq, so collector was - * able to continue responding to other requests. - * - * PG_ENSURE_ERROR_CLEANUP() guaranties that cleanup callback will be - * fired for both ERROR and FATAL. - */ - PG_ENSURE_ERROR_CLEANUP(pgws_cleanup_callback, 0); - { - res = shm_mq_receive(recv_mqh, &len, &data, false); - if (res != SHM_MQ_SUCCESS || len != sizeof(*count)) - elog(ERROR, "error reading mq"); - - memcpy(count, data, sizeof(*count)); - - result = palloc(item_size * (*count)); - ptr = result; - - for (i = 0; i < *count; i++) - { - res = shm_mq_receive(recv_mqh, &len, &data, false); - if (res != SHM_MQ_SUCCESS || len != item_size) - elog(ERROR, "error reading mq"); - - memcpy(ptr, data, item_size); - ptr += item_size; - } - } - PG_END_ENSURE_ERROR_CLEANUP(pgws_cleanup_callback, 0); - - /* We still have to detach and release lock during normal operation. */ - shm_mq_detach_compat(recv_mqh, recv_mq); - LockRelease(&queueTag, ExclusiveLock, false); - - return result; -} - PG_FUNCTION_INFO_V1(pg_wait_sampling_get_profile); Datum pg_wait_sampling_get_profile(PG_FUNCTION_ARGS) { - Profile *profile; - FuncCallContext *funcctx; + ProfileHashEntry *profile; + FuncCallContext *funcctx; check_shmem(); @@ -667,17 +499,31 @@ pg_wait_sampling_get_profile(PG_FUNCTION_ARGS) { MemoryContext oldcontext; TupleDesc tupdesc; + HASH_SEQ_STATUS hash_seq; + ProfileHashEntry *entry; + int profile_count, + entry_index; funcctx = SRF_FIRSTCALL_INIT(); oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); - /* Receive profile from shmq */ - profile = (Profile *) palloc0(sizeof(Profile)); - profile->items = (ProfileItem *) receive_array(PROFILE_REQUEST, - sizeof(ProfileItem), &profile->count); + /* Extract profile from shared memory */ + profile_count = hash_get_num_entries(pgws_profile_hash); + profile = (ProfileHashEntry *) + palloc(sizeof(ProfileHashEntry) * profile_count); + entry_index = 0; + LWLockAcquire(pgws_profile_lock, LW_SHARED); + hash_seq_init(&hash_seq, pgws_profile_hash); + while ((entry = hash_seq_search(&hash_seq)) != NULL) + { + profile[entry_index++] = *entry; + } + LWLockRelease(pgws_profile_lock); + + /* Build result rows */ funcctx->user_fctx = profile; - funcctx->max_calls = profile->count; + funcctx->max_calls = profile_count; /* Make tuple descriptor */ tupdesc = CreateTemplateTupleDescCompat(5, false); @@ -699,7 +545,7 @@ pg_wait_sampling_get_profile(PG_FUNCTION_ARGS) /* stuff done on every call of the function */ funcctx = SRF_PERCALL_SETUP(); - profile = (Profile *) funcctx->user_fctx; + profile = (ProfileHashEntry *) funcctx->user_fctx; if (funcctx->call_cntr < funcctx->max_calls) { @@ -707,19 +553,22 @@ pg_wait_sampling_get_profile(PG_FUNCTION_ARGS) Datum values[5]; bool nulls[5]; HeapTuple tuple; - ProfileItem *item; + ProfileHashEntry *item; const char *event_type, *event; - item = &profile->items[funcctx->call_cntr]; + item = &profile[funcctx->call_cntr]; MemSet(values, 0, sizeof(values)); MemSet(nulls, 0, sizeof(nulls)); /* Make and return next tuple to caller */ - event_type = pgstat_get_wait_event_type(item->wait_event_info); - event = pgstat_get_wait_event(item->wait_event_info); - values[0] = Int32GetDatum(item->pid); + event_type = pgstat_get_wait_event_type(item->key.wait_event_info); + event = pgstat_get_wait_event(item->key.wait_event_info); + if (WhetherProfilePid) + values[0] = Int32GetDatum(item->key.pid); + else + nulls[0] = true; if (event_type) values[1] = PointerGetDatum(cstring_to_text(event_type)); else @@ -729,12 +578,12 @@ pg_wait_sampling_get_profile(PG_FUNCTION_ARGS) else nulls[2] = true; - if (pgws_collector_hdr->profileQueries) - values[3] = UInt64GetDatum(item->queryId); + if (WhetherProfileQueryId) + values[3] = UInt64GetDatum(item->key.queryid); else - values[3] = (Datum) 0; + nulls[3] = true; - values[4] = UInt64GetDatum(item->count); + values[4] = UInt64GetDatum(item->counter); tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls); @@ -751,22 +600,29 @@ PG_FUNCTION_INFO_V1(pg_wait_sampling_reset_profile); Datum pg_wait_sampling_reset_profile(PG_FUNCTION_ARGS) { - LOCKTAG collectorTag; + HASH_SEQ_STATUS hash_seq; + ProfileHashEntry *entry; check_shmem(); - pgws_init_lock_tag(&queueTag, PGWS_QUEUE_LOCK); - - LockAcquire(&queueTag, ExclusiveLock, false, false); + LWLockAcquire(pgws_profile_lock, LW_EXCLUSIVE); - pgws_init_lock_tag(&collectorTag, PGWS_COLLECTOR_LOCK); - LockAcquire(&collectorTag, ExclusiveLock, false, false); - pgws_collector_hdr->request = PROFILE_RESET; - LockRelease(&collectorTag, ExclusiveLock, false); + /* Remove all profile entries. */ + hash_seq_init(&hash_seq, pgws_profile_hash); + while ((entry = hash_seq_search(&hash_seq)) != NULL) + { + hash_search(pgws_profile_hash, &entry->key, HASH_REMOVE, NULL); + } - SetLatch(pgws_collector_hdr->latch); + LWLockRelease(pgws_profile_lock); - LockRelease(&queueTag, ExclusiveLock, false); + /* + * TODO: consider saving of the time of statistics reset to more easly + * compute the differential counters. It might look as global time + * accessable via separate function call as it's done in pg_stat_statemens + * or more granular time accounting per profile entries to take into account + * evictions of these entries from restricted by size hashtable. + */ PG_RETURN_VOID(); } @@ -775,7 +631,7 @@ PG_FUNCTION_INFO_V1(pg_wait_sampling_get_history); Datum pg_wait_sampling_get_history(PG_FUNCTION_ARGS) { - History *history; + HistoryItem *history; FuncCallContext *funcctx; check_shmem(); @@ -784,17 +640,25 @@ pg_wait_sampling_get_history(PG_FUNCTION_ARGS) { MemoryContext oldcontext; TupleDesc tupdesc; + int history_size; funcctx = SRF_FIRSTCALL_INIT(); oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); - /* Receive history from shmq */ - history = (History *) palloc0(sizeof(History)); - history->items = (HistoryItem *) receive_array(HISTORY_REQUEST, - sizeof(HistoryItem), &history->count); + /* Extract history from shared ring buffer */ + LWLockAcquire(pgws_history_lock, LW_SHARED); + + history_size = pgws_history_ring->index < HistoryBufferSize ? + pgws_history_ring->index : HistoryBufferSize; + history = (HistoryItem *) palloc(history_size * sizeof(HistoryItem)); + memcpy(history, pgws_history_ring->items, + history_size * sizeof(HistoryItem)); + LWLockRelease(pgws_history_lock); + + /* Save function context */ funcctx->user_fctx = history; - funcctx->max_calls = history->count; + funcctx->max_calls = history_size; /* Make tuple descriptor */ tupdesc = CreateTemplateTupleDescCompat(5, false); @@ -816,9 +680,9 @@ pg_wait_sampling_get_history(PG_FUNCTION_ARGS) /* stuff done on every call of the function */ funcctx = SRF_PERCALL_SETUP(); - history = (History *) funcctx->user_fctx; + history = (HistoryItem *) funcctx->user_fctx; - if (history->index < history->count) + if (funcctx->call_cntr < funcctx->max_calls) { HeapTuple tuple; HistoryItem *item; @@ -827,7 +691,7 @@ pg_wait_sampling_get_history(PG_FUNCTION_ARGS) const char *event_type, *event; - item = &history->items[history->index]; + item = &history[funcctx->call_cntr]; /* Make and return next tuple to caller */ MemSet(values, 0, sizeof(values)); @@ -849,7 +713,6 @@ pg_wait_sampling_get_history(PG_FUNCTION_ARGS) values[4] = UInt64GetDatum(item->queryId); tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls); - history->index++; SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple)); } else @@ -875,20 +738,11 @@ pgws_planner_hook(Query *parse, if (MyProc) { int i = MyProc - ProcGlobal->allProcs; -#if PG_VERSION_NUM >= 110000 - /* - * since we depend on queryId we need to check that its size - * is uint64 as we coded in pg_wait_sampling - */ - StaticAssertExpr(sizeof(parse->queryId) == sizeof(uint64), - "queryId size is not uint64"); -#else - StaticAssertExpr(sizeof(parse->queryId) == sizeof(uint32), - "queryId size is not uint32"); -#endif + + StaticAssertExpr(sizeof(parse->queryId) == sizeof(pgwsQueryId), + "queryId size is not correct"); if (!pgws_proc_queryids[i]) pgws_proc_queryids[i] = parse->queryId; - } /* Invoke original hook if needed */ diff --git a/pg_wait_sampling.h b/pg_wait_sampling.h index 29425fc..56d1f91 100644 --- a/pg_wait_sampling.h +++ b/pg_wait_sampling.h @@ -17,67 +17,60 @@ #error "You are trying to build pg_wait_sampling with PostgreSQL version lower than 9.6. Please, check you environment." #endif -#include "storage/proc.h" -#include "storage/shm_mq.h" #include "utils/timestamp.h" #define PG_WAIT_SAMPLING_MAGIC 0xCA94B107 -#define COLLECTOR_QUEUE_SIZE (16 * 1024) -#define HISTORY_TIME_MULTIPLIER 10 -#define PGWS_QUEUE_LOCK 0 -#define PGWS_COLLECTOR_LOCK 1 typedef struct { uint32 pid; uint32 wait_event_info; - uint64 queryId; - uint64 count; -} ProfileItem; - -typedef struct -{ - uint32 pid; - uint32 wait_event_info; - uint64 queryId; + pgwsQueryId queryId; TimestampTz ts; } HistoryItem; typedef struct { - bool wraparound; - Size index; - Size count; - HistoryItem *items; + Size index; + HistoryItem items[FLEXIBLE_ARRAY_MEMBER]; } History; -typedef enum +/* + * Hashtable key that defines the identity of a hashtable entry + */ +typedef struct { - NO_REQUEST, - HISTORY_REQUEST, - PROFILE_REQUEST, - PROFILE_RESET -} SHMRequest; + int32 pid; /* pid of observable process */ + uint32 wait_event_info;/* proc's wait information */ + pgwsQueryId queryid; /* query identifier */ +} ProfileHashKey; +/* + * Wait statistics entry + */ typedef struct { - Latch *latch; - SHMRequest request; - int historySize; - int historyPeriod; - int profilePeriod; - bool profilePid; - bool profileQueries; -} CollectorShmqHeader; + ProfileHashKey key; /* hash key of entry - MUST BE FIRST */ + int64 counter; /* cummulative counter for this entry */ + double usage; /* usage factor */ +} ProfileHashEntry; /* pg_wait_sampling.c */ -extern CollectorShmqHeader *pgws_collector_hdr; -extern shm_mq *pgws_collector_mq; -extern uint64 *pgws_proc_queryids; -extern void pgws_init_lock_tag(LOCKTAG *tag, uint32 lock); +extern pgwsQueryId *pgws_proc_queryids; +extern HTAB *pgws_profile_hash; +extern LWLock *pgws_profile_lock; +extern History *pgws_history_ring; +extern LWLock *pgws_history_lock; + +/* global settings */ +extern int MaxProfileEntries; +extern int HistoryBufferSize; +extern int HistoryPeriod; +extern int ProfilePeriod; +extern bool WhetherProfilePid; +extern bool WhetherProfileQueryId; /* collector.c */ -extern void pgws_register_wait_collector(void); extern PGDLLEXPORT void pgws_collector_main(Datum main_arg); #endif