diff --git a/.github/workflows/mamonsu-tests-dev.yml b/.github/workflows/mamonsu-tests-dev.yml index 0336b7c..2427a7d 100644 --- a/.github/workflows/mamonsu-tests-dev.yml +++ b/.github/workflows/mamonsu-tests-dev.yml @@ -86,7 +86,7 @@ jobs: echo "zabbix_address=$(hostname -I | awk '{print $1}')" >> $GITHUB_OUTPUT id: zabbix_address - name: Edit Zabbix address in agent.conf - run: sed -i "s/\(address *= *\).*/\1 ${{ steps.zabbix_address.outputs.zabbix_address }}/" ${{ env.MAMONSU_PATH }}/github-actions-tests/sources/agent_3.5.12.conf + run: sed -i "s/\(address *= *\).*/\1 ${{ steps.zabbix_address.outputs.zabbix_address }}/" ${{ env.MAMONSU_PATH }}/github-actions-tests/sources/agent_3.5.13.conf - name: Copy test scripts to container run: docker exec $( echo "${{ matrix.docker_os }}" | sed 's/://' | sed 's/\.//' ) mkdir -p -m 755 /mamonsu/ diff --git a/.github/workflows/mamonsu-tests-master.yml b/.github/workflows/mamonsu-tests-master.yml index 6dfb86d..e14042c 100644 --- a/.github/workflows/mamonsu-tests-master.yml +++ b/.github/workflows/mamonsu-tests-master.yml @@ -91,7 +91,7 @@ jobs: echo "zabbix_address=$(hostname -I | awk '{print $1}')" >> $GITHUB_OUTPUT id: zabbix_address - name: Edit Zabbix address in agent.conf - run: sed -i "s/\(address *= *\).*/\1 ${{ steps.zabbix_address.outputs.zabbix_address }}/" ${{ env.MAMONSU_PATH }}/github-actions-tests/sources/agent_3.5.12.conf + run: sed -i "s/\(address *= *\).*/\1 ${{ steps.zabbix_address.outputs.zabbix_address }}/" ${{ env.MAMONSU_PATH }}/github-actions-tests/sources/agent_3.5.13.conf - name: Copy test scripts to container run: docker exec $( echo "${{ matrix.docker_os }}" | sed 's/://' | sed 's/\.//' ) mkdir -p -m 755 /mamonsu/ diff --git a/README.md b/README.md index a4f3076..fd6b6bf 100644 --- a/README.md +++ b/README.md @@ -179,7 +179,7 @@ Pre-built _mamonsu_ packages are provided in official Postgres Pro repository: [ **Install from repository using script:** ```shell -$ wget https://repo.postgrespro.ru/mamonsu/keys/pgpro-repo-add.sh +$ wget https://repo.postgrespro.ru/mamonsu/mamonsu/keys/pgpro-repo-add.sh $ sudo chmod 700 ./pgpro-repo-add.sh $ sudo ./pgpro-repo-add.sh ``` diff --git a/documentation/metrics.md b/documentation/metrics.md index 15ef4a1..d507a8e 100644 --- a/documentation/metrics.md +++ b/documentation/metrics.md @@ -3691,7 +3691,8 @@ Default config: ### Replication Default config: -        lag_more_than_in_sec = 300 +        lag_more_than_in_sec = 300\ +        critical_bytes_held_by_non_active_slot = 1073741824 bytes ### Items @@ -3763,6 +3764,37 @@ Default config: *Non-active Replication Slots* calculates as count of slots with `false` active status. +- **Bytes Held By Non-active Replication Slots** + + Zabbix item: + + + + + + + + + + + + + + + + + + + + + + + + + +
NamePostgreSQL Replication: Bytes held by non-active slot {#NON_ACTIVE_SLOT_NAME}
Keypgsql.replication.non_active_slots_discovery[]
TypeNumeric (float)
UnitsBytes
DeltaAs Is
Supported Version10+
+ + *Non-active Replication Slots* calculates as count of slots with `false` active status. - **Streaming Replication Lag** @@ -3861,12 +3893,40 @@ Default config: +- **PostgreSQL Replication: Non-active Slots Discovery** + + Items: + + + + + + + + + + + + + + + + + + + + + +
NamePostgreSQL Replication: Bytes held by non-active slot {#NON_ACTIVE_SLOT_NAME}
Keypgsql.replication.non_active_slots_discovery[]
TypeNumeric (float)
UnitsBytes
DeltaAs Is
+ ### Triggers - **PostgreSQL Instance: server mode has been changed on {HOSTNAME} to {ITEM.LASTVALUE}** - **PostgreSQL number of non-active replication slots on {HOSTNAME} (value={ITEM.LASTVALUE})** - + Disabled by default +- **PostgreSQL Replication: bytes held by slot {#NON_ACTIVE_SLOT_NAME} is too high (value={ITEM.LASTVALUE})** + Triggers if *PostgreSQL Replication: Bytes held by non-active slot {#NON_ACTIVE_SLOT_NAME}* exceeds `critical_bytes_held_by_non_active_slot`. - **PostgreSQL streaming lag too high on {HOSTNAME} (value={ITEM.LASTVALUE})** Triggers if *PostgreSQL Replication: Streaming Replication Lag* exceeds `lag_more_than_in_sec`. diff --git a/github-actions-tests/mamonsu_build.sh b/github-actions-tests/mamonsu_build.sh index a766806..6c24eb9 100644 --- a/github-actions-tests/mamonsu_build.sh +++ b/github-actions-tests/mamonsu_build.sh @@ -41,7 +41,7 @@ if [ "${OS%:*}" = "centos" ]; then python3 setup.py build && python3 setup.py install make rpm sudo rpm -i ./mamonsu*.rpm - cat /mamonsu/github-actions-tests/sources/agent_3.5.12.conf > /etc/mamonsu/agent.conf + cat /mamonsu/github-actions-tests/sources/agent_3.5.13.conf > /etc/mamonsu/agent.conf # ensuring mamonsu can actually start sudo su -s /bin/bash -c "mamonsu bootstrap -x --user postgres -d mamonsu_test_db" mamonsu /etc/init.d/mamonsu restart @@ -65,7 +65,7 @@ elif [ "${OS%:*}" = "ubuntu" ]; then python3 setup.py build && python3 setup.py install make deb sudo dpkg -i ./mamonsu*.deb - cat /mamonsu/github-actions-tests/sources/agent_3.5.12.conf > /etc/mamonsu/agent.conf + cat /mamonsu/github-actions-tests/sources/agent_3.5.13.conf > /etc/mamonsu/agent.conf # ensuring mamonsu can actually start sudo su -s /bin/bash -c "mamonsu bootstrap -x --user postgres -d mamonsu_test_db" mamonsu service mamonsu restart diff --git a/github-actions-tests/sources/agent_3.5.12.conf b/github-actions-tests/sources/agent_3.5.13.conf similarity index 100% rename from github-actions-tests/sources/agent_3.5.12.conf rename to github-actions-tests/sources/agent_3.5.13.conf diff --git a/mamonsu/__init__.py b/mamonsu/__init__.py index b43c491..9264cb8 100644 --- a/mamonsu/__init__.py +++ b/mamonsu/__init__.py @@ -1,7 +1,7 @@ __author__ = 'Dmitry Vasilyev' __author_email__ = 'info@postgrespro.ru' __description__ = 'Monitoring agent for PostgreSQL' -__version__ = '3.5.12' +__version__ = '3.5.13' __licence__ = 'BSD' __url__ = 'https://github.com/postgrespro/mamonsu' diff --git a/mamonsu/lib/default_config.py b/mamonsu/lib/default_config.py index c7f2d98..12791a1 100644 --- a/mamonsu/lib/default_config.py +++ b/mamonsu/lib/default_config.py @@ -35,6 +35,8 @@ def default_host(): host = os.environ.get('PGHOST') or 'auto' if platform.FREEBSD: host = os.environ.get('PGHOST') or 'auto' + if platform.DARWIN: + host = os.environ.get('PGHOST') or 'auto' return host @staticmethod diff --git a/mamonsu/lib/platform.py b/mamonsu/lib/platform.py index 5ea5faa..279200d 100644 --- a/mamonsu/lib/platform.py +++ b/mamonsu/lib/platform.py @@ -3,5 +3,6 @@ LINUX = (sys.platform == 'linux' or sys.platform == 'linux2') WINDOWS = (sys.platform == 'win32' or sys.platform == 'win64') FREEBSD = ('freebsd' in sys.platform) +DARWIN = sys.platform == 'darwin' UNIX = LINUX or FREEBSD INTEGER_TYPES = int, diff --git a/mamonsu/lib/queue.py b/mamonsu/lib/queue.py index 96ceadf..e348fc4 100644 --- a/mamonsu/lib/queue.py +++ b/mamonsu/lib/queue.py @@ -10,25 +10,21 @@ def __init__(self): self.lock = threading.Lock() def add(self, metric): - self.lock.acquire() - self.queue.insert(0, metric) - self.lock.release() + with self.lock: + self.queue.insert(0, metric) # replace last metric def replace(self, metric): - self.lock.acquire() - self.queue.pop() - self.queue.append(metric) - self.lock.release() + with self.lock: + if self.queue: + self.queue.pop() + self.queue.append(metric) def size(self): - self.lock.acquire() - result = len(self.queue) - self.lock.release() - return result + with self.lock: + return len(self.queue) def flush(self): - self.lock.acquire() - result, self.queue = self.queue, [] - self.lock.release() - return result + with self.lock: + result, self.queue = self.queue, [] + return result diff --git a/mamonsu/plugins/pgsql/driver/pool.py b/mamonsu/plugins/pgsql/driver/pool.py index 6576f92..a8433d9 100644 --- a/mamonsu/plugins/pgsql/driver/pool.py +++ b/mamonsu/plugins/pgsql/driver/pool.py @@ -86,7 +86,7 @@ class Pool(object): """ SELECT application_name, {0} - coalesce((pg_{1}_{2}_diff(pg_current_{1}_{2}(), replay_lsn))::int, 0) AS total_lag + coalesce((pg_{1}_{2}_diff(pg_current_{1}_{2}(), replay_{2}))::int, 0) AS total_lag FROM pg_stat_replication; """, """ @@ -95,6 +95,30 @@ class Pool(object): total_lag FROM mamonsu.count_{1}_lag_lsn(); """ + ), + "wal_held_bytes_master": ( + """ + SELECT slot_name, + coalesce((pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn))::int, 0) AS wal_held_bytes + FROM pg_replication_slots; + """, + """ + SELECT slot_name, + wal_held_bytes + FROM mamonsu.bytes_held_by_inactive_slot_on_master(); + """ + ), + "wal_held_bytes_replica": ( + """ + SELECT slot_name, + coalesce((pg_wal_lsn_diff(pg_last_wal_replay_lsn(), restart_lsn))::int, 0) AS wal_held_bytes + FROM pg_replication_slots; + """, + """ + SELECT slot_name, + wal_held_bytes + FROM mamonsu.bytes_held_by_inactive_slot_on_replica(); + """ ) } diff --git a/mamonsu/plugins/pgsql/replication.py b/mamonsu/plugins/pgsql/replication.py index 8a51889..7ed701c 100644 --- a/mamonsu/plugins/pgsql/replication.py +++ b/mamonsu/plugins/pgsql/replication.py @@ -13,7 +13,8 @@ class Replication(Plugin): AgentPluginType = "pg" # key: (macro, value) plugin_macros = { - "critical_lag_seconds": [("macro", "{$CRITICAL_LAG_SECONDS}"), ("value", 60 * 5)] + "critical_lag_seconds": [("macro", "{$CRITICAL_LAG_SECONDS}"), ("value", 60 * 5)], + "critical_bytes_held_by_none_active_slot": [("macro", "{$CRITICAL_BYTES_HELD_BY_NON_ACTIVE_SLOT}"), ("value", 1024 * 1024 * 1024)] } # get time of replication lag @@ -30,8 +31,15 @@ class Replication(Plugin): WHERE active = 'false'; """ + query_bytes_held_by_non_active_slot = """ + SELECT slot_name, coalesce(pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn)::bigint, 0) AS wal_size_bytes + FROM pg_replication_slots + WHERE active = 'false'; + """ + # for discovery rule for name of each replica key_lsn_replication_discovery = "pgsql.replication.discovery{0}" + key_replication_non_active_slots_discovery = "pgsql.replication.non_active_slots_discovery{0}" key_total_lag = "pgsql.replication.total_lag{0}" # for PG 10 and higher key_flush = "pgsql.replication.flush_lag{0}" @@ -42,6 +50,7 @@ class Replication(Plugin): key_replication = "pgsql.replication_lag{0}" key_non_active_slots = "pgsql.replication.non_active_slots{0}" + key_non_active_slots_held_bytes = "pgsql.replication.non_active_slots_held_bytes{0}" def run(self, zbx): @@ -79,6 +88,14 @@ def run(self, zbx): zbx.send("pgsql.replication.replay_lag[{0}]".format(info[0]), float(info[5])) zbx.send("pgsql.replication.discovery[]", zbx.json({"data": lags})) del lags + bytes_held_by_non_active_slot = Pooler.run_sql_type("wal_held_bytes_master", args=[]) + if bytes_held_by_non_active_slot: + discovery = [] + for info in bytes_held_by_non_active_slot: + discovery.append({"{#NON_ACTIVE_SLOT_NAME}": info[0]}) + zbx.send("pgsql.replication.non_active_slots_held_bytes[{0}]".format(info[0]), int(info[1])) + zbx.send("pgsql.replication.non_active_slots_discovery[]", zbx.json({"data": discovery})) + del discovery elif Pooler.is_superuser() or Pooler.is_bootstraped(): result_lags = Pooler.run_sql_type("wal_lag_lsn", args=[" ", "xlog", "location"]) if result_lags: @@ -90,7 +107,15 @@ def run(self, zbx): del lags else: self.disable_and_exit_if_not_superuser() - + else: + bytes_held_by_non_active_slot = Pooler.run_sql_type("wal_held_bytes_replica", args=[]) + if bytes_held_by_non_active_slot: + discovery = [] + for info in bytes_held_by_non_active_slot: + discovery.append({"{#NON_ACTIVE_SLOT_NAME}": info[0]}) + zbx.send("pgsql.replication.non_active_slots_held_bytes[{0}]".format(info[0]), int(info[1])) + zbx.send("pgsql.replication.non_active_slots_discovery[]", zbx.json({"data": discovery})) + del discovery non_active_slots = Pooler.query(self.query_non_active_slots) zbx.send(self.key_non_active_slots.format("[]"), int(non_active_slots[0][0])) @@ -132,7 +157,8 @@ def triggers(self, template, dashboard=False): }) + template.trigger({ "name": "PostgreSQL Replication: number of non-active replication slots on {HOSTNAME} (value={ITEM.LASTVALUE})", "expression": "{#TEMPLATE:" + self.right_type(self.key_non_active_slots) + ".last()}>" + str( - NUMBER_NON_ACTIVE_SLOTS) + NUMBER_NON_ACTIVE_SLOTS), + "status": 1 }) return triggers @@ -198,7 +224,42 @@ def discovery_rules(self, template, dashboard=False): ] } ] - return template.discovery_rule(rule=rule, conditions=conditions, items=items, graphs=graphs) + active_slots_discovery_rule = template.discovery_rule(rule=rule, conditions=conditions, items=items, graphs=graphs) + + rule = { + "name": "PostgreSQL Replication: Non Active Slots Discovery", + "key": self.key_replication_non_active_slots_discovery.format("[{0}]".format(self.Macros[self.Type])) + } + if Plugin.old_zabbix: + conditions = [] + rule["filter"] = "{#NON_ACTIVE_SLOT_NAME}:.*" + else: + conditions = [{ + "condition": [ + {"macro": "{#NON_ACTIVE_SLOT_NAME}", + "value": ".*", + "operator": 8, + "formulaid": "A"} + ] + }] + items = [ + {"key": self.right_type(self.key_non_active_slots_held_bytes, var_discovery="{#NON_ACTIVE_SLOT_NAME},"), + "name": "PostgreSQL Replication: Bytes held by non-active slot {#NON_ACTIVE_SLOT_NAME}", + "value_type": Plugin.VALUE_TYPE.numeric_float, + "delay": self.plugin_config("interval"), + "drawtype": 2} + ] + graphs = [] + triggers = [ + { + "name": "PostgreSQL Replication: bytes held by slot {#NON_ACTIVE_SLOT_NAME} is too high (value={ITEM.LASTVALUE})", + "expression": "{#TEMPLATE:" + self.right_type(self.key_non_active_slots_held_bytes, var_discovery="{#NON_ACTIVE_SLOT_NAME},") + ".last()}>" + + self.plugin_macros["critical_bytes_held_by_none_active_slot"][0][1] + } + ] + non_active_slots_discovery_rule = template.discovery_rule(rule=rule, conditions=conditions, items=items, graphs=graphs, triggers=triggers) + + return active_slots_discovery_rule + non_active_slots_discovery_rule def keys_and_queries(self, template_zabbix): result = [] diff --git a/mamonsu/plugins/system/linux/disk_sizes.py b/mamonsu/plugins/system/linux/disk_sizes.py index 898c2c0..d461812 100644 --- a/mamonsu/plugins/system/linux/disk_sizes.py +++ b/mamonsu/plugins/system/linux/disk_sizes.py @@ -20,7 +20,7 @@ class DiskSizes(Plugin): ExcludeFsTypes = [ "none", "unknown", "rootfs", "iso9660", "squashfs", "udf", "romfs", "ramfs", "debugfs", "cgroup", "cgroup_root", - "pstore", "devtmpfs", "autofs", "cgroup", "configfs", "devpts", "efivarfs", "fusectl", "fuse.gvfsd-fuse", + "pstore", "devtmpfs", "autofs", "cgroup2", "configfs", "devpts", "efivarfs", "fusectl", "fuse.gvfsd-fuse", "hugetlbfs", "mqueue", "binfmt_misc", "nfsd", "proc", "pstore", "selinuxfs", "rpc_pipefs", "securityfs", "sysfs", "nsfs", "tmpfs", "tracefs" ] diff --git a/mamonsu/tools/bootstrap/sql.py b/mamonsu/tools/bootstrap/sql.py index f37be0f..bf99442 100644 --- a/mamonsu/tools/bootstrap/sql.py +++ b/mamonsu/tools/bootstrap/sql.py @@ -236,6 +236,23 @@ coalesce((pg_{7}_diff(pg_current_{7}(), replay_{9}))::bigint, 0) AS total_lag FROM pg_stat_replication $$ LANGUAGE SQL SECURITY DEFINER; + +DROP FUNCTION IF EXISTS mamonsu.bytes_held_by_inactive_slot_on_master(); +CREATE OR REPLACE FUNCTION mamonsu.bytes_held_by_inactive_slot_on_master() +RETURNS TABLE(slot_name TEXT, wal_held_bytes BIGINT) AS $$ +SELECT slot_name::TEXT, coalesce((pg_{7}_diff(pg_current_wal_lsn(), restart_lsn))::bigint, 0) AS wal_held_bytes +FROM pg_replication_slots +WHERE active = 'false' +$$ LANGUAGE SQL SECURITY DEFINER; + +DROP FUNCTION IF EXISTS mamonsu.bytes_held_by_inactive_slot_on_replica(); +CREATE OR REPLACE FUNCTION mamonsu.bytes_held_by_inactive_slot_on_replica() +RETURNS TABLE(slot_name TEXT, wal_held_bytes BIGINT) AS $$ +SELECT slot_name::TEXT, coalesce((pg_{7}_diff(pg_last_wal_replay_lsn(), restart_lsn))::bigint, 0) AS wal_held_bytes +FROM pg_replication_slots +WHERE active = 'false' +$$ LANGUAGE SQL SECURITY DEFINER; + """ CreatePgBuffercacheFunctionsSQL = """ diff --git a/packaging/debian/changelog b/packaging/debian/changelog index 218931a..6efa097 100644 --- a/packaging/debian/changelog +++ b/packaging/debian/changelog @@ -1,3 +1,9 @@ +mamonsu (3.5.13-1) stable; urgency=low + * Added a new metric that displays the bytes held by non-active replication slots, along with the corresponding trigger.; + * Set the trigger for 'number of non-active replication slots' to be disabled by default.; + * Fixed the Linux plugin to ensure compatibility with recent Linux versions that use cgroups2.; + * Resolved a deadlock issue in the send queue that caused Mamonsu to hang after network problems.; + mamonsu (3.5.12-1) stable; urgency=low * Port version parser code from public archive of pypa/pkg_resources; * Thread-safe implementation of connection cache; diff --git a/packaging/rpm/SPECS/mamonsu.spec b/packaging/rpm/SPECS/mamonsu.spec index dcc7c9f..dcfd2bd 100644 --- a/packaging/rpm/SPECS/mamonsu.spec +++ b/packaging/rpm/SPECS/mamonsu.spec @@ -1,5 +1,5 @@ Name: mamonsu -Version: 3.5.12 +Version: 3.5.13 Release: 1%{?dist} Summary: Monitoring agent for PostgreSQL Group: Applications/Internet @@ -73,6 +73,12 @@ chown -R mamonsu:mamonsu /var/log/mamonsu chown -R mamonsu:mamonsu /etc/mamonsu %changelog +* Thu May 29 2025 Andrey Papsuyko - 3.5.13-1 + - Added a new metric that displays the bytes held by non-active replication slots, along with the corresponding trigger.; + - Set the trigger for 'number of non-active replication slots' to be disabled by default.; + - Fixed the Linux plugin to ensure compatibility with recent Linux versions that use cgroups2.; + - Resolved a deadlock issue in the send queue that caused Mamonsu to hang after network problems.; + * Wed Mar 5 2025 Maxim Styushin - 3.5.12-1 - Port version parser code from public archive of pypa/pkg_resources; - Thread-safe implementation of connection cache; diff --git a/packaging/win/mamonsu.def.nsh b/packaging/win/mamonsu.def.nsh index 1b60f1c..5afbfdc 100644 --- a/packaging/win/mamonsu.def.nsh +++ b/packaging/win/mamonsu.def.nsh @@ -1,5 +1,5 @@ !define NAME Mamonsu -!define VERSION 3.5.12 +!define VERSION 3.5.13 !define MAMONSU_REG_PATH "Software\PostgresPro\Mamonsu" !define MAMONSU_REG_UNINSTALLER_PATH "Software\Microsoft\Windows\CurrentVersion\Uninstall" !define EDB_REG "SOFTWARE\Postgresql"