diff --git a/.github/workflows/mamonsu-tests-dev.yml b/.github/workflows/mamonsu-tests-dev.yml
index 0336b7c..2427a7d 100644
--- a/.github/workflows/mamonsu-tests-dev.yml
+++ b/.github/workflows/mamonsu-tests-dev.yml
@@ -86,7 +86,7 @@ jobs:
echo "zabbix_address=$(hostname -I | awk '{print $1}')" >> $GITHUB_OUTPUT
id: zabbix_address
- name: Edit Zabbix address in agent.conf
- run: sed -i "s/\(address *= *\).*/\1 ${{ steps.zabbix_address.outputs.zabbix_address }}/" ${{ env.MAMONSU_PATH }}/github-actions-tests/sources/agent_3.5.12.conf
+ run: sed -i "s/\(address *= *\).*/\1 ${{ steps.zabbix_address.outputs.zabbix_address }}/" ${{ env.MAMONSU_PATH }}/github-actions-tests/sources/agent_3.5.13.conf
- name: Copy test scripts to container
run: docker exec $( echo "${{ matrix.docker_os }}" | sed 's/://' | sed 's/\.//' ) mkdir -p -m 755 /mamonsu/
diff --git a/.github/workflows/mamonsu-tests-master.yml b/.github/workflows/mamonsu-tests-master.yml
index 6dfb86d..e14042c 100644
--- a/.github/workflows/mamonsu-tests-master.yml
+++ b/.github/workflows/mamonsu-tests-master.yml
@@ -91,7 +91,7 @@ jobs:
echo "zabbix_address=$(hostname -I | awk '{print $1}')" >> $GITHUB_OUTPUT
id: zabbix_address
- name: Edit Zabbix address in agent.conf
- run: sed -i "s/\(address *= *\).*/\1 ${{ steps.zabbix_address.outputs.zabbix_address }}/" ${{ env.MAMONSU_PATH }}/github-actions-tests/sources/agent_3.5.12.conf
+ run: sed -i "s/\(address *= *\).*/\1 ${{ steps.zabbix_address.outputs.zabbix_address }}/" ${{ env.MAMONSU_PATH }}/github-actions-tests/sources/agent_3.5.13.conf
- name: Copy test scripts to container
run: docker exec $( echo "${{ matrix.docker_os }}" | sed 's/://' | sed 's/\.//' ) mkdir -p -m 755 /mamonsu/
diff --git a/README.md b/README.md
index a4f3076..fd6b6bf 100644
--- a/README.md
+++ b/README.md
@@ -179,7 +179,7 @@ Pre-built _mamonsu_ packages are provided in official Postgres Pro repository: [
**Install from repository using script:**
```shell
-$ wget https://repo.postgrespro.ru/mamonsu/keys/pgpro-repo-add.sh
+$ wget https://repo.postgrespro.ru/mamonsu/mamonsu/keys/pgpro-repo-add.sh
$ sudo chmod 700 ./pgpro-repo-add.sh
$ sudo ./pgpro-repo-add.sh
```
diff --git a/documentation/metrics.md b/documentation/metrics.md
index 15ef4a1..d507a8e 100644
--- a/documentation/metrics.md
+++ b/documentation/metrics.md
@@ -3691,7 +3691,8 @@ Default config:
### Replication
Default config:
- lag_more_than_in_sec = 300
+ lag_more_than_in_sec = 300\
+ critical_bytes_held_by_non_active_slot = 1073741824 bytes
### Items
@@ -3763,6 +3764,37 @@ Default config:
*Non-active Replication Slots* calculates as count of slots with `false` active status.
+- **Bytes Held By Non-active Replication Slots**
+
+ Zabbix item:
+
+
+ Name |
+ PostgreSQL Replication: Bytes held by non-active slot {#NON_ACTIVE_SLOT_NAME} |
+
+
+ Key |
+ pgsql.replication.non_active_slots_discovery[] |
+
+
+ Type |
+ Numeric (float) |
+
+
+ Units |
+ Bytes |
+
+
+ Delta |
+ As Is |
+
+
+ Supported Version |
+ 10+ |
+
+
+
+ *Non-active Replication Slots* calculates as count of slots with `false` active status.
- **Streaming Replication Lag**
@@ -3861,12 +3893,40 @@ Default config:
+- **PostgreSQL Replication: Non-active Slots Discovery**
+
+ Items:
+
+
+ Name |
+ PostgreSQL Replication: Bytes held by non-active slot {#NON_ACTIVE_SLOT_NAME} |
+
+
+ Key |
+ pgsql.replication.non_active_slots_discovery[] |
+
+
+ Type |
+ Numeric (float) |
+
+
+ Units |
+ Bytes |
+
+
+ Delta |
+ As Is |
+
+
+
### Triggers
- **PostgreSQL Instance: server mode has been changed on {HOSTNAME} to {ITEM.LASTVALUE}**
- **PostgreSQL number of non-active replication slots on {HOSTNAME} (value={ITEM.LASTVALUE})**
-
+ Disabled by default
+- **PostgreSQL Replication: bytes held by slot {#NON_ACTIVE_SLOT_NAME} is too high (value={ITEM.LASTVALUE})**
+ Triggers if *PostgreSQL Replication: Bytes held by non-active slot {#NON_ACTIVE_SLOT_NAME}* exceeds `critical_bytes_held_by_non_active_slot`.
- **PostgreSQL streaming lag too high on {HOSTNAME} (value={ITEM.LASTVALUE})**
Triggers if *PostgreSQL Replication: Streaming Replication Lag* exceeds `lag_more_than_in_sec`.
diff --git a/github-actions-tests/mamonsu_build.sh b/github-actions-tests/mamonsu_build.sh
index a766806..6c24eb9 100644
--- a/github-actions-tests/mamonsu_build.sh
+++ b/github-actions-tests/mamonsu_build.sh
@@ -41,7 +41,7 @@ if [ "${OS%:*}" = "centos" ]; then
python3 setup.py build && python3 setup.py install
make rpm
sudo rpm -i ./mamonsu*.rpm
- cat /mamonsu/github-actions-tests/sources/agent_3.5.12.conf > /etc/mamonsu/agent.conf
+ cat /mamonsu/github-actions-tests/sources/agent_3.5.13.conf > /etc/mamonsu/agent.conf
# ensuring mamonsu can actually start
sudo su -s /bin/bash -c "mamonsu bootstrap -x --user postgres -d mamonsu_test_db" mamonsu
/etc/init.d/mamonsu restart
@@ -65,7 +65,7 @@ elif [ "${OS%:*}" = "ubuntu" ]; then
python3 setup.py build && python3 setup.py install
make deb
sudo dpkg -i ./mamonsu*.deb
- cat /mamonsu/github-actions-tests/sources/agent_3.5.12.conf > /etc/mamonsu/agent.conf
+ cat /mamonsu/github-actions-tests/sources/agent_3.5.13.conf > /etc/mamonsu/agent.conf
# ensuring mamonsu can actually start
sudo su -s /bin/bash -c "mamonsu bootstrap -x --user postgres -d mamonsu_test_db" mamonsu
service mamonsu restart
diff --git a/github-actions-tests/sources/agent_3.5.12.conf b/github-actions-tests/sources/agent_3.5.13.conf
similarity index 100%
rename from github-actions-tests/sources/agent_3.5.12.conf
rename to github-actions-tests/sources/agent_3.5.13.conf
diff --git a/mamonsu/__init__.py b/mamonsu/__init__.py
index b43c491..9264cb8 100644
--- a/mamonsu/__init__.py
+++ b/mamonsu/__init__.py
@@ -1,7 +1,7 @@
__author__ = 'Dmitry Vasilyev'
__author_email__ = 'info@postgrespro.ru'
__description__ = 'Monitoring agent for PostgreSQL'
-__version__ = '3.5.12'
+__version__ = '3.5.13'
__licence__ = 'BSD'
__url__ = 'https://github.com/postgrespro/mamonsu'
diff --git a/mamonsu/lib/default_config.py b/mamonsu/lib/default_config.py
index c7f2d98..12791a1 100644
--- a/mamonsu/lib/default_config.py
+++ b/mamonsu/lib/default_config.py
@@ -35,6 +35,8 @@ def default_host():
host = os.environ.get('PGHOST') or 'auto'
if platform.FREEBSD:
host = os.environ.get('PGHOST') or 'auto'
+ if platform.DARWIN:
+ host = os.environ.get('PGHOST') or 'auto'
return host
@staticmethod
diff --git a/mamonsu/lib/platform.py b/mamonsu/lib/platform.py
index 5ea5faa..279200d 100644
--- a/mamonsu/lib/platform.py
+++ b/mamonsu/lib/platform.py
@@ -3,5 +3,6 @@
LINUX = (sys.platform == 'linux' or sys.platform == 'linux2')
WINDOWS = (sys.platform == 'win32' or sys.platform == 'win64')
FREEBSD = ('freebsd' in sys.platform)
+DARWIN = sys.platform == 'darwin'
UNIX = LINUX or FREEBSD
INTEGER_TYPES = int,
diff --git a/mamonsu/lib/queue.py b/mamonsu/lib/queue.py
index 96ceadf..e348fc4 100644
--- a/mamonsu/lib/queue.py
+++ b/mamonsu/lib/queue.py
@@ -10,25 +10,21 @@ def __init__(self):
self.lock = threading.Lock()
def add(self, metric):
- self.lock.acquire()
- self.queue.insert(0, metric)
- self.lock.release()
+ with self.lock:
+ self.queue.insert(0, metric)
# replace last metric
def replace(self, metric):
- self.lock.acquire()
- self.queue.pop()
- self.queue.append(metric)
- self.lock.release()
+ with self.lock:
+ if self.queue:
+ self.queue.pop()
+ self.queue.append(metric)
def size(self):
- self.lock.acquire()
- result = len(self.queue)
- self.lock.release()
- return result
+ with self.lock:
+ return len(self.queue)
def flush(self):
- self.lock.acquire()
- result, self.queue = self.queue, []
- self.lock.release()
- return result
+ with self.lock:
+ result, self.queue = self.queue, []
+ return result
diff --git a/mamonsu/plugins/pgsql/driver/pool.py b/mamonsu/plugins/pgsql/driver/pool.py
index 6576f92..a8433d9 100644
--- a/mamonsu/plugins/pgsql/driver/pool.py
+++ b/mamonsu/plugins/pgsql/driver/pool.py
@@ -86,7 +86,7 @@ class Pool(object):
"""
SELECT application_name,
{0}
- coalesce((pg_{1}_{2}_diff(pg_current_{1}_{2}(), replay_lsn))::int, 0) AS total_lag
+ coalesce((pg_{1}_{2}_diff(pg_current_{1}_{2}(), replay_{2}))::int, 0) AS total_lag
FROM pg_stat_replication;
""",
"""
@@ -95,6 +95,30 @@ class Pool(object):
total_lag
FROM mamonsu.count_{1}_lag_lsn();
"""
+ ),
+ "wal_held_bytes_master": (
+ """
+ SELECT slot_name,
+ coalesce((pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn))::int, 0) AS wal_held_bytes
+ FROM pg_replication_slots;
+ """,
+ """
+ SELECT slot_name,
+ wal_held_bytes
+ FROM mamonsu.bytes_held_by_inactive_slot_on_master();
+ """
+ ),
+ "wal_held_bytes_replica": (
+ """
+ SELECT slot_name,
+ coalesce((pg_wal_lsn_diff(pg_last_wal_replay_lsn(), restart_lsn))::int, 0) AS wal_held_bytes
+ FROM pg_replication_slots;
+ """,
+ """
+ SELECT slot_name,
+ wal_held_bytes
+ FROM mamonsu.bytes_held_by_inactive_slot_on_replica();
+ """
)
}
diff --git a/mamonsu/plugins/pgsql/replication.py b/mamonsu/plugins/pgsql/replication.py
index 8a51889..7ed701c 100644
--- a/mamonsu/plugins/pgsql/replication.py
+++ b/mamonsu/plugins/pgsql/replication.py
@@ -13,7 +13,8 @@ class Replication(Plugin):
AgentPluginType = "pg"
# key: (macro, value)
plugin_macros = {
- "critical_lag_seconds": [("macro", "{$CRITICAL_LAG_SECONDS}"), ("value", 60 * 5)]
+ "critical_lag_seconds": [("macro", "{$CRITICAL_LAG_SECONDS}"), ("value", 60 * 5)],
+ "critical_bytes_held_by_none_active_slot": [("macro", "{$CRITICAL_BYTES_HELD_BY_NON_ACTIVE_SLOT}"), ("value", 1024 * 1024 * 1024)]
}
# get time of replication lag
@@ -30,8 +31,15 @@ class Replication(Plugin):
WHERE active = 'false';
"""
+ query_bytes_held_by_non_active_slot = """
+ SELECT slot_name, coalesce(pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn)::bigint, 0) AS wal_size_bytes
+ FROM pg_replication_slots
+ WHERE active = 'false';
+ """
+
# for discovery rule for name of each replica
key_lsn_replication_discovery = "pgsql.replication.discovery{0}"
+ key_replication_non_active_slots_discovery = "pgsql.replication.non_active_slots_discovery{0}"
key_total_lag = "pgsql.replication.total_lag{0}"
# for PG 10 and higher
key_flush = "pgsql.replication.flush_lag{0}"
@@ -42,6 +50,7 @@ class Replication(Plugin):
key_replication = "pgsql.replication_lag{0}"
key_non_active_slots = "pgsql.replication.non_active_slots{0}"
+ key_non_active_slots_held_bytes = "pgsql.replication.non_active_slots_held_bytes{0}"
def run(self, zbx):
@@ -79,6 +88,14 @@ def run(self, zbx):
zbx.send("pgsql.replication.replay_lag[{0}]".format(info[0]), float(info[5]))
zbx.send("pgsql.replication.discovery[]", zbx.json({"data": lags}))
del lags
+ bytes_held_by_non_active_slot = Pooler.run_sql_type("wal_held_bytes_master", args=[])
+ if bytes_held_by_non_active_slot:
+ discovery = []
+ for info in bytes_held_by_non_active_slot:
+ discovery.append({"{#NON_ACTIVE_SLOT_NAME}": info[0]})
+ zbx.send("pgsql.replication.non_active_slots_held_bytes[{0}]".format(info[0]), int(info[1]))
+ zbx.send("pgsql.replication.non_active_slots_discovery[]", zbx.json({"data": discovery}))
+ del discovery
elif Pooler.is_superuser() or Pooler.is_bootstraped():
result_lags = Pooler.run_sql_type("wal_lag_lsn", args=[" ", "xlog", "location"])
if result_lags:
@@ -90,7 +107,15 @@ def run(self, zbx):
del lags
else:
self.disable_and_exit_if_not_superuser()
-
+ else:
+ bytes_held_by_non_active_slot = Pooler.run_sql_type("wal_held_bytes_replica", args=[])
+ if bytes_held_by_non_active_slot:
+ discovery = []
+ for info in bytes_held_by_non_active_slot:
+ discovery.append({"{#NON_ACTIVE_SLOT_NAME}": info[0]})
+ zbx.send("pgsql.replication.non_active_slots_held_bytes[{0}]".format(info[0]), int(info[1]))
+ zbx.send("pgsql.replication.non_active_slots_discovery[]", zbx.json({"data": discovery}))
+ del discovery
non_active_slots = Pooler.query(self.query_non_active_slots)
zbx.send(self.key_non_active_slots.format("[]"), int(non_active_slots[0][0]))
@@ -132,7 +157,8 @@ def triggers(self, template, dashboard=False):
}) + template.trigger({
"name": "PostgreSQL Replication: number of non-active replication slots on {HOSTNAME} (value={ITEM.LASTVALUE})",
"expression": "{#TEMPLATE:" + self.right_type(self.key_non_active_slots) + ".last()}>" + str(
- NUMBER_NON_ACTIVE_SLOTS)
+ NUMBER_NON_ACTIVE_SLOTS),
+ "status": 1
})
return triggers
@@ -198,7 +224,42 @@ def discovery_rules(self, template, dashboard=False):
]
}
]
- return template.discovery_rule(rule=rule, conditions=conditions, items=items, graphs=graphs)
+ active_slots_discovery_rule = template.discovery_rule(rule=rule, conditions=conditions, items=items, graphs=graphs)
+
+ rule = {
+ "name": "PostgreSQL Replication: Non Active Slots Discovery",
+ "key": self.key_replication_non_active_slots_discovery.format("[{0}]".format(self.Macros[self.Type]))
+ }
+ if Plugin.old_zabbix:
+ conditions = []
+ rule["filter"] = "{#NON_ACTIVE_SLOT_NAME}:.*"
+ else:
+ conditions = [{
+ "condition": [
+ {"macro": "{#NON_ACTIVE_SLOT_NAME}",
+ "value": ".*",
+ "operator": 8,
+ "formulaid": "A"}
+ ]
+ }]
+ items = [
+ {"key": self.right_type(self.key_non_active_slots_held_bytes, var_discovery="{#NON_ACTIVE_SLOT_NAME},"),
+ "name": "PostgreSQL Replication: Bytes held by non-active slot {#NON_ACTIVE_SLOT_NAME}",
+ "value_type": Plugin.VALUE_TYPE.numeric_float,
+ "delay": self.plugin_config("interval"),
+ "drawtype": 2}
+ ]
+ graphs = []
+ triggers = [
+ {
+ "name": "PostgreSQL Replication: bytes held by slot {#NON_ACTIVE_SLOT_NAME} is too high (value={ITEM.LASTVALUE})",
+ "expression": "{#TEMPLATE:" + self.right_type(self.key_non_active_slots_held_bytes, var_discovery="{#NON_ACTIVE_SLOT_NAME},") + ".last()}>" +
+ self.plugin_macros["critical_bytes_held_by_none_active_slot"][0][1]
+ }
+ ]
+ non_active_slots_discovery_rule = template.discovery_rule(rule=rule, conditions=conditions, items=items, graphs=graphs, triggers=triggers)
+
+ return active_slots_discovery_rule + non_active_slots_discovery_rule
def keys_and_queries(self, template_zabbix):
result = []
diff --git a/mamonsu/plugins/system/linux/disk_sizes.py b/mamonsu/plugins/system/linux/disk_sizes.py
index 898c2c0..d461812 100644
--- a/mamonsu/plugins/system/linux/disk_sizes.py
+++ b/mamonsu/plugins/system/linux/disk_sizes.py
@@ -20,7 +20,7 @@ class DiskSizes(Plugin):
ExcludeFsTypes = [
"none", "unknown", "rootfs", "iso9660", "squashfs", "udf", "romfs", "ramfs", "debugfs", "cgroup", "cgroup_root",
- "pstore", "devtmpfs", "autofs", "cgroup", "configfs", "devpts", "efivarfs", "fusectl", "fuse.gvfsd-fuse",
+ "pstore", "devtmpfs", "autofs", "cgroup2", "configfs", "devpts", "efivarfs", "fusectl", "fuse.gvfsd-fuse",
"hugetlbfs", "mqueue", "binfmt_misc", "nfsd", "proc", "pstore", "selinuxfs", "rpc_pipefs", "securityfs",
"sysfs", "nsfs", "tmpfs", "tracefs"
]
diff --git a/mamonsu/tools/bootstrap/sql.py b/mamonsu/tools/bootstrap/sql.py
index f37be0f..bf99442 100644
--- a/mamonsu/tools/bootstrap/sql.py
+++ b/mamonsu/tools/bootstrap/sql.py
@@ -236,6 +236,23 @@
coalesce((pg_{7}_diff(pg_current_{7}(), replay_{9}))::bigint, 0) AS total_lag
FROM pg_stat_replication
$$ LANGUAGE SQL SECURITY DEFINER;
+
+DROP FUNCTION IF EXISTS mamonsu.bytes_held_by_inactive_slot_on_master();
+CREATE OR REPLACE FUNCTION mamonsu.bytes_held_by_inactive_slot_on_master()
+RETURNS TABLE(slot_name TEXT, wal_held_bytes BIGINT) AS $$
+SELECT slot_name::TEXT, coalesce((pg_{7}_diff(pg_current_wal_lsn(), restart_lsn))::bigint, 0) AS wal_held_bytes
+FROM pg_replication_slots
+WHERE active = 'false'
+$$ LANGUAGE SQL SECURITY DEFINER;
+
+DROP FUNCTION IF EXISTS mamonsu.bytes_held_by_inactive_slot_on_replica();
+CREATE OR REPLACE FUNCTION mamonsu.bytes_held_by_inactive_slot_on_replica()
+RETURNS TABLE(slot_name TEXT, wal_held_bytes BIGINT) AS $$
+SELECT slot_name::TEXT, coalesce((pg_{7}_diff(pg_last_wal_replay_lsn(), restart_lsn))::bigint, 0) AS wal_held_bytes
+FROM pg_replication_slots
+WHERE active = 'false'
+$$ LANGUAGE SQL SECURITY DEFINER;
+
"""
CreatePgBuffercacheFunctionsSQL = """
diff --git a/packaging/debian/changelog b/packaging/debian/changelog
index 218931a..6efa097 100644
--- a/packaging/debian/changelog
+++ b/packaging/debian/changelog
@@ -1,3 +1,9 @@
+mamonsu (3.5.13-1) stable; urgency=low
+ * Added a new metric that displays the bytes held by non-active replication slots, along with the corresponding trigger.;
+ * Set the trigger for 'number of non-active replication slots' to be disabled by default.;
+ * Fixed the Linux plugin to ensure compatibility with recent Linux versions that use cgroups2.;
+ * Resolved a deadlock issue in the send queue that caused Mamonsu to hang after network problems.;
+
mamonsu (3.5.12-1) stable; urgency=low
* Port version parser code from public archive of pypa/pkg_resources;
* Thread-safe implementation of connection cache;
diff --git a/packaging/rpm/SPECS/mamonsu.spec b/packaging/rpm/SPECS/mamonsu.spec
index dcc7c9f..dcfd2bd 100644
--- a/packaging/rpm/SPECS/mamonsu.spec
+++ b/packaging/rpm/SPECS/mamonsu.spec
@@ -1,5 +1,5 @@
Name: mamonsu
-Version: 3.5.12
+Version: 3.5.13
Release: 1%{?dist}
Summary: Monitoring agent for PostgreSQL
Group: Applications/Internet
@@ -73,6 +73,12 @@ chown -R mamonsu:mamonsu /var/log/mamonsu
chown -R mamonsu:mamonsu /etc/mamonsu
%changelog
+* Thu May 29 2025 Andrey Papsuyko - 3.5.13-1
+ - Added a new metric that displays the bytes held by non-active replication slots, along with the corresponding trigger.;
+ - Set the trigger for 'number of non-active replication slots' to be disabled by default.;
+ - Fixed the Linux plugin to ensure compatibility with recent Linux versions that use cgroups2.;
+ - Resolved a deadlock issue in the send queue that caused Mamonsu to hang after network problems.;
+
* Wed Mar 5 2025 Maxim Styushin - 3.5.12-1
- Port version parser code from public archive of pypa/pkg_resources;
- Thread-safe implementation of connection cache;
diff --git a/packaging/win/mamonsu.def.nsh b/packaging/win/mamonsu.def.nsh
index 1b60f1c..5afbfdc 100644
--- a/packaging/win/mamonsu.def.nsh
+++ b/packaging/win/mamonsu.def.nsh
@@ -1,5 +1,5 @@
!define NAME Mamonsu
-!define VERSION 3.5.12
+!define VERSION 3.5.13
!define MAMONSU_REG_PATH "Software\PostgresPro\Mamonsu"
!define MAMONSU_REG_UNINSTALLER_PATH "Software\Microsoft\Windows\CurrentVersion\Uninstall"
!define EDB_REG "SOFTWARE\Postgresql"