From 4bec1289e79e0754838ac280f9b0e0237bd18918 Mon Sep 17 00:00:00 2001 From: Zhijie Hou Date: Mon, 22 Sep 2025 11:22:55 +0800 Subject: [PATCH] Fix unintended drop of active replication origins Currently, if two backends configure the same replication origin and one backend resets it first, the acquired_by flag is cleared without recognizing the active usage by the first backend. This can result in the unintended dropping of the origin, potentially leading to issues if the shared memory of the dropped origin is reused for a newly created origin. Such reuse could cause unpredictable advancement of a different slot by the remaining backend holding the memory of the dropped origin. This commit addresses the issue by introducing a reference count for replication origins. The count is incremented when a backend sets up the origin and decremented upon a reset. As a result, the replication origin is only dropped when the reference count reaches zero. --- .../expected/parallel_session_origin.out | 44 +++++++++++ .../specs/parallel_session_origin.spec | 4 + src/backend/replication/logical/origin.c | 79 ++++++++++++------- 3 files changed, 97 insertions(+), 30 deletions(-) diff --git a/contrib/test_decoding/expected/parallel_session_origin.out b/contrib/test_decoding/expected/parallel_session_origin.out index e515b39f7ce8..546d89339543 100644 --- a/contrib/test_decoding/expected/parallel_session_origin.out +++ b/contrib/test_decoding/expected/parallel_session_origin.out @@ -77,3 +77,47 @@ pg_replication_origin_session_reset (1 row) + +starting permutation: s0_setup s0_is_setup s1_setup s1_is_setup s0_reset s1_drop s1_reset +step s0_setup: SELECT pg_replication_origin_session_setup('origin'); +pg_replication_origin_session_setup +----------------------------------- + +(1 row) + +step s0_is_setup: SELECT pg_replication_origin_session_is_setup(); +pg_replication_origin_session_is_setup +-------------------------------------- +t +(1 row) + +step s1_setup: + SELECT pg_replication_origin_session_setup('origin', pid) + FROM pg_stat_activity + WHERE application_name = 'isolation/parallel_session_origin/s0'; + +pg_replication_origin_session_setup +----------------------------------- + +(1 row) + +step s1_is_setup: SELECT pg_replication_origin_session_is_setup(); +pg_replication_origin_session_is_setup +-------------------------------------- +t +(1 row) + +step s0_reset: SELECT pg_replication_origin_session_reset(); +pg_replication_origin_session_reset +----------------------------------- + +(1 row) + +step s1_drop: SELECT pg_replication_origin_drop('origin'); +ERROR: could not drop replication origin with ID 1, in use by another process +step s1_reset: SELECT pg_replication_origin_session_reset(); +pg_replication_origin_session_reset +----------------------------------- + +(1 row) + diff --git a/contrib/test_decoding/specs/parallel_session_origin.spec b/contrib/test_decoding/specs/parallel_session_origin.spec index c0e5fda07236..8e9c81e4419a 100644 --- a/contrib/test_decoding/specs/parallel_session_origin.spec +++ b/contrib/test_decoding/specs/parallel_session_origin.spec @@ -49,8 +49,12 @@ step "s1_store_lsn" { SELECT 1, local_lsn FROM pg_replication_origin_status; } step "s1_reset" { SELECT pg_replication_origin_session_reset(); } +step "s1_drop" { SELECT pg_replication_origin_drop('origin'); } # Firstly s0 attaches to a origin and s1 attaches to the same. Both sessions # commits a transaction and store the local_lsn of the replication origin. # Compare LSNs and expect latter transaction (done by s1) has larger local_lsn. permutation "s0_setup" "s0_is_setup" "s1_setup" "s1_is_setup" "s0_add_message" "s0_store_lsn" "s1_add_message" "s1_store_lsn" "s0_compare" "s0_reset" "s1_reset" + +# Test that the origin cannot be dropped if any session is actively using it. +permutation "s0_setup" "s0_is_setup" "s1_setup" "s1_is_setup" "s0_reset" "s1_drop" "s1_reset" diff --git a/src/backend/replication/logical/origin.c b/src/backend/replication/logical/origin.c index 2380f369578e..536e524f4d50 100644 --- a/src/backend/replication/logical/origin.c +++ b/src/backend/replication/logical/origin.c @@ -130,6 +130,9 @@ typedef struct ReplicationState */ int acquired_by; + /* Number of backend that is currently using this origin. */ + int refcount; + /* * Condition variable that's signaled when acquired_by changes. */ @@ -383,16 +386,19 @@ replorigin_state_clear(RepOriginId roident, bool nowait) if (state->roident == roident) { /* found our slot, is it busy? */ - if (state->acquired_by != 0) + if (state->refcount > 0) { ConditionVariable *cv; if (nowait) ereport(ERROR, (errcode(ERRCODE_OBJECT_IN_USE), - errmsg("could not drop replication origin with ID %d, in use by PID %d", - state->roident, - state->acquired_by))); + (state->acquired_by != 0) + ? errmsg("could not drop replication origin with ID %d, in use by PID %d", + state->roident, + state->acquired_by) + : errmsg("could not drop replication origin with ID %d, in use by another process", + state->roident))); /* * We must wait and then retry. Since we don't know which CV @@ -1069,32 +1075,47 @@ replorigin_get_progress(RepOriginId node, bool flush) return remote_lsn; } -/* - * Tear down a (possibly) configured session replication origin during process - * exit. - */ +/* Helpful function to reset the session replication origin */ static void -ReplicationOriginExitCleanup(int code, Datum arg) +replorigin_session_reset_internal(void) { - ConditionVariable *cv = NULL; + ConditionVariable *cv; - if (session_replication_state == NULL) - return; + Assert(session_replication_state != NULL); LWLockAcquire(ReplicationOriginLock, LW_EXCLUSIVE); - if (session_replication_state->acquired_by == MyProcPid) - { - cv = &session_replication_state->origin_cv; + Assert(session_replication_state->refcount > 0); + /* + * Reset the PID only if the current backend is the first to set up this + * origin. This prevents resetting the PID when other backends are still + * using this origin. + */ + if (session_replication_state->acquired_by == MyProcPid) session_replication_state->acquired_by = 0; - session_replication_state = NULL; - } + + session_replication_state->refcount--; + + cv = &session_replication_state->origin_cv; + session_replication_state = NULL; LWLockRelease(ReplicationOriginLock); - if (cv) - ConditionVariableBroadcast(cv); + ConditionVariableBroadcast(cv); +} + +/* + * Tear down a (possibly) configured session replication origin during process + * exit. + */ +static void +ReplicationOriginExitCleanup(int code, Datum arg) +{ + if (session_replication_state == NULL) + return; + + replorigin_session_reset_internal(); } /* @@ -1205,9 +1226,17 @@ replorigin_session_setup(RepOriginId node, int acquired_by) Assert(session_replication_state->roident != InvalidRepOriginId); if (acquired_by == 0) + { session_replication_state->acquired_by = MyProcPid; + Assert(session_replication_state->refcount == 0); + } else + { Assert(session_replication_state->acquired_by == acquired_by); + Assert(session_replication_state->refcount > 0); + } + + session_replication_state->refcount++; LWLockRelease(ReplicationOriginLock); @@ -1224,8 +1253,6 @@ replorigin_session_setup(RepOriginId node, int acquired_by) void replorigin_session_reset(void) { - ConditionVariable *cv; - Assert(max_active_replication_origins != 0); if (session_replication_state == NULL) @@ -1233,15 +1260,7 @@ replorigin_session_reset(void) (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("no replication origin is configured"))); - LWLockAcquire(ReplicationOriginLock, LW_EXCLUSIVE); - - session_replication_state->acquired_by = 0; - cv = &session_replication_state->origin_cv; - session_replication_state = NULL; - - LWLockRelease(ReplicationOriginLock); - - ConditionVariableBroadcast(cv); + replorigin_session_reset_internal(); } /*