From 484a54c91511fd3aeee0de24b5cdb331d1374e93 Mon Sep 17 00:00:00 2001 From: Spike Curtis Date: Wed, 29 Nov 2023 15:24:20 +0400 Subject: [PATCH] feat: add cleanup of lost tailnet peers and tunnels to PGCoordinator --- enterprise/tailnet/pgcoord.go | 19 ++-- enterprise/tailnet/pgcoord_internal_test.go | 108 +++++++++++++++++++- enterprise/tailnet/pgcoord_test.go | 2 + 3 files changed, 122 insertions(+), 7 deletions(-) diff --git a/enterprise/tailnet/pgcoord.go b/enterprise/tailnet/pgcoord.go index 16c7123c9ddde..635d1d4fce098 100644 --- a/enterprise/tailnet/pgcoord.go +++ b/enterprise/tailnet/pgcoord.go @@ -1723,15 +1723,22 @@ func (h *heartbeats) cleanupLoop() { } } -// cleanup issues a DB command to clean out any old expired coordinators state. The cleanup is idempotent, so no need -// to synchronize with other coordinators. +// cleanup issues a DB command to clean out any old expired coordinators or lost peer state. The +// cleanup is idempotent, so no need to synchronize with other coordinators. func (h *heartbeats) cleanup() { + // the records we are attempting to clean up do no serious harm other than + // accumulating in the tables, so we don't bother retrying if it fails. err := h.store.CleanTailnetCoordinators(h.ctx) if err != nil { - // the records we are attempting to clean up do no serious harm other than - // accumulating in the tables, so we don't bother retrying if it fails. h.logger.Error(h.ctx, "failed to cleanup old coordinators", slog.Error(err)) - return } - h.logger.Debug(h.ctx, "cleaned up old coordinators") + err = h.store.CleanTailnetLostPeers(h.ctx) + if err != nil { + h.logger.Error(h.ctx, "failed to cleanup lost peers", slog.Error(err)) + } + err = h.store.CleanTailnetTunnels(h.ctx) + if err != nil { + h.logger.Error(h.ctx, "failed to cleanup abandoned tunnels", slog.Error(err)) + } + h.logger.Debug(h.ctx, "completed cleanup") } diff --git a/enterprise/tailnet/pgcoord_internal_test.go b/enterprise/tailnet/pgcoord_internal_test.go index a580a5fedda75..9df920639e031 100644 --- a/enterprise/tailnet/pgcoord_internal_test.go +++ b/enterprise/tailnet/pgcoord_internal_test.go @@ -47,6 +47,14 @@ func TestHeartbeat_Cleanup(t *testing.T) { <-waitForCleanup return nil }) + mStore.EXPECT().CleanTailnetLostPeers(gomock.Any()).MinTimes(2).DoAndReturn(func(_ context.Context) error { + <-waitForCleanup + return nil + }) + mStore.EXPECT().CleanTailnetTunnels(gomock.Any()).MinTimes(2).DoAndReturn(func(_ context.Context) error { + <-waitForCleanup + return nil + }) uut := &heartbeats{ ctx: ctx, @@ -56,7 +64,7 @@ func TestHeartbeat_Cleanup(t *testing.T) { } go uut.cleanupLoop() - for i := 0; i < 2; i++ { + for i := 0; i < 6; i++ { select { case <-ctx.Done(): t.Fatal("timeout") @@ -67,6 +75,104 @@ func TestHeartbeat_Cleanup(t *testing.T) { close(waitForCleanup) } +// TestLostPeerCleanupQueries tests that our SQL queries to clean up lost peers do what we expect, +// that is, clean up peers and associated tunnels that have been lost for over 24 hours. +func TestLostPeerCleanupQueries(t *testing.T) { + t.Parallel() + if !dbtestutil.WillUsePostgres() { + t.Skip("test only with postgres") + } + store, _, sqlDB := dbtestutil.NewDBWithSQLDB(t, dbtestutil.WithDumpOnFailure()) + ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitShort) + defer cancel() + + coordID := uuid.New() + _, err := store.UpsertTailnetCoordinator(ctx, coordID) + require.NoError(t, err) + + peerID := uuid.New() + _, err = store.UpsertTailnetPeer(ctx, database.UpsertTailnetPeerParams{ + ID: peerID, + CoordinatorID: coordID, + Node: []byte("test"), + Status: database.TailnetStatusLost, + }) + require.NoError(t, err) + + otherID := uuid.New() + _, err = store.UpsertTailnetTunnel(ctx, database.UpsertTailnetTunnelParams{ + CoordinatorID: coordID, + SrcID: peerID, + DstID: otherID, + }) + require.NoError(t, err) + + peers, err := store.GetAllTailnetPeers(ctx) + require.NoError(t, err) + require.Len(t, peers, 1) + require.Equal(t, peerID, peers[0].ID) + + tunnels, err := store.GetAllTailnetTunnels(ctx) + require.NoError(t, err) + require.Len(t, tunnels, 1) + require.Equal(t, peerID, tunnels[0].SrcID) + require.Equal(t, otherID, tunnels[0].DstID) + + // this clean is a noop since the peer and tunnel are less than 24h old + err = store.CleanTailnetLostPeers(ctx) + require.NoError(t, err) + err = store.CleanTailnetTunnels(ctx) + require.NoError(t, err) + + peers, err = store.GetAllTailnetPeers(ctx) + require.NoError(t, err) + require.Len(t, peers, 1) + require.Equal(t, peerID, peers[0].ID) + + tunnels, err = store.GetAllTailnetTunnels(ctx) + require.NoError(t, err) + require.Len(t, tunnels, 1) + require.Equal(t, peerID, tunnels[0].SrcID) + require.Equal(t, otherID, tunnels[0].DstID) + + // set the age of the tunnel to >24h + sqlDB.Exec("UPDATE tailnet_tunnels SET updated_at = $1", time.Now().Add(-25*time.Hour)) + + // this clean is still a noop since the peer hasn't been lost for 24 hours + err = store.CleanTailnetLostPeers(ctx) + require.NoError(t, err) + err = store.CleanTailnetTunnels(ctx) + require.NoError(t, err) + + peers, err = store.GetAllTailnetPeers(ctx) + require.NoError(t, err) + require.Len(t, peers, 1) + require.Equal(t, peerID, peers[0].ID) + + tunnels, err = store.GetAllTailnetTunnels(ctx) + require.NoError(t, err) + require.Len(t, tunnels, 1) + require.Equal(t, peerID, tunnels[0].SrcID) + require.Equal(t, otherID, tunnels[0].DstID) + + // set the age of the tunnel to >24h + sqlDB.Exec("UPDATE tailnet_peers SET updated_at = $1", time.Now().Add(-25*time.Hour)) + + // this clean removes the peer and the associated tunnel + err = store.CleanTailnetLostPeers(ctx) + require.NoError(t, err) + err = store.CleanTailnetTunnels(ctx) + require.NoError(t, err) + + peers, err = store.GetAllTailnetPeers(ctx) + require.NoError(t, err) + require.Len(t, peers, 0) + + tunnels, err = store.GetAllTailnetTunnels(ctx) + require.NoError(t, err) + require.Len(t, tunnels, 0) +} + func TestDebugTemplate(t *testing.T) { t.Parallel() if runtime.GOOS == "windows" { diff --git a/enterprise/tailnet/pgcoord_test.go b/enterprise/tailnet/pgcoord_test.go index 20f5f39621a00..920bb8a2c81a7 100644 --- a/enterprise/tailnet/pgcoord_test.go +++ b/enterprise/tailnet/pgcoord_test.go @@ -545,6 +545,8 @@ func TestPGCoordinator_Unhealthy(t *testing.T) { Return(database.TailnetCoordinator{}, nil) // extra calls we don't particularly care about for this test mStore.EXPECT().CleanTailnetCoordinators(gomock.Any()).AnyTimes().Return(nil) + mStore.EXPECT().CleanTailnetLostPeers(gomock.Any()).AnyTimes().Return(nil) + mStore.EXPECT().CleanTailnetTunnels(gomock.Any()).AnyTimes().Return(nil) mStore.EXPECT().GetTailnetTunnelPeerIDs(gomock.Any(), gomock.Any()).AnyTimes().Return(nil, nil) mStore.EXPECT().GetTailnetTunnelPeerBindings(gomock.Any(), gomock.Any()). AnyTimes().Return(nil, nil)