Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit cb302b6

Browse files
committed
Checking for, and specifically handling, database unreachability in tailnet control protocol dialer
Signed-off-by: Danny Kopping <[email protected]>
1 parent 3f95841 commit cb302b6

File tree

9 files changed

+194
-20
lines changed

9 files changed

+194
-20
lines changed

coderd/coderd.go

+4
Original file line numberDiff line numberDiff line change
@@ -679,6 +679,10 @@ func New(options *Options) *API {
679679
DERPFn: api.DERPMap,
680680
Logger: options.Logger,
681681
ClientID: uuid.New(),
682+
DatabaseHealthcheckFn: func(ctx context.Context) error {
683+
_, err := api.Database.Ping(ctx)
684+
return err
685+
},
682686
}
683687
stn, err := NewServerTailnet(api.ctx,
684688
options.Logger,

coderd/tailnet.go

+14-5
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,11 @@ import (
2424
"tailscale.com/tailcfg"
2525

2626
"cdr.dev/slog"
27+
2728
"github.com/coder/coder/v2/coderd/tracing"
2829
"github.com/coder/coder/v2/coderd/workspaceapps"
2930
"github.com/coder/coder/v2/coderd/workspaceapps/appurl"
31+
"github.com/coder/coder/v2/codersdk"
3032
"github.com/coder/coder/v2/codersdk/workspacesdk"
3133
"github.com/coder/coder/v2/site"
3234
"github.com/coder/coder/v2/tailnet"
@@ -537,13 +539,20 @@ func NewMultiAgentController(ctx context.Context, logger slog.Logger, tracer tra
537539
// InmemTailnetDialer is a tailnet.ControlProtocolDialer that connects to a Coordinator and DERPMap
538540
// service running in the same memory space.
539541
type InmemTailnetDialer struct {
540-
CoordPtr *atomic.Pointer[tailnet.Coordinator]
541-
DERPFn func() *tailcfg.DERPMap
542-
Logger slog.Logger
543-
ClientID uuid.UUID
542+
CoordPtr *atomic.Pointer[tailnet.Coordinator]
543+
DERPFn func() *tailcfg.DERPMap
544+
Logger slog.Logger
545+
ClientID uuid.UUID
546+
DatabaseHealthcheckFn func(ctx context.Context) error
544547
}
545548

546-
func (a *InmemTailnetDialer) Dial(_ context.Context, _ tailnet.ResumeTokenController) (tailnet.ControlProtocolClients, error) {
549+
func (a *InmemTailnetDialer) Dial(ctx context.Context, _ tailnet.ResumeTokenController) (tailnet.ControlProtocolClients, error) {
550+
if a.DatabaseHealthcheckFn != nil {
551+
if err := a.DatabaseHealthcheckFn(ctx); err != nil {
552+
return tailnet.ControlProtocolClients{}, xerrors.Errorf("%s: %w", codersdk.DatabaseNotReachable, err)
553+
}
554+
}
555+
547556
coord := a.CoordPtr.Load()
548557
if coord == nil {
549558
return tailnet.ControlProtocolClients{}, xerrors.Errorf("tailnet coordinator not initialized")

coderd/tailnet_test.go

+74-9
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ import (
1818
"github.com/stretchr/testify/assert"
1919
"github.com/stretchr/testify/require"
2020
"go.opentelemetry.io/otel/trace"
21+
"golang.org/x/xerrors"
2122
"tailscale.com/tailcfg"
2223

2324
"github.com/coder/coder/v2/agent"
@@ -56,8 +57,7 @@ func TestServerTailnet_AgentConn_NoSTUN(t *testing.T) {
5657
defer cancel()
5758

5859
// Connect through the ServerTailnet
59-
agents, serverTailnet := setupServerTailnetAgent(t, 1,
60-
tailnettest.DisableSTUN, tailnettest.DERPIsEmbedded)
60+
agents, serverTailnet := setupServerTailnetAgent(t, 1, withDERPAndStunOptions(tailnettest.DisableSTUN, tailnettest.DERPIsEmbedded))
6161
a := agents[0]
6262

6363
conn, release, err := serverTailnet.AgentConn(ctx, a.id)
@@ -340,7 +340,7 @@ func TestServerTailnet_ReverseProxy(t *testing.T) {
340340
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitLong)
341341
defer cancel()
342342

343-
agents, serverTailnet := setupServerTailnetAgent(t, 1, tailnettest.DisableSTUN)
343+
agents, serverTailnet := setupServerTailnetAgent(t, 1, withDERPAndStunOptions(tailnettest.DisableSTUN))
344344
a := agents[0]
345345

346346
require.True(t, serverTailnet.Conn().GetBlockEndpoints(), "expected BlockEndpoints to be set")
@@ -365,6 +365,43 @@ func TestServerTailnet_ReverseProxy(t *testing.T) {
365365
})
366366
}
367367

368+
func TestServerTailnet_Healthcheck(t *testing.T) {
369+
t.Parallel()
370+
371+
// Verifies that a non-nil healthcheck which returns a non-error response behaves as expected.
372+
t.Run("Passing", func(t *testing.T) {
373+
t.Parallel()
374+
375+
ctx := testutil.Context(t, testutil.WaitMedium)
376+
fn := func(ctx context.Context) error { return nil }
377+
378+
agents, serverTailnet := setupServerTailnetAgent(t, 1, withHealthcheckFn(fn))
379+
380+
a := agents[0]
381+
conn, release, err := serverTailnet.AgentConn(ctx, a.id)
382+
t.Cleanup(release)
383+
require.NoError(t, err)
384+
assert.True(t, conn.AwaitReachable(ctx))
385+
})
386+
387+
// If the healthcheck fails, we have no insight into this at this level.
388+
// The dial against the control plane is retried, so we wait for the context to timeout as an indication that the
389+
// healthcheck is performing as expected.
390+
t.Run("Failing", func(t *testing.T) {
391+
t.Parallel()
392+
393+
ctx := testutil.Context(t, testutil.WaitMedium)
394+
fn := func(ctx context.Context) error { return xerrors.Errorf("oops, db gone") }
395+
396+
agents, serverTailnet := setupServerTailnetAgent(t, 1, withHealthcheckFn(fn))
397+
398+
a := agents[0]
399+
_, release, err := serverTailnet.AgentConn(ctx, a.id)
400+
require.Nil(t, release)
401+
require.ErrorContains(t, err, "agent is unreachable")
402+
})
403+
}
404+
368405
type wrappedListener struct {
369406
net.Listener
370407
dials int32
@@ -389,9 +426,36 @@ type agentWithID struct {
389426
agent.Agent
390427
}
391428

392-
func setupServerTailnetAgent(t *testing.T, agentNum int, opts ...tailnettest.DERPAndStunOption) ([]agentWithID, *coderd.ServerTailnet) {
429+
type serverOption struct {
430+
HealthcheckFn func(ctx context.Context) error
431+
DERPAndStunOptions []tailnettest.DERPAndStunOption
432+
}
433+
434+
func withHealthcheckFn(fn func(ctx context.Context) error) serverOption {
435+
return serverOption{
436+
HealthcheckFn: fn,
437+
}
438+
}
439+
440+
func withDERPAndStunOptions(opts ...tailnettest.DERPAndStunOption) serverOption {
441+
return serverOption{
442+
DERPAndStunOptions: opts,
443+
}
444+
}
445+
446+
func setupServerTailnetAgent(t *testing.T, agentNum int, opts ...serverOption) ([]agentWithID, *coderd.ServerTailnet) {
393447
logger := testutil.Logger(t)
394-
derpMap, derpServer := tailnettest.RunDERPAndSTUN(t, opts...)
448+
449+
var healthcheckFn func(ctx context.Context) error
450+
var derpAndStunOptions []tailnettest.DERPAndStunOption
451+
for _, opt := range opts {
452+
derpAndStunOptions = append(derpAndStunOptions, opt.DERPAndStunOptions...)
453+
if opt.HealthcheckFn != nil {
454+
healthcheckFn = opt.HealthcheckFn
455+
}
456+
}
457+
458+
derpMap, derpServer := tailnettest.RunDERPAndSTUN(t, derpAndStunOptions...)
395459

396460
coord := tailnet.NewCoordinator(logger)
397461
t.Cleanup(func() {
@@ -431,10 +495,11 @@ func setupServerTailnetAgent(t *testing.T, agentNum int, opts ...tailnettest.DER
431495
}
432496

433497
dialer := &coderd.InmemTailnetDialer{
434-
CoordPtr: &coordPtr,
435-
DERPFn: func() *tailcfg.DERPMap { return derpMap },
436-
Logger: logger,
437-
ClientID: uuid.UUID{5},
498+
CoordPtr: &coordPtr,
499+
DERPFn: func() *tailcfg.DERPMap { return derpMap },
500+
Logger: logger,
501+
ClientID: uuid.UUID{5},
502+
DatabaseHealthcheckFn: healthcheckFn,
438503
}
439504
serverTailnet, err := coderd.NewServerTailnet(
440505
context.Background(),

coderd/workspaceagents.go

+10
Original file line numberDiff line numberDiff line change
@@ -997,6 +997,16 @@ func (api *API) derpMapUpdates(rw http.ResponseWriter, r *http.Request) {
997997
func (api *API) workspaceAgentClientCoordinate(rw http.ResponseWriter, r *http.Request) {
998998
ctx := r.Context()
999999

1000+
// Ensure the database is reachable before proceeding.
1001+
_, err := api.Database.Ping(ctx)
1002+
if err != nil {
1003+
httpapi.Write(ctx, rw, http.StatusInternalServerError, codersdk.Response{
1004+
Message: codersdk.DatabaseNotReachable,
1005+
Detail: err.Error(),
1006+
})
1007+
return
1008+
}
1009+
10001010
// This route accepts user API key auth and workspace proxy auth. The moon actor has
10011011
// full permissions so should be able to pass this authz check.
10021012
workspace := httpmw.WorkspaceParam(r)

coderd/workspaceagents_test.go

+60
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ import (
4545
"github.com/coder/coder/v2/coderd/database/dbfake"
4646
"github.com/coder/coder/v2/coderd/database/dbgen"
4747
"github.com/coder/coder/v2/coderd/database/dbmem"
48+
"github.com/coder/coder/v2/coderd/database/dbtestutil"
4849
"github.com/coder/coder/v2/coderd/database/dbtime"
4950
"github.com/coder/coder/v2/coderd/database/pubsub"
5051
"github.com/coder/coder/v2/coderd/externalauth"
@@ -55,6 +56,7 @@ import (
5556
"github.com/coder/coder/v2/codersdk"
5657
"github.com/coder/coder/v2/codersdk/agentsdk"
5758
"github.com/coder/coder/v2/codersdk/workspacesdk"
59+
"github.com/coder/coder/v2/enterprise/coderd/coderdenttest"
5860
"github.com/coder/coder/v2/provisioner/echo"
5961
"github.com/coder/coder/v2/provisionersdk/proto"
6062
"github.com/coder/coder/v2/tailnet"
@@ -495,6 +497,45 @@ func TestWorkspaceAgentConnectRPC(t *testing.T) {
495497
// Then: we should get a 401 Unauthorized response
496498
require.Equal(t, http.StatusUnauthorized, sdkErr.StatusCode())
497499
})
500+
501+
// This test validates that the tailnet controller will retry connecting to the control plane until context timeout
502+
// when the dialer fails its healthcheck.
503+
t.Run("DatabaseUnreachable", func(t *testing.T) {
504+
t.Parallel()
505+
506+
store, ps := dbtestutil.NewDB(t)
507+
508+
// Given: a database which will fail its Ping(ctx) call.
509+
// NOTE: The Ping(ctx) call is made by the Dialer.
510+
pdb := &pingFailingDB{
511+
Store: store,
512+
}
513+
client, user := coderdenttest.New(t, &coderdenttest.Options{
514+
Options: &coderdtest.Options{
515+
Database: pdb,
516+
Pubsub: ps,
517+
IncludeProvisionerDaemon: true,
518+
},
519+
})
520+
521+
// When: a workspace agent is setup and we try dial it.
522+
r := dbfake.WorkspaceBuild(t, pdb, database.WorkspaceTable{
523+
OrganizationID: user.OrganizationID,
524+
OwnerID: user.UserID,
525+
}).WithAgent().Do()
526+
_ = agenttest.New(t, client.URL, r.AgentToken)
527+
resources := coderdtest.AwaitWorkspaceAgents(t, client, r.Workspace.ID)
528+
529+
// When: the db is marked as unhealthy (i.e. will fail its Ping).
530+
// This needs to be done *after* the server "starts" otherwise it'll fail straight away when trying to initialize.
531+
pdb.MarkUnhealthy()
532+
533+
// Then: the tailnet controller will continually try to dial the coordination endpoint, exceeding its context timeout.
534+
ctx := testutil.Context(t, testutil.WaitMedium)
535+
conn, err := workspacesdk.New(client).DialAgent(ctx, resources[0].Agents[0].ID, nil)
536+
require.ErrorContains(t, err, codersdk.DatabaseNotReachable)
537+
require.Nil(t, conn)
538+
})
498539
}
499540

500541
func TestWorkspaceAgentTailnet(t *testing.T) {
@@ -2591,3 +2632,22 @@ func TestAgentConnectionInfo(t *testing.T) {
25912632
require.True(t, info.DisableDirectConnections)
25922633
require.True(t, info.DERPForceWebSockets)
25932634
}
2635+
2636+
type pingFailingDB struct {
2637+
database.Store
2638+
2639+
unhealthy bool
2640+
}
2641+
2642+
func (p *pingFailingDB) Ping(context.Context) (time.Duration, error) {
2643+
if !p.unhealthy {
2644+
return time.Nanosecond, nil
2645+
}
2646+
2647+
// Simulate a database connection error.
2648+
return 0, xerrors.New("oops")
2649+
}
2650+
2651+
func (p *pingFailingDB) MarkUnhealthy() {
2652+
p.unhealthy = true
2653+
}

codersdk/database.go

+7
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
package codersdk
2+
3+
import "errors"
4+
5+
const DatabaseNotReachable = "database not reachable"
6+
7+
var ErrDatabaseNotReachable = errors.New(DatabaseNotReachable)

codersdk/workspacesdk/dialer.go

+11-4
Original file line numberDiff line numberDiff line change
@@ -11,17 +11,19 @@ import (
1111
"golang.org/x/xerrors"
1212

1313
"cdr.dev/slog"
14+
"github.com/coder/websocket"
15+
1416
"github.com/coder/coder/v2/buildinfo"
1517
"github.com/coder/coder/v2/codersdk"
1618
"github.com/coder/coder/v2/tailnet"
1719
"github.com/coder/coder/v2/tailnet/proto"
18-
"github.com/coder/websocket"
1920
)
2021

2122
var permanentErrorStatuses = []int{
22-
http.StatusConflict, // returned if client/agent connections disabled (browser only)
23-
http.StatusBadRequest, // returned if API mismatch
24-
http.StatusNotFound, // returned if user doesn't have permission or agent doesn't exist
23+
http.StatusConflict, // returned if client/agent connections disabled (browser only)
24+
http.StatusBadRequest, // returned if API mismatch
25+
http.StatusNotFound, // returned if user doesn't have permission or agent doesn't exist
26+
http.StatusInternalServerError, // returned if database is not reachable,
2527
}
2628

2729
type WebsocketDialer struct {
@@ -89,6 +91,11 @@ func (w *WebsocketDialer) Dial(ctx context.Context, r tailnet.ResumeTokenControl
8991
"Ensure your client release version (%s, different than the API version) matches the server release version",
9092
buildinfo.Version())
9193
}
94+
95+
if sdkErr.Message == codersdk.DatabaseNotReachable &&
96+
sdkErr.StatusCode() == http.StatusInternalServerError {
97+
err = xerrors.Errorf("%s: %w", codersdk.DatabaseNotReachable, err)
98+
}
9299
}
93100
w.connected <- err
94101
return tailnet.ControlProtocolClients{}, err

site/src/api/typesGenerated.ts

+3
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

tailnet/controllers.go

+11-2
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,12 @@ import (
2121
"tailscale.com/util/dnsname"
2222

2323
"cdr.dev/slog"
24+
"github.com/coder/quartz"
25+
"github.com/coder/retry"
26+
2427
"github.com/coder/coder/v2/coderd/util/ptr"
2528
"github.com/coder/coder/v2/codersdk"
2629
"github.com/coder/coder/v2/tailnet/proto"
27-
"github.com/coder/quartz"
28-
"github.com/coder/retry"
2930
)
3031

3132
// A Controller connects to the tailnet control plane, and then uses the control protocols to
@@ -1381,6 +1382,14 @@ func (c *Controller) Run(ctx context.Context) {
13811382
if xerrors.Is(err, context.Canceled) || xerrors.Is(err, context.DeadlineExceeded) {
13821383
return
13831384
}
1385+
1386+
// If the database is unreachable by the control plane, there's not much we can do, so we'll just retry later.
1387+
if strings.Contains(err.Error(), codersdk.DatabaseNotReachable) {
1388+
c.logger.Warn(c.ctx, "control plane lost connection to database, retrying",
1389+
slog.Error(err), slog.F("retry_in_ms", retrier.Delay.Milliseconds()))
1390+
continue
1391+
}
1392+
13841393
errF := slog.Error(err)
13851394
var sdkErr *codersdk.Error
13861395
if xerrors.As(err, &sdkErr) {

0 commit comments

Comments
 (0)