Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 2ba4a62

Browse files
kylecarbscoadler
andauthored
feat: Add high availability for multiple replicas (#4555)
* feat: HA tailnet coordinator * fixup! feat: HA tailnet coordinator * fixup! feat: HA tailnet coordinator * remove printlns * close all connections on coordinator * impelement high availability feature * fixup! impelement high availability feature * fixup! impelement high availability feature * fixup! impelement high availability feature * fixup! impelement high availability feature * Add replicas * Add DERP meshing to arbitrary addresses * Move packages to highavailability folder * Move coordinator to high availability package * Add flags for HA * Rename to replicasync * Denest packages for replicas * Add test for multiple replicas * Fix coordination test * Add HA to the helm chart * Rename function pointer * Add warnings for HA * Add the ability to block endpoints * Add flag to disable P2P connections * Wow, I made the tests pass * Add replicas endpoint * Ensure close kills replica * Update sql * Add database latency to high availability * Pipe TLS to DERP mesh * Fix DERP mesh with TLS * Add tests for TLS * Fix replica sync TLS * Fix RootCA for replica meshing * Remove ID from replicasync * Fix getting certificates for meshing * Remove excessive locking * Fix linting * Store mesh key in the database * Fix replica key for tests * Fix types gen * Fix unlocking unlocked * Fix race in tests * Update enterprise/derpmesh/derpmesh.go Co-authored-by: Colin Adler <[email protected]> * Rename to syncReplicas * Reuse http client * Delete old replicas on a CRON * Fix race condition in connection tests * Fix linting * Fix nil type * Move pubsub to in-memory for twenty test * Add comment for configuration tweaking * Fix leak with transport * Fix close leak in derpmesh * Fix race when creating server * Remove handler update * Skip test on Windows * Fix DERP mesh test * Wrap HTTP handler replacement in mutex * Fix error message for relay * Fix API handler for normal tests * Fix speedtest * Fix replica resend * Fix derpmesh send * Ping async * Increase wait time of template version jobd * Fix race when closing replica sync * Add name to client * Log the derpmap being used * Don't connect if DERP is empty * Improve agent coordinator logging * Fix lock in coordinator * Fix relay addr * Fix race when updating durations * Fix client publish race * Run pubsub loop in a queue * Store agent nodes in order * Fix coordinator locking * Check for closed pipe Co-authored-by: Colin Adler <[email protected]>
1 parent dc3519e commit 2ba4a62

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

76 files changed

+3434
-401
lines changed

.vscode/settings.json

+3
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
"derphttp",
2020
"derpmap",
2121
"devel",
22+
"dflags",
2223
"drpc",
2324
"drpcconn",
2425
"drpcmux",
@@ -86,8 +87,10 @@
8687
"ptytest",
8788
"quickstart",
8889
"reconfig",
90+
"replicasync",
8991
"retrier",
9092
"rpty",
93+
"SCIM",
9194
"sdkproto",
9295
"sdktrace",
9396
"Signup",

agent/agent.go

+1
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,7 @@ func (a *agent) runTailnet(ctx context.Context, derpMap *tailcfg.DERPMap) {
170170
if a.isClosed() {
171171
return
172172
}
173+
a.logger.Debug(ctx, "running tailnet with derpmap", slog.F("derpmap", derpMap))
173174
if a.network != nil {
174175
a.network.SetDERPMap(derpMap)
175176
return

agent/agent_test.go

+2-4
Original file line numberDiff line numberDiff line change
@@ -465,7 +465,7 @@ func TestAgent(t *testing.T) {
465465

466466
conn, _ := setupAgent(t, codersdk.WorkspaceAgentMetadata{}, 0)
467467
require.Eventually(t, func() bool {
468-
_, err := conn.Ping()
468+
_, err := conn.Ping(context.Background())
469469
return err == nil
470470
}, testutil.WaitMedium, testutil.IntervalFast)
471471
conn1, err := conn.DialContext(context.Background(), l.Addr().Network(), l.Addr().String())
@@ -483,9 +483,7 @@ func TestAgent(t *testing.T) {
483483

484484
t.Run("Speedtest", func(t *testing.T) {
485485
t.Parallel()
486-
if testing.Short() {
487-
t.Skip("The minimum duration for a speedtest is hardcoded in Tailscale to 5s!")
488-
}
486+
t.Skip("This test is relatively flakey because of Tailscale's speedtest code...")
489487
derpMap := tailnettest.RunDERPAndSTUN(t)
490488
conn, _ := setupAgent(t, codersdk.WorkspaceAgentMetadata{
491489
DERPMap: derpMap,

cli/agent_test.go

+6-8
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,6 @@ import (
77
"github.com/stretchr/testify/assert"
88
"github.com/stretchr/testify/require"
99

10-
"cdr.dev/slog"
11-
1210
"github.com/coder/coder/cli/clitest"
1311
"github.com/coder/coder/coderd/coderdtest"
1412
"github.com/coder/coder/provisioner/echo"
@@ -67,11 +65,11 @@ func TestWorkspaceAgent(t *testing.T) {
6765
if assert.NotEmpty(t, workspace.LatestBuild.Resources) && assert.NotEmpty(t, resources[0].Agents) {
6866
assert.NotEmpty(t, resources[0].Agents[0].Version)
6967
}
70-
dialer, err := client.DialWorkspaceAgentTailnet(ctx, slog.Logger{}, resources[0].Agents[0].ID)
68+
dialer, err := client.DialWorkspaceAgent(ctx, resources[0].Agents[0].ID, nil)
7169
require.NoError(t, err)
7270
defer dialer.Close()
7371
require.Eventually(t, func() bool {
74-
_, err := dialer.Ping()
72+
_, err := dialer.Ping(ctx)
7573
return err == nil
7674
}, testutil.WaitMedium, testutil.IntervalFast)
7775
cancelFunc()
@@ -128,11 +126,11 @@ func TestWorkspaceAgent(t *testing.T) {
128126
if assert.NotEmpty(t, resources) && assert.NotEmpty(t, resources[0].Agents) {
129127
assert.NotEmpty(t, resources[0].Agents[0].Version)
130128
}
131-
dialer, err := client.DialWorkspaceAgentTailnet(ctx, slog.Logger{}, resources[0].Agents[0].ID)
129+
dialer, err := client.DialWorkspaceAgent(ctx, resources[0].Agents[0].ID, nil)
132130
require.NoError(t, err)
133131
defer dialer.Close()
134132
require.Eventually(t, func() bool {
135-
_, err := dialer.Ping()
133+
_, err := dialer.Ping(ctx)
136134
return err == nil
137135
}, testutil.WaitMedium, testutil.IntervalFast)
138136
cancelFunc()
@@ -189,11 +187,11 @@ func TestWorkspaceAgent(t *testing.T) {
189187
if assert.NotEmpty(t, resources) && assert.NotEmpty(t, resources[0].Agents) {
190188
assert.NotEmpty(t, resources[0].Agents[0].Version)
191189
}
192-
dialer, err := client.DialWorkspaceAgentTailnet(ctx, slog.Logger{}, resources[0].Agents[0].ID)
190+
dialer, err := client.DialWorkspaceAgent(ctx, resources[0].Agents[0].ID, nil)
193191
require.NoError(t, err)
194192
defer dialer.Close()
195193
require.Eventually(t, func() bool {
196-
_, err := dialer.Ping()
194+
_, err := dialer.Ping(ctx)
197195
return err == nil
198196
}, testutil.WaitMedium, testutil.IntervalFast)
199197
cancelFunc()

cli/config/file.go

+5
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,11 @@ func (r Root) Session() File {
1313
return File(filepath.Join(string(r), "session"))
1414
}
1515

16+
// ReplicaID is a unique identifier for the Coder server.
17+
func (r Root) ReplicaID() File {
18+
return File(filepath.Join(string(r), "replica_id"))
19+
}
20+
1621
func (r Root) URL() File {
1722
return File(filepath.Join(string(r), "url"))
1823
}

cli/configssh_test.go

+1-2
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@ import (
1919
"github.com/stretchr/testify/assert"
2020
"github.com/stretchr/testify/require"
2121

22-
"cdr.dev/slog"
2322
"cdr.dev/slog/sloggers/slogtest"
2423

2524
"github.com/coder/coder/agent"
@@ -115,7 +114,7 @@ func TestConfigSSH(t *testing.T) {
115114
_ = agentCloser.Close()
116115
}()
117116
resources := coderdtest.AwaitWorkspaceAgents(t, client, workspace.ID)
118-
agentConn, err := client.DialWorkspaceAgentTailnet(context.Background(), slog.Logger{}, resources[0].Agents[0].ID)
117+
agentConn, err := client.DialWorkspaceAgent(context.Background(), resources[0].Agents[0].ID, nil)
119118
require.NoError(t, err)
120119
defer agentConn.Close()
121120

cli/deployment/flags.go

+7
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,13 @@ func Flags() *codersdk.DeploymentFlags {
8585
Description: "Addresses for STUN servers to establish P2P connections. Set empty to disable P2P connections.",
8686
Default: []string{"stun.l.google.com:19302"},
8787
},
88+
DerpServerRelayAddress: &codersdk.StringFlag{
89+
Name: "DERP Server Relay Address",
90+
Flag: "derp-server-relay-address",
91+
EnvVar: "CODER_DERP_SERVER_RELAY_ADDRESS",
92+
Description: "An HTTP address that is accessible by other replicas to relay DERP traffic. Required for high availability.",
93+
Enterprise: true,
94+
},
8895
DerpConfigURL: &codersdk.StringFlag{
8996
Name: "DERP Config URL",
9097
Flag: "derp-config-url",

cli/portforward.go

+2-3
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@ import (
1616
"github.com/spf13/cobra"
1717
"golang.org/x/xerrors"
1818

19-
"cdr.dev/slog"
2019
"github.com/coder/coder/agent"
2120
"github.com/coder/coder/cli/cliflag"
2221
"github.com/coder/coder/cli/cliui"
@@ -96,7 +95,7 @@ func portForward() *cobra.Command {
9695
return xerrors.Errorf("await agent: %w", err)
9796
}
9897

99-
conn, err := client.DialWorkspaceAgentTailnet(ctx, slog.Logger{}, workspaceAgent.ID)
98+
conn, err := client.DialWorkspaceAgent(ctx, workspaceAgent.ID, nil)
10099
if err != nil {
101100
return err
102101
}
@@ -156,7 +155,7 @@ func portForward() *cobra.Command {
156155
case <-ticker.C:
157156
}
158157

159-
_, err = conn.Ping()
158+
_, err = conn.Ping(ctx)
160159
if err != nil {
161160
continue
162161
}

cli/root.go

+4-2
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import (
44
"context"
55
"flag"
66
"fmt"
7+
"io"
78
"net/http"
89
"net/url"
910
"os"
@@ -100,8 +101,9 @@ func Core() []*cobra.Command {
100101
}
101102

102103
func AGPL() []*cobra.Command {
103-
all := append(Core(), Server(deployment.Flags(), func(_ context.Context, o *coderd.Options) (*coderd.API, error) {
104-
return coderd.New(o), nil
104+
all := append(Core(), Server(deployment.Flags(), func(_ context.Context, o *coderd.Options) (*coderd.API, io.Closer, error) {
105+
api := coderd.New(o)
106+
return api, api, nil
105107
}))
106108
return all
107109
}

cli/server.go

+16-7
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ import (
6969
)
7070

7171
// nolint:gocyclo
72-
func Server(dflags *codersdk.DeploymentFlags, newAPI func(context.Context, *coderd.Options) (*coderd.API, error)) *cobra.Command {
72+
func Server(dflags *codersdk.DeploymentFlags, newAPI func(context.Context, *coderd.Options) (*coderd.API, io.Closer, error)) *cobra.Command {
7373
root := &cobra.Command{
7474
Use: "server",
7575
Short: "Start a Coder server",
@@ -167,9 +167,10 @@ func Server(dflags *codersdk.DeploymentFlags, newAPI func(context.Context, *code
167167
}
168168
defer listener.Close()
169169

170+
var tlsConfig *tls.Config
170171
if dflags.TLSEnable.Value {
171-
listener, err = configureServerTLS(
172-
listener, dflags.TLSMinVersion.Value,
172+
tlsConfig, err = configureTLS(
173+
dflags.TLSMinVersion.Value,
173174
dflags.TLSClientAuth.Value,
174175
dflags.TLSCertFiles.Value,
175176
dflags.TLSKeyFiles.Value,
@@ -178,6 +179,7 @@ func Server(dflags *codersdk.DeploymentFlags, newAPI func(context.Context, *code
178179
if err != nil {
179180
return xerrors.Errorf("configure tls: %w", err)
180181
}
182+
listener = tls.NewListener(listener, tlsConfig)
181183
}
182184

183185
tcpAddr, valid := listener.Addr().(*net.TCPAddr)
@@ -328,6 +330,9 @@ func Server(dflags *codersdk.DeploymentFlags, newAPI func(context.Context, *code
328330
Experimental: ExperimentalEnabled(cmd),
329331
DeploymentFlags: dflags,
330332
}
333+
if tlsConfig != nil {
334+
options.TLSCertificates = tlsConfig.Certificates
335+
}
331336

332337
if dflags.OAuth2GithubClientSecret.Value != "" {
333338
options.GithubOAuth2Config, err = configureGithubOAuth2(accessURLParsed,
@@ -471,11 +476,14 @@ func Server(dflags *codersdk.DeploymentFlags, newAPI func(context.Context, *code
471476
), dflags.PromAddress.Value, "prometheus")()
472477
}
473478

474-
coderAPI, err := newAPI(ctx, options)
479+
// We use a separate closer so the Enterprise API
480+
// can have it's own close functions. This is cleaner
481+
// than abstracting the Coder API itself.
482+
coderAPI, closer, err := newAPI(ctx, options)
475483
if err != nil {
476484
return err
477485
}
478-
defer coderAPI.Close()
486+
defer closer.Close()
479487

480488
client := codersdk.New(localURL)
481489
if dflags.TLSEnable.Value {
@@ -893,7 +901,7 @@ func loadCertificates(tlsCertFiles, tlsKeyFiles []string) ([]tls.Certificate, er
893901
return certs, nil
894902
}
895903

896-
func configureServerTLS(listener net.Listener, tlsMinVersion, tlsClientAuth string, tlsCertFiles, tlsKeyFiles []string, tlsClientCAFile string) (net.Listener, error) {
904+
func configureTLS(tlsMinVersion, tlsClientAuth string, tlsCertFiles, tlsKeyFiles []string, tlsClientCAFile string) (*tls.Config, error) {
897905
tlsConfig := &tls.Config{
898906
MinVersion: tls.VersionTLS12,
899907
}
@@ -929,6 +937,7 @@ func configureServerTLS(listener net.Listener, tlsMinVersion, tlsClientAuth stri
929937
if err != nil {
930938
return nil, xerrors.Errorf("load certificates: %w", err)
931939
}
940+
tlsConfig.Certificates = certs
932941
tlsConfig.GetCertificate = func(hi *tls.ClientHelloInfo) (*tls.Certificate, error) {
933942
// If there's only one certificate, return it.
934943
if len(certs) == 1 {
@@ -963,7 +972,7 @@ func configureServerTLS(listener net.Listener, tlsMinVersion, tlsClientAuth stri
963972
tlsConfig.ClientCAs = caPool
964973
}
965974

966-
return tls.NewListener(listener, tlsConfig), nil
975+
return tlsConfig, nil
967976
}
968977

969978
func configureGithubOAuth2(accessURL *url.URL, clientID, clientSecret string, allowSignups bool, allowOrgs []string, rawTeams []string, enterpriseBaseURL string) (*coderd.GithubOAuth2Config, error) {

cli/speedtest.go

+4-2
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,9 @@ func speedtest() *cobra.Command {
5555
if cliflag.IsSetBool(cmd, varVerbose) {
5656
logger = logger.Leveled(slog.LevelDebug)
5757
}
58-
conn, err := client.DialWorkspaceAgentTailnet(ctx, logger, workspaceAgent.ID)
58+
conn, err := client.DialWorkspaceAgent(ctx, workspaceAgent.ID, &codersdk.DialWorkspaceAgentOptions{
59+
Logger: logger,
60+
})
5961
if err != nil {
6062
return err
6163
}
@@ -68,7 +70,7 @@ func speedtest() *cobra.Command {
6870
return ctx.Err()
6971
case <-ticker.C:
7072
}
71-
dur, err := conn.Ping()
73+
dur, err := conn.Ping(ctx)
7274
if err != nil {
7375
continue
7476
}

cli/ssh.go

+1-3
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,6 @@ import (
2020
"golang.org/x/term"
2121
"golang.org/x/xerrors"
2222

23-
"cdr.dev/slog"
24-
2523
"github.com/coder/coder/cli/cliflag"
2624
"github.com/coder/coder/cli/cliui"
2725
"github.com/coder/coder/coderd/autobuild/notify"
@@ -86,7 +84,7 @@ func ssh() *cobra.Command {
8684
return xerrors.Errorf("await agent: %w", err)
8785
}
8886

89-
conn, err := client.DialWorkspaceAgentTailnet(ctx, slog.Logger{}, workspaceAgent.ID)
87+
conn, err := client.DialWorkspaceAgent(ctx, workspaceAgent.ID, nil)
9088
if err != nil {
9189
return err
9290
}

coderd/activitybump_test.go

+4-2
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ func TestWorkspaceActivityBump(t *testing.T) {
7272
"deadline %v never updated", firstDeadline,
7373
)
7474

75-
require.WithinDuration(t, database.Now().Add(time.Hour), workspace.LatestBuild.Deadline.Time, time.Second)
75+
require.WithinDuration(t, database.Now().Add(time.Hour), workspace.LatestBuild.Deadline.Time, 3*time.Second)
7676
}
7777
}
7878

@@ -82,7 +82,9 @@ func TestWorkspaceActivityBump(t *testing.T) {
8282
client, workspace, assertBumped := setupActivityTest(t)
8383

8484
resources := coderdtest.AwaitWorkspaceAgents(t, client, workspace.ID)
85-
conn, err := client.DialWorkspaceAgentTailnet(ctx, slogtest.Make(t, nil), resources[0].Agents[0].ID)
85+
conn, err := client.DialWorkspaceAgent(ctx, resources[0].Agents[0].ID, &codersdk.DialWorkspaceAgentOptions{
86+
Logger: slogtest.Make(t, nil),
87+
})
8688
require.NoError(t, err)
8789
defer conn.Close()
8890

0 commit comments

Comments
 (0)