Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 82f494c

Browse files
authored
fix: Improve tailnet connections by reducing timeouts (#5043)
* fix: Improve tailnet connections by reducing timeouts This awaits connection ping before running a dial. Before, we were hitting the TCP retransmission and handshake timeouts, which could intermittently add 1 or 5 seconds to a connection being initialized. * Update Tailscale
1 parent 4646f58 commit 82f494c

12 files changed

+112
-108
lines changed

agent/agent_test.go

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -498,12 +498,7 @@ func TestAgent(t *testing.T) {
498498
}()
499499

500500
conn, _ := setupAgent(t, codersdk.WorkspaceAgentMetadata{}, 0)
501-
require.Eventually(t, func() bool {
502-
ctx, cancelFunc := context.WithTimeout(context.Background(), testutil.IntervalFast)
503-
defer cancelFunc()
504-
_, err := conn.Ping(ctx)
505-
return err == nil
506-
}, testutil.WaitMedium, testutil.IntervalFast)
501+
require.True(t, conn.AwaitReachable(context.Background()))
507502
conn1, err := conn.DialContext(context.Background(), l.Addr().Network(), l.Addr().String())
508503
require.NoError(t, err)
509504
defer conn1.Close()

cli/agent_test.go

Lines changed: 3 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@ import (
1414
"github.com/coder/coder/coderd/coderdtest"
1515
"github.com/coder/coder/provisioner/echo"
1616
"github.com/coder/coder/provisionersdk/proto"
17-
"github.com/coder/coder/testutil"
1817
)
1918

2019
func TestWorkspaceAgent(t *testing.T) {
@@ -71,12 +70,7 @@ func TestWorkspaceAgent(t *testing.T) {
7170
dialer, err := client.DialWorkspaceAgent(ctx, resources[0].Agents[0].ID, nil)
7271
require.NoError(t, err)
7372
defer dialer.Close()
74-
require.Eventually(t, func() bool {
75-
ctx, cancelFunc := context.WithTimeout(ctx, testutil.IntervalFast)
76-
defer cancelFunc()
77-
_, err := dialer.Ping(ctx)
78-
return err == nil
79-
}, testutil.WaitMedium, testutil.IntervalFast)
73+
require.True(t, dialer.AwaitReachable(context.Background()))
8074
cancelFunc()
8175
err = <-errC
8276
require.NoError(t, err)
@@ -134,12 +128,7 @@ func TestWorkspaceAgent(t *testing.T) {
134128
dialer, err := client.DialWorkspaceAgent(ctx, resources[0].Agents[0].ID, nil)
135129
require.NoError(t, err)
136130
defer dialer.Close()
137-
require.Eventually(t, func() bool {
138-
ctx, cancelFunc := context.WithTimeout(ctx, testutil.IntervalFast)
139-
defer cancelFunc()
140-
_, err := dialer.Ping(ctx)
141-
return err == nil
142-
}, testutil.WaitMedium, testutil.IntervalFast)
131+
require.True(t, dialer.AwaitReachable(context.Background()))
143132
cancelFunc()
144133
err = <-errC
145134
require.NoError(t, err)
@@ -197,13 +186,7 @@ func TestWorkspaceAgent(t *testing.T) {
197186
dialer, err := client.DialWorkspaceAgent(ctx, resources[0].Agents[0].ID, nil)
198187
require.NoError(t, err)
199188
defer dialer.Close()
200-
require.Eventually(t, func() bool {
201-
ctx, cancelFunc := context.WithTimeout(ctx, testutil.IntervalFast)
202-
defer cancelFunc()
203-
_, err := dialer.Ping(ctx)
204-
return err == nil
205-
}, testutil.WaitMedium, testutil.IntervalFast)
206-
189+
require.True(t, dialer.AwaitReachable(context.Background()))
207190
sshClient, err := dialer.SSHClient(ctx)
208191
require.NoError(t, err)
209192
defer sshClient.Close()

cli/portforward.go

Lines changed: 1 addition & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@ import (
1010
"strings"
1111
"sync"
1212
"syscall"
13-
"time"
1413

1514
"github.com/pion/udp"
1615
"github.com/spf13/cobra"
@@ -145,22 +144,7 @@ func portForward() *cobra.Command {
145144
closeAllListeners()
146145
}()
147146

148-
ticker := time.NewTicker(250 * time.Millisecond)
149-
defer ticker.Stop()
150-
for {
151-
select {
152-
case <-ctx.Done():
153-
return ctx.Err()
154-
case <-ticker.C:
155-
}
156-
157-
_, err = conn.Ping(ctx)
158-
if err != nil {
159-
continue
160-
}
161-
break
162-
}
163-
ticker.Stop()
147+
conn.AwaitReachable(ctx)
164148
_, _ = fmt.Fprintln(cmd.OutOrStderr(), "Ready!")
165149
wg.Wait()
166150
return closeErr

cli/speedtest.go

Lines changed: 30 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -62,33 +62,37 @@ func speedtest() *cobra.Command {
6262
return err
6363
}
6464
defer conn.Close()
65-
ticker := time.NewTicker(time.Second)
66-
defer ticker.Stop()
67-
for {
68-
select {
69-
case <-ctx.Done():
70-
return ctx.Err()
71-
case <-ticker.C:
65+
if direct {
66+
ticker := time.NewTicker(time.Second)
67+
defer ticker.Stop()
68+
for {
69+
select {
70+
case <-ctx.Done():
71+
return ctx.Err()
72+
case <-ticker.C:
73+
}
74+
dur, err := conn.Ping(ctx)
75+
if err != nil {
76+
continue
77+
}
78+
status := conn.Status()
79+
if len(status.Peers()) != 1 {
80+
continue
81+
}
82+
peer := status.Peer[status.Peers()[0]]
83+
if peer.CurAddr == "" && direct {
84+
cmd.Printf("Waiting for a direct connection... (%dms via %s)\n", dur.Milliseconds(), peer.Relay)
85+
continue
86+
}
87+
via := peer.Relay
88+
if via == "" {
89+
via = "direct"
90+
}
91+
cmd.Printf("%dms via %s\n", dur.Milliseconds(), via)
92+
break
7293
}
73-
dur, err := conn.Ping(ctx)
74-
if err != nil {
75-
continue
76-
}
77-
status := conn.Status()
78-
if len(status.Peers()) != 1 {
79-
continue
80-
}
81-
peer := status.Peer[status.Peers()[0]]
82-
if peer.CurAddr == "" && direct {
83-
cmd.Printf("Waiting for a direct connection... (%dms via %s)\n", dur.Milliseconds(), peer.Relay)
84-
continue
85-
}
86-
via := peer.Relay
87-
if via == "" {
88-
via = "direct"
89-
}
90-
cmd.Printf("%dms via %s\n", dur.Milliseconds(), via)
91-
break
94+
} else {
95+
conn.AwaitReachable(ctx)
9296
}
9397
dir := tsspeedtest.Download
9498
if reverse {

cli/ssh.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -90,12 +90,12 @@ func ssh() *cobra.Command {
9090
return xerrors.Errorf("await agent: %w", err)
9191
}
9292

93-
conn, err := client.DialWorkspaceAgent(ctx, workspaceAgent.ID, nil)
93+
conn, err := client.DialWorkspaceAgent(ctx, workspaceAgent.ID, &codersdk.DialWorkspaceAgentOptions{})
9494
if err != nil {
9595
return err
9696
}
9797
defer conn.Close()
98-
98+
conn.AwaitReachable(ctx)
9999
stopPolling := tryPollWorkspaceAutostop(ctx, client, workspace)
100100
defer stopPolling()
101101

coderd/workspaceagents_test.go

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -178,12 +178,7 @@ func TestWorkspaceAgentListen(t *testing.T) {
178178
defer func() {
179179
_ = conn.Close()
180180
}()
181-
require.Eventually(t, func() bool {
182-
ctx, cancelFunc := context.WithTimeout(ctx, testutil.IntervalFast)
183-
defer cancelFunc()
184-
_, err := conn.Ping(ctx)
185-
return err == nil
186-
}, testutil.WaitLong, testutil.IntervalFast)
181+
conn.AwaitReachable(ctx)
187182
})
188183

189184
t.Run("FailNonLatestBuild", func(t *testing.T) {

codersdk/agentconn.go

Lines changed: 8 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,7 @@ import (
1616

1717
"golang.org/x/crypto/ssh"
1818
"golang.org/x/xerrors"
19-
"tailscale.com/ipn/ipnstate"
2019
"tailscale.com/net/speedtest"
21-
"tailscale.com/tailcfg"
2220

2321
"github.com/coder/coder/coderd/tracing"
2422
"github.com/coder/coder/tailnet"
@@ -133,27 +131,18 @@ type AgentConn struct {
133131
CloseFunc func()
134132
}
135133

134+
func (c *AgentConn) AwaitReachable(ctx context.Context) bool {
135+
ctx, span := tracing.StartSpan(ctx)
136+
defer span.End()
137+
138+
return c.Conn.AwaitReachable(ctx, TailnetIP)
139+
}
140+
136141
func (c *AgentConn) Ping(ctx context.Context) (time.Duration, error) {
137142
ctx, span := tracing.StartSpan(ctx)
138143
defer span.End()
139144

140-
errCh := make(chan error, 1)
141-
durCh := make(chan time.Duration, 1)
142-
go c.Conn.Ping(TailnetIP, tailcfg.PingDisco, func(pr *ipnstate.PingResult) {
143-
if pr.Err != "" {
144-
errCh <- xerrors.New(pr.Err)
145-
return
146-
}
147-
durCh <- time.Duration(pr.LatencySeconds * float64(time.Second))
148-
})
149-
select {
150-
case err := <-errCh:
151-
return 0, err
152-
case <-ctx.Done():
153-
return 0, ctx.Err()
154-
case dur := <-durCh:
155-
return dur, nil
156-
}
145+
return c.Conn.Ping(ctx, TailnetIP)
157146
}
158147

159148
func (c *AgentConn) CloseWithError(_ error) error {

codersdk/workspaceagents.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -447,13 +447,14 @@ func (c *Client) DialWorkspaceAgent(ctx context.Context, agentID uuid.UUID, opti
447447
_ = conn.Close()
448448
return nil, err
449449
}
450+
450451
return &AgentConn{
451452
Conn: conn,
452453
CloseFunc: func() {
453454
cancelFunc()
454455
<-closed
455456
},
456-
}, err
457+
}, nil
457458
}
458459

459460
// WorkspaceAgent returns an agent by ID.

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ replace github.com/tcnksm/go-httpstat => github.com/kylecarbs/go-httpstat v0.0.0
4040

4141
// There are a few minor changes we make to Tailscale that we're slowly upstreaming. Compare here:
4242
// https://github.com/tailscale/tailscale/compare/main...coder:tailscale:main
43-
replace tailscale.com => github.com/coder/tailscale v1.1.1-0.20221104170440-ef53dca69a41
43+
replace tailscale.com => github.com/coder/tailscale v1.1.1-0.20221113171243-7d90f070c5dc
4444

4545
// Switch to our fork that imports fixes from http://github.com/tailscale/ssh.
4646
// See: https://github.com/coder/coder/issues/3371

go.sum

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -355,8 +355,8 @@ github.com/coder/retry v1.3.0 h1:5lAAwt/2Cm6lVmnfBY7sOMXcBOwcwJhmV5QGSELIVWY=
355355
github.com/coder/retry v1.3.0/go.mod h1:tXuRgZgWjUnU5LZPT4lJh4ew2elUhexhlnXzrJWdyFY=
356356
github.com/coder/ssh v0.0.0-20220811105153-fcea99919338 h1:tN5GKFT68YLVzJoA8AHuiMNJ0qlhoD3pGN3JY9gxSko=
357357
github.com/coder/ssh v0.0.0-20220811105153-fcea99919338/go.mod h1:ZSS+CUoKHDrqVakTfTWUlKSr9MtMFkC4UvtQKD7O914=
358-
github.com/coder/tailscale v1.1.1-0.20221104170440-ef53dca69a41 h1:/mjNjfUarvH8BdmvDVLvtIIENoe3PevqCyZQmAlILuw=
359-
github.com/coder/tailscale v1.1.1-0.20221104170440-ef53dca69a41/go.mod h1:lkCb74eSJwxeNq8YwyILoHD5vtHktiZnTOxBxo3tbNc=
358+
github.com/coder/tailscale v1.1.1-0.20221113171243-7d90f070c5dc h1:qozpteSLz0ifMasetJ+/Qac5Ud/NRNIlgTubGf6TAaQ=
359+
github.com/coder/tailscale v1.1.1-0.20221113171243-7d90f070c5dc/go.mod h1:lkCb74eSJwxeNq8YwyILoHD5vtHktiZnTOxBxo3tbNc=
360360
github.com/containerd/aufs v0.0.0-20200908144142-dab0cbea06f4/go.mod h1:nukgQABAEopAHvB6j7cnP5zJ+/3aVcE7hCYqvIwAHyE=
361361
github.com/containerd/aufs v0.0.0-20201003224125-76a6863f2989/go.mod h1:AkGGQs9NM2vtYHaUen+NljV0/baGCAPELGm2q9ZXpWU=
362362
github.com/containerd/aufs v0.0.0-20210316121734-20793ff83c97/go.mod h1:kL5kd6KM5TzQjR79jljyi4olc1Vrx6XBlcyj3gNv2PU=

tailnet/conn.go

Lines changed: 57 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package tailnet
22

33
import (
44
"context"
5+
"errors"
56
"fmt"
67
"io"
78
"net"
@@ -198,7 +199,7 @@ func NewConn(options *Options) (*Conn, error) {
198199
wireguardEngine: wireguardEngine,
199200
}
200201
wireguardEngine.SetStatusCallback(func(s *wgengine.Status, err error) {
201-
server.logger.Info(context.Background(), "wireguard status", slog.F("status", s), slog.F("err", err))
202+
server.logger.Debug(context.Background(), "wireguard status", slog.F("status", s), slog.F("err", err))
202203
if err != nil {
203204
return
204205
}
@@ -217,7 +218,7 @@ func NewConn(options *Options) (*Conn, error) {
217218
server.sendNode()
218219
})
219220
wireguardEngine.SetNetInfoCallback(func(ni *tailcfg.NetInfo) {
220-
server.logger.Info(context.Background(), "netinfo callback", slog.F("netinfo", ni))
221+
server.logger.Debug(context.Background(), "netinfo callback", slog.F("netinfo", ni))
221222
// If the lastMutex is blocked, it's possible that
222223
// multiple NetInfo callbacks occur at the same time.
223224
//
@@ -383,6 +384,9 @@ func (c *Conn) UpdateNodes(nodes []*Node) error {
383384
if c.isClosed() {
384385
return nil
385386
}
387+
if errors.Is(err, wgengine.ErrNoChanges) {
388+
return nil
389+
}
386390
return xerrors.Errorf("reconfig: %w", err)
387391
}
388392
return nil
@@ -395,9 +399,56 @@ func (c *Conn) Status() *ipnstate.Status {
395399
return sb.Status()
396400
}
397401

398-
// Ping sends a ping to the Wireguard engine.
399-
func (c *Conn) Ping(ip netip.Addr, pingType tailcfg.PingType, cb func(*ipnstate.PingResult)) {
400-
c.wireguardEngine.Ping(ip, pingType, cb)
402+
// Ping sends a Disco ping to the Wireguard engine.
403+
func (c *Conn) Ping(ctx context.Context, ip netip.Addr) (time.Duration, error) {
404+
errCh := make(chan error, 1)
405+
durCh := make(chan time.Duration, 1)
406+
go c.wireguardEngine.Ping(ip, tailcfg.PingDisco, func(pr *ipnstate.PingResult) {
407+
if pr.Err != "" {
408+
errCh <- xerrors.New(pr.Err)
409+
return
410+
}
411+
durCh <- time.Duration(pr.LatencySeconds * float64(time.Second))
412+
})
413+
select {
414+
case err := <-errCh:
415+
return 0, err
416+
case <-ctx.Done():
417+
return 0, ctx.Err()
418+
case dur := <-durCh:
419+
return dur, nil
420+
}
421+
}
422+
423+
// AwaitReachable pings the provided IP continually until the
424+
// address is reachable. It's the callers responsibility to provide
425+
// a timeout, otherwise this function will block forever.
426+
func (c *Conn) AwaitReachable(ctx context.Context, ip netip.Addr) bool {
427+
ticker := time.NewTicker(time.Millisecond * 100)
428+
defer ticker.Stop()
429+
completedCtx, completed := context.WithCancel(ctx)
430+
run := func() {
431+
ctx, cancelFunc := context.WithTimeout(completedCtx, time.Second)
432+
defer cancelFunc()
433+
_, err := c.Ping(ctx, ip)
434+
if err == nil {
435+
completed()
436+
}
437+
}
438+
go run()
439+
defer completed()
440+
for {
441+
select {
442+
case <-completedCtx.Done():
443+
return true
444+
case <-ticker.C:
445+
// Pings can take a while, so we can run multiple
446+
// in parallel to return ASAP.
447+
go run()
448+
case <-ctx.Done():
449+
return false
450+
}
451+
}
401452
}
402453

403454
// Closed is a channel that ends when the connection has
@@ -466,7 +517,7 @@ func (c *Conn) sendNode() {
466517
}
467518
c.nodeSending = true
468519
go func() {
469-
c.logger.Info(context.Background(), "sending node", slog.F("node", node))
520+
c.logger.Debug(context.Background(), "sending node", slog.F("node", node))
470521
nodeCallback(node)
471522
c.lastMutex.Lock()
472523
c.nodeSending = false

tailnet/conn_test.go

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,14 +62,16 @@ func TestTailnet(t *testing.T) {
6262
err := w1.UpdateNodes([]*tailnet.Node{node})
6363
require.NoError(t, err)
6464
})
65-
65+
require.True(t, w2.AwaitReachable(context.Background(), w1IP))
6666
conn := make(chan struct{})
6767
go func() {
6868
listener, err := w1.Listen("tcp", ":35565")
6969
assert.NoError(t, err)
7070
defer listener.Close()
7171
nc, err := listener.Accept()
72-
assert.NoError(t, err)
72+
if !assert.NoError(t, err) {
73+
return
74+
}
7375
_ = nc.Close()
7476
conn <- struct{}{}
7577
}()

0 commit comments

Comments
 (0)