Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 3292f1f

Browse files
committed
fix: never use STUN latency for DERP region netcheck (#29)
Changes the STUN probe to still run as usual, but avoid storing the latency in the region report. This was causing Google STUN to artificially make the default region have an extremely low latency. Also fixes HTTPS latency check to work on ForceHTTP nodes.
1 parent 7471222 commit 3292f1f

File tree

2 files changed

+75
-21
lines changed

2 files changed

+75
-21
lines changed

derp/derphttp/derphttp_client.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -399,7 +399,7 @@ func (c *Client) connect(ctx context.Context, caller string) (client *derp.Clien
399399
tcpConn, err = c.dialURL(ctx)
400400
default:
401401
c.logf("%s: connecting to derp-%d (%v)", caller, reg.RegionID, reg.RegionCode)
402-
tcpConn, node, err = c.dialRegion(ctx, reg)
402+
tcpConn, node, err = c.DialRegion(ctx, reg)
403403
}
404404
if err != nil {
405405
return nil, 0, err
@@ -612,10 +612,10 @@ func (c *Client) dialURL(ctx context.Context) (net.Conn, error) {
612612
return tcpConn, nil
613613
}
614614

615-
// dialRegion returns a TCP connection to the provided region, trying
615+
// DialRegion returns a TCP connection to the provided region, trying
616616
// each node in order (with dialNode) until one connects or ctx is
617617
// done.
618-
func (c *Client) dialRegion(ctx context.Context, reg *tailcfg.DERPRegion) (net.Conn, *tailcfg.DERPNode, error) {
618+
func (c *Client) DialRegion(ctx context.Context, reg *tailcfg.DERPRegion) (net.Conn, *tailcfg.DERPNode, error) {
619619
if len(reg.Nodes) == 0 {
620620
return nil, nil, fmt.Errorf("no nodes for %s", c.targetString(reg))
621621
}
@@ -663,7 +663,7 @@ func (c *Client) tlsClient(nc net.Conn, node *tailcfg.DERPNode) *tls.Conn {
663663
// in the DERP map. TLS is initiated on the first node where a socket is
664664
// established.
665665
func (c *Client) DialRegionTLS(ctx context.Context, reg *tailcfg.DERPRegion) (tlsConn *tls.Conn, connClose io.Closer, node *tailcfg.DERPNode, err error) {
666-
tcpConn, node, err := c.dialRegion(ctx, reg)
666+
tcpConn, node, err := c.DialRegion(ctx, reg)
667667
if err != nil {
668668
return nil, nil, nil, err
669669
}

net/netcheck/netcheck.go

Lines changed: 71 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ package netcheck
77
import (
88
"bufio"
99
"context"
10-
"crypto/tls"
10+
"encoding/hex"
1111
"errors"
1212
"fmt"
1313
"io"
@@ -678,7 +678,9 @@ func (rs *reportState) addNodeLatency(node *tailcfg.DERPNode, ipp netip.AddrPort
678678
ret := rs.report
679679

680680
ret.UDP = true
681-
updateLatency(ret.RegionLatency, node.RegionID, d)
681+
682+
// Coder: don't actually store the latency.
683+
//updateLatency(ret.RegionLatency, node.RegionID, d)
682684

683685
// Once we've heard from enough regions (3), start a timer to
684686
// give up on the other ones. The timer's duration is a
@@ -696,13 +698,13 @@ func (rs *reportState) addNodeLatency(node *tailcfg.DERPNode, ipp netip.AddrPort
696698

697699
switch {
698700
case ipp.Addr().Is6():
699-
updateLatency(ret.RegionV6Latency, node.RegionID, d)
701+
//updateLatency(ret.RegionV6Latency, node.RegionID, d)
700702
ret.IPv6 = true
701703
ret.GlobalV6 = ipPortStr
702704
// TODO: track MappingVariesByDestIP for IPv6
703705
// too? Would be sad if so, but who knows.
704706
case ipp.Addr().Is4():
705-
updateLatency(ret.RegionV4Latency, node.RegionID, d)
707+
//updateLatency(ret.RegionV4Latency, node.RegionID, d)
706708
ret.IPv4 = true
707709
if rs.gotEP4 == "" {
708710
rs.gotEP4 = ipPortStr
@@ -1016,7 +1018,9 @@ func (c *Client) GetReport(ctx context.Context, dm *tailcfg.DERPMap) (_ *Report,
10161018
// Try HTTPS and ICMP latency check if all STUN probes failed due to
10171019
// UDP presumably being blocked.
10181020
// TODO: this should be moved into the probePlan, using probeProto probeHTTPS.
1019-
if !rs.anyUDP() && ctx.Err() == nil {
1021+
// Coder: always run this because we don't store STUN latency in the report.
1022+
//if !rs.anyUDP() && ctx.Err() == nil {
1023+
if ctx.Err() == nil {
10201024
var wg sync.WaitGroup
10211025
var need []*tailcfg.DERPRegion
10221026
for rid, reg := range dm.Regions {
@@ -1039,13 +1043,14 @@ func (c *Client) GetReport(ctx context.Context, dm *tailcfg.DERPMap) (_ *Report,
10391043
}()
10401044

10411045
wg.Add(len(need))
1042-
c.logf("netcheck: UDP is blocked, trying HTTPS")
1046+
// Coder: this is misleading because we always log it.
1047+
//c.logf("netcheck: UDP is blocked, trying HTTPS")
10431048
}
10441049
for _, reg := range need {
10451050
go func(reg *tailcfg.DERPRegion) {
10461051
defer wg.Done()
1047-
if d, ip, err := c.measureHTTPSLatency(ctx, reg); err != nil {
1048-
c.logf("[v1] netcheck: measuring HTTPS latency of %v (%d): %v", reg.RegionCode, reg.RegionID, err)
1052+
if d, ip, err := c.measureHTTPLatency(ctx, reg); err != nil {
1053+
c.logf("[v1] netcheck: measuring HTTP(S) latency of %v (%d): %v", reg.RegionCode, reg.RegionID, err)
10491054
} else {
10501055
rs.mu.Lock()
10511056
if l, ok := rs.report.RegionLatency[reg.RegionID]; !ok {
@@ -1216,7 +1221,9 @@ func (c *Client) runHTTPOnlyChecks(ctx context.Context, last *Report, rs *report
12161221
return nil
12171222
}
12181223

1219-
func (c *Client) measureHTTPSLatency(ctx context.Context, reg *tailcfg.DERPRegion) (time.Duration, netip.Addr, error) {
1224+
// measureHTTPLatency measures the latency to the given DERP region over HTTP
1225+
// or HTTPS.
1226+
func (c *Client) measureHTTPLatency(ctx context.Context, reg *tailcfg.DERPRegion) (time.Duration, netip.Addr, error) {
12201227
metricHTTPSend.Add(1)
12211228
var result httpstat.Result
12221229
ctx, cancel := context.WithTimeout(httpstat.WithHTTPStat(ctx, &result), overallProbeTimeout)
@@ -1227,28 +1234,60 @@ func (c *Client) measureHTTPSLatency(ctx context.Context, reg *tailcfg.DERPRegio
12271234
dc := derphttp.NewNetcheckClient(c.logf)
12281235
defer dc.Close()
12291236

1230-
tlsConn, tcpConn, node, err := dc.DialRegionTLS(ctx, reg)
1237+
var hasForceHTTPNode = false
1238+
for _, node := range reg.Nodes {
1239+
if node.STUNOnly {
1240+
continue
1241+
}
1242+
if node.ForceHTTP {
1243+
hasForceHTTPNode = true
1244+
break
1245+
}
1246+
}
1247+
1248+
var (
1249+
conn net.Conn
1250+
closer io.Closer
1251+
node *tailcfg.DERPNode
1252+
err error
1253+
)
1254+
if hasForceHTTPNode {
1255+
conn, node, err = dc.DialRegion(ctx, reg)
1256+
closer = conn
1257+
} else {
1258+
conn, closer, node, err = dc.DialRegionTLS(ctx, reg)
1259+
}
12311260
if err != nil {
12321261
return 0, ip, err
12331262
}
1234-
defer tcpConn.Close()
1235-
1236-
if ta, ok := tlsConn.RemoteAddr().(*net.TCPAddr); ok {
1263+
defer closer.Close()
1264+
if ta, ok := conn.RemoteAddr().(*net.TCPAddr); ok {
12371265
ip, _ = netip.AddrFromSlice(ta.IP)
12381266
ip = ip.Unmap()
12391267
}
12401268
if ip == (netip.Addr{}) {
1241-
return 0, ip, fmt.Errorf("no unexpected RemoteAddr %#v", tlsConn.RemoteAddr())
1269+
return 0, ip, fmt.Errorf("no unexpected RemoteAddr %#v", conn.RemoteAddr())
12421270
}
12431271

1244-
connc := make(chan *tls.Conn, 1)
1245-
connc <- tlsConn
1272+
connc := make(chan net.Conn, 1)
1273+
connc <- conn
12461274

12471275
tr := &http.Transport{
12481276
DialContext: func(ctx context.Context, network, addr string) (net.Conn, error) {
1249-
return nil, errors.New("unexpected DialContext dial")
1277+
if !hasForceHTTPNode {
1278+
return nil, errors.New("unexpected DialContext dial")
1279+
}
1280+
select {
1281+
case nc := <-connc:
1282+
return nc, nil
1283+
default:
1284+
return nil, errors.New("only one conn expected")
1285+
}
12501286
},
12511287
DialTLSContext: func(ctx context.Context, network, addr string) (net.Conn, error) {
1288+
if hasForceHTTPNode {
1289+
return nil, errors.New("unexpected DialTLSContext dial")
1290+
}
12521291
select {
12531292
case nc := <-connc:
12541293
return nc, nil
@@ -1530,6 +1569,12 @@ func (rs *reportState) runProbe(ctx context.Context, dm *tailcfg.DERPMap, probe
15301569
return
15311570
}
15321571

1572+
// Coder: The address below won't be valid if the node doesn't have a
1573+
// STUNPort.
1574+
if node.STUNPort < 1 {
1575+
return
1576+
}
1577+
15331578
addr := c.nodeAddr(ctx, node, probe.proto)
15341579
if !addr.IsValid() {
15351580
return
@@ -1542,7 +1587,16 @@ func (rs *reportState) runProbe(ctx context.Context, dm *tailcfg.DERPMap, probe
15421587

15431588
rs.mu.Lock()
15441589
rs.inFlight[txID] = func(ipp netip.AddrPort) {
1590+
// Coder: we don't want to store the latency of STUN netchecks because
1591+
// Coder doesn't contain a built-in STUN server and customers often use
1592+
// Google STUN which has extremely low latency everywhere on the planet.
1593+
// This means that latency checks to any regions containing the Google
1594+
// STUN server will always muddy that region's latency results.
1595+
//
1596+
// rs.addNodeLatency has been updated to not store latency but will
1597+
// still set the approapriate values in the report.
15451598
rs.addNodeLatency(node, ipp, time.Since(sent))
1599+
c.logf("netcheck.runProbe: got STUN response for %s from %s (%s) in %s", node.Name, ipp.String(), hex.EncodeToString(txID[:]), time.Since(sent).String())
15461600
cancelSet() // abort other nodes in this set
15471601
}
15481602
rs.mu.Unlock()

0 commit comments

Comments
 (0)