Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit a684f00

Browse files
authored
[filebeat][streaming] - Improved websocket exponential backoff logic to produce a smoother backoff curve (#44069)
The previous waitTime calculation for the exponential backoff strategy produced an extremely sharp curve, where depending on the values of waitMin and waitMax and number of attempts, the waitMax (cap) would be easily reached after the initial couple of attempts thereby limiting the growth in the wait time compared to the total number of attempts. Simply the waitTime growth curve would hit the cap and flatten out after 1-2 retry attempts because of uncapped jitter. This new change makes it such the waitTime growth curve increases more smoothly with the number of attempts providing a smoother backoff function.
1 parent 14ac057 commit a684f00

2 files changed

Lines changed: 34 additions & 11 deletions

File tree

CHANGELOG-developer.next.asciidoc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -230,6 +230,7 @@ client will aggregate metrics by input on this registry. It'll collect
230230
- Add new API to libbeat/monitoring/inputmon. The API allows to register and
231231
unregister input metrics without relaying on the global 'dataset' namespace.{pull}42618[42618] {issue}42761[42761]
232232
- Added exponential backoff retry logic for vSphere client logout operation in metricbeat. {issue}43449[43449]
233+
- Improved Filebeat Websocket input exponential backoff logic to produce a smoother backoff curve. {pull}44069[44069]
233234

234235
==== Deprecated
235236

x-pack/filebeat/input/streaming/websocket.go

Lines changed: 33 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,13 @@ import (
3030
"github.com/elastic/elastic-agent-libs/transport/tlscommon"
3131
)
3232

33+
const (
34+
// spread is the induced jitter for the backoff logic. This is not user configurable
35+
// as it is not a common use case. The value of 1.0 is chosen as it produces a relatively
36+
// smooth backoff curve.
37+
spread = 1.0
38+
)
39+
3340
type websocketStream struct {
3441
processor
3542

@@ -422,7 +429,7 @@ func connectWebSocket(ctx context.Context, cfg config, url string, log *logp.Log
422429
} else {
423430
log.Errorf("attempt %d: webSocket connection failed with error %v and no response, retrying...\n", attempt, err)
424431
}
425-
waitTime := calculateWaitTime(retryConfig.WaitMin, retryConfig.WaitMax, attempt)
432+
waitTime := calculateWaitTime(retryConfig.WaitMin, retryConfig.WaitMax, attempt, retryConfig.MaxAttempts)
426433
time.Sleep(waitTime)
427434
}
428435
if response == nil {
@@ -446,7 +453,7 @@ func connectWebSocket(ctx context.Context, cfg config, url string, log *logp.Log
446453
} else {
447454
log.Errorf("attempt %d: webSocket connection failed with error %v and no response, retrying...\n", attempt, err)
448455
}
449-
waitTime := calculateWaitTime(retryConfig.WaitMin, retryConfig.WaitMax, attempt)
456+
waitTime := calculateWaitTime(retryConfig.WaitMin, retryConfig.WaitMax, attempt, retryConfig.MaxAttempts)
450457
time.Sleep(waitTime)
451458
}
452459
}
@@ -456,16 +463,9 @@ func connectWebSocket(ctx context.Context, cfg config, url string, log *logp.Log
456463
}
457464

458465
// calculateWaitTime calculates the wait time for the next attempt based on the exponential backoff algorithm.
459-
func calculateWaitTime(waitMin, waitMax time.Duration, attempt int) time.Duration {
466+
func calculateWaitTime(waitMin, waitMax time.Duration, attempt, maxAttempts int) time.Duration {
460467
// calculate exponential backoff
461-
base := float64(waitMin)
462-
backoff := base * math.Pow(2, float64(attempt-1))
463-
464-
// calculate jitter proportional to the backoff
465-
maxJitter := float64(waitMax-waitMin) * math.Pow(2, float64(attempt-1))
466-
jitter := rand.Float64() * maxJitter
467-
468-
waitTime := time.Duration(backoff + jitter)
468+
waitTime := wait(waitMin, waitMax, attempt, maxAttempts, spread)
469469
// caps the wait time to the maximum wait time
470470
if waitTime > waitMax {
471471
waitTime = waitMax
@@ -474,6 +474,28 @@ func calculateWaitTime(waitMin, waitMax time.Duration, attempt int) time.Duratio
474474
return waitTime
475475
}
476476

477+
// wait returns a logistic backoff duration with jitter. The duration increases
478+
// from min to max in n steps, with i indicating the step. Jitter is added around
479+
// the logistic based on the value of spread. Zero spread results in no jitter,
480+
// and unit spread is maximal. Spread values above one may result in durations
481+
// outside [min, max]. min must not be greater than max.
482+
func wait(min, max time.Duration, i, n int, spread float64) time.Duration {
483+
l := logistic(i, n-1) // n-1 because of fence posts.
484+
return min + time.Duration(float64(max-min)*(l+spread*jitter(l)))
485+
}
486+
487+
// logistic returns the ith value of n of the logistic function shifted
488+
// n/2 right. The returned value is in (0, 1) for all sensible values.
489+
func logistic(i, n int) float64 {
490+
return 1 / (1 + math.Exp(float64(n)/2-float64(i)))
491+
}
492+
493+
// jitter returns a jittered value around f, f±eps, where eps is f(1-f).
494+
// f must be in [0, 1].
495+
func jitter(f float64) float64 {
496+
return (rand.Float64() - 0.5) * f * (1 - f)
497+
}
498+
477499
// now is time.Now with a modifiable time source.
478500
func (s *websocketStream) now() time.Time {
479501
if s.time == nil {

0 commit comments

Comments
 (0)