Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit f63bb92

Browse files
jsfakianeriknordmark
authored andcommitted
Allow download retrying if the computed and configured checksums differ.
- We have identified that downloading images under poor networking conditions from datastores might fail silently due to the SDK libraries we use for for Azure and AWS. This happens when we are installing new applications. - Another observation we made is that when we manually try re-installing the applications, they eventually succeed in the installation. - With this patch, we automate retrying the image download a fixed number of times (5 by default) if the computed versus the configure checksums differ. Signed-off-by: Ioannis Sfakianakis <[email protected]>
1 parent 0566cd7 commit f63bb92

12 files changed

Lines changed: 84 additions & 12 deletions

File tree

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,3 +27,4 @@ tools/compare-sbom-sources/vendor
2727
tools/get-deps/get-deps
2828
tools/get-deps/vendor
2929
pkg/installer/target
30+
pkg/installer/vendor

docs/CONFIG-PROPERTIES.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
| timer.port.testbetterinterval | timer in seconds | 600 | test a higher prio port config |
3030
| network.fallback.any.eth | "enabled" or "disabled" | disabled (enabled forcefully during onboarding if no network config) | if no connectivity try any Ethernet, WiFi, or LTE with DHCP client |
3131
| network.download.max.cost | 0-255 | 0 | [max port cost for download](DEVICE-CONNECTIVITY.md) to avoid e.g., LTE ports |
32+
| blob.download.max.retries | 1-10 | 5 | max download retries when image verification fails.|
3233
| debug.enable.usb | boolean | false | allow USB e.g. keyboards on device |
3334
| debug.enable.vga | boolean | false | allow VGA console on device |
3435
| debug.enable.ssh | authorized ssh key | empty string(ssh disabled) | allow ssh to EVE |

pkg/pillar/cmd/downloader/downloader.go

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -399,9 +399,22 @@ func handleModify(ctx *downloaderContext, key string,
399399

400400
// If RefCount from zero to non-zero and status has error
401401
// or status is not downloaded then do install
402-
if config.RefCount != 0 && (status.HasError() || status.State != types.DOWNLOADED) {
402+
if config.RefCount != 0 && (status.HasError() || status.State != types.DOWNLOADED ||
403+
status.LastRetry != config.LastRetry) {
403404
log.Functionf("handleModify installing %s", config.Name)
405+
if status.LastRetry != config.LastRetry {
406+
log.Functionf("handleModify retry download %s", config.Name)
407+
status.CurrentSize = 0
408+
status.Size = 0
409+
status.Progress = 0
410+
status.State = types.DOWNLOADING
411+
status.LastRetry = config.LastRetry
412+
publishDownloaderStatus(ctx, status)
413+
}
404414
handleCreate(ctx, config, status, key, receiveChan)
415+
// Retrying the download due to image verification failure
416+
// This happens when: RefCount and retryCount is non-zero,
417+
// and DownloaderStatus state is "downloaded"
405418
} else if status.RefCount != config.RefCount {
406419
log.Functionf("handleModify RefCount change %s from %d to %d",
407420
config.Name, status.RefCount, config.RefCount)

pkg/pillar/cmd/volumemgr/blob.go

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -484,12 +484,10 @@ func unpublishBlobStatus(ctx *volumemgrContext, blobs ...*types.BlobStatus) {
484484
// But the BlobStatus pointer might appear several times in
485485
// the list hence we better clear the Has*Ref
486486
if blob.HasDownloaderRef {
487-
MaybeRemoveDownloaderConfig(ctx, blob.Sha256)
488-
blob.HasDownloaderRef = false
487+
MaybeRemoveDownloaderConfig(ctx, blob)
489488
}
490489
if blob.HasVerifierRef {
491-
MaybeRemoveVerifyImageConfig(ctx, blob.Sha256)
492-
blob.HasVerifierRef = false
490+
MaybeRemoveVerifyImageConfig(ctx, blob)
493491
}
494492
//If blob is loaded, then remove it from CAS
495493
if blob.State == types.LOADED {

pkg/pillar/cmd/volumemgr/handledownloader.go

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import (
88
"encoding/hex"
99
"path"
1010
"strconv"
11+
"time"
1112

1213
"github.com/lf-edge/eve/pkg/pillar/types"
1314
)
@@ -18,10 +19,12 @@ func AddOrRefcountDownloaderConfig(ctx *volumemgrContext, blob types.BlobStatus)
1819
log.Functionf("AddOrRefcountDownloaderConfig for %s", blob.Sha256)
1920

2021
refCount := uint(1)
22+
lastRetry := time.Now()
2123
m := lookupDownloaderConfig(ctx, blob.Sha256)
2224
if m != nil {
2325
log.Functionf("downloader config exists for %s to refcount %d", blob.Sha256, m.RefCount)
2426
refCount = m.RefCount + 1
27+
lastRetry = m.LastRetry
2528
// We need to update datastore id before publishing the
2629
// datastore config because datastore id can be updated
2730
// in some cases. For example:
@@ -64,6 +67,7 @@ func AddOrRefcountDownloaderConfig(ctx *volumemgrContext, blob types.BlobStatus)
6467
Size: size,
6568
Target: locFilename,
6669
RefCount: refCount,
70+
LastRetry: lastRetry,
6771
}
6872
log.Functionf("AddOrRefcountDownloaderConfig: DownloaderConfig: %+v", n)
6973
publishDownloaderConfig(ctx, &n)
@@ -83,7 +87,8 @@ func AddOrRefcountDownloaderConfig(ctx *volumemgrContext, blob types.BlobStatus)
8387
// ignored silently.
8488
// > If DownloaderConfig's Refcount was incremented before #3, then expired notification from the
8589
// Downloader will be ignored silently.
86-
func MaybeRemoveDownloaderConfig(ctx *volumemgrContext, imageSha string) {
90+
func MaybeRemoveDownloaderConfig(ctx *volumemgrContext, blob *types.BlobStatus) {
91+
imageSha := blob.Sha256
8792
log.Functionf("MaybeRemoveDownloaderConfig(%s)", imageSha)
8893

8994
m := lookupDownloaderConfig(ctx, imageSha)
@@ -102,9 +107,30 @@ func MaybeRemoveDownloaderConfig(ctx *volumemgrContext, imageSha string) {
102107
m.RefCount, imageSha)
103108

104109
publishDownloaderConfig(ctx, m)
110+
111+
// Remove downloader config reference from blob
112+
blob.HasDownloaderRef = false
105113
log.Functionf("MaybeRemoveDownloaderConfig done for %s", imageSha)
106114
}
107115

116+
func retryDownload(ctx *volumemgrContext, imageSha string) {
117+
m := lookupDownloaderConfig(ctx, imageSha)
118+
if m == nil {
119+
log.Functionf("retryDownload: config missing for %s",
120+
imageSha)
121+
return
122+
}
123+
if m.RefCount == 0 {
124+
log.Warnf("retryDownload: Attempting to retry when "+
125+
"RefCount is 0. Image Details - Name: %s, ImageSha: %s, ",
126+
m.Name, m.ImageSha256)
127+
}
128+
m.LastRetry = time.Now()
129+
130+
publishDownloaderConfig(ctx, m)
131+
log.Functionf("retryDownload done for %s", imageSha)
132+
}
133+
108134
func publishDownloaderConfig(ctx *volumemgrContext,
109135
config *types.DownloaderConfig) {
110136

pkg/pillar/cmd/volumemgr/handleverifier.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,8 @@ func MaybeAddVerifyImageConfigBlob(ctx *volumemgrContext, blob types.BlobStatus)
8686
// However, MaybeAddVerifyImageConfig can be called to increment the refcount
8787
// since the handshake with the verifier will not conclude until the
8888
// VerifyImageConfig is unpublished
89-
func MaybeRemoveVerifyImageConfig(ctx *volumemgrContext, imageSha string) {
89+
func MaybeRemoveVerifyImageConfig(ctx *volumemgrContext, blob *types.BlobStatus) {
90+
imageSha := blob.Sha256
9091

9192
log.Functionf("MaybeRemoveVerifyImageConfig(%s)", imageSha)
9293

@@ -111,6 +112,9 @@ func MaybeRemoveVerifyImageConfig(ctx *volumemgrContext, imageSha string) {
111112
publishVerifyImageConfig(ctx, m)
112113
}
113114
log.Functionf("MaybeRemoveVerifyImageConfig done for %s", imageSha)
115+
116+
// Remove the has verifier reference from blob status
117+
blob.HasVerifierRef = false
114118
}
115119

116120
// deleteVerifyImageConfig checks the refcount and if it is zero it

pkg/pillar/cmd/volumemgr/updatestatus.go

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,20 @@ func doUpdateContentTree(ctx *volumemgrContext, status *types.ContentTreeStatus)
227227
blobErrorEntities = append(blobErrorEntities, &types.ErrorEntity{EntityID: blob.Sha256, EntityType: types.ErrorEntityContentBlob})
228228

229229
leftToProcess = true
230+
if blob.IsErrorSource(types.VerifyImageStatus{}) && blob.RetryCount < blobDownloadMaxRetries {
231+
232+
// Remove VerifyImage config and retry download
233+
MaybeRemoveVerifyImageConfig(ctx, blob)
234+
retryDownload(ctx, blobSha)
235+
236+
// Increment retry count
237+
blob.RetryCount++
238+
blob.State = types.DOWNLOADING
239+
blob.ClearErrorWithSource()
240+
241+
log.Errorf("EVE failed to verify Blob(%s), retrying %d/%d ...", blobSha, blob.RetryCount, blobDownloadMaxRetries)
242+
publishBlobStatus(ctx, blob)
243+
}
230244
}
231245
}
232246

@@ -412,15 +426,13 @@ func doUpdateContentTree(ctx *volumemgrContext, status *types.ContentTreeStatus)
412426
if blob.HasDownloaderRef {
413427
log.Functionf("doUpdateContentTree(%s): removing downloaderRef from Blob %s",
414428
status.Key(), blob.Sha256)
415-
MaybeRemoveDownloaderConfig(ctx, blob.Sha256)
416-
blob.HasDownloaderRef = false
429+
MaybeRemoveDownloaderConfig(ctx, blob)
417430
}
418431
if blob.HasVerifierRef {
419432
log.Functionf("doUpdateContentTree(%s): removing verifyRef from Blob %s",
420433
status.Key(), blob.Sha256)
421-
MaybeRemoveVerifyImageConfig(ctx, blob.Sha256)
434+
MaybeRemoveVerifyImageConfig(ctx, blob)
422435
// Set the path to "" as we delete the verifier path
423-
blob.HasVerifierRef = false
424436
blob.Path = ""
425437
}
426438
publishBlobStatus(ctx, blob)

pkg/pillar/cmd/volumemgr/volumemgr.go

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,10 @@ const (
4242
blankVolumeFormat = zconfig.Format_RAW // format of blank volume TODO: make configurable
4343
)
4444

45-
var volumeFormat = make(map[string]zconfig.Format)
45+
var (
46+
blobDownloadMaxRetries uint32 = 5 // Unless from GlobalConfig
47+
volumeFormat = make(map[string]zconfig.Format)
48+
)
4649

4750
type volumemgrContext struct {
4851
agentbase.AgentBase
@@ -746,6 +749,10 @@ func handleGlobalConfigImpl(ctxArg interface{}, key string,
746749
gcp := agentlog.HandleGlobalConfig(log, ctx.subGlobalConfig, agentName,
747750
ctx.CLIParams().DebugOverride, logger)
748751
if gcp != nil {
752+
// Set max retries for blob download from global config
753+
if gcp.GlobalValueInt(types.BlobDownloadMaxRetries) != 0 {
754+
blobDownloadMaxRetries = gcp.GlobalValueInt(types.BlobDownloadMaxRetries)
755+
}
749756
maybeUpdateConfigItems(ctx, gcp)
750757
ctx.globalConfig = gcp
751758
ctx.GCInitialized = true

pkg/pillar/types/blob.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ type BlobStatus struct {
4747
Progress uint
4848
// ErrorAndTimeWithSource provide common error handling capabilities
4949
ErrorAndTimeWithSource
50+
RetryCount uint32
5051
}
5152

5253
const (

pkg/pillar/types/downloadertypes.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ type DownloaderConfig struct {
2323
Size uint64 // In bytes
2424
FinalObjDir string // final Object Store
2525
RefCount uint
26+
LastRetry time.Time
2627
}
2728

2829
func (config DownloaderConfig) Key() string {
@@ -119,6 +120,8 @@ type DownloaderStatus struct {
119120
RetryCount int
120121
// We save the original error when we do a retry
121122
OrigError string
123+
// Used only when image verification fails after the download
124+
LastRetry time.Time
122125
}
123126

124127
func (status DownloaderStatus) Key() string {

0 commit comments

Comments
 (0)