@@ -40,26 +40,29 @@ func (h *HatcheryKubernetes) killAwolWorkers(ctx context.Context) error {
4040 continue
4141 }
4242
43- var toDelete , found bool
44- for _ , w := range workers {
45- if workerName , ok := labels [LABEL_WORKER_NAME ]; ok && workerName == w .Name {
46- found = true
43+ var toDelete bool
44+ for _ , container := range pod .Status .ContainerStatuses {
45+ terminated := (container .State .Terminated != nil && (container .State .Terminated .Reason == "Completed" || container .State .Terminated .Reason == "Error" ))
46+ errImagePull := (container .State .Waiting != nil && container .State .Waiting .Reason == "ErrImagePull" )
47+ if terminated || errImagePull {
48+ toDelete = true
49+ log .Debug (ctx , "pod %s/%s is terminated or in error" , pod .Namespace , pod .Name )
4750 break
4851 }
4952 }
50- if ! found {
51- toDelete = true
52- }
5353
5454 if ! toDelete {
55- for _ , container := range pod .Status .ContainerStatuses {
56- terminated := (container .State .Terminated != nil && (container .State .Terminated .Reason == "Completed" || container .State .Terminated .Reason == "Error" ))
57- errImagePull := (container .State .Waiting != nil && container .State .Waiting .Reason == "ErrImagePull" )
58- if terminated || errImagePull {
59- toDelete = true
55+ var found bool
56+ for _ , w := range workers {
57+ if workerName , ok := labels [LABEL_WORKER_NAME ]; ok && workerName == w .Name {
58+ found = true
6059 break
6160 }
6261 }
62+ if ! found && time .Since (pod .CreationTimestamp .Time ) > 3 * time .Minute {
63+ toDelete = true
64+ log .Debug (ctx , "pod %s/%s didn't match a registered worker and was started since %v" , pod .Namespace , pod .Name , pod .CreationTimestamp .Time )
65+ }
6366 }
6467
6568 if toDelete {
@@ -130,6 +133,7 @@ func (h *HatcheryKubernetes) killAwolWorkers(ctx context.Context) error {
130133 globalErr = err
131134 log .Error (ctx , "hatchery:kubernetes> killAwolWorkers> Cannot delete pod %s (%s)" , pod .Name , err )
132135 }
136+ log .Debug (ctx , "pod %s/%s killed" , pod .Namespace , pod .Name )
133137 }
134138 }
135139 return globalErr
0 commit comments