[arvados] created: 2.1.0-2865-g9c254acbd

git repository hosting git at public.arvados.org
Fri Sep 2 14:08:03 UTC 2022


        at  9c254acbd78ed50e1e9fec508fb9ec4164867dda (commit)


commit 9c254acbd78ed50e1e9fec508fb9ec4164867dda
Author: Tom Clegg <tom at curii.com>
Date:   Fri Sep 2 10:06:03 2022 -0400

    19437: Don't cancel until 3 consecutive docker-inspect failures.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>

diff --git a/lib/crunchrun/docker.go b/lib/crunchrun/docker.go
index 54d0e680f..bb6352658 100644
--- a/lib/crunchrun/docker.go
+++ b/lib/crunchrun/docker.go
@@ -31,6 +31,11 @@ const minDockerRAM = int64(16 * 1024 * 1024)
 // https://docs.docker.com/engine/api/.
 const DockerAPIVersion = "1.35"
 
+// Number of consecutive "inspect container" failures before
+// concluding Docker is unresponsive, giving up, and cancelling the
+// container.
+const dockerWatchdogThreshold = 3
+
 type dockerExecutor struct {
 	containerUUID    string
 	logf             func(string, ...interface{})
@@ -225,17 +230,17 @@ func (e *dockerExecutor) Wait(ctx context.Context) (int, error) {
 				// kill it.
 				return
 			} else if err != nil {
-				e.logf("Error inspecting container: %s", err)
-				watchdogErr <- err
-				return
+				watchdogErr <- fmt.Errorf("error inspecting container: %s", err)
 			} else if ctr.State == nil || !(ctr.State.Running || ctr.State.Status == "created") {
-				watchdogErr <- fmt.Errorf("Container is not running: State=%v", ctr.State)
-				return
+				watchdogErr <- fmt.Errorf("container is not running: State=%v", ctr.State)
+			} else {
+				watchdogErr <- nil
 			}
 		}
 	}()
 
 	waitOk, waitErr := e.dockerclient.ContainerWait(ctx, e.containerID, dockercontainer.WaitConditionNotRunning)
+	errors := 0
 	for {
 		select {
 		case waitBody := <-waitOk:
@@ -250,7 +255,16 @@ func (e *dockerExecutor) Wait(ctx context.Context) (int, error) {
 			return -1, ctx.Err()
 
 		case err := <-watchdogErr:
-			return -1, err
+			if err == nil {
+				errors = 0
+			} else {
+				e.logf("docker watchdog: %s", err)
+				errors++
+				if errors >= dockerWatchdogThreshold {
+					e.logf("docker watchdog: giving up")
+					return -1, err
+				}
+			}
 		}
 	}
 }

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list