[arvados] updated: 2.4.2-22-g8ad66154d

git repository hosting git at public.arvados.org
Fri Sep 16 15:12:31 UTC 2022


Summary of changes:
 lib/crunchrun/docker.go | 26 ++++++++++++++++++++------
 1 file changed, 20 insertions(+), 6 deletions(-)

       via  8ad66154df528ad2020e80bc255896537f1c712a (commit)
      from  6d04694d27d32591404310be790a212f9804142a (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.


commit 8ad66154df528ad2020e80bc255896537f1c712a
Author: Tom Clegg <tom at curii.com>
Date:   Fri Sep 2 10:06:03 2022 -0400

    19437: Don't cancel until 3 consecutive docker-inspect failures.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>

diff --git a/lib/crunchrun/docker.go b/lib/crunchrun/docker.go
index eee8f1d76..7d8f312a8 100644
--- a/lib/crunchrun/docker.go
+++ b/lib/crunchrun/docker.go
@@ -23,6 +23,11 @@ import (
 // Docker daemon won't let you set a limit less than ~10 MiB
 const minDockerRAM = int64(16 * 1024 * 1024)
 
+// Number of consecutive "inspect container" failures before
+// concluding Docker is unresponsive, giving up, and cancelling the
+// container.
+const dockerWatchdogThreshold = 3
+
 type dockerExecutor struct {
 	containerUUID    string
 	logf             func(string, ...interface{})
@@ -217,17 +222,17 @@ func (e *dockerExecutor) Wait(ctx context.Context) (int, error) {
 				// kill it.
 				return
 			} else if err != nil {
-				e.logf("Error inspecting container: %s", err)
-				watchdogErr <- err
-				return
+				watchdogErr <- fmt.Errorf("error inspecting container: %s", err)
 			} else if ctr.State == nil || !(ctr.State.Running || ctr.State.Status == "created") {
-				watchdogErr <- fmt.Errorf("Container is not running: State=%v", ctr.State)
-				return
+				watchdogErr <- fmt.Errorf("container is not running: State=%v", ctr.State)
+			} else {
+				watchdogErr <- nil
 			}
 		}
 	}()
 
 	waitOk, waitErr := e.dockerclient.ContainerWait(ctx, e.containerID, dockercontainer.WaitConditionNotRunning)
+	errors := 0
 	for {
 		select {
 		case waitBody := <-waitOk:
@@ -242,7 +247,16 @@ func (e *dockerExecutor) Wait(ctx context.Context) (int, error) {
 			return -1, ctx.Err()
 
 		case err := <-watchdogErr:
-			return -1, err
+			if err == nil {
+				errors = 0
+			} else {
+				e.logf("docker watchdog: %s", err)
+				errors++
+				if errors >= dockerWatchdogThreshold {
+					e.logf("docker watchdog: giving up")
+					return -1, err
+				}
+			}
 		}
 	}
 }

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list