[ARVADOS] updated: 1.1.0-192-gce911bd

Git user git at public.curoverse.com
Tue Nov 28 16:40:05 EST 2017


Summary of changes:
 services/crunch-run/crunchrun.go      | 35 +++++++++++---------
 services/crunch-run/crunchrun_test.go | 61 +++++++++++++++++++++++++++++++++--
 2 files changed, 78 insertions(+), 18 deletions(-)

       via  ce911bd75fb998c2ac1d88a67db55f7f0e362752 (commit)
      from  84bc109580e503b4b8bbb5ddcd5f1d909745141f (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.


commit ce911bd75fb998c2ac1d88a67db55f7f0e362752
Author: Peter Amstutz <pamstutz at veritasgenetics.com>
Date:   Tue Nov 28 16:35:15 2017 -0500

    12614: Add tests.
    
    Also reorder run sequence slightly so that failure to start container due to
    broken node is retryable.
    
    Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <pamstutz at veritasgenetics.com>

diff --git a/services/crunch-run/crunchrun.go b/services/crunch-run/crunchrun.go
index 72a2f1a..3c6d7d2 100644
--- a/services/crunch-run/crunchrun.go
+++ b/services/crunch-run/crunchrun.go
@@ -228,15 +228,15 @@ func (runner *ContainerRunner) stopSignals() {
 	}
 }
 
-var dockerErrorBlacklist = []string{"Cannot connect to the Docker daemon"}
+var errorBlacklist = []string{"Cannot connect to the Docker daemon"}
 var brokenNodeHook *string
 
 func (runner *ContainerRunner) checkBrokenNode(goterr error) bool {
-	for _, d := range dockerErrorBlacklist {
+	for _, d := range errorBlacklist {
 		if strings.Index(goterr.Error(), d) != -1 {
-			runner.CrunchLog.Printf("Error suggests node is unable to run containers.")
-			if *brokenNodeHook == "" {
-				runner.CrunchLog.Printf("No broken node hook provided")
+			runner.CrunchLog.Printf("Error suggests node is unable to run containers: %v", goterr)
+			if brokenNodeHook == nil || *brokenNodeHook == "" {
+				runner.CrunchLog.Printf("No broken node hook provided, cannot mark node as broken.")
 			} else {
 				runner.CrunchLog.Printf("Running broken node hook %q", *brokenNodeHook)
 				// run killme script
@@ -1474,11 +1474,6 @@ func (runner *ContainerRunner) Run() (err error) {
 				return
 			}
 			runner.CrunchLog.Print(e)
-			if runner.checkBrokenNode(e) {
-				// A container failing due to "broken node"
-				// error should back into queue to run again.
-				runner.finalState = "Queued"
-			}
 			if err == nil {
 				err = e
 			}
@@ -1518,7 +1513,11 @@ func (runner *ContainerRunner) Run() (err error) {
 	// check for and/or load image
 	err = runner.LoadImage()
 	if err != nil {
-		runner.finalState = "Cancelled"
+		if !runner.checkBrokenNode(err) {
+			// Failed to load image but not due to a "broken node"
+			// condition, probably user error.
+			runner.finalState = "Cancelled"
+		}
 		err = fmt.Errorf("While loading container image: %v", err)
 		return
 	}
@@ -1547,19 +1546,25 @@ func (runner *ContainerRunner) Run() (err error) {
 		return
 	}
 
-	runner.StartCrunchstat()
-
 	if runner.IsCancelled() {
 		return
 	}
 
-	err = runner.UpdateContainerRunning()
+	runner.StartCrunchstat()
+
+	err = runner.StartContainer()
 	if err != nil {
+		if !runner.checkBrokenNode(err) {
+			// Failed to start container but not a "broken node"
+			// condition, assume user error.
+			runner.finalState = "Cancelled"
+		}
 		return
 	}
+
 	runner.finalState = "Cancelled"
 
-	err = runner.StartContainer()
+	err = runner.UpdateContainerRunning()
 	if err != nil {
 		return
 	}
diff --git a/services/crunch-run/crunchrun_test.go b/services/crunch-run/crunchrun_test.go
index bc0b312..6581c6b 100644
--- a/services/crunch-run/crunchrun_test.go
+++ b/services/crunch-run/crunchrun_test.go
@@ -156,6 +156,10 @@ func (t *TestDockerClient) ContainerWait(ctx context.Context, container string,
 }
 
 func (t *TestDockerClient) ImageInspectWithRaw(ctx context.Context, image string) (dockertypes.ImageInspect, []byte, error) {
+	if t.finish == 2 {
+		return dockertypes.ImageInspect{}, nil, fmt.Errorf("Cannot connect to the Docker daemon at unix:///var/run/docker.sock. Is the docker daemon running?")
+	}
+
 	if t.imageLoaded == image {
 		return dockertypes.ImageInspect{}, nil, nil
 	} else {
@@ -164,6 +168,9 @@ func (t *TestDockerClient) ImageInspectWithRaw(ctx context.Context, image string
 }
 
 func (t *TestDockerClient) ImageLoad(ctx context.Context, input io.Reader, quiet bool) (dockertypes.ImageLoadResponse, error) {
+	if t.finish == 2 {
+		return dockertypes.ImageLoadResponse{}, fmt.Errorf("Cannot connect to the Docker daemon at unix:///var/run/docker.sock. Is the docker daemon running?")
+	}
 	_, err := io.Copy(ioutil.Discard, input)
 	if err != nil {
 		return dockertypes.ImageLoadResponse{}, err
@@ -668,9 +675,10 @@ func FullRunHelper(c *C, record string, extraMounts []string, exitCode int, fn f
 	if api.CalledWith("container.state", "Complete") != nil {
 		c.Check(err, IsNil)
 	}
-	c.Check(api.WasSetRunning, Equals, true)
-
-	c.Check(api.Content[api.Calls-2]["container"].(arvadosclient.Dict)["log"], NotNil)
+	if exitCode != 2 {
+		c.Check(api.WasSetRunning, Equals, true)
+		c.Check(api.Content[api.Calls-2]["container"].(arvadosclient.Dict)["log"], NotNil)
+	}
 
 	if err != nil {
 		for k, v := range api.Logs {
@@ -1759,3 +1767,50 @@ func (s *TestSuite) TestEvalSymlinkDir(c *C) {
 	_, err = cr.UploadOutputFile(realTemp+"/"+v, info, err, []string{}, nil, "", "", 0)
 	c.Assert(err, NotNil)
 }
+
+func (s *TestSuite) TestFullBrokenDocker1(c *C) {
+	api, _, _ := FullRunHelper(c, `{
+    "command": ["echo", "hello world"],
+    "container_image": "d4ab34d3d4f8a72f5c4973051ae69fab+122",
+    "cwd": ".",
+    "environment": {},
+    "mounts": {"/tmp": {"kind": "tmp"} },
+    "output_path": "/tmp",
+    "priority": 1,
+    "runtime_constraints": {}
+}`, nil, 2, func(t *TestDockerClient) {
+		t.logWriter.Write(dockerLog(1, "hello world\n"))
+		t.logWriter.Close()
+	})
+
+	ech := "/bin/echo"
+	brokenNodeHook = &ech
+
+	c.Check(api.CalledWith("container.state", "Queued"), NotNil)
+	c.Check(strings.Index(api.Logs["crunch-run"].String(), "unable to run containers"), Not(Equals), -1)
+	c.Check(strings.Index(api.Logs["crunch-run"].String(), "Running broken node hook '/bin/echo'"), Not(Equals), -1)
+
+}
+
+func (s *TestSuite) TestFullBrokenDocker2(c *C) {
+	api, _, _ := FullRunHelper(c, `{
+    "command": ["echo", "hello world"],
+    "container_image": "d4ab34d3d4f8a72f5c4973051ae69fab+122",
+    "cwd": ".",
+    "environment": {},
+    "mounts": {"/tmp": {"kind": "tmp"} },
+    "output_path": "/tmp",
+    "priority": 1,
+    "runtime_constraints": {}
+}`, nil, 2, func(t *TestDockerClient) {
+		t.logWriter.Write(dockerLog(1, "hello world\n"))
+		t.logWriter.Close()
+	})
+
+	ech := ""
+	brokenNodeHook = &ech
+
+	c.Check(api.CalledWith("container.state", "Queued"), NotNil)
+	c.Check(strings.Index(api.Logs["crunch-run"].String(), "unable to run containers"), Not(Equals), -1)
+	c.Check(strings.Index(api.Logs["crunch-run"].String(), "No broken node hook"), Not(Equals), -1)
+}

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list