[ARVADOS] updated: 1.2.0-219-ga7c56b505

Git user git at public.curoverse.com
Thu Oct 18 17:19:27 EDT 2018


Summary of changes:
 services/crunch-run/cmd_watchdog.go      |  76 --------------------
 services/crunch-run/cmd_watchdog_test.go | 117 -------------------------------
 services/crunch-run/crunchrun.go         |  32 ++++++++-
 services/crunch-run/crunchrun_test.go    |  31 ++++++++
 4 files changed, 61 insertions(+), 195 deletions(-)
 delete mode 100644 services/crunch-run/cmd_watchdog.go
 delete mode 100644 services/crunch-run/cmd_watchdog_test.go

  discards  09a692078f06f2cded026f0a5b95f71ea706f41f (commit)
       via  a7c56b505a1f6907eaa5b2978fdeb1e55b659441 (commit)

This update added new revisions after undoing existing revisions.  That is
to say, the old revision is not a strict subset of the new revision.  This
situation occurs when you --force push a change and generate a repository
containing something like this:

 * -- * -- B -- O -- O -- O (09a692078f06f2cded026f0a5b95f71ea706f41f)
            \
             N -- N -- N (a7c56b505a1f6907eaa5b2978fdeb1e55b659441)

When this happens we assume that you've already had alert emails for all
of the O revisions, and so we here report only the revisions in the N
branch from the common base, B.

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.


commit a7c56b505a1f6907eaa5b2978fdeb1e55b659441
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Thu Oct 18 17:18:57 2018 -0400

    14328: Cancel if container ends but ContainerWait does not return.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/services/crunch-run/crunchrun.go b/services/crunch-run/crunchrun.go
index be98a3ee1..367169e86 100644
--- a/services/crunch-run/crunchrun.go
+++ b/services/crunch-run/crunchrun.go
@@ -79,6 +79,7 @@ type ThinDockerClient interface {
 	ContainerStart(ctx context.Context, container string, options dockertypes.ContainerStartOptions) error
 	ContainerRemove(ctx context.Context, container string, options dockertypes.ContainerRemoveOptions) error
 	ContainerWait(ctx context.Context, container string, condition dockercontainer.WaitCondition) (<-chan dockercontainer.ContainerWaitOKBody, <-chan error)
+	ContainerList(ctx context.Context, opts dockertypes.ContainerListOptions) ([]dockertypes.Container, error)
 	ImageInspectWithRaw(ctx context.Context, image string) (dockertypes.ImageInspect, []byte, error)
 	ImageLoad(ctx context.Context, input io.Reader, quiet bool) (dockertypes.ImageLoadResponse, error)
 	ImageRemove(ctx context.Context, image string, options dockertypes.ImageRemoveOptions) ([]dockertypes.ImageDeleteResponseItem, error)
@@ -154,6 +155,8 @@ type ContainerRunner struct {
 	networkMode     string // passed through to HostConfig.NetworkMode
 	arvMountLog     *ThrottledLogger
 	checkContainerd time.Duration
+
+	containerWaitGracePeriod time.Duration
 }
 
 // setupSignals sets up signal handling to gracefully terminate the underlying
@@ -1124,6 +1127,33 @@ func (runner *ContainerRunner) WaitFinish() error {
 		runTimeExceeded = time.After(time.Duration(timeout) * time.Second)
 	}
 
+	containerGone := make(chan struct{})
+	go func() {
+		defer close(containerGone)
+		if runner.containerWaitGracePeriod < 1 {
+			runner.containerWaitGracePeriod = 30 * time.Second
+		}
+		found := time.Now()
+	polling:
+		for range time.NewTicker(runner.containerWaitGracePeriod / 30).C {
+			ctrs, err := runner.Docker.ContainerList(context.Background(), dockertypes.ContainerListOptions{})
+			if err != nil {
+				runner.CrunchLog.Printf("error checking container list: %s", err)
+				continue polling
+			}
+			for _, ctr := range ctrs {
+				if ctr.ID == runner.ContainerID {
+					found = time.Now()
+					continue polling
+				}
+			}
+			if time.Since(found) > runner.containerWaitGracePeriod {
+				runner.CrunchLog.Printf("container %s no longer exists", runner.ContainerID)
+				return
+			}
+		}
+	}()
+
 	containerdGone := make(chan error)
 	defer close(containerdGone)
 	if runner.checkContainerd > 0 {
@@ -1171,6 +1201,9 @@ func (runner *ContainerRunner) WaitFinish() error {
 			runner.stop(nil)
 			runTimeExceeded = nil
 
+		case <-containerGone:
+			return errors.New("docker client never returned status")
+
 		case err := <-containerdGone:
 			return err
 		}
diff --git a/services/crunch-run/crunchrun_test.go b/services/crunch-run/crunchrun_test.go
index 217d4236b..0e48a50aa 100644
--- a/services/crunch-run/crunchrun_test.go
+++ b/services/crunch-run/crunchrun_test.go
@@ -47,6 +47,7 @@ var _ = Suite(&TestSuite{})
 type TestSuite struct {
 	client *arvados.Client
 	docker *TestDockerClient
+	runner *ContainerRunner
 }
 
 func (s *TestSuite) SetUpTest(c *C) {
@@ -103,6 +104,7 @@ type TestDockerClient struct {
 	api         *ArvTestClient
 	realTemp    string
 	calledWait  bool
+	ctrExited   bool
 }
 
 func NewTestDockerClient() *TestDockerClient {
@@ -176,6 +178,14 @@ func (t *TestDockerClient) ContainerWait(ctx context.Context, container string,
 	return body, err
 }
 
+func (t *TestDockerClient) ContainerList(ctx context.Context, options dockertypes.ContainerListOptions) ([]dockertypes.Container, error) {
+	if t.ctrExited {
+		return nil, nil
+	} else {
+		return []dockertypes.Container{{ID: "abcde"}}, nil
+	}
+}
+
 func (t *TestDockerClient) ImageInspectWithRaw(ctx context.Context, image string) (dockertypes.ImageInspect, []byte, error) {
 	if t.exitCode == 2 {
 		return dockertypes.ImageInspect{}, nil, fmt.Errorf("Cannot connect to the Docker daemon at unix:///var/run/docker.sock. Is the docker daemon running?")
@@ -736,6 +746,7 @@ func (s *TestSuite) fullRunHelper(c *C, record string, extraMounts []string, exi
 	defer kc.Close()
 	cr, err = NewContainerRunner(s.client, api, kc, s.docker, "zzzzz-zzzzz-zzzzzzzzzzzzzzz")
 	c.Assert(err, IsNil)
+	s.runner = cr
 	cr.statInterval = 100 * time.Millisecond
 	am := &ArvMountCmdLine{}
 	cr.RunArvMount = am.ArvMountTest
@@ -830,6 +841,26 @@ func (s *TestSuite) TestRunTimeExceeded(c *C) {
 	c.Check(api.Logs["crunch-run"].String(), Matches, "(?ms).*maximum run time exceeded.*")
 }
 
+func (s *TestSuite) TestContainerWaitFails(c *C) {
+	api, _, _ := s.fullRunHelper(c, `{
+    "command": ["sleep", "3"],
+    "container_image": "d4ab34d3d4f8a72f5c4973051ae69fab+122",
+    "cwd": ".",
+    "mounts": {"/tmp": {"kind": "tmp"} },
+    "output_path": "/tmp",
+    "priority": 1
+}`, nil, 0, func(t *TestDockerClient) {
+		s.runner.containerWaitGracePeriod = time.Second
+		t.ctrExited = true
+		time.Sleep(10 * time.Second)
+		t.logWriter.Close()
+	})
+
+	c.Check(api.CalledWith("container.state", "Cancelled"), NotNil)
+	c.Check(api.Logs["crunch-run"].String(), Matches, "(?ms).*container.*no longer exists.*")
+	c.Check(api.Logs["crunch-run"].String(), Matches, "(?ms).*docker client never returned status.*")
+}
+
 func (s *TestSuite) TestCrunchstat(c *C) {
 	api, _, _ := s.fullRunHelper(c, `{
 		"command": ["sleep", "1"],

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list