[ARVADOS] updated: 1.2.0-219-ga7c56b505
Git user
git at public.curoverse.com
Thu Oct 18 17:19:27 EDT 2018
Summary of changes:
services/crunch-run/cmd_watchdog.go | 76 --------------------
services/crunch-run/cmd_watchdog_test.go | 117 -------------------------------
services/crunch-run/crunchrun.go | 32 ++++++++-
services/crunch-run/crunchrun_test.go | 31 ++++++++
4 files changed, 61 insertions(+), 195 deletions(-)
delete mode 100644 services/crunch-run/cmd_watchdog.go
delete mode 100644 services/crunch-run/cmd_watchdog_test.go
discards 09a692078f06f2cded026f0a5b95f71ea706f41f (commit)
via a7c56b505a1f6907eaa5b2978fdeb1e55b659441 (commit)
This update added new revisions after undoing existing revisions. That is
to say, the old revision is not a strict subset of the new revision. This
situation occurs when you --force push a change and generate a repository
containing something like this:
* -- * -- B -- O -- O -- O (09a692078f06f2cded026f0a5b95f71ea706f41f)
\
N -- N -- N (a7c56b505a1f6907eaa5b2978fdeb1e55b659441)
When this happens we assume that you've already had alert emails for all
of the O revisions, and so we here report only the revisions in the N
branch from the common base, B.
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
commit a7c56b505a1f6907eaa5b2978fdeb1e55b659441
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date: Thu Oct 18 17:18:57 2018 -0400
14328: Cancel if container ends but ContainerWait does not return.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>
diff --git a/services/crunch-run/crunchrun.go b/services/crunch-run/crunchrun.go
index be98a3ee1..367169e86 100644
--- a/services/crunch-run/crunchrun.go
+++ b/services/crunch-run/crunchrun.go
@@ -79,6 +79,7 @@ type ThinDockerClient interface {
ContainerStart(ctx context.Context, container string, options dockertypes.ContainerStartOptions) error
ContainerRemove(ctx context.Context, container string, options dockertypes.ContainerRemoveOptions) error
ContainerWait(ctx context.Context, container string, condition dockercontainer.WaitCondition) (<-chan dockercontainer.ContainerWaitOKBody, <-chan error)
+ ContainerList(ctx context.Context, opts dockertypes.ContainerListOptions) ([]dockertypes.Container, error)
ImageInspectWithRaw(ctx context.Context, image string) (dockertypes.ImageInspect, []byte, error)
ImageLoad(ctx context.Context, input io.Reader, quiet bool) (dockertypes.ImageLoadResponse, error)
ImageRemove(ctx context.Context, image string, options dockertypes.ImageRemoveOptions) ([]dockertypes.ImageDeleteResponseItem, error)
@@ -154,6 +155,8 @@ type ContainerRunner struct {
networkMode string // passed through to HostConfig.NetworkMode
arvMountLog *ThrottledLogger
checkContainerd time.Duration
+
+ containerWaitGracePeriod time.Duration
}
// setupSignals sets up signal handling to gracefully terminate the underlying
@@ -1124,6 +1127,33 @@ func (runner *ContainerRunner) WaitFinish() error {
runTimeExceeded = time.After(time.Duration(timeout) * time.Second)
}
+ containerGone := make(chan struct{})
+ go func() {
+ defer close(containerGone)
+ if runner.containerWaitGracePeriod < 1 {
+ runner.containerWaitGracePeriod = 30 * time.Second
+ }
+ found := time.Now()
+ polling:
+ for range time.NewTicker(runner.containerWaitGracePeriod / 30).C {
+ ctrs, err := runner.Docker.ContainerList(context.Background(), dockertypes.ContainerListOptions{})
+ if err != nil {
+ runner.CrunchLog.Printf("error checking container list: %s", err)
+ continue polling
+ }
+ for _, ctr := range ctrs {
+ if ctr.ID == runner.ContainerID {
+ found = time.Now()
+ continue polling
+ }
+ }
+ if time.Since(found) > runner.containerWaitGracePeriod {
+ runner.CrunchLog.Printf("container %s no longer exists", runner.ContainerID)
+ return
+ }
+ }
+ }()
+
containerdGone := make(chan error)
defer close(containerdGone)
if runner.checkContainerd > 0 {
@@ -1171,6 +1201,9 @@ func (runner *ContainerRunner) WaitFinish() error {
runner.stop(nil)
runTimeExceeded = nil
+ case <-containerGone:
+ return errors.New("docker client never returned status")
+
case err := <-containerdGone:
return err
}
diff --git a/services/crunch-run/crunchrun_test.go b/services/crunch-run/crunchrun_test.go
index 217d4236b..0e48a50aa 100644
--- a/services/crunch-run/crunchrun_test.go
+++ b/services/crunch-run/crunchrun_test.go
@@ -47,6 +47,7 @@ var _ = Suite(&TestSuite{})
type TestSuite struct {
client *arvados.Client
docker *TestDockerClient
+ runner *ContainerRunner
}
func (s *TestSuite) SetUpTest(c *C) {
@@ -103,6 +104,7 @@ type TestDockerClient struct {
api *ArvTestClient
realTemp string
calledWait bool
+ ctrExited bool
}
func NewTestDockerClient() *TestDockerClient {
@@ -176,6 +178,14 @@ func (t *TestDockerClient) ContainerWait(ctx context.Context, container string,
return body, err
}
+func (t *TestDockerClient) ContainerList(ctx context.Context, options dockertypes.ContainerListOptions) ([]dockertypes.Container, error) {
+ if t.ctrExited {
+ return nil, nil
+ } else {
+ return []dockertypes.Container{{ID: "abcde"}}, nil
+ }
+}
+
func (t *TestDockerClient) ImageInspectWithRaw(ctx context.Context, image string) (dockertypes.ImageInspect, []byte, error) {
if t.exitCode == 2 {
return dockertypes.ImageInspect{}, nil, fmt.Errorf("Cannot connect to the Docker daemon at unix:///var/run/docker.sock. Is the docker daemon running?")
@@ -736,6 +746,7 @@ func (s *TestSuite) fullRunHelper(c *C, record string, extraMounts []string, exi
defer kc.Close()
cr, err = NewContainerRunner(s.client, api, kc, s.docker, "zzzzz-zzzzz-zzzzzzzzzzzzzzz")
c.Assert(err, IsNil)
+ s.runner = cr
cr.statInterval = 100 * time.Millisecond
am := &ArvMountCmdLine{}
cr.RunArvMount = am.ArvMountTest
@@ -830,6 +841,26 @@ func (s *TestSuite) TestRunTimeExceeded(c *C) {
c.Check(api.Logs["crunch-run"].String(), Matches, "(?ms).*maximum run time exceeded.*")
}
+func (s *TestSuite) TestContainerWaitFails(c *C) {
+ api, _, _ := s.fullRunHelper(c, `{
+ "command": ["sleep", "3"],
+ "container_image": "d4ab34d3d4f8a72f5c4973051ae69fab+122",
+ "cwd": ".",
+ "mounts": {"/tmp": {"kind": "tmp"} },
+ "output_path": "/tmp",
+ "priority": 1
+}`, nil, 0, func(t *TestDockerClient) {
+ s.runner.containerWaitGracePeriod = time.Second
+ t.ctrExited = true
+ time.Sleep(10 * time.Second)
+ t.logWriter.Close()
+ })
+
+ c.Check(api.CalledWith("container.state", "Cancelled"), NotNil)
+ c.Check(api.Logs["crunch-run"].String(), Matches, "(?ms).*container.*no longer exists.*")
+ c.Check(api.Logs["crunch-run"].String(), Matches, "(?ms).*docker client never returned status.*")
+}
+
func (s *TestSuite) TestCrunchstat(c *C) {
api, _, _ := s.fullRunHelper(c, `{
"command": ["sleep", "1"],
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list