[ARVADOS] updated: 1.3.0-191-g630601173

Thu Jan 24 16:10:54 EST 2019

Summary of changes:
 lib/dispatchcloud/worker/worker_test.go | 51 +++++++++++++++++++++++++++------
 1 file changed, 43 insertions(+), 8 deletions(-)

  discards  788d0a18592e246b6db62fdb4618fdf9b5eed9b8 (commit)
       via  630601173bda46a7c02b5fbf43eaf5422a95b7d7 (commit)

This update added new revisions after undoing existing revisions.  That is
to say, the old revision is not a strict subset of the new revision.  This
situation occurs when you --force push a change and generate a repository
containing something like this:

 * -- * -- B -- O -- O -- O (788d0a18592e246b6db62fdb4618fdf9b5eed9b8)
            \
             N -- N -- N (630601173bda46a7c02b5fbf43eaf5422a95b7d7)

When this happens we assume that you've already had alert emails for all
of the O revisions, and so we here report only the revisions in the N
branch from the common base, B.

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.


commit 630601173bda46a7c02b5fbf43eaf5422a95b7d7
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Thu Jan 24 16:10:48 2019 -0500

    14325: Don't shutdown busy VMs even if boot probe fails.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/lib/dispatchcloud/test/stub_driver.go b/lib/dispatchcloud/test/stub_driver.go
index 64afa3acf..2277a3f23 100644
--- a/lib/dispatchcloud/test/stub_driver.go
+++ b/lib/dispatchcloud/test/stub_driver.go
@@ -50,9 +50,7 @@ type StubDriver struct {
 }
 
 // InstanceSet returns a new *StubInstanceSet.
-func (sd *StubDriver) InstanceSet(params map[string]interface{}, id cloud.InstanceSetID,
-	logger logrus.FieldLogger) (cloud.InstanceSet, error) {
-
+func (sd *StubDriver) InstanceSet(params map[string]interface{}, id cloud.InstanceSetID, logger logrus.FieldLogger) (cloud.InstanceSet, error) {
 	sis := StubInstanceSet{
 		driver:  sd,
 		servers: map[cloud.InstanceID]*StubVM{},
diff --git a/lib/dispatchcloud/worker/pool_test.go b/lib/dispatchcloud/worker/pool_test.go
index 26087193f..aa8195843 100644
--- a/lib/dispatchcloud/worker/pool_test.go
+++ b/lib/dispatchcloud/worker/pool_test.go
@@ -5,7 +5,6 @@
 package worker
 
 import (
-	"io"
 	"time"
 
 	"git.curoverse.com/arvados.git/lib/cloud"
@@ -50,7 +49,7 @@ func (suite *PoolSuite) TestCreateUnallocShutdown(c *check.C) {
 	type3 := arvados.InstanceType{Name: "a2l", ProviderType: "a2.large", VCPUs: 4, RAM: 4 * GiB, Price: .04}
 	pool := &Pool{
 		logger:      logrus.StandardLogger(),
-		newExecutor: func(cloud.Instance) Executor { return &stubExecutor{} },
+		newExecutor: func(cloud.Instance) Executor { return stubExecutor{} },
 		instanceSet: &throttledInstanceSet{InstanceSet: lameInstanceSet},
 		instanceTypes: arvados.InstanceTypeMap{
 			type1.Name: type1,
@@ -186,13 +185,3 @@ func (suite *PoolSuite) wait(c *check.C, pool *Pool, notify <-chan struct{}, rea
 	}
 	c.Check(ready(), check.Equals, true)
 }
-
-type stubExecutor struct{}
-
-func (*stubExecutor) SetTarget(cloud.ExecutorTarget) {}
-
-func (*stubExecutor) Execute(env map[string]string, cmd string, stdin io.Reader) ([]byte, []byte, error) {
-	return nil, nil, nil
-}
-
-func (*stubExecutor) Close() {}
diff --git a/lib/dispatchcloud/worker/worker.go b/lib/dispatchcloud/worker/worker.go
index 0872e594f..a75d2bbb8 100644
--- a/lib/dispatchcloud/worker/worker.go
+++ b/lib/dispatchcloud/worker/worker.go
@@ -6,6 +6,7 @@ package worker
 
 import (
 	"bytes"
+	"fmt"
 	"strings"
 	"sync"
 	"time"
@@ -144,38 +145,65 @@ func (wkr *worker) ProbeAndUpdate() {
 	}
 }
 
-// should be called in a new goroutine
+// probeAndUpdate calls probeBooted and/or probeRunning if needed, and
+// updates state accordingly.
+//
+// In StateUnknown: Call both probeBooted and probeRunning.
+// In StateBooting: Call probeBooted; if successful, call probeRunning.
+// In StateRunning: Call probeRunning.
+// In StateIdle: Call probeRunning.
+// In StateShutdown: Do nothing.
+//
+// If both probes succeed, wkr.state changes to
+// StateIdle/StateRunning.
+//
+// If probeRunning succeeds, wkr.running is updated. (This means
+// wkr.running might be non-empty even in StateUnknown, if the boot
+// probe failed.)
+//
+// probeAndUpdate should be called in a new goroutine.
 func (wkr *worker) probeAndUpdate() {
 	wkr.mtx.Lock()
 	updated := wkr.updated
-	needProbeRunning := wkr.state == StateRunning || wkr.state == StateIdle
-	needProbeBooted := wkr.state == StateUnknown || wkr.state == StateBooting
+	initialState := wkr.state
 	wkr.mtx.Unlock()
-	if !needProbeBooted && !needProbeRunning {
-		return
-	}
 
 	var (
+		booted   bool
 		ctrUUIDs []string
 		ok       bool
 		stderr   []byte
 	)
-	if needProbeBooted {
-		ok, stderr = wkr.probeBooted()
-		wkr.mtx.Lock()
-		if ok || wkr.state == StateRunning || wkr.state == StateIdle {
+
+	switch initialState {
+	case StateShutdown:
+		return
+	case StateIdle, StateRunning:
+		booted = true
+	case StateUnknown, StateBooting:
+	default:
+		panic(fmt.Sprintf("unknown state %s", initialState))
+	}
+
+	if !booted {
+		booted, stderr = wkr.probeBooted()
+		if !booted {
+			// Pretend this probe succeeded if another
+			// concurrent attempt succeeded.
+			wkr.mtx.Lock()
+			booted = wkr.state == StateRunning || wkr.state == StateIdle
+			wkr.mtx.Unlock()
+		} else {
 			wkr.logger.Info("instance booted; will try probeRunning")
-			needProbeRunning = true
 		}
-		wkr.mtx.Unlock()
 	}
-	if needProbeRunning {
+	if booted || wkr.state == StateUnknown {
 		ctrUUIDs, ok, stderr = wkr.probeRunning()
 	}
 	logger := wkr.logger.WithField("stderr", string(stderr))
 	wkr.mtx.Lock()
 	defer wkr.mtx.Unlock()
-	if !ok {
+	if !ok || (!booted && len(ctrUUIDs) == 0 && len(wkr.running) == 0) {
 		if wkr.state == StateShutdown && wkr.updated.After(updated) {
 			// Skip the logging noise if shutdown was
 			// initiated during probe.
@@ -186,10 +214,10 @@ func (wkr *worker) probeAndUpdate() {
 			"Duration": dur,
 			"State":    wkr.state,
 		})
-		if wkr.state == StateBooting && !needProbeRunning {
-			// If we know the instance has never passed a
-			// boot probe, it's not noteworthy that it
-			// hasn't passed this probe.
+		if !booted {
+			// While we're polling the VM to see if it's
+			// finished booting, failures are not
+			// noteworthy, so we log at Debug level.
 			logger.Debug("new instance not responding")
 		} else {
 			logger.Info("instance not responding")
@@ -234,11 +262,16 @@ func (wkr *worker) probeAndUpdate() {
 			changed = true
 		}
 	}
-	if wkr.state == StateUnknown || wkr.state == StateBooting {
+	if booted && (wkr.state == StateUnknown || wkr.state == StateBooting) {
 		// Note: this will change again below if
 		// len(wkr.starting)+len(wkr.running) > 0.
 		wkr.state = StateIdle
 		changed = true
+	} else if wkr.state == StateUnknown && len(running) != len(wkr.running) {
+		logger.WithFields(logrus.Fields{
+			"RunningContainers": len(running),
+			"State":             wkr.state,
+		}).Info("crunch-run probe succeeded, but boot probe is still failing")
 	}
 	if !changed {
 		return
@@ -251,7 +284,7 @@ func (wkr *worker) probeAndUpdate() {
 		wkr.state = StateIdle
 	}
 	wkr.updated = updateTime
-	if needProbeBooted {
+	if booted && (initialState == StateUnknown || initialState == StateBooting) {
 		logger.WithFields(logrus.Fields{
 			"RunningContainers": len(running),
 			"State":             wkr.state,
diff --git a/lib/dispatchcloud/worker/worker_test.go b/lib/dispatchcloud/worker/worker_test.go
new file mode 100644
index 000000000..570d3b5a9
--- /dev/null
+++ b/lib/dispatchcloud/worker/worker_test.go
@@ -0,0 +1,240 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package worker
+
+import (
+	"errors"
+	"io"
+	"time"
+
+	"git.curoverse.com/arvados.git/lib/cloud"
+	"git.curoverse.com/arvados.git/lib/dispatchcloud/test"
+	"git.curoverse.com/arvados.git/sdk/go/arvados"
+	"github.com/sirupsen/logrus"
+	check "gopkg.in/check.v1"
+)
+
+var _ = check.Suite(&WorkerSuite{})
+
+type WorkerSuite struct{}
+
+func (suite *WorkerSuite) TestProbeAndUpdate(c *check.C) {
+	logger := logrus.StandardLogger()
+	bootTimeout := time.Minute
+	probeTimeout := time.Second
+
+	is, err := (&test.StubDriver{}).InstanceSet(nil, "", logger)
+	c.Assert(err, check.IsNil)
+	inst, err := is.Create(arvados.InstanceType{}, "", nil, nil)
+	c.Assert(err, check.IsNil)
+
+	type trialT struct {
+		testCaseComment string // displayed in test output to help identify failure case
+		age             time.Duration
+		state           State
+		running         int
+		starting        int
+		respBoot        stubResp // zero value is success
+		respRun         stubResp // zero value is success + nothing running
+		expectState     State
+		expectRunning   int
+	}
+
+	errFail := errors.New("failed")
+	respFail := stubResp{"", "command failed\n", errFail}
+	respContainerRunning := stubResp{"zzzzz-dz642-abcdefghijklmno\n", "", nil}
+	for _, trial := range []trialT{
+		{
+			testCaseComment: "Unknown, probes fail",
+			state:           StateUnknown,
+			respBoot:        respFail,
+			respRun:         respFail,
+			expectState:     StateUnknown,
+		},
+		{
+			testCaseComment: "Unknown, boot probe fails, but one container is running",
+			state:           StateUnknown,
+			respBoot:        respFail,
+			respRun:         respContainerRunning,
+			expectState:     StateUnknown,
+			expectRunning:   1,
+		},
+		{
+			testCaseComment: "Unknown, boot probe fails, previously running container has exited",
+			state:           StateUnknown,
+			running:         1,
+			respBoot:        respFail,
+			expectState:     StateUnknown,
+			expectRunning:   0,
+		},
+		{
+			testCaseComment: "Unknown, boot timeout exceeded, boot probe fails",
+			state:           StateUnknown,
+			age:             bootTimeout + time.Second,
+			respBoot:        respFail,
+			respRun:         respFail,
+			expectState:     StateShutdown,
+		},
+		{
+			testCaseComment: "Unknown, boot timeout exceeded, boot probe succeeds but crunch-run fails",
+			state:           StateUnknown,
+			age:             bootTimeout * 2,
+			respRun:         respFail,
+			expectState:     StateShutdown,
+		},
+		{
+			testCaseComment: "Unknown, boot timeout exceeded, boot probe fails but crunch-run succeeds",
+			state:           StateUnknown,
+			age:             bootTimeout * 2,
+			respBoot:        respFail,
+			expectState:     StateShutdown,
+		},
+		{
+			testCaseComment: "Unknown, boot timeout exceeded, boot probe fails but container is running",
+			state:           StateUnknown,
+			age:             bootTimeout * 2,
+			respBoot:        respFail,
+			respRun:         respContainerRunning,
+			expectState:     StateUnknown,
+			expectRunning:   1,
+		},
+		{
+			testCaseComment: "Booting, boot probe fails, run probe fails",
+			state:           StateBooting,
+			respBoot:        respFail,
+			respRun:         respFail,
+			expectState:     StateBooting,
+		},
+		{
+			testCaseComment: "Booting, boot probe fails, run probe succeeds (but isn't expected to be called)",
+			state:           StateBooting,
+			respBoot:        respFail,
+			expectState:     StateBooting,
+		},
+		{
+			testCaseComment: "Booting, boot probe succeeds, run probe fails",
+			state:           StateBooting,
+			respRun:         respFail,
+			expectState:     StateBooting,
+		},
+		{
+			testCaseComment: "Booting, boot probe succeeds, run probe succeeds",
+			state:           StateBooting,
+			expectState:     StateIdle,
+		},
+		{
+			testCaseComment: "Booting, boot probe succeeds, run probe succeeds, container is running",
+			state:           StateBooting,
+			respRun:         respContainerRunning,
+			expectState:     StateRunning,
+			expectRunning:   1,
+		},
+		{
+			testCaseComment: "Booting, boot timeout exceeded",
+			state:           StateBooting,
+			age:             bootTimeout * 2,
+			respRun:         respFail,
+			expectState:     StateShutdown,
+		},
+		{
+			testCaseComment: "Idle, probe timeout exceeded, one container running",
+			state:           StateIdle,
+			age:             probeTimeout * 2,
+			respRun:         respContainerRunning,
+			expectState:     StateRunning,
+			expectRunning:   1,
+		},
+		{
+			testCaseComment: "Idle, probe timeout exceeded, one container running, probe fails",
+			state:           StateIdle,
+			age:             probeTimeout * 2,
+			running:         1,
+			respRun:         respFail,
+			expectState:     StateShutdown,
+			expectRunning:   1,
+		},
+		{
+			testCaseComment: "Idle, probe timeout exceeded, nothing running, probe fails",
+			state:           StateIdle,
+			age:             probeTimeout * 2,
+			respRun:         respFail,
+			expectState:     StateShutdown,
+		},
+		{
+			testCaseComment: "Running, one container still running",
+			state:           StateRunning,
+			running:         1,
+			respRun:         respContainerRunning,
+			expectState:     StateRunning,
+			expectRunning:   1,
+		},
+		{
+			testCaseComment: "Running, container has exited",
+			state:           StateRunning,
+			running:         1,
+			expectState:     StateIdle,
+			expectRunning:   0,
+		},
+		{
+			testCaseComment: "Running, probe timeout exceeded, nothing running, new container being started",
+			state:           StateRunning,
+			age:             probeTimeout * 2,
+			starting:        1,
+			expectState:     StateRunning,
+		},
+	} {
+		c.Logf("------- %#v", trial)
+		ctime := time.Now().Add(-trial.age)
+		exr := stubExecutor{
+			"bootprobe":         trial.respBoot,
+			"crunch-run --list": trial.respRun,
+		}
+		wp := &Pool{
+			newExecutor:      func(cloud.Instance) Executor { return exr },
+			bootProbeCommand: "bootprobe",
+			timeoutBooting:   bootTimeout,
+			timeoutProbe:     probeTimeout,
+			exited:           map[string]time.Time{},
+		}
+		wkr := &worker{
+			logger:   logger,
+			executor: exr,
+			wp:       wp,
+			mtx:      &wp.mtx,
+			state:    trial.state,
+			instance: inst,
+			appeared: ctime,
+			busy:     ctime,
+			probed:   ctime,
+			updated:  ctime,
+		}
+		if trial.running > 0 {
+			wkr.running = map[string]struct{}{"zzzzz-dz642-abcdefghijklmno": struct{}{}}
+		}
+		if trial.starting > 0 {
+			wkr.starting = map[string]struct{}{"zzzzz-dz642-abcdefghijklmno": struct{}{}}
+		}
+		wkr.probeAndUpdate()
+		c.Check(wkr.state, check.Equals, trial.expectState)
+		c.Check(len(wkr.running), check.Equals, trial.expectRunning)
+	}
+}
+
+type stubResp struct {
+	stdout string
+	stderr string
+	err    error
+}
+type stubExecutor map[string]stubResp
+
+func (se stubExecutor) SetTarget(cloud.ExecutorTarget) {}
+func (se stubExecutor) Close()                         {}
+func (se stubExecutor) Execute(env map[string]string, cmd string, stdin io.Reader) (stdout, stderr []byte, err error) {
+	resp, ok := se[cmd]
+	if !ok {
+		return nil, []byte("command not found\n"), errors.New("command not found")
+	}
+	return []byte(resp.stdout), []byte(resp.stderr), resp.err
+}

-----------------------------------------------------------------------


hooks/post-receive
--