[ARVADOS] updated: 1.3.0-239-gee53a267d

Git user git at public.curoverse.com
Tue Feb 5 17:25:30 EST 2019


Summary of changes:
 lib/dispatchcloud/worker/worker.go | 61 ++++++++++++++++++++++----------------
 1 file changed, 36 insertions(+), 25 deletions(-)

       via  ee53a267ded17bc50eaf4dfebba5ff4a3273753c (commit)
      from  71fd4da18b22100682ae7e2079aadfd66360d310 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.


commit ee53a267ded17bc50eaf4dfebba5ff4a3273753c
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Mon Feb 4 11:21:32 2019 -0500

    14325: Log stderr from last boot-probe when giving up on boot.
    
    Remove duplicate log message after failed run-probe.
    
    Ensure we always make at least one probe attempt after the boot/probe
    timeout expires.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/lib/dispatchcloud/worker/worker.go b/lib/dispatchcloud/worker/worker.go
index 78ebaac6e..a24747267 100644
--- a/lib/dispatchcloud/worker/worker.go
+++ b/lib/dispatchcloud/worker/worker.go
@@ -172,7 +172,7 @@ func (wkr *worker) probeAndUpdate() {
 		booted   bool
 		ctrUUIDs []string
 		ok       bool
-		stderr   []byte
+		stderr   []byte // from probeBooted
 	)
 
 	switch initialState {
@@ -185,6 +185,9 @@ func (wkr *worker) probeAndUpdate() {
 		panic(fmt.Sprintf("unknown state %s", initialState))
 	}
 
+	probeStart := time.Now()
+	logger := wkr.logger.WithField("ProbeStart", probeStart)
+
 	if !booted {
 		booted, stderr = wkr.probeBooted()
 		if !booted {
@@ -193,14 +196,14 @@ func (wkr *worker) probeAndUpdate() {
 			wkr.mtx.Lock()
 			booted = wkr.state == StateRunning || wkr.state == StateIdle
 			wkr.mtx.Unlock()
-		} else {
-			wkr.logger.Info("instance booted; will try probeRunning")
+		}
+		if booted {
+			logger.Info("instance booted; will try probeRunning")
 		}
 	}
 	if booted || wkr.state == StateUnknown {
-		ctrUUIDs, ok, stderr = wkr.probeRunning()
+		ctrUUIDs, ok = wkr.probeRunning()
 	}
-	logger := wkr.logger.WithField("stderr", string(stderr))
 	wkr.mtx.Lock()
 	defer wkr.mtx.Unlock()
 	if !ok || (!booted && len(ctrUUIDs) == 0 && len(wkr.running) == 0) {
@@ -209,20 +212,27 @@ func (wkr *worker) probeAndUpdate() {
 			// initiated during probe.
 			return
 		}
-		dur := time.Since(wkr.probed)
-		logger := logger.WithFields(logrus.Fields{
-			"Duration": dur,
-			"State":    wkr.state,
-		})
-		if !booted {
-			// While we're polling the VM to see if it's
-			// finished booting, failures are not
-			// noteworthy, so we log at Debug level.
-			logger.Debug("new instance not responding")
-		} else {
-			logger.Info("instance not responding")
+		// Using the start time of the probe as the timeout
+		// threshold ensures we always initiate at least one
+		// probe attempt after the boot/probe timeout expires
+		// (otherwise, a slow probe failure could cause us to
+		// shutdown an instance even though it did in fact
+		// boot/recover before the timeout expired).
+		dur := probeStart.Sub(wkr.probed)
+		if wkr.shutdownIfBroken(dur) {
+			// stderr from failed run-probes will have
+			// been logged already, but boot-probe
+			// failures are normal so they are logged only
+			// at Debug level. This is our chance to log
+			// some evidence about why the node never
+			// booted, even in non-debug mode.
+			if !booted {
+				logger.WithFields(logrus.Fields{
+					"Duration": dur,
+					"stderr":   string(stderr),
+				}).Info("boot failed")
+			}
 		}
-		wkr.shutdownIfBroken(dur)
 		return
 	}
 
@@ -313,7 +323,7 @@ func (wkr *worker) probeAndUpdate() {
 	go wkr.wp.notify()
 }
 
-func (wkr *worker) probeRunning() (running []string, ok bool, stderr []byte) {
+func (wkr *worker) probeRunning() (running []string, ok bool) {
 	cmd := "crunch-run --list"
 	stdout, stderr, err := wkr.executor.Execute(nil, cmd, nil)
 	if err != nil {
@@ -322,13 +332,13 @@ func (wkr *worker) probeRunning() (running []string, ok bool, stderr []byte) {
 			"stdout":  string(stdout),
 			"stderr":  string(stderr),
 		}).WithError(err).Warn("probe failed")
-		return nil, false, stderr
+		return nil, false
 	}
 	stdout = bytes.TrimRight(stdout, "\n")
 	if len(stdout) == 0 {
-		return nil, true, stderr
+		return nil, true
 	}
-	return strings.Split(string(stdout), "\n"), true, stderr
+	return strings.Split(string(stdout), "\n"), true
 }
 
 func (wkr *worker) probeBooted() (ok bool, stderr []byte) {
@@ -351,17 +361,17 @@ func (wkr *worker) probeBooted() (ok bool, stderr []byte) {
 }
 
 // caller must have lock.
-func (wkr *worker) shutdownIfBroken(dur time.Duration) {
+func (wkr *worker) shutdownIfBroken(dur time.Duration) bool {
 	if wkr.idleBehavior == IdleBehaviorHold {
 		// Never shut down.
-		return
+		return false
 	}
 	label, threshold := "", wkr.wp.timeoutProbe
 	if wkr.state == StateUnknown || wkr.state == StateBooting {
 		label, threshold = "new ", wkr.wp.timeoutBooting
 	}
 	if dur < threshold {
-		return
+		return false
 	}
 	wkr.logger.WithFields(logrus.Fields{
 		"Duration": dur,
@@ -369,6 +379,7 @@ func (wkr *worker) shutdownIfBroken(dur time.Duration) {
 		"State":    wkr.state,
 	}).Warnf("%sinstance unresponsive, shutting down", label)
 	wkr.shutdown()
+	return true
 }
 
 // caller must have lock.

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list