[arvados] created: 2.1.0-2901-g0bafd2d26

git repository hosting git at public.arvados.org
Fri Sep 16 18:18:12 UTC 2022


        at  0bafd2d2639d8e9d7c538414c8621d2625d4eb79 (commit)


commit 0bafd2d2639d8e9d7c538414c8621d2625d4eb79
Author: Tom Clegg <tom at curii.com>
Date:   Fri Sep 16 14:16:59 2022 -0400

    19520: When hitting boot timeout, log stdout of last boot probe.
    
    (not just stderr)
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>

diff --git a/lib/dispatchcloud/worker/worker.go b/lib/dispatchcloud/worker/worker.go
index b01a820cd..8a9cbd528 100644
--- a/lib/dispatchcloud/worker/worker.go
+++ b/lib/dispatchcloud/worker/worker.go
@@ -233,6 +233,7 @@ func (wkr *worker) probeAndUpdate() {
 		booted   bool
 		ctrUUIDs []string
 		ok       bool
+		stdout   []byte // from probeBooted
 		stderr   []byte // from probeBooted
 	)
 
@@ -250,7 +251,7 @@ func (wkr *worker) probeAndUpdate() {
 	logger := wkr.logger.WithField("ProbeStart", probeStart)
 
 	if !booted {
-		booted, stderr = wkr.probeBooted()
+		booted, stdout, stderr = wkr.probeBooted()
 		if !booted {
 			// Pretend this probe succeeded if another
 			// concurrent attempt succeeded.
@@ -287,8 +288,8 @@ func (wkr *worker) probeAndUpdate() {
 		// boot/recover before the timeout expired).
 		dur := probeStart.Sub(wkr.probed)
 		if wkr.shutdownIfBroken(dur) {
-			// stderr from failed run-probes will have
-			// been logged already, but boot-probe
+			// stdout+stderr from failed run-probes will
+			// have been logged already, but boot-probe
 			// failures are normal so they are logged only
 			// at Debug level. This is our chance to log
 			// some evidence about why the node never
@@ -297,6 +298,7 @@ func (wkr *worker) probeAndUpdate() {
 				wkr.reportBootOutcome(BootOutcomeFailed)
 				logger.WithFields(logrus.Fields{
 					"Duration": dur,
+					"stdout":   string(stdout),
 					"stderr":   string(stderr),
 				}).Info("boot failed")
 			}
@@ -444,7 +446,7 @@ func (wkr *worker) probeRunning() (running []string, reportsBroken, ok bool) {
 	return
 }
 
-func (wkr *worker) probeBooted() (ok bool, stderr []byte) {
+func (wkr *worker) probeBooted() (ok bool, stdout, stderr []byte) {
 	cmd := wkr.wp.bootProbeCommand
 	if cmd == "" {
 		cmd = "true"
@@ -457,21 +459,25 @@ func (wkr *worker) probeBooted() (ok bool, stderr []byte) {
 	})
 	if err != nil {
 		logger.WithError(err).Debug("boot probe failed")
-		return false, stderr
+		return false, stdout, stderr
 	}
 	logger.Info("boot probe succeeded")
 	if err = wkr.wp.loadRunnerData(); err != nil {
 		wkr.logger.WithError(err).Warn("cannot boot worker: error loading runner binary")
-		return false, stderr
+		return false, stdout, stderr
 	} else if len(wkr.wp.runnerData) == 0 {
 		// Assume crunch-run is already installed
-	} else if _, stderr2, err := wkr.copyRunnerData(); err != nil {
-		wkr.logger.WithError(err).WithField("stderr", string(stderr2)).Warn("error copying runner binary")
-		return false, stderr2
+	} else if stdout2, stderr2, err := wkr.copyRunnerData(); err != nil {
+		wkr.logger.WithError(err).WithFields(logrus.Fields{
+			"stdout": string(stdout2),
+			"stderr": string(stderr2),
+		}).Warn("error copying runner binary")
+		return false, stdout2, stderr2
 	} else {
+		stdout = append(stdout, stdout2...)
 		stderr = append(stderr, stderr2...)
 	}
-	return true, stderr
+	return true, stdout, stderr
 }
 
 func (wkr *worker) copyRunnerData() (stdout, stderr []byte, err error) {

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list