[arvados] created: 2.5.0-304-g9680a4a50

git repository hosting git at public.arvados.org
Fri Mar 24 15:21:12 UTC 2023


        at  9680a4a50200ade72af3d3e35d43775efefb9f94 (commit)


commit 9680a4a50200ade72af3d3e35d43775efefb9f94
Author: Tom Clegg <tom at curii.com>
Date:   Fri Mar 24 11:18:03 2023 -0400

    20235: Install crunch-run before run-probe even if boot-probe fails.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>

diff --git a/lib/dispatchcloud/worker/worker.go b/lib/dispatchcloud/worker/worker.go
index b2ed6c2bf..8b4be1a3c 100644
--- a/lib/dispatchcloud/worker/worker.go
+++ b/lib/dispatchcloud/worker/worker.go
@@ -253,19 +253,30 @@ func (wkr *worker) probeAndUpdate() {
 
 	if !booted {
 		booted, stderr = wkr.probeBooted()
+		shouldCopy := booted || initialState == StateUnknown
 		if !booted {
 			// Pretend this probe succeeded if another
 			// concurrent attempt succeeded.
 			wkr.mtx.Lock()
-			booted = wkr.state == StateRunning || wkr.state == StateIdle
+			if wkr.state == StateRunning || wkr.state == StateIdle {
+				booted = true
+				shouldCopy = false
+			}
 			wkr.mtx.Unlock()
 		}
+		if shouldCopy {
+			_, stderrCopy, err := wkr.copyRunnerData()
+			if err != nil {
+				booted = false
+				wkr.logger.WithError(err).WithField("stderr", string(stderrCopy)).Warn("error copying runner binary")
+			}
+		}
 		if booted {
 			logger.Info("instance booted; will try probeRunning")
 		}
 	}
 	reportedBroken := false
-	if booted || wkr.state == StateUnknown {
+	if booted || initialState == StateUnknown {
 		ctrUUIDs, reportedBroken, ok = wkr.probeRunning()
 	}
 	wkr.mtx.Lock()
@@ -467,21 +478,18 @@ func (wkr *worker) probeBooted() (ok bool, stderr []byte) {
 		return false, stderr
 	}
 	logger.Info("boot probe succeeded")
+	return true, stderr
+}
+
+func (wkr *worker) copyRunnerData() (stdout, stderr []byte, err error) {
 	if err = wkr.wp.loadRunnerData(); err != nil {
 		wkr.logger.WithError(err).Warn("cannot boot worker: error loading runner binary")
-		return false, stderr
+		return
 	} else if len(wkr.wp.runnerData) == 0 {
 		// Assume crunch-run is already installed
-	} else if _, stderr2, err := wkr.copyRunnerData(); err != nil {
-		wkr.logger.WithError(err).WithField("stderr", string(stderr2)).Warn("error copying runner binary")
-		return false, stderr2
-	} else {
-		stderr = append(stderr, stderr2...)
+		return
 	}
-	return true, stderr
-}
 
-func (wkr *worker) copyRunnerData() (stdout, stderr []byte, err error) {
 	hash := fmt.Sprintf("%x", wkr.wp.runnerMD5)
 	dstdir, _ := filepath.Split(wkr.wp.runnerCmd)
 	logger := wkr.logger.WithFields(logrus.Fields{
diff --git a/lib/dispatchcloud/worker/worker_test.go b/lib/dispatchcloud/worker/worker_test.go
index 2ee6b7c36..d04ecbb72 100644
--- a/lib/dispatchcloud/worker/worker_test.go
+++ b/lib/dispatchcloud/worker/worker_test.go
@@ -122,6 +122,39 @@ func (suite *WorkerSuite) TestProbeAndUpdate(c *check.C) {
 			expectState:     StateUnknown,
 			expectRunning:   1,
 		},
+		{
+			testCaseComment: "Unknown, boot probe fails, deployRunner succeeds, container is running",
+			state:           StateUnknown,
+			respBoot:        respFail,
+			respRun:         respFail,
+			respRunDeployed: respContainerRunning,
+			deployRunner:    []byte("ELF"),
+			expectStdin:     []byte("ELF"),
+			expectState:     StateUnknown,
+			expectRunning:   1,
+		},
+		{
+			testCaseComment: "Unknown, boot timeout exceeded, boot probe fails but deployRunner succeeds and container is running",
+			state:           StateUnknown,
+			age:             bootTimeout * 2,
+			respBoot:        respFail,
+			respRun:         respFail,
+			respRunDeployed: respContainerRunning,
+			deployRunner:    []byte("ELF"),
+			expectStdin:     []byte("ELF"),
+			expectState:     StateUnknown,
+			expectRunning:   1,
+		},
+		{
+			testCaseComment: "Unknown, boot timeout exceeded, boot probe fails but deployRunner succeeds and no container is running",
+			state:           StateUnknown,
+			age:             bootTimeout * 2,
+			respBoot:        respFail,
+			respRun:         respFail,
+			deployRunner:    []byte("ELF"),
+			expectStdin:     []byte("ELF"),
+			expectState:     StateShutdown,
+		},
 		{
 			testCaseComment: "Booting, boot probe fails, run probe fails",
 			state:           StateBooting,

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list