[arvados] created: 2.5.0-304-g9680a4a50
git repository hosting
git at public.arvados.org
Fri Mar 24 15:21:12 UTC 2023
at 9680a4a50200ade72af3d3e35d43775efefb9f94 (commit)
commit 9680a4a50200ade72af3d3e35d43775efefb9f94
Author: Tom Clegg <tom at curii.com>
Date: Fri Mar 24 11:18:03 2023 -0400
20235: Install crunch-run before run-probe even if boot-probe fails.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>
diff --git a/lib/dispatchcloud/worker/worker.go b/lib/dispatchcloud/worker/worker.go
index b2ed6c2bf..8b4be1a3c 100644
--- a/lib/dispatchcloud/worker/worker.go
+++ b/lib/dispatchcloud/worker/worker.go
@@ -253,19 +253,30 @@ func (wkr *worker) probeAndUpdate() {
if !booted {
booted, stderr = wkr.probeBooted()
+ shouldCopy := booted || initialState == StateUnknown
if !booted {
// Pretend this probe succeeded if another
// concurrent attempt succeeded.
wkr.mtx.Lock()
- booted = wkr.state == StateRunning || wkr.state == StateIdle
+ if wkr.state == StateRunning || wkr.state == StateIdle {
+ booted = true
+ shouldCopy = false
+ }
wkr.mtx.Unlock()
}
+ if shouldCopy {
+ _, stderrCopy, err := wkr.copyRunnerData()
+ if err != nil {
+ booted = false
+ wkr.logger.WithError(err).WithField("stderr", string(stderrCopy)).Warn("error copying runner binary")
+ }
+ }
if booted {
logger.Info("instance booted; will try probeRunning")
}
}
reportedBroken := false
- if booted || wkr.state == StateUnknown {
+ if booted || initialState == StateUnknown {
ctrUUIDs, reportedBroken, ok = wkr.probeRunning()
}
wkr.mtx.Lock()
@@ -467,21 +478,18 @@ func (wkr *worker) probeBooted() (ok bool, stderr []byte) {
return false, stderr
}
logger.Info("boot probe succeeded")
+ return true, stderr
+}
+
+func (wkr *worker) copyRunnerData() (stdout, stderr []byte, err error) {
if err = wkr.wp.loadRunnerData(); err != nil {
wkr.logger.WithError(err).Warn("cannot boot worker: error loading runner binary")
- return false, stderr
+ return
} else if len(wkr.wp.runnerData) == 0 {
// Assume crunch-run is already installed
- } else if _, stderr2, err := wkr.copyRunnerData(); err != nil {
- wkr.logger.WithError(err).WithField("stderr", string(stderr2)).Warn("error copying runner binary")
- return false, stderr2
- } else {
- stderr = append(stderr, stderr2...)
+ return
}
- return true, stderr
-}
-func (wkr *worker) copyRunnerData() (stdout, stderr []byte, err error) {
hash := fmt.Sprintf("%x", wkr.wp.runnerMD5)
dstdir, _ := filepath.Split(wkr.wp.runnerCmd)
logger := wkr.logger.WithFields(logrus.Fields{
diff --git a/lib/dispatchcloud/worker/worker_test.go b/lib/dispatchcloud/worker/worker_test.go
index 2ee6b7c36..d04ecbb72 100644
--- a/lib/dispatchcloud/worker/worker_test.go
+++ b/lib/dispatchcloud/worker/worker_test.go
@@ -122,6 +122,39 @@ func (suite *WorkerSuite) TestProbeAndUpdate(c *check.C) {
expectState: StateUnknown,
expectRunning: 1,
},
+ {
+ testCaseComment: "Unknown, boot probe fails, deployRunner succeeds, container is running",
+ state: StateUnknown,
+ respBoot: respFail,
+ respRun: respFail,
+ respRunDeployed: respContainerRunning,
+ deployRunner: []byte("ELF"),
+ expectStdin: []byte("ELF"),
+ expectState: StateUnknown,
+ expectRunning: 1,
+ },
+ {
+ testCaseComment: "Unknown, boot timeout exceeded, boot probe fails but deployRunner succeeds and container is running",
+ state: StateUnknown,
+ age: bootTimeout * 2,
+ respBoot: respFail,
+ respRun: respFail,
+ respRunDeployed: respContainerRunning,
+ deployRunner: []byte("ELF"),
+ expectStdin: []byte("ELF"),
+ expectState: StateUnknown,
+ expectRunning: 1,
+ },
+ {
+ testCaseComment: "Unknown, boot timeout exceeded, boot probe fails but deployRunner succeeds and no container is running",
+ state: StateUnknown,
+ age: bootTimeout * 2,
+ respBoot: respFail,
+ respRun: respFail,
+ deployRunner: []byte("ELF"),
+ expectStdin: []byte("ELF"),
+ expectState: StateShutdown,
+ },
{
testCaseComment: "Booting, boot probe fails, run probe fails",
state: StateBooting,
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list