[arvados] created: 2.6.0-485-g471f5c792
git repository hosting
git at public.arvados.org
Mon Aug 14 16:00:04 UTC 2023
at 471f5c792f4bf06ca4fab1600074c8c335b7d70f (commit)
commit 471f5c792f4bf06ca4fab1600074c8c335b7d70f
Author: Tom Clegg <tom at curii.com>
Date: Mon Aug 14 11:14:24 2023 -0400
20649: Mention `arvados-server cloudtest` in boot timeout log msg.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>
diff --git a/lib/dispatchcloud/worker/worker.go b/lib/dispatchcloud/worker/worker.go
index 0b406ce61..10a28157e 100644
--- a/lib/dispatchcloud/worker/worker.go
+++ b/lib/dispatchcloud/worker/worker.go
@@ -555,9 +555,11 @@ func (wkr *worker) shutdownIfBroken(dur time.Duration) bool {
// Never shut down.
return false
}
- label, threshold := "", wkr.wp.timeoutProbe
+ prologue, epilogue, threshold := "", "", wkr.wp.timeoutProbe
if wkr.state == StateUnknown || wkr.state == StateBooting {
- label, threshold = "new ", wkr.wp.timeoutBooting
+ prologue = "new "
+ epilogue = " -- `arvados-server cloudtest` might help troubleshoot, see https://doc.arvados.org/main/admin/cloudtest.html"
+ threshold = wkr.wp.timeoutBooting
}
if dur < threshold {
return false
@@ -566,7 +568,7 @@ func (wkr *worker) shutdownIfBroken(dur time.Duration) bool {
"Duration": dur,
"Since": wkr.probed,
"State": wkr.state,
- }).Warnf("%sinstance unresponsive, shutting down", label)
+ }).Warnf("%sinstance unresponsive, shutting down%s", prologue, epilogue)
wkr.shutdown()
return true
}
commit e44725a3792df227f189f88ffb2cd1dbf0e93489
Author: Tom Clegg <tom at curii.com>
Date: Mon Aug 14 11:11:51 2023 -0400
20649: Fix panic on race, worker shutdown vs. container startup.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>
diff --git a/lib/dispatchcloud/worker/worker.go b/lib/dispatchcloud/worker/worker.go
index 8b9326fa9..0b406ce61 100644
--- a/lib/dispatchcloud/worker/worker.go
+++ b/lib/dispatchcloud/worker/worker.go
@@ -188,6 +188,14 @@ func (wkr *worker) startContainer(ctr arvados.Container) {
}
wkr.mtx.Lock()
defer wkr.mtx.Unlock()
+ if wkr.starting[ctr.UUID] != rr {
+ // Someone else (e.g., wkr.probeAndUpdate() ->
+ // wkr.updateRunning() or wkr.Close()) already
+ // moved our runner from wkr.starting to
+ // wkr.running or deleted it while we were in
+ // rr.Start().
+ return
+ }
now := time.Now()
wkr.updated = now
wkr.busy = now
@@ -665,10 +673,12 @@ func (wkr *worker) Close() {
for uuid, rr := range wkr.running {
wkr.logger.WithField("ContainerUUID", uuid).Info("crunch-run process abandoned")
rr.Close()
+ delete(wkr.running, uuid)
}
for uuid, rr := range wkr.starting {
wkr.logger.WithField("ContainerUUID", uuid).Info("crunch-run process abandoned")
rr.Close()
+ delete(wkr.starting, uuid)
}
}
commit b38ee1ddd208d40c236361536639c811787aaea2
Author: Tom Clegg <tom at curii.com>
Date: Mon Aug 14 11:10:44 2023 -0400
20649: Don't wait timeoutSignal before sending first TERM signal.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>
diff --git a/lib/dispatchcloud/worker/runner.go b/lib/dispatchcloud/worker/runner.go
index ac039272c..f22b8922a 100644
--- a/lib/dispatchcloud/worker/runner.go
+++ b/lib/dispatchcloud/worker/runner.go
@@ -138,7 +138,7 @@ func (rr *remoteRunner) Kill(reason string) {
termDeadline := time.Now().Add(rr.timeoutTERM)
t := time.NewTicker(rr.timeoutSignal)
defer t.Stop()
- for range t.C {
+ for ; ; <-t.C {
switch {
case rr.isClosed():
return
commit c9e7b20bf97ea8e5ab77b5e20400e8736eb9c42b
Author: Tom Clegg <tom at curii.com>
Date: Wed Aug 9 17:07:35 2023 -0400
20457: Fix flaky test.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>
diff --git a/lib/dispatchcloud/dispatcher_test.go b/lib/dispatchcloud/dispatcher_test.go
index 4583a596e..8a8a34f2b 100644
--- a/lib/dispatchcloud/dispatcher_test.go
+++ b/lib/dispatchcloud/dispatcher_test.go
@@ -367,6 +367,7 @@ func (s *DispatcherSuite) TestInstancesAPI(c *check.C) {
sr := getInstances()
c.Check(len(sr.Items), check.Equals, 0)
+ s.stubDriver.ErrorRateCreate = 0
ch := s.disp.pool.Subscribe()
defer s.disp.pool.Unsubscribe(ch)
ok := s.disp.pool.Create(test.InstanceType(1))
commit 687b9b47f2a9be9dd7dfba5c01a452bce1a82114
Author: Tom Clegg <tom at curii.com>
Date: Fri Aug 11 17:47:32 2023 -0400
20649: Log actual error from last failed boot probe.
Not just the stderr content, which is useless if the error was
something like "connection failed".
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>
diff --git a/lib/dispatchcloud/worker/worker.go b/lib/dispatchcloud/worker/worker.go
index e6e817458..8b9326fa9 100644
--- a/lib/dispatchcloud/worker/worker.go
+++ b/lib/dispatchcloud/worker/worker.go
@@ -240,6 +240,7 @@ func (wkr *worker) probeAndUpdate() {
ctrUUIDs []string
ok bool
stderr []byte // from probeBooted
+ errLast error // from probeBooted or copyRunnerData
)
switch initialState {
@@ -256,7 +257,8 @@ func (wkr *worker) probeAndUpdate() {
logger := wkr.logger.WithField("ProbeStart", probeStart)
if !booted {
- booted, stderr = wkr.probeBooted()
+ stderr, errLast = wkr.probeBooted()
+ booted = errLast == nil
shouldCopy := booted || initialState == StateUnknown
if !booted {
// Pretend this probe succeeded if another
@@ -273,6 +275,7 @@ func (wkr *worker) probeAndUpdate() {
if err != nil {
booted = false
wkr.logger.WithError(err).WithField("stderr", string(stderrCopy)).Warn("error copying runner binary")
+ errLast = err
}
}
if booted {
@@ -315,7 +318,7 @@ func (wkr *worker) probeAndUpdate() {
logger.WithFields(logrus.Fields{
"Duration": dur,
"stderr": string(stderr),
- }).Info("boot failed")
+ }).WithError(errLast).Info("boot failed")
}
}
return
@@ -466,7 +469,7 @@ func (wkr *worker) probeRunning() (running []string, reportsBroken, ok bool) {
return
}
-func (wkr *worker) probeBooted() (ok bool, stderr []byte) {
+func (wkr *worker) probeBooted() (stderr []byte, err error) {
cmd := wkr.wp.bootProbeCommand
if cmd == "" {
cmd = "true"
@@ -498,10 +501,10 @@ func (wkr *worker) probeBooted() (ok bool, stderr []byte) {
// remain"
logger.WithError(err).Warn("boot probe failed")
}
- return false, stderr
+ return stderr, err
}
logger.Info("boot probe succeeded")
- return true, stderr
+ return stderr, nil
}
func (wkr *worker) copyRunnerData() (stdout, stderr []byte, err error) {
commit 37a55033565011d984794562022794a901155d13
Author: Tom Clegg <tom at curii.com>
Date: Fri Aug 11 17:47:06 2023 -0400
20649: Log boot probe errors other than usual wait-for-boot errors.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>
diff --git a/lib/dispatchcloud/sshexecutor/executor.go b/lib/dispatchcloud/sshexecutor/executor.go
index c37169921..3761c6992 100644
--- a/lib/dispatchcloud/sshexecutor/executor.go
+++ b/lib/dispatchcloud/sshexecutor/executor.go
@@ -18,6 +18,8 @@ import (
"golang.org/x/crypto/ssh"
)
+var ErrNoAddress = errors.New("instance has no address")
+
// New returns a new Executor, using the given target.
func New(t cloud.ExecutorTarget) *Executor {
return &Executor{target: t}
@@ -196,7 +198,7 @@ func (exr *Executor) TargetHostPort() (string, string) {
func (exr *Executor) setupSSHClient() (*ssh.Client, error) {
addr := net.JoinHostPort(exr.TargetHostPort())
if addr == ":" {
- return nil, errors.New("instance has no address")
+ return nil, ErrNoAddress
}
var receivedKey ssh.PublicKey
client, err := ssh.Dial("tcp", addr, &ssh.ClientConfig{
diff --git a/lib/dispatchcloud/sshexecutor/executor_test.go b/lib/dispatchcloud/sshexecutor/executor_test.go
index b4afeafa8..95b29fa6a 100644
--- a/lib/dispatchcloud/sshexecutor/executor_test.go
+++ b/lib/dispatchcloud/sshexecutor/executor_test.go
@@ -6,6 +6,7 @@ package sshexecutor
import (
"bytes"
+ "errors"
"fmt"
"io"
"io/ioutil"
@@ -146,6 +147,7 @@ func (s *ExecutorSuite) TestExecute(c *check.C) {
exr.SetTargetPort("0")
_, _, err = exr.Execute(nil, command, nil)
c.Check(err, check.ErrorMatches, `.*connection refused.*`)
+ c.Check(errors.As(err, new(*net.OpError)), check.Equals, true)
// Use the test server's listening port.
exr.SetTargetPort(target.Port())
diff --git a/lib/dispatchcloud/worker/worker.go b/lib/dispatchcloud/worker/worker.go
index 8b4be1a3c..e6e817458 100644
--- a/lib/dispatchcloud/worker/worker.go
+++ b/lib/dispatchcloud/worker/worker.go
@@ -7,17 +7,21 @@ package worker
import (
"bytes"
"encoding/json"
+ "errors"
"fmt"
"io"
+ "net"
"path/filepath"
"strings"
"sync"
"time"
"git.arvados.org/arvados.git/lib/cloud"
+ "git.arvados.org/arvados.git/lib/dispatchcloud/sshexecutor"
"git.arvados.org/arvados.git/sdk/go/arvados"
"git.arvados.org/arvados.git/sdk/go/stats"
"github.com/sirupsen/logrus"
+ "golang.org/x/crypto/ssh"
)
const (
@@ -301,10 +305,10 @@ func (wkr *worker) probeAndUpdate() {
dur := probeStart.Sub(wkr.probed)
if wkr.shutdownIfBroken(dur) {
// stderr from failed run-probes will have
- // been logged already, but boot-probe
+ // been logged already, but some boot-probe
// failures are normal so they are logged only
- // at Debug level. This is our chance to log
- // some evidence about why the node never
+ // at Debug level. This may be our chance to
+ // log some evidence about why the node never
// booted, even in non-debug mode.
if !booted {
wkr.reportBootOutcome(BootOutcomeFailed)
@@ -474,7 +478,26 @@ func (wkr *worker) probeBooted() (ok bool, stderr []byte) {
"stderr": string(stderr),
})
if err != nil {
- logger.WithError(err).Debug("boot probe failed")
+ if errors.Is(err, sshexecutor.ErrNoAddress) ||
+ errors.As(err, new(*net.OpError)) ||
+ errors.As(err, new(*ssh.ExitError)) {
+ // These errors are expected while the
+ // instance is booting, so we only log them at
+ // debug level.
+ logger.WithError(err).Debug("boot probe failed")
+ } else {
+ // Other errors are more likely to indicate a
+ // configuration problem, and it's more
+ // sysadmin-friendly to show them right away
+ // instead of waiting until boot timeout and
+ // only showing the last error.
+ //
+ // Example: "ssh: handshake failed: ssh:
+ // unable to authenticate, attempted methods
+ // [none publickey], no supported methods
+ // remain"
+ logger.WithError(err).Warn("boot probe failed")
+ }
return false, stderr
}
logger.Info("boot probe succeeded")
commit c7d05b1dda46923dbf807caca85767429c122341
Author: Tom Clegg <tom at curii.com>
Date: Thu Aug 10 10:45:46 2023 -0400
20649: cloudtest obeys DeployPublicKey config, like a-d-c.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>
diff --git a/lib/cloud/cloudtest/cmd.go b/lib/cloud/cloudtest/cmd.go
index 0c1049b76..2dc13e5a5 100644
--- a/lib/cloud/cloudtest/cmd.go
+++ b/lib/cloud/cloudtest/cmd.go
@@ -99,6 +99,7 @@ func (command) RunCommand(prog string, args []string, stdin io.Reader, stdout, s
InstanceType: it,
SSHKey: key,
SSHPort: cluster.Containers.CloudVMs.SSHPort,
+ DeployPublicKey: cluster.Containers.CloudVMs.DeployPublicKey,
BootProbeCommand: cluster.Containers.CloudVMs.BootProbeCommand,
InstanceInitCommand: cloud.InitCommand(cluster.Containers.CloudVMs.InstanceInitCommand),
ShellCommand: *shellCommand,
diff --git a/lib/cloud/cloudtest/tester.go b/lib/cloud/cloudtest/tester.go
index f7c2a3c73..a335278ed 100644
--- a/lib/cloud/cloudtest/tester.go
+++ b/lib/cloud/cloudtest/tester.go
@@ -41,6 +41,7 @@ type tester struct {
ImageID cloud.ImageID
SSHKey ssh.Signer
SSHPort string
+ DeployPublicKey bool
BootProbeCommand string
InstanceInitCommand cloud.InitCommand
ShellCommand string
@@ -174,15 +175,21 @@ func (t *tester) runWithDriverParameters(driverParameters json.RawMessage) bool
bootDeadline := time.Now().Add(t.TimeoutBooting)
initCommand := worker.TagVerifier{Instance: nil, Secret: t.secret, ReportVerified: nil}.InitCommand() + "\n" + t.InstanceInitCommand
+ installPublicKey := t.SSHKey.PublicKey()
+ if !t.DeployPublicKey {
+ installPublicKey = nil
+ }
+
t.Logger.WithFields(logrus.Fields{
"InstanceType": t.InstanceType.Name,
"ProviderInstanceType": t.InstanceType.ProviderType,
"ImageID": t.ImageID,
"Tags": tags,
"InitCommand": initCommand,
+ "DeployPublicKey": installPublicKey != nil,
}).Info("creating instance")
t0 := time.Now()
- inst, err := t.is.Create(t.InstanceType, t.ImageID, tags, initCommand, t.SSHKey.PublicKey())
+ inst, err := t.is.Create(t.InstanceType, t.ImageID, tags, initCommand, installPublicKey)
lgrC := t.Logger.WithField("Duration", time.Since(t0))
if err != nil {
// Create() might have failed due to a bug or network
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list