[arvados] created: 2.6.0-485-g471f5c792

git repository hosting git at public.arvados.org
Mon Aug 14 16:00:04 UTC 2023


        at  471f5c792f4bf06ca4fab1600074c8c335b7d70f (commit)


commit 471f5c792f4bf06ca4fab1600074c8c335b7d70f
Author: Tom Clegg <tom at curii.com>
Date:   Mon Aug 14 11:14:24 2023 -0400

    20649: Mention `arvados-server cloudtest` in boot timeout log msg.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>

diff --git a/lib/dispatchcloud/worker/worker.go b/lib/dispatchcloud/worker/worker.go
index 0b406ce61..10a28157e 100644
--- a/lib/dispatchcloud/worker/worker.go
+++ b/lib/dispatchcloud/worker/worker.go
@@ -555,9 +555,11 @@ func (wkr *worker) shutdownIfBroken(dur time.Duration) bool {
 		// Never shut down.
 		return false
 	}
-	label, threshold := "", wkr.wp.timeoutProbe
+	prologue, epilogue, threshold := "", "", wkr.wp.timeoutProbe
 	if wkr.state == StateUnknown || wkr.state == StateBooting {
-		label, threshold = "new ", wkr.wp.timeoutBooting
+		prologue = "new "
+		epilogue = " -- `arvados-server cloudtest` might help troubleshoot, see https://doc.arvados.org/main/admin/cloudtest.html"
+		threshold = wkr.wp.timeoutBooting
 	}
 	if dur < threshold {
 		return false
@@ -566,7 +568,7 @@ func (wkr *worker) shutdownIfBroken(dur time.Duration) bool {
 		"Duration": dur,
 		"Since":    wkr.probed,
 		"State":    wkr.state,
-	}).Warnf("%sinstance unresponsive, shutting down", label)
+	}).Warnf("%sinstance unresponsive, shutting down%s", prologue, epilogue)
 	wkr.shutdown()
 	return true
 }

commit e44725a3792df227f189f88ffb2cd1dbf0e93489
Author: Tom Clegg <tom at curii.com>
Date:   Mon Aug 14 11:11:51 2023 -0400

    20649: Fix panic on race, worker shutdown vs. container startup.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>

diff --git a/lib/dispatchcloud/worker/worker.go b/lib/dispatchcloud/worker/worker.go
index 8b9326fa9..0b406ce61 100644
--- a/lib/dispatchcloud/worker/worker.go
+++ b/lib/dispatchcloud/worker/worker.go
@@ -188,6 +188,14 @@ func (wkr *worker) startContainer(ctr arvados.Container) {
 		}
 		wkr.mtx.Lock()
 		defer wkr.mtx.Unlock()
+		if wkr.starting[ctr.UUID] != rr {
+			// Someone else (e.g., wkr.probeAndUpdate() ->
+			// wkr.updateRunning() or wkr.Close()) already
+			// moved our runner from wkr.starting to
+			// wkr.running or deleted it while we were in
+			// rr.Start().
+			return
+		}
 		now := time.Now()
 		wkr.updated = now
 		wkr.busy = now
@@ -665,10 +673,12 @@ func (wkr *worker) Close() {
 	for uuid, rr := range wkr.running {
 		wkr.logger.WithField("ContainerUUID", uuid).Info("crunch-run process abandoned")
 		rr.Close()
+		delete(wkr.running, uuid)
 	}
 	for uuid, rr := range wkr.starting {
 		wkr.logger.WithField("ContainerUUID", uuid).Info("crunch-run process abandoned")
 		rr.Close()
+		delete(wkr.starting, uuid)
 	}
 }
 

commit b38ee1ddd208d40c236361536639c811787aaea2
Author: Tom Clegg <tom at curii.com>
Date:   Mon Aug 14 11:10:44 2023 -0400

    20649: Don't wait timeoutSignal before sending first TERM signal.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>

diff --git a/lib/dispatchcloud/worker/runner.go b/lib/dispatchcloud/worker/runner.go
index ac039272c..f22b8922a 100644
--- a/lib/dispatchcloud/worker/runner.go
+++ b/lib/dispatchcloud/worker/runner.go
@@ -138,7 +138,7 @@ func (rr *remoteRunner) Kill(reason string) {
 		termDeadline := time.Now().Add(rr.timeoutTERM)
 		t := time.NewTicker(rr.timeoutSignal)
 		defer t.Stop()
-		for range t.C {
+		for ; ; <-t.C {
 			switch {
 			case rr.isClosed():
 				return

commit c9e7b20bf97ea8e5ab77b5e20400e8736eb9c42b
Author: Tom Clegg <tom at curii.com>
Date:   Wed Aug 9 17:07:35 2023 -0400

    20457: Fix flaky test.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>

diff --git a/lib/dispatchcloud/dispatcher_test.go b/lib/dispatchcloud/dispatcher_test.go
index 4583a596e..8a8a34f2b 100644
--- a/lib/dispatchcloud/dispatcher_test.go
+++ b/lib/dispatchcloud/dispatcher_test.go
@@ -367,6 +367,7 @@ func (s *DispatcherSuite) TestInstancesAPI(c *check.C) {
 	sr := getInstances()
 	c.Check(len(sr.Items), check.Equals, 0)
 
+	s.stubDriver.ErrorRateCreate = 0
 	ch := s.disp.pool.Subscribe()
 	defer s.disp.pool.Unsubscribe(ch)
 	ok := s.disp.pool.Create(test.InstanceType(1))

commit 687b9b47f2a9be9dd7dfba5c01a452bce1a82114
Author: Tom Clegg <tom at curii.com>
Date:   Fri Aug 11 17:47:32 2023 -0400

    20649: Log actual error from last failed boot probe.
    
    Not just the stderr content, which is useless if the error was
    something like "connection failed".
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>

diff --git a/lib/dispatchcloud/worker/worker.go b/lib/dispatchcloud/worker/worker.go
index e6e817458..8b9326fa9 100644
--- a/lib/dispatchcloud/worker/worker.go
+++ b/lib/dispatchcloud/worker/worker.go
@@ -240,6 +240,7 @@ func (wkr *worker) probeAndUpdate() {
 		ctrUUIDs []string
 		ok       bool
 		stderr   []byte // from probeBooted
+		errLast  error  // from probeBooted or copyRunnerData
 	)
 
 	switch initialState {
@@ -256,7 +257,8 @@ func (wkr *worker) probeAndUpdate() {
 	logger := wkr.logger.WithField("ProbeStart", probeStart)
 
 	if !booted {
-		booted, stderr = wkr.probeBooted()
+		stderr, errLast = wkr.probeBooted()
+		booted = errLast == nil
 		shouldCopy := booted || initialState == StateUnknown
 		if !booted {
 			// Pretend this probe succeeded if another
@@ -273,6 +275,7 @@ func (wkr *worker) probeAndUpdate() {
 			if err != nil {
 				booted = false
 				wkr.logger.WithError(err).WithField("stderr", string(stderrCopy)).Warn("error copying runner binary")
+				errLast = err
 			}
 		}
 		if booted {
@@ -315,7 +318,7 @@ func (wkr *worker) probeAndUpdate() {
 				logger.WithFields(logrus.Fields{
 					"Duration": dur,
 					"stderr":   string(stderr),
-				}).Info("boot failed")
+				}).WithError(errLast).Info("boot failed")
 			}
 		}
 		return
@@ -466,7 +469,7 @@ func (wkr *worker) probeRunning() (running []string, reportsBroken, ok bool) {
 	return
 }
 
-func (wkr *worker) probeBooted() (ok bool, stderr []byte) {
+func (wkr *worker) probeBooted() (stderr []byte, err error) {
 	cmd := wkr.wp.bootProbeCommand
 	if cmd == "" {
 		cmd = "true"
@@ -498,10 +501,10 @@ func (wkr *worker) probeBooted() (ok bool, stderr []byte) {
 			// remain"
 			logger.WithError(err).Warn("boot probe failed")
 		}
-		return false, stderr
+		return stderr, err
 	}
 	logger.Info("boot probe succeeded")
-	return true, stderr
+	return stderr, nil
 }
 
 func (wkr *worker) copyRunnerData() (stdout, stderr []byte, err error) {

commit 37a55033565011d984794562022794a901155d13
Author: Tom Clegg <tom at curii.com>
Date:   Fri Aug 11 17:47:06 2023 -0400

    20649: Log boot probe errors other than usual wait-for-boot errors.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>

diff --git a/lib/dispatchcloud/sshexecutor/executor.go b/lib/dispatchcloud/sshexecutor/executor.go
index c37169921..3761c6992 100644
--- a/lib/dispatchcloud/sshexecutor/executor.go
+++ b/lib/dispatchcloud/sshexecutor/executor.go
@@ -18,6 +18,8 @@ import (
 	"golang.org/x/crypto/ssh"
 )
 
+var ErrNoAddress = errors.New("instance has no address")
+
 // New returns a new Executor, using the given target.
 func New(t cloud.ExecutorTarget) *Executor {
 	return &Executor{target: t}
@@ -196,7 +198,7 @@ func (exr *Executor) TargetHostPort() (string, string) {
 func (exr *Executor) setupSSHClient() (*ssh.Client, error) {
 	addr := net.JoinHostPort(exr.TargetHostPort())
 	if addr == ":" {
-		return nil, errors.New("instance has no address")
+		return nil, ErrNoAddress
 	}
 	var receivedKey ssh.PublicKey
 	client, err := ssh.Dial("tcp", addr, &ssh.ClientConfig{
diff --git a/lib/dispatchcloud/sshexecutor/executor_test.go b/lib/dispatchcloud/sshexecutor/executor_test.go
index b4afeafa8..95b29fa6a 100644
--- a/lib/dispatchcloud/sshexecutor/executor_test.go
+++ b/lib/dispatchcloud/sshexecutor/executor_test.go
@@ -6,6 +6,7 @@ package sshexecutor
 
 import (
 	"bytes"
+	"errors"
 	"fmt"
 	"io"
 	"io/ioutil"
@@ -146,6 +147,7 @@ func (s *ExecutorSuite) TestExecute(c *check.C) {
 		exr.SetTargetPort("0")
 		_, _, err = exr.Execute(nil, command, nil)
 		c.Check(err, check.ErrorMatches, `.*connection refused.*`)
+		c.Check(errors.As(err, new(*net.OpError)), check.Equals, true)
 
 		// Use the test server's listening port.
 		exr.SetTargetPort(target.Port())
diff --git a/lib/dispatchcloud/worker/worker.go b/lib/dispatchcloud/worker/worker.go
index 8b4be1a3c..e6e817458 100644
--- a/lib/dispatchcloud/worker/worker.go
+++ b/lib/dispatchcloud/worker/worker.go
@@ -7,17 +7,21 @@ package worker
 import (
 	"bytes"
 	"encoding/json"
+	"errors"
 	"fmt"
 	"io"
+	"net"
 	"path/filepath"
 	"strings"
 	"sync"
 	"time"
 
 	"git.arvados.org/arvados.git/lib/cloud"
+	"git.arvados.org/arvados.git/lib/dispatchcloud/sshexecutor"
 	"git.arvados.org/arvados.git/sdk/go/arvados"
 	"git.arvados.org/arvados.git/sdk/go/stats"
 	"github.com/sirupsen/logrus"
+	"golang.org/x/crypto/ssh"
 )
 
 const (
@@ -301,10 +305,10 @@ func (wkr *worker) probeAndUpdate() {
 		dur := probeStart.Sub(wkr.probed)
 		if wkr.shutdownIfBroken(dur) {
 			// stderr from failed run-probes will have
-			// been logged already, but boot-probe
+			// been logged already, but some boot-probe
 			// failures are normal so they are logged only
-			// at Debug level. This is our chance to log
-			// some evidence about why the node never
+			// at Debug level. This may be our chance to
+			// log some evidence about why the node never
 			// booted, even in non-debug mode.
 			if !booted {
 				wkr.reportBootOutcome(BootOutcomeFailed)
@@ -474,7 +478,26 @@ func (wkr *worker) probeBooted() (ok bool, stderr []byte) {
 		"stderr":  string(stderr),
 	})
 	if err != nil {
-		logger.WithError(err).Debug("boot probe failed")
+		if errors.Is(err, sshexecutor.ErrNoAddress) ||
+			errors.As(err, new(*net.OpError)) ||
+			errors.As(err, new(*ssh.ExitError)) {
+			// These errors are expected while the
+			// instance is booting, so we only log them at
+			// debug level.
+			logger.WithError(err).Debug("boot probe failed")
+		} else {
+			// Other errors are more likely to indicate a
+			// configuration problem, and it's more
+			// sysadmin-friendly to show them right away
+			// instead of waiting until boot timeout and
+			// only showing the last error.
+			//
+			// Example: "ssh: handshake failed: ssh:
+			// unable to authenticate, attempted methods
+			// [none publickey], no supported methods
+			// remain"
+			logger.WithError(err).Warn("boot probe failed")
+		}
 		return false, stderr
 	}
 	logger.Info("boot probe succeeded")

commit c7d05b1dda46923dbf807caca85767429c122341
Author: Tom Clegg <tom at curii.com>
Date:   Thu Aug 10 10:45:46 2023 -0400

    20649: cloudtest obeys DeployPublicKey config, like a-d-c.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>

diff --git a/lib/cloud/cloudtest/cmd.go b/lib/cloud/cloudtest/cmd.go
index 0c1049b76..2dc13e5a5 100644
--- a/lib/cloud/cloudtest/cmd.go
+++ b/lib/cloud/cloudtest/cmd.go
@@ -99,6 +99,7 @@ func (command) RunCommand(prog string, args []string, stdin io.Reader, stdout, s
 		InstanceType:        it,
 		SSHKey:              key,
 		SSHPort:             cluster.Containers.CloudVMs.SSHPort,
+		DeployPublicKey:     cluster.Containers.CloudVMs.DeployPublicKey,
 		BootProbeCommand:    cluster.Containers.CloudVMs.BootProbeCommand,
 		InstanceInitCommand: cloud.InitCommand(cluster.Containers.CloudVMs.InstanceInitCommand),
 		ShellCommand:        *shellCommand,
diff --git a/lib/cloud/cloudtest/tester.go b/lib/cloud/cloudtest/tester.go
index f7c2a3c73..a335278ed 100644
--- a/lib/cloud/cloudtest/tester.go
+++ b/lib/cloud/cloudtest/tester.go
@@ -41,6 +41,7 @@ type tester struct {
 	ImageID             cloud.ImageID
 	SSHKey              ssh.Signer
 	SSHPort             string
+	DeployPublicKey     bool
 	BootProbeCommand    string
 	InstanceInitCommand cloud.InitCommand
 	ShellCommand        string
@@ -174,15 +175,21 @@ func (t *tester) runWithDriverParameters(driverParameters json.RawMessage) bool
 	bootDeadline := time.Now().Add(t.TimeoutBooting)
 	initCommand := worker.TagVerifier{Instance: nil, Secret: t.secret, ReportVerified: nil}.InitCommand() + "\n" + t.InstanceInitCommand
 
+	installPublicKey := t.SSHKey.PublicKey()
+	if !t.DeployPublicKey {
+		installPublicKey = nil
+	}
+
 	t.Logger.WithFields(logrus.Fields{
 		"InstanceType":         t.InstanceType.Name,
 		"ProviderInstanceType": t.InstanceType.ProviderType,
 		"ImageID":              t.ImageID,
 		"Tags":                 tags,
 		"InitCommand":          initCommand,
+		"DeployPublicKey":      installPublicKey != nil,
 	}).Info("creating instance")
 	t0 := time.Now()
-	inst, err := t.is.Create(t.InstanceType, t.ImageID, tags, initCommand, t.SSHKey.PublicKey())
+	inst, err := t.is.Create(t.InstanceType, t.ImageID, tags, initCommand, installPublicKey)
 	lgrC := t.Logger.WithField("Duration", time.Since(t0))
 	if err != nil {
 		// Create() might have failed due to a bug or network

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list