[ARVADOS] updated: 1.3.0-3079-g5c5ac412b

Git user git at public.arvados.org
Thu Sep 3 20:27:34 UTC 2020


Summary of changes:
 lib/dispatchcloud/dispatcher_test.go |  3 +++
 lib/dispatchcloud/worker/pool.go     | 47 +++++++++++++++++++++++++++---------
 lib/dispatchcloud/worker/verify.go   |  4 ++-
 lib/dispatchcloud/worker/worker.go   | 17 +++++++------
 4 files changed, 51 insertions(+), 20 deletions(-)

  discards  59c6fadc7aa48cf1c7a68a8fd6fa9ab420eef7ba (commit)
       via  5c5ac412b722025d1af37f81bea60a4b503ce6aa (commit)

This update added new revisions after undoing existing revisions.  That is
to say, the old revision is not a strict subset of the new revision.  This
situation occurs when you --force push a change and generate a repository
containing something like this:

 * -- * -- B -- O -- O -- O (59c6fadc7aa48cf1c7a68a8fd6fa9ab420eef7ba)
            \
             N -- N -- N (5c5ac412b722025d1af37f81bea60a4b503ce6aa)

When this happens we assume that you've already had alert emails for all
of the O revisions, and so we here report only the revisions in the N
branch from the common base, B.

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.


commit 5c5ac412b722025d1af37f81bea60a4b503ce6aa
Author: Ward Vandewege <ward at curii.com>
Date:   Thu Sep 3 13:10:42 2020 -0400

    16636: a-d-c: add a time-to-ssh and time-to-ready-for-container metrics
    
    Arvados-DCO-1.1-Signed-off-by: Ward Vandewege <ward at curii.com>

diff --git a/lib/dispatchcloud/dispatcher_test.go b/lib/dispatchcloud/dispatcher_test.go
index 42decff31..6e1850410 100644
--- a/lib/dispatchcloud/dispatcher_test.go
+++ b/lib/dispatchcloud/dispatcher_test.go
@@ -215,6 +215,12 @@ func (s *DispatcherSuite) TestDispatchToStubDriver(c *check.C) {
 	c.Check(resp.Body.String(), check.Matches, `(?ms).*boot_outcomes{outcome="success"} [^0].*`)
 	c.Check(resp.Body.String(), check.Matches, `(?ms).*instances_disappeared{state="shutdown"} [^0].*`)
 	c.Check(resp.Body.String(), check.Matches, `(?ms).*instances_disappeared{state="unknown"} 0\n.*`)
+	c.Check(resp.Body.String(), check.Matches, `(?ms).*time_to_ssh_seconds{quantile="0.95"} [0-9.]*`)
+	c.Check(resp.Body.String(), check.Matches, `(?ms).*time_to_ssh_seconds_count [0-9]*`)
+	c.Check(resp.Body.String(), check.Matches, `(?ms).*time_to_ssh_seconds_sum [0-9.]*`)
+	c.Check(resp.Body.String(), check.Matches, `(?ms).*time_to_ready_for_container_seconds{quantile="0.95"} [0-9.]*`)
+	c.Check(resp.Body.String(), check.Matches, `(?ms).*time_to_ready_for_container_seconds_count [0-9]*`)
+	c.Check(resp.Body.String(), check.Matches, `(?ms).*time_to_ready_for_container_seconds_sum [0-9.]*`)
 }
 
 func (s *DispatcherSuite) TestAPIPermissions(c *check.C) {
diff --git a/lib/dispatchcloud/worker/pool.go b/lib/dispatchcloud/worker/pool.go
index 435b6e43a..6ca15de49 100644
--- a/lib/dispatchcloud/worker/pool.go
+++ b/lib/dispatchcloud/worker/pool.go
@@ -170,13 +170,15 @@ type Pool struct {
 	runnerMD5    [md5.Size]byte
 	runnerCmd    string
 
-	mContainersRunning prometheus.Gauge
-	mInstances         *prometheus.GaugeVec
-	mInstancesPrice    *prometheus.GaugeVec
-	mVCPUs             *prometheus.GaugeVec
-	mMemory            *prometheus.GaugeVec
-	mBootOutcomes      *prometheus.CounterVec
-	mDisappearances    *prometheus.CounterVec
+	mContainersRunning       prometheus.Gauge
+	mInstances               *prometheus.GaugeVec
+	mInstancesPrice          *prometheus.GaugeVec
+	mVCPUs                   *prometheus.GaugeVec
+	mMemory                  *prometheus.GaugeVec
+	mBootOutcomes            *prometheus.CounterVec
+	mDisappearances          *prometheus.CounterVec
+	mTimeToSSH               prometheus.Summary
+	mTimeToReadyForContainer prometheus.Summary
 }
 
 type createCall struct {
@@ -323,7 +325,7 @@ func (wp *Pool) Create(it arvados.InstanceType) bool {
 			wp.tagKeyPrefix + tagKeyIdleBehavior:   string(IdleBehaviorRun),
 			wp.tagKeyPrefix + tagKeyInstanceSecret: secret,
 		}
-		initCmd := TagVerifier{nil, secret}.InitCommand()
+		initCmd := TagVerifier{nil, secret, nil}.InitCommand()
 		inst, err := wp.instanceSet.Create(it, wp.imageID, tags, initCmd, wp.installPublicKey)
 		wp.mtx.Lock()
 		defer wp.mtx.Unlock()
@@ -367,6 +369,23 @@ func (wp *Pool) SetIdleBehavior(id cloud.InstanceID, idleBehavior IdleBehavior)
 	return nil
 }
 
+// Successful connection to the SSH daemon, update the mTimeToSSH metric
+func (wp *Pool) reportSSHConnected(inst cloud.Instance) {
+	wkr := wp.workers[inst.ID()]
+	wkr.mtx.Lock()
+	defer wkr.mtx.Unlock()
+	if wkr.state != StateBooting || !wkr.firstSSHConnection.IsZero() {
+		// the node is not in booting state (can happen if a-d-c is restarted) OR
+		// this is not the first SSH connection
+		return
+	}
+
+	if wp.mTimeToSSH != nil {
+		wp.mTimeToSSH.Observe(time.Since(wkr.appeared).Seconds())
+	}
+	wkr.firstSSHConnection = time.Now()
+}
+
 // Add or update worker attached to the given instance.
 //
 // The second return value is true if a new worker is created.
@@ -377,7 +396,7 @@ func (wp *Pool) SetIdleBehavior(id cloud.InstanceID, idleBehavior IdleBehavior)
 // Caller must have lock.
 func (wp *Pool) updateWorker(inst cloud.Instance, it arvados.InstanceType) (*worker, bool) {
 	secret := inst.Tags()[wp.tagKeyPrefix+tagKeyInstanceSecret]
-	inst = TagVerifier{inst, secret}
+	inst = TagVerifier{Instance: inst, Secret: secret, ReportVerified: wp.reportSSHConnected}
 	id := inst.ID()
 	if wkr := wp.workers[id]; wkr != nil {
 		wkr.executor.SetTarget(inst)
@@ -626,6 +645,22 @@ func (wp *Pool) registerMetrics(reg *prometheus.Registry) {
 		wp.mDisappearances.WithLabelValues(v).Add(0)
 	}
 	reg.MustRegister(wp.mDisappearances)
+	wp.mTimeToSSH = prometheus.NewSummary(prometheus.SummaryOpts{
+		Namespace:  "arvados",
+		Subsystem:  "dispatchcloud",
+		Name:       "instances_time_to_ssh_seconds",
+		Help:       "Number of seconds between instance creation and the first successful SSH connection.",
+		Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.95: 0.005, 0.99: 0.001},
+	})
+	reg.MustRegister(wp.mTimeToSSH)
+	wp.mTimeToReadyForContainer = prometheus.NewSummary(prometheus.SummaryOpts{
+		Namespace:  "arvados",
+		Subsystem:  "dispatchcloud",
+		Name:       "instances_time_to_ready_for_container_seconds",
+		Help:       "Number of seconds between the first successful SSH connection and ready to run a container.",
+		Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.95: 0.005, 0.99: 0.001},
+	})
+	reg.MustRegister(wp.mTimeToReadyForContainer)
 }
 
 func (wp *Pool) runMetrics() {
diff --git a/lib/dispatchcloud/worker/verify.go b/lib/dispatchcloud/worker/verify.go
index 597950fca..4e6ee86c6 100644
--- a/lib/dispatchcloud/worker/verify.go
+++ b/lib/dispatchcloud/worker/verify.go
@@ -23,7 +23,8 @@ var (
 
 type TagVerifier struct {
 	cloud.Instance
-	Secret string
+	Secret         string
+	ReportVerified func(cloud.Instance)
 }
 
 func (tv TagVerifier) InitCommand() cloud.InitCommand {
@@ -31,6 +32,7 @@ func (tv TagVerifier) InitCommand() cloud.InitCommand {
 }
 
 func (tv TagVerifier) VerifyHostKey(pubKey ssh.PublicKey, client *ssh.Client) error {
+	tv.ReportVerified(tv.Instance)
 	if err := tv.Instance.VerifyHostKey(pubKey, client); err != cloud.ErrNotImplemented || tv.Secret == "" {
 		// If the wrapped instance indicates it has a way to
 		// verify the key, return that decision.
diff --git a/lib/dispatchcloud/worker/worker.go b/lib/dispatchcloud/worker/worker.go
index 5d2360f3c..9199d4baf 100644
--- a/lib/dispatchcloud/worker/worker.go
+++ b/lib/dispatchcloud/worker/worker.go
@@ -103,11 +103,13 @@ type worker struct {
 	updated             time.Time
 	busy                time.Time
 	destroyed           time.Time
+	firstSSHConnection  time.Time
 	lastUUID            string
 	running             map[string]*remoteRunner // remember to update state idle<->running when this changes
 	starting            map[string]*remoteRunner // remember to update state idle<->running when this changes
 	probing             chan struct{}
 	bootOutcomeReported bool
+	timeToReadyReported bool
 }
 
 func (wkr *worker) onUnkillable(uuid string) {
@@ -140,6 +142,17 @@ func (wkr *worker) reportBootOutcome(outcome BootOutcome) {
 	wkr.bootOutcomeReported = true
 }
 
+// caller must have lock.
+func (wkr *worker) reportTimeBetweenFirstSSHAndReadyForContainer() {
+	if wkr.timeToReadyReported {
+		return
+	}
+	if wkr.wp.mTimeToSSH != nil {
+		wkr.wp.mTimeToReadyForContainer.Observe(time.Since(wkr.firstSSHConnection).Seconds())
+	}
+	wkr.timeToReadyReported = true
+}
+
 // caller must have lock.
 func (wkr *worker) setIdleBehavior(idleBehavior IdleBehavior) {
 	wkr.logger.WithField("IdleBehavior", idleBehavior).Info("set idle behavior")
@@ -313,6 +326,9 @@ func (wkr *worker) probeAndUpdate() {
 
 	// Update state if this was the first successful boot-probe.
 	if booted && (wkr.state == StateUnknown || wkr.state == StateBooting) {
+		if wkr.state == StateBooting {
+			wkr.reportTimeBetweenFirstSSHAndReadyForContainer()
+		}
 		// Note: this will change again below if
 		// len(wkr.starting)+len(wkr.running) > 0.
 		wkr.state = StateIdle

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list