[ARVADOS] updated: 1.3.0-3079-g5c5ac412b
Git user
git at public.arvados.org
Thu Sep 3 20:27:34 UTC 2020
Summary of changes:
lib/dispatchcloud/dispatcher_test.go | 3 +++
lib/dispatchcloud/worker/pool.go | 47 +++++++++++++++++++++++++++---------
lib/dispatchcloud/worker/verify.go | 4 ++-
lib/dispatchcloud/worker/worker.go | 17 +++++++------
4 files changed, 51 insertions(+), 20 deletions(-)
discards 59c6fadc7aa48cf1c7a68a8fd6fa9ab420eef7ba (commit)
via 5c5ac412b722025d1af37f81bea60a4b503ce6aa (commit)
This update added new revisions after undoing existing revisions. That is
to say, the old revision is not a strict subset of the new revision. This
situation occurs when you --force push a change and generate a repository
containing something like this:
* -- * -- B -- O -- O -- O (59c6fadc7aa48cf1c7a68a8fd6fa9ab420eef7ba)
\
N -- N -- N (5c5ac412b722025d1af37f81bea60a4b503ce6aa)
When this happens we assume that you've already had alert emails for all
of the O revisions, and so we here report only the revisions in the N
branch from the common base, B.
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
commit 5c5ac412b722025d1af37f81bea60a4b503ce6aa
Author: Ward Vandewege <ward at curii.com>
Date: Thu Sep 3 13:10:42 2020 -0400
16636: a-d-c: add a time-to-ssh and time-to-ready-for-container metrics
Arvados-DCO-1.1-Signed-off-by: Ward Vandewege <ward at curii.com>
diff --git a/lib/dispatchcloud/dispatcher_test.go b/lib/dispatchcloud/dispatcher_test.go
index 42decff31..6e1850410 100644
--- a/lib/dispatchcloud/dispatcher_test.go
+++ b/lib/dispatchcloud/dispatcher_test.go
@@ -215,6 +215,12 @@ func (s *DispatcherSuite) TestDispatchToStubDriver(c *check.C) {
c.Check(resp.Body.String(), check.Matches, `(?ms).*boot_outcomes{outcome="success"} [^0].*`)
c.Check(resp.Body.String(), check.Matches, `(?ms).*instances_disappeared{state="shutdown"} [^0].*`)
c.Check(resp.Body.String(), check.Matches, `(?ms).*instances_disappeared{state="unknown"} 0\n.*`)
+ c.Check(resp.Body.String(), check.Matches, `(?ms).*time_to_ssh_seconds{quantile="0.95"} [0-9.]*`)
+ c.Check(resp.Body.String(), check.Matches, `(?ms).*time_to_ssh_seconds_count [0-9]*`)
+ c.Check(resp.Body.String(), check.Matches, `(?ms).*time_to_ssh_seconds_sum [0-9.]*`)
+ c.Check(resp.Body.String(), check.Matches, `(?ms).*time_to_ready_for_container_seconds{quantile="0.95"} [0-9.]*`)
+ c.Check(resp.Body.String(), check.Matches, `(?ms).*time_to_ready_for_container_seconds_count [0-9]*`)
+ c.Check(resp.Body.String(), check.Matches, `(?ms).*time_to_ready_for_container_seconds_sum [0-9.]*`)
}
func (s *DispatcherSuite) TestAPIPermissions(c *check.C) {
diff --git a/lib/dispatchcloud/worker/pool.go b/lib/dispatchcloud/worker/pool.go
index 435b6e43a..6ca15de49 100644
--- a/lib/dispatchcloud/worker/pool.go
+++ b/lib/dispatchcloud/worker/pool.go
@@ -170,13 +170,15 @@ type Pool struct {
runnerMD5 [md5.Size]byte
runnerCmd string
- mContainersRunning prometheus.Gauge
- mInstances *prometheus.GaugeVec
- mInstancesPrice *prometheus.GaugeVec
- mVCPUs *prometheus.GaugeVec
- mMemory *prometheus.GaugeVec
- mBootOutcomes *prometheus.CounterVec
- mDisappearances *prometheus.CounterVec
+ mContainersRunning prometheus.Gauge
+ mInstances *prometheus.GaugeVec
+ mInstancesPrice *prometheus.GaugeVec
+ mVCPUs *prometheus.GaugeVec
+ mMemory *prometheus.GaugeVec
+ mBootOutcomes *prometheus.CounterVec
+ mDisappearances *prometheus.CounterVec
+ mTimeToSSH prometheus.Summary
+ mTimeToReadyForContainer prometheus.Summary
}
type createCall struct {
@@ -323,7 +325,7 @@ func (wp *Pool) Create(it arvados.InstanceType) bool {
wp.tagKeyPrefix + tagKeyIdleBehavior: string(IdleBehaviorRun),
wp.tagKeyPrefix + tagKeyInstanceSecret: secret,
}
- initCmd := TagVerifier{nil, secret}.InitCommand()
+ initCmd := TagVerifier{nil, secret, nil}.InitCommand()
inst, err := wp.instanceSet.Create(it, wp.imageID, tags, initCmd, wp.installPublicKey)
wp.mtx.Lock()
defer wp.mtx.Unlock()
@@ -367,6 +369,23 @@ func (wp *Pool) SetIdleBehavior(id cloud.InstanceID, idleBehavior IdleBehavior)
return nil
}
+// Successful connection to the SSH daemon, update the mTimeToSSH metric
+func (wp *Pool) reportSSHConnected(inst cloud.Instance) {
+ wkr := wp.workers[inst.ID()]
+ wkr.mtx.Lock()
+ defer wkr.mtx.Unlock()
+ if wkr.state != StateBooting || !wkr.firstSSHConnection.IsZero() {
+ // the node is not in booting state (can happen if a-d-c is restarted) OR
+ // this is not the first SSH connection
+ return
+ }
+
+ if wp.mTimeToSSH != nil {
+ wp.mTimeToSSH.Observe(time.Since(wkr.appeared).Seconds())
+ }
+ wkr.firstSSHConnection = time.Now()
+}
+
// Add or update worker attached to the given instance.
//
// The second return value is true if a new worker is created.
@@ -377,7 +396,7 @@ func (wp *Pool) SetIdleBehavior(id cloud.InstanceID, idleBehavior IdleBehavior)
// Caller must have lock.
func (wp *Pool) updateWorker(inst cloud.Instance, it arvados.InstanceType) (*worker, bool) {
secret := inst.Tags()[wp.tagKeyPrefix+tagKeyInstanceSecret]
- inst = TagVerifier{inst, secret}
+ inst = TagVerifier{Instance: inst, Secret: secret, ReportVerified: wp.reportSSHConnected}
id := inst.ID()
if wkr := wp.workers[id]; wkr != nil {
wkr.executor.SetTarget(inst)
@@ -626,6 +645,22 @@ func (wp *Pool) registerMetrics(reg *prometheus.Registry) {
wp.mDisappearances.WithLabelValues(v).Add(0)
}
reg.MustRegister(wp.mDisappearances)
+ wp.mTimeToSSH = prometheus.NewSummary(prometheus.SummaryOpts{
+ Namespace: "arvados",
+ Subsystem: "dispatchcloud",
+ Name: "instances_time_to_ssh_seconds",
+ Help: "Number of seconds between instance creation and the first successful SSH connection.",
+ Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.95: 0.005, 0.99: 0.001},
+ })
+ reg.MustRegister(wp.mTimeToSSH)
+ wp.mTimeToReadyForContainer = prometheus.NewSummary(prometheus.SummaryOpts{
+ Namespace: "arvados",
+ Subsystem: "dispatchcloud",
+ Name: "instances_time_to_ready_for_container_seconds",
+ Help: "Number of seconds between the first successful SSH connection and ready to run a container.",
+ Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.95: 0.005, 0.99: 0.001},
+ })
+ reg.MustRegister(wp.mTimeToReadyForContainer)
}
func (wp *Pool) runMetrics() {
diff --git a/lib/dispatchcloud/worker/verify.go b/lib/dispatchcloud/worker/verify.go
index 597950fca..4e6ee86c6 100644
--- a/lib/dispatchcloud/worker/verify.go
+++ b/lib/dispatchcloud/worker/verify.go
@@ -23,7 +23,8 @@ var (
type TagVerifier struct {
cloud.Instance
- Secret string
+ Secret string
+ ReportVerified func(cloud.Instance)
}
func (tv TagVerifier) InitCommand() cloud.InitCommand {
@@ -31,6 +32,7 @@ func (tv TagVerifier) InitCommand() cloud.InitCommand {
}
func (tv TagVerifier) VerifyHostKey(pubKey ssh.PublicKey, client *ssh.Client) error {
+ tv.ReportVerified(tv.Instance)
if err := tv.Instance.VerifyHostKey(pubKey, client); err != cloud.ErrNotImplemented || tv.Secret == "" {
// If the wrapped instance indicates it has a way to
// verify the key, return that decision.
diff --git a/lib/dispatchcloud/worker/worker.go b/lib/dispatchcloud/worker/worker.go
index 5d2360f3c..9199d4baf 100644
--- a/lib/dispatchcloud/worker/worker.go
+++ b/lib/dispatchcloud/worker/worker.go
@@ -103,11 +103,13 @@ type worker struct {
updated time.Time
busy time.Time
destroyed time.Time
+ firstSSHConnection time.Time
lastUUID string
running map[string]*remoteRunner // remember to update state idle<->running when this changes
starting map[string]*remoteRunner // remember to update state idle<->running when this changes
probing chan struct{}
bootOutcomeReported bool
+ timeToReadyReported bool
}
func (wkr *worker) onUnkillable(uuid string) {
@@ -140,6 +142,17 @@ func (wkr *worker) reportBootOutcome(outcome BootOutcome) {
wkr.bootOutcomeReported = true
}
+// caller must have lock.
+func (wkr *worker) reportTimeBetweenFirstSSHAndReadyForContainer() {
+ if wkr.timeToReadyReported {
+ return
+ }
+ if wkr.wp.mTimeToSSH != nil {
+ wkr.wp.mTimeToReadyForContainer.Observe(time.Since(wkr.firstSSHConnection).Seconds())
+ }
+ wkr.timeToReadyReported = true
+}
+
// caller must have lock.
func (wkr *worker) setIdleBehavior(idleBehavior IdleBehavior) {
wkr.logger.WithField("IdleBehavior", idleBehavior).Info("set idle behavior")
@@ -313,6 +326,9 @@ func (wkr *worker) probeAndUpdate() {
// Update state if this was the first successful boot-probe.
if booted && (wkr.state == StateUnknown || wkr.state == StateBooting) {
+ if wkr.state == StateBooting {
+ wkr.reportTimeBetweenFirstSSHAndReadyForContainer()
+ }
// Note: this will change again below if
// len(wkr.starting)+len(wkr.running) > 0.
wkr.state = StateIdle
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list