[ARVADOS] created: 1.3.0-3079-g59c6fadc7
Git user
git at public.arvados.org
Thu Sep 3 17:11:10 UTC 2020
at 59c6fadc7aa48cf1c7a68a8fd6fa9ab420eef7ba (commit)
commit 59c6fadc7aa48cf1c7a68a8fd6fa9ab420eef7ba
Author: Ward Vandewege <ward at curii.com>
Date: Thu Sep 3 13:10:42 2020 -0400
16636: a-d-c: add a time-to-ssh metric
Arvados-DCO-1.1-Signed-off-by: Ward Vandewege <ward at curii.com>
diff --git a/lib/dispatchcloud/dispatcher_test.go b/lib/dispatchcloud/dispatcher_test.go
index 42decff31..3d602c08d 100644
--- a/lib/dispatchcloud/dispatcher_test.go
+++ b/lib/dispatchcloud/dispatcher_test.go
@@ -215,6 +215,9 @@ func (s *DispatcherSuite) TestDispatchToStubDriver(c *check.C) {
c.Check(resp.Body.String(), check.Matches, `(?ms).*boot_outcomes{outcome="success"} [^0].*`)
c.Check(resp.Body.String(), check.Matches, `(?ms).*instances_disappeared{state="shutdown"} [^0].*`)
c.Check(resp.Body.String(), check.Matches, `(?ms).*instances_disappeared{state="unknown"} 0\n.*`)
+ c.Check(resp.Body.String(), check.Matches, `(?ms).*time_to_ssh_seconds{quantile="0.95"} [0-9.]*`)
+ c.Check(resp.Body.String(), check.Matches, `(?ms).*time_to_ssh_seconds_count [0-9]*`)
+ c.Check(resp.Body.String(), check.Matches, `(?ms).*time_to_ssh_seconds_sum [0-9.]*`)
}
func (s *DispatcherSuite) TestAPIPermissions(c *check.C) {
diff --git a/lib/dispatchcloud/worker/pool.go b/lib/dispatchcloud/worker/pool.go
index 435b6e43a..b4d75478b 100644
--- a/lib/dispatchcloud/worker/pool.go
+++ b/lib/dispatchcloud/worker/pool.go
@@ -177,6 +177,7 @@ type Pool struct {
mMemory *prometheus.GaugeVec
mBootOutcomes *prometheus.CounterVec
mDisappearances *prometheus.CounterVec
+ mTimeToSSH prometheus.Summary
}
type createCall struct {
@@ -324,6 +325,7 @@ func (wp *Pool) Create(it arvados.InstanceType) bool {
wp.tagKeyPrefix + tagKeyInstanceSecret: secret,
}
initCmd := TagVerifier{nil, secret}.InitCommand()
+ // WARD HERE FIXME
inst, err := wp.instanceSet.Create(it, wp.imageID, tags, initCmd, wp.installPublicKey)
wp.mtx.Lock()
defer wp.mtx.Unlock()
@@ -626,6 +628,14 @@ func (wp *Pool) registerMetrics(reg *prometheus.Registry) {
wp.mDisappearances.WithLabelValues(v).Add(0)
}
reg.MustRegister(wp.mDisappearances)
+ wp.mTimeToSSH = prometheus.NewSummary(prometheus.SummaryOpts{
+ Namespace: "arvados",
+ Subsystem: "dispatchcloud",
+ Name: "instances_time_to_ssh_seconds",
+ Help: "Number of seconds between instance creation and the first successful SSH connection.",
+ Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.95: 0.005, 0.99: 0.001},
+ })
+ reg.MustRegister(wp.mTimeToSSH)
}
func (wp *Pool) runMetrics() {
diff --git a/lib/dispatchcloud/worker/worker.go b/lib/dispatchcloud/worker/worker.go
index 5d2360f3c..14f649334 100644
--- a/lib/dispatchcloud/worker/worker.go
+++ b/lib/dispatchcloud/worker/worker.go
@@ -108,6 +108,7 @@ type worker struct {
starting map[string]*remoteRunner // remember to update state idle<->running when this changes
probing chan struct{}
bootOutcomeReported bool
+ timeToSSHReported bool
}
func (wkr *worker) onUnkillable(uuid string) {
@@ -140,6 +141,17 @@ func (wkr *worker) reportBootOutcome(outcome BootOutcome) {
wkr.bootOutcomeReported = true
}
+// caller must have lock.
+func (wkr *worker) reportTimeToSSH() {
+ if wkr.timeToSSHReported {
+ return
+ }
+ if wkr.wp.mTimeToSSH != nil {
+ wkr.wp.mTimeToSSH.Observe(time.Since(wkr.appeared).Seconds())
+ }
+ wkr.timeToSSHReported = true
+}
+
// caller must have lock.
func (wkr *worker) setIdleBehavior(idleBehavior IdleBehavior) {
wkr.logger.WithField("IdleBehavior", idleBehavior).Info("set idle behavior")
@@ -365,6 +377,9 @@ func (wkr *worker) probeRunning() (running []string, reportsBroken, ok bool) {
}).WithError(err).Warn("probe failed")
return
}
+ wkr.mtx.Lock()
+ wkr.reportTimeToSSH()
+ wkr.mtx.Unlock()
ok = true
for _, s := range strings.Split(string(stdout), "\n") {
if s == "broken" {
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list