[ARVADOS] created: 1.3.0-3170-g799f8e333
Git user
git at public.arvados.org
Wed Sep 16 20:53:35 UTC 2020
at 799f8e333e7067cee0db0ee8bbcf45a56602d1f1 (commit)
commit 799f8e333e7067cee0db0ee8bbcf45a56602d1f1
Author: Ward Vandewege <ward at curii.com>
Date: Wed Sep 16 16:53:00 2020 -0400
16838: a-d-c: metrics: add runProbe success/failure duration metrics.
Arvados-DCO-1.1-Signed-off-by: Ward Vandewege <ward at curii.com>
diff --git a/lib/dispatchcloud/dispatcher_test.go b/lib/dispatchcloud/dispatcher_test.go
index 9f1eb098e..d5d90bf35 100644
--- a/lib/dispatchcloud/dispatcher_test.go
+++ b/lib/dispatchcloud/dispatcher_test.go
@@ -227,6 +227,10 @@ func (s *DispatcherSuite) TestDispatchToStubDriver(c *check.C) {
c.Check(resp.Body.String(), check.Matches, `(?ms).*time_from_shutdown_request_to_disappearance_seconds_sum [0-9.]*`)
c.Check(resp.Body.String(), check.Matches, `(?ms).*time_from_queue_to_crunch_run_seconds_count [0-9]*`)
c.Check(resp.Body.String(), check.Matches, `(?ms).*time_from_queue_to_crunch_run_seconds_sum [0-9e+.]*`)
+ c.Check(resp.Body.String(), check.Matches, `(?ms).*run_probe_duration_seconds_count{outcome="success"} [0-9]*`)
+ c.Check(resp.Body.String(), check.Matches, `(?ms).*run_probe_duration_seconds_sum{outcome="success"} [0-9e+.]*`)
+ c.Check(resp.Body.String(), check.Matches, `(?ms).*run_probe_duration_seconds_count{outcome="fail"} [0-9]*`)
+ c.Check(resp.Body.String(), check.Matches, `(?ms).*run_probe_duration_seconds_sum{outcome="fail"} [0-9e+.]*`)
}
func (s *DispatcherSuite) TestAPIPermissions(c *check.C) {
diff --git a/lib/dispatchcloud/worker/pool.go b/lib/dispatchcloud/worker/pool.go
index c6eaeae2b..78c51dee1 100644
--- a/lib/dispatchcloud/worker/pool.go
+++ b/lib/dispatchcloud/worker/pool.go
@@ -184,6 +184,7 @@ type Pool struct {
mTimeToReadyForContainer prometheus.Summary
mTimeFromShutdownToGone prometheus.Summary
mTimeFromQueueToCrunchRun prometheus.Summary
+ mRunProbeDuration *prometheus.SummaryVec
}
type createCall struct {
@@ -682,6 +683,17 @@ func (wp *Pool) registerMetrics(reg *prometheus.Registry) {
Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.95: 0.005, 0.99: 0.001},
})
reg.MustRegister(wp.mTimeFromQueueToCrunchRun)
+ wp.mRunProbeDuration = prometheus.NewSummaryVec(prometheus.SummaryOpts{
+ Namespace: "arvados",
+ Subsystem: "dispatchcloud",
+ Name: "instances_run_probe_duration_seconds",
+ Help: "Number of seconds per runProbe call.",
+ Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.95: 0.005, 0.99: 0.001},
+ }, []string{"outcome"})
+ for _, v := range []string{"success", "fail"} {
+ wp.mRunProbeDuration.WithLabelValues(v).Observe(0)
+ }
+ reg.MustRegister(wp.mRunProbeDuration)
}
func (wp *Pool) runMetrics() {
diff --git a/lib/dispatchcloud/worker/worker.go b/lib/dispatchcloud/worker/worker.go
index 5b145d7c6..9e89d7daa 100644
--- a/lib/dispatchcloud/worker/worker.go
+++ b/lib/dispatchcloud/worker/worker.go
@@ -192,7 +192,7 @@ func (wkr *worker) startContainer(ctr arvados.Container) {
}
// ProbeAndUpdate conducts appropriate boot/running probes (if any)
-// for the worker's curent state. If a previous probe is still
+// for the worker's current state. If a previous probe is still
// running, it does nothing.
//
// It should be called in a new goroutine.
@@ -376,6 +376,7 @@ func (wkr *worker) probeRunning() (running []string, reportsBroken, ok bool) {
if u := wkr.instance.RemoteUser(); u != "root" {
cmd = "sudo " + cmd
}
+ before := time.Now()
stdout, stderr, err := wkr.executor.Execute(nil, cmd, nil)
if err != nil {
wkr.logger.WithFields(logrus.Fields{
@@ -383,8 +384,10 @@ func (wkr *worker) probeRunning() (running []string, reportsBroken, ok bool) {
"stdout": string(stdout),
"stderr": string(stderr),
}).WithError(err).Warn("probe failed")
+ wkr.wp.mRunProbeDuration.WithLabelValues("fail").Observe(time.Now().Sub(before).Seconds())
return
}
+ wkr.wp.mRunProbeDuration.WithLabelValues("success").Observe(time.Now().Sub(before).Seconds())
ok = true
staleRunLock := false
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list