[ARVADOS] created: 1.3.0-3170-g799f8e333

Git user git at public.arvados.org
Wed Sep 16 20:53:35 UTC 2020


        at  799f8e333e7067cee0db0ee8bbcf45a56602d1f1 (commit)


commit 799f8e333e7067cee0db0ee8bbcf45a56602d1f1
Author: Ward Vandewege <ward at curii.com>
Date:   Wed Sep 16 16:53:00 2020 -0400

    16838: a-d-c: metrics: add runProbe success/failure duration metrics.
    
    Arvados-DCO-1.1-Signed-off-by: Ward Vandewege <ward at curii.com>

diff --git a/lib/dispatchcloud/dispatcher_test.go b/lib/dispatchcloud/dispatcher_test.go
index 9f1eb098e..d5d90bf35 100644
--- a/lib/dispatchcloud/dispatcher_test.go
+++ b/lib/dispatchcloud/dispatcher_test.go
@@ -227,6 +227,10 @@ func (s *DispatcherSuite) TestDispatchToStubDriver(c *check.C) {
 	c.Check(resp.Body.String(), check.Matches, `(?ms).*time_from_shutdown_request_to_disappearance_seconds_sum [0-9.]*`)
 	c.Check(resp.Body.String(), check.Matches, `(?ms).*time_from_queue_to_crunch_run_seconds_count [0-9]*`)
 	c.Check(resp.Body.String(), check.Matches, `(?ms).*time_from_queue_to_crunch_run_seconds_sum [0-9e+.]*`)
+	c.Check(resp.Body.String(), check.Matches, `(?ms).*run_probe_duration_seconds_count{outcome="success"} [0-9]*`)
+	c.Check(resp.Body.String(), check.Matches, `(?ms).*run_probe_duration_seconds_sum{outcome="success"} [0-9e+.]*`)
+	c.Check(resp.Body.String(), check.Matches, `(?ms).*run_probe_duration_seconds_count{outcome="fail"} [0-9]*`)
+	c.Check(resp.Body.String(), check.Matches, `(?ms).*run_probe_duration_seconds_sum{outcome="fail"} [0-9e+.]*`)
 }
 
 func (s *DispatcherSuite) TestAPIPermissions(c *check.C) {
diff --git a/lib/dispatchcloud/worker/pool.go b/lib/dispatchcloud/worker/pool.go
index c6eaeae2b..78c51dee1 100644
--- a/lib/dispatchcloud/worker/pool.go
+++ b/lib/dispatchcloud/worker/pool.go
@@ -184,6 +184,7 @@ type Pool struct {
 	mTimeToReadyForContainer  prometheus.Summary
 	mTimeFromShutdownToGone   prometheus.Summary
 	mTimeFromQueueToCrunchRun prometheus.Summary
+	mRunProbeDuration         *prometheus.SummaryVec
 }
 
 type createCall struct {
@@ -682,6 +683,17 @@ func (wp *Pool) registerMetrics(reg *prometheus.Registry) {
 		Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.95: 0.005, 0.99: 0.001},
 	})
 	reg.MustRegister(wp.mTimeFromQueueToCrunchRun)
+	wp.mRunProbeDuration = prometheus.NewSummaryVec(prometheus.SummaryOpts{
+		Namespace:  "arvados",
+		Subsystem:  "dispatchcloud",
+		Name:       "instances_run_probe_duration_seconds",
+		Help:       "Number of seconds per runProbe call.",
+		Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.95: 0.005, 0.99: 0.001},
+	}, []string{"outcome"})
+	for _, v := range []string{"success", "fail"} {
+		wp.mRunProbeDuration.WithLabelValues(v).Observe(0)
+	}
+	reg.MustRegister(wp.mRunProbeDuration)
 }
 
 func (wp *Pool) runMetrics() {
diff --git a/lib/dispatchcloud/worker/worker.go b/lib/dispatchcloud/worker/worker.go
index 5b145d7c6..9e89d7daa 100644
--- a/lib/dispatchcloud/worker/worker.go
+++ b/lib/dispatchcloud/worker/worker.go
@@ -192,7 +192,7 @@ func (wkr *worker) startContainer(ctr arvados.Container) {
 }
 
 // ProbeAndUpdate conducts appropriate boot/running probes (if any)
-// for the worker's curent state. If a previous probe is still
+// for the worker's current state. If a previous probe is still
 // running, it does nothing.
 //
 // It should be called in a new goroutine.
@@ -376,6 +376,7 @@ func (wkr *worker) probeRunning() (running []string, reportsBroken, ok bool) {
 	if u := wkr.instance.RemoteUser(); u != "root" {
 		cmd = "sudo " + cmd
 	}
+	before := time.Now()
 	stdout, stderr, err := wkr.executor.Execute(nil, cmd, nil)
 	if err != nil {
 		wkr.logger.WithFields(logrus.Fields{
@@ -383,8 +384,10 @@ func (wkr *worker) probeRunning() (running []string, reportsBroken, ok bool) {
 			"stdout":  string(stdout),
 			"stderr":  string(stderr),
 		}).WithError(err).Warn("probe failed")
+		wkr.wp.mRunProbeDuration.WithLabelValues("fail").Observe(time.Now().Sub(before).Seconds())
 		return
 	}
+	wkr.wp.mRunProbeDuration.WithLabelValues("success").Observe(time.Now().Sub(before).Seconds())
 	ok = true
 
 	staleRunLock := false

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list