[ARVADOS] created: 1.3.0-3151-ga5dbdd1db

Mon Sep 14 19:15:54 UTC 2020

at  a5dbdd1dbcdfcb835e7c1fe741e4c00927177404 (commit)


commit a5dbdd1dbcdfcb835e7c1fe741e4c00927177404
Author: Ward Vandewege <ward at curii.com>
Date:   Mon Sep 14 15:14:32 2020 -0400

    16636: add 'containers_longest_wait_time_seconds' metric: a gauge that
           indicates the current longest wait time between seeing a
           container in the queue or requeueing, and starting its crunch-run
           process on a worker.
    
    Arvados-DCO-1.1-Signed-off-by: Ward Vandewege <ward at curii.com>

diff --git a/lib/dispatchcloud/scheduler/run_queue.go b/lib/dispatchcloud/scheduler/run_queue.go
index b9d653a82..9bbb064fe 100644
--- a/lib/dispatchcloud/scheduler/run_queue.go
+++ b/lib/dispatchcloud/scheduler/run_queue.go
@@ -34,6 +34,7 @@ func (sch *Scheduler) runQueue() {
 	dontstart := map[arvados.InstanceType]bool{}
 	var overquota []container.QueueEnt // entries that are unmappable because of worker pool quota
 	var containerAllocatedWorkerBootingCount int
+	var longestWaitTimeCandidate, previousLongestWaitTimeCandidate float64
 
 tryrun:
 	for i, ctr := range sorted {
@@ -45,6 +46,11 @@ tryrun:
 		if _, running := running[ctr.UUID]; running || ctr.Priority < 1 {
 			continue
 		}
+		previousLongestWaitTimeCandidate = longestWaitTimeCandidate
+		since := time.Since(ctr.CreatedAt).Seconds()
+		if since > longestWaitTimeCandidate {
+			longestWaitTimeCandidate = since
+		}
 		switch ctr.State {
 		case arvados.ContainerStateQueued:
 			if unalloc[it] < 1 && sch.pool.AtQuota() {
@@ -92,6 +98,7 @@ tryrun:
 				logger.Info("not restarting yet: crunch-run process from previous attempt has not exited")
 			} else if sch.pool.StartContainer(it, ctr) {
 				// Success.
+				longestWaitTimeCandidate = previousLongestWaitTimeCandidate
 			} else {
 				containerAllocatedWorkerBootingCount += 1
 				dontstart[it] = true
@@ -101,6 +108,7 @@ tryrun:
 
 	sch.mContainersAllocatedNotStarted.Set(float64(containerAllocatedWorkerBootingCount))
 	sch.mContainersNotAllocatedOverQuota.Set(float64(len(overquota)))
+	sch.mLongestWaitTimeSinceQueue.Set(longestWaitTimeCandidate)
 
 	if len(overquota) > 0 {
 		// Unlock any containers that are unmappable while
diff --git a/lib/dispatchcloud/scheduler/run_queue_test.go b/lib/dispatchcloud/scheduler/run_queue_test.go
index 6d7036498..e7963ca7d 100644
--- a/lib/dispatchcloud/scheduler/run_queue_test.go
+++ b/lib/dispatchcloud/scheduler/run_queue_test.go
@@ -391,15 +391,16 @@ func (*SchedulerSuite) TestKillNonexistentContainer(c *check.C) {
 	c.Check(pool.Running(), check.HasLen, 0)
 }
 
-func (*SchedulerSuite) TestContainersAllocatedNotStartedMetric(c *check.C) {
+func (*SchedulerSuite) TestContainersMetrics(c *check.C) {
 	ctx := ctxlog.Context(context.Background(), ctxlog.TestLogger(c))
 	queue := test.Queue{
 		ChooseType: chooseType,
 		Containers: []arvados.Container{
 			{
-				UUID:     test.ContainerUUID(1),
-				Priority: 1,
-				State:    arvados.ContainerStateLocked,
+				UUID:      test.ContainerUUID(1),
+				Priority:  1,
+				State:     arvados.ContainerStateLocked,
+				CreatedAt: time.Now().Add(-10 * time.Second),
 				RuntimeConstraints: arvados.RuntimeConstraints{
 					VCPUs: 1,
 					RAM:   1 << 30,
@@ -420,6 +421,7 @@ func (*SchedulerSuite) TestContainersAllocatedNotStartedMetric(c *check.C) {
 
 	c.Check(int(testutil.ToFloat64(sch.mContainersAllocatedNotStarted)), check.Equals, 1)
 	c.Check(int(testutil.ToFloat64(sch.mContainersNotAllocatedOverQuota)), check.Equals, 0)
+	c.Check(int(testutil.ToFloat64(sch.mLongestWaitTimeSinceQueue)), check.Equals, 10)
 
 	// Create a pool without workers. The queued container will not be started, and the
 	// 'over quota' metric will be 1 because no workers are available and canCreate defaults
@@ -430,4 +432,35 @@ func (*SchedulerSuite) TestContainersAllocatedNotStartedMetric(c *check.C) {
 
 	c.Check(int(testutil.ToFloat64(sch.mContainersAllocatedNotStarted)), check.Equals, 0)
 	c.Check(int(testutil.ToFloat64(sch.mContainersNotAllocatedOverQuota)), check.Equals, 1)
+	c.Check(int(testutil.ToFloat64(sch.mLongestWaitTimeSinceQueue)), check.Equals, 10)
+
+	// Reset the queue, and create a pool with an idle worker. The queued
+	// container will be started immediately and mLongestWaitTimeSinceQueue
+	// should be zero.
+	queue = test.Queue{
+		ChooseType: chooseType,
+		Containers: []arvados.Container{
+			{
+				UUID:      test.ContainerUUID(1),
+				Priority:  1,
+				State:     arvados.ContainerStateLocked,
+				CreatedAt: time.Now().Add(-10 * time.Second),
+				RuntimeConstraints: arvados.RuntimeConstraints{
+					VCPUs: 1,
+					RAM:   1 << 30,
+				},
+			},
+		},
+	}
+	queue.Update()
+
+	pool = stubPool{
+		idle:    map[arvados.InstanceType]int{test.InstanceType(1): 1},
+		unalloc: map[arvados.InstanceType]int{test.InstanceType(1): 1},
+		running: map[string]time.Time{},
+	}
+	sch = New(ctx, &queue, &pool, nil, time.Millisecond, time.Millisecond)
+	sch.runQueue()
+
+	c.Check(int(testutil.ToFloat64(sch.mLongestWaitTimeSinceQueue)), check.Equals, 0)
 }
diff --git a/lib/dispatchcloud/scheduler/scheduler.go b/lib/dispatchcloud/scheduler/scheduler.go
index 6fd47e919..c16924573 100644
--- a/lib/dispatchcloud/scheduler/scheduler.go
+++ b/lib/dispatchcloud/scheduler/scheduler.go
@@ -46,6 +46,7 @@ type Scheduler struct {
 
 	mContainersAllocatedNotStarted   prometheus.Gauge
 	mContainersNotAllocatedOverQuota prometheus.Gauge
+	mLongestWaitTimeSinceQueue       prometheus.Gauge
 }
 
 // New returns a new unstarted Scheduler.
@@ -87,6 +88,13 @@ func (sch *Scheduler) registerMetrics(reg *prometheus.Registry) {
 		Help:      "Number of containers not allocated to a worker because the system has hit a quota.",
 	})
 	reg.MustRegister(sch.mContainersNotAllocatedOverQuota)
+	sch.mLongestWaitTimeSinceQueue = prometheus.NewGauge(prometheus.GaugeOpts{
+		Namespace: "arvados",
+		Subsystem: "dispatchcloud",
+		Name:      "containers_longest_wait_time_seconds",
+		Help:      "Current longest wait time of any container since queuing, and before the start of crunch-run.",
+	})
+	reg.MustRegister(sch.mLongestWaitTimeSinceQueue)
 }
 
 // Start starts the scheduler.

commit 62403e0d1d95f8526d17222a66356382e2d4b028
Merge: ab439e74c f0cc3c391
Author: Ward Vandewege <ward at curii.com>
Date:   Mon Sep 14 13:03:18 2020 -0400

    16636: Merge branch 'master' into 16636-more-metrics
    
    Arvados-DCO-1.1-Signed-off-by: Ward Vandewege <ward at curii.com>


commit ab439e74c340896f52629996f54614ea190f0f5e
Author: Ward Vandewege <ward at curii.com>
Date:   Mon Sep 14 09:53:38 2020 -0400

    16636: add 'time_from_queue_to_crunch_run' metric: wait times (between
           seeing a container in the queue or requeueing, and starting its
           crunch-run process on a worker) across previous starts.
    
    Arvados-DCO-1.1-Signed-off-by: Ward Vandewege <ward at curii.com>

diff --git a/lib/dispatchcloud/dispatcher_test.go b/lib/dispatchcloud/dispatcher_test.go
index 92a42c7a2..80cb28f35 100644
--- a/lib/dispatchcloud/dispatcher_test.go
+++ b/lib/dispatchcloud/dispatcher_test.go
@@ -223,6 +223,8 @@ func (s *DispatcherSuite) TestDispatchToStubDriver(c *check.C) {
 	c.Check(resp.Body.String(), check.Matches, `(?ms).*time_to_ready_for_container_seconds_sum [0-9.]*`)
 	c.Check(resp.Body.String(), check.Matches, `(?ms).*time_from_shutdown_request_to_disappearance_seconds_count [0-9]*`)
 	c.Check(resp.Body.String(), check.Matches, `(?ms).*time_from_shutdown_request_to_disappearance_seconds_sum [0-9.]*`)
+	c.Check(resp.Body.String(), check.Matches, `(?ms).*time_from_queue_to_crunch_run_seconds_count [0-9]*`)
+	c.Check(resp.Body.String(), check.Matches, `(?ms).*time_from_queue_to_crunch_run_seconds_sum [0-9e+.]*`)
 }
 
 func (s *DispatcherSuite) TestAPIPermissions(c *check.C) {
diff --git a/lib/dispatchcloud/worker/pool.go b/lib/dispatchcloud/worker/pool.go
index 75bc01fc0..67962c9d6 100644
--- a/lib/dispatchcloud/worker/pool.go
+++ b/lib/dispatchcloud/worker/pool.go
@@ -170,16 +170,17 @@ type Pool struct {
 	runnerMD5    [md5.Size]byte
 	runnerCmd    string
 
-	mContainersRunning       prometheus.Gauge
-	mInstances               *prometheus.GaugeVec
-	mInstancesPrice          *prometheus.GaugeVec
-	mVCPUs                   *prometheus.GaugeVec
-	mMemory                  *prometheus.GaugeVec
-	mBootOutcomes            *prometheus.CounterVec
-	mDisappearances          *prometheus.CounterVec
-	mTimeToSSH               prometheus.Summary
-	mTimeToReadyForContainer prometheus.Summary
-	mTimeFromShutdownToGone  prometheus.Summary
+	mContainersRunning        prometheus.Gauge
+	mInstances                *prometheus.GaugeVec
+	mInstancesPrice           *prometheus.GaugeVec
+	mVCPUs                    *prometheus.GaugeVec
+	mMemory                   *prometheus.GaugeVec
+	mBootOutcomes             *prometheus.CounterVec
+	mDisappearances           *prometheus.CounterVec
+	mTimeToSSH                prometheus.Summary
+	mTimeToReadyForContainer  prometheus.Summary
+	mTimeFromShutdownToGone   prometheus.Summary
+	mTimeFromQueueToCrunchRun prometheus.Summary
 }
 
 type createCall struct {
@@ -670,6 +671,14 @@ func (wp *Pool) registerMetrics(reg *prometheus.Registry) {
 		Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.95: 0.005, 0.99: 0.001},
 	})
 	reg.MustRegister(wp.mTimeFromShutdownToGone)
+	wp.mTimeFromQueueToCrunchRun = prometheus.NewSummary(prometheus.SummaryOpts{
+		Namespace:  "arvados",
+		Subsystem:  "dispatchcloud",
+		Name:       "containers_time_from_queue_to_crunch_run_seconds",
+		Help:       "Number of seconds between the queuing of a container and the start of crunch-run.",
+		Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.95: 0.005, 0.99: 0.001},
+	})
+	reg.MustRegister(wp.mTimeFromQueueToCrunchRun)
 }
 
 func (wp *Pool) runMetrics() {
diff --git a/lib/dispatchcloud/worker/worker.go b/lib/dispatchcloud/worker/worker.go
index 9199d4baf..95794d0b3 100644
--- a/lib/dispatchcloud/worker/worker.go
+++ b/lib/dispatchcloud/worker/worker.go
@@ -176,6 +176,9 @@ func (wkr *worker) startContainer(ctr arvados.Container) {
 	}
 	go func() {
 		rr.Start()
+		if wkr.wp.mTimeFromQueueToCrunchRun != nil {
+			wkr.wp.mTimeFromQueueToCrunchRun.Observe(time.Since(ctr.CreatedAt).Seconds())
+		}
 		wkr.mtx.Lock()
 		defer wkr.mtx.Unlock()
 		now := time.Now()

commit b1df618b0c33eccc4ab15d89aad4a10fbf245993
Author: Ward Vandewege <ward at curii.com>
Date:   Fri Sep 11 12:30:07 2020 -0400

    16636: add instances_time_from_shutdown_request_to_disappearance_seconds
           metric
    
    Arvados-DCO-1.1-Signed-off-by: Ward Vandewege <ward at curii.com>

diff --git a/lib/dispatchcloud/dispatcher_test.go b/lib/dispatchcloud/dispatcher_test.go
index 6e1850410..92a42c7a2 100644
--- a/lib/dispatchcloud/dispatcher_test.go
+++ b/lib/dispatchcloud/dispatcher_test.go
@@ -221,6 +221,8 @@ func (s *DispatcherSuite) TestDispatchToStubDriver(c *check.C) {
 	c.Check(resp.Body.String(), check.Matches, `(?ms).*time_to_ready_for_container_seconds{quantile="0.95"} [0-9.]*`)
 	c.Check(resp.Body.String(), check.Matches, `(?ms).*time_to_ready_for_container_seconds_count [0-9]*`)
 	c.Check(resp.Body.String(), check.Matches, `(?ms).*time_to_ready_for_container_seconds_sum [0-9.]*`)
+	c.Check(resp.Body.String(), check.Matches, `(?ms).*time_from_shutdown_request_to_disappearance_seconds_count [0-9]*`)
+	c.Check(resp.Body.String(), check.Matches, `(?ms).*time_from_shutdown_request_to_disappearance_seconds_sum [0-9.]*`)
 }
 
 func (s *DispatcherSuite) TestAPIPermissions(c *check.C) {
diff --git a/lib/dispatchcloud/worker/pool.go b/lib/dispatchcloud/worker/pool.go
index 086887cb4..75bc01fc0 100644
--- a/lib/dispatchcloud/worker/pool.go
+++ b/lib/dispatchcloud/worker/pool.go
@@ -179,6 +179,7 @@ type Pool struct {
 	mDisappearances          *prometheus.CounterVec
 	mTimeToSSH               prometheus.Summary
 	mTimeToReadyForContainer prometheus.Summary
+	mTimeFromShutdownToGone  prometheus.Summary
 }
 
 type createCall struct {
@@ -661,6 +662,14 @@ func (wp *Pool) registerMetrics(reg *prometheus.Registry) {
 		Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.95: 0.005, 0.99: 0.001},
 	})
 	reg.MustRegister(wp.mTimeToReadyForContainer)
+	wp.mTimeFromShutdownToGone = prometheus.NewSummary(prometheus.SummaryOpts{
+		Namespace:  "arvados",
+		Subsystem:  "dispatchcloud",
+		Name:       "instances_time_from_shutdown_request_to_disappearance_seconds",
+		Help:       "Number of seconds between the first shutdown attempt and the disappearance of the worker.",
+		Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.95: 0.005, 0.99: 0.001},
+	})
+	reg.MustRegister(wp.mTimeFromShutdownToGone)
 }
 
 func (wp *Pool) runMetrics() {
@@ -930,6 +939,9 @@ func (wp *Pool) sync(threshold time.Time, instances []cloud.Instance) {
 		if wp.mDisappearances != nil {
 			wp.mDisappearances.WithLabelValues(stateString[wkr.state]).Inc()
 		}
+		if wp.mTimeFromShutdownToGone != nil {
+			wp.mTimeFromShutdownToGone.Observe(time.Now().Sub(wkr.destroyed).Seconds())
+		}
 		delete(wp.workers, id)
 		go wkr.Close()
 		notify = true

-----------------------------------------------------------------------


hooks/post-receive
--