[ARVADOS] created: 1.3.0-3151-ga5dbdd1db
Git user
git at public.arvados.org
Mon Sep 14 19:15:54 UTC 2020
at a5dbdd1dbcdfcb835e7c1fe741e4c00927177404 (commit)
commit a5dbdd1dbcdfcb835e7c1fe741e4c00927177404
Author: Ward Vandewege <ward at curii.com>
Date: Mon Sep 14 15:14:32 2020 -0400
16636: add 'containers_longest_wait_time_seconds' metric: a gauge that
indicates the current longest wait time between seeing a
container in the queue or requeueing, and starting its crunch-run
process on a worker.
Arvados-DCO-1.1-Signed-off-by: Ward Vandewege <ward at curii.com>
diff --git a/lib/dispatchcloud/scheduler/run_queue.go b/lib/dispatchcloud/scheduler/run_queue.go
index b9d653a82..9bbb064fe 100644
--- a/lib/dispatchcloud/scheduler/run_queue.go
+++ b/lib/dispatchcloud/scheduler/run_queue.go
@@ -34,6 +34,7 @@ func (sch *Scheduler) runQueue() {
dontstart := map[arvados.InstanceType]bool{}
var overquota []container.QueueEnt // entries that are unmappable because of worker pool quota
var containerAllocatedWorkerBootingCount int
+ var longestWaitTimeCandidate, previousLongestWaitTimeCandidate float64
tryrun:
for i, ctr := range sorted {
@@ -45,6 +46,11 @@ tryrun:
if _, running := running[ctr.UUID]; running || ctr.Priority < 1 {
continue
}
+ previousLongestWaitTimeCandidate = longestWaitTimeCandidate
+ since := time.Since(ctr.CreatedAt).Seconds()
+ if since > longestWaitTimeCandidate {
+ longestWaitTimeCandidate = since
+ }
switch ctr.State {
case arvados.ContainerStateQueued:
if unalloc[it] < 1 && sch.pool.AtQuota() {
@@ -92,6 +98,7 @@ tryrun:
logger.Info("not restarting yet: crunch-run process from previous attempt has not exited")
} else if sch.pool.StartContainer(it, ctr) {
// Success.
+ longestWaitTimeCandidate = previousLongestWaitTimeCandidate
} else {
containerAllocatedWorkerBootingCount += 1
dontstart[it] = true
@@ -101,6 +108,7 @@ tryrun:
sch.mContainersAllocatedNotStarted.Set(float64(containerAllocatedWorkerBootingCount))
sch.mContainersNotAllocatedOverQuota.Set(float64(len(overquota)))
+ sch.mLongestWaitTimeSinceQueue.Set(longestWaitTimeCandidate)
if len(overquota) > 0 {
// Unlock any containers that are unmappable while
diff --git a/lib/dispatchcloud/scheduler/run_queue_test.go b/lib/dispatchcloud/scheduler/run_queue_test.go
index 6d7036498..e7963ca7d 100644
--- a/lib/dispatchcloud/scheduler/run_queue_test.go
+++ b/lib/dispatchcloud/scheduler/run_queue_test.go
@@ -391,15 +391,16 @@ func (*SchedulerSuite) TestKillNonexistentContainer(c *check.C) {
c.Check(pool.Running(), check.HasLen, 0)
}
-func (*SchedulerSuite) TestContainersAllocatedNotStartedMetric(c *check.C) {
+func (*SchedulerSuite) TestContainersMetrics(c *check.C) {
ctx := ctxlog.Context(context.Background(), ctxlog.TestLogger(c))
queue := test.Queue{
ChooseType: chooseType,
Containers: []arvados.Container{
{
- UUID: test.ContainerUUID(1),
- Priority: 1,
- State: arvados.ContainerStateLocked,
+ UUID: test.ContainerUUID(1),
+ Priority: 1,
+ State: arvados.ContainerStateLocked,
+ CreatedAt: time.Now().Add(-10 * time.Second),
RuntimeConstraints: arvados.RuntimeConstraints{
VCPUs: 1,
RAM: 1 << 30,
@@ -420,6 +421,7 @@ func (*SchedulerSuite) TestContainersAllocatedNotStartedMetric(c *check.C) {
c.Check(int(testutil.ToFloat64(sch.mContainersAllocatedNotStarted)), check.Equals, 1)
c.Check(int(testutil.ToFloat64(sch.mContainersNotAllocatedOverQuota)), check.Equals, 0)
+ c.Check(int(testutil.ToFloat64(sch.mLongestWaitTimeSinceQueue)), check.Equals, 10)
// Create a pool without workers. The queued container will not be started, and the
// 'over quota' metric will be 1 because no workers are available and canCreate defaults
@@ -430,4 +432,35 @@ func (*SchedulerSuite) TestContainersAllocatedNotStartedMetric(c *check.C) {
c.Check(int(testutil.ToFloat64(sch.mContainersAllocatedNotStarted)), check.Equals, 0)
c.Check(int(testutil.ToFloat64(sch.mContainersNotAllocatedOverQuota)), check.Equals, 1)
+ c.Check(int(testutil.ToFloat64(sch.mLongestWaitTimeSinceQueue)), check.Equals, 10)
+
+ // Reset the queue, and create a pool with an idle worker. The queued
+ // container will be started immediately and mLongestWaitTimeSinceQueue
+ // should be zero.
+ queue = test.Queue{
+ ChooseType: chooseType,
+ Containers: []arvados.Container{
+ {
+ UUID: test.ContainerUUID(1),
+ Priority: 1,
+ State: arvados.ContainerStateLocked,
+ CreatedAt: time.Now().Add(-10 * time.Second),
+ RuntimeConstraints: arvados.RuntimeConstraints{
+ VCPUs: 1,
+ RAM: 1 << 30,
+ },
+ },
+ },
+ }
+ queue.Update()
+
+ pool = stubPool{
+ idle: map[arvados.InstanceType]int{test.InstanceType(1): 1},
+ unalloc: map[arvados.InstanceType]int{test.InstanceType(1): 1},
+ running: map[string]time.Time{},
+ }
+ sch = New(ctx, &queue, &pool, nil, time.Millisecond, time.Millisecond)
+ sch.runQueue()
+
+ c.Check(int(testutil.ToFloat64(sch.mLongestWaitTimeSinceQueue)), check.Equals, 0)
}
diff --git a/lib/dispatchcloud/scheduler/scheduler.go b/lib/dispatchcloud/scheduler/scheduler.go
index 6fd47e919..c16924573 100644
--- a/lib/dispatchcloud/scheduler/scheduler.go
+++ b/lib/dispatchcloud/scheduler/scheduler.go
@@ -46,6 +46,7 @@ type Scheduler struct {
mContainersAllocatedNotStarted prometheus.Gauge
mContainersNotAllocatedOverQuota prometheus.Gauge
+ mLongestWaitTimeSinceQueue prometheus.Gauge
}
// New returns a new unstarted Scheduler.
@@ -87,6 +88,13 @@ func (sch *Scheduler) registerMetrics(reg *prometheus.Registry) {
Help: "Number of containers not allocated to a worker because the system has hit a quota.",
})
reg.MustRegister(sch.mContainersNotAllocatedOverQuota)
+ sch.mLongestWaitTimeSinceQueue = prometheus.NewGauge(prometheus.GaugeOpts{
+ Namespace: "arvados",
+ Subsystem: "dispatchcloud",
+ Name: "containers_longest_wait_time_seconds",
+ Help: "Current longest wait time of any container since queuing, and before the start of crunch-run.",
+ })
+ reg.MustRegister(sch.mLongestWaitTimeSinceQueue)
}
// Start starts the scheduler.
commit 62403e0d1d95f8526d17222a66356382e2d4b028
Merge: ab439e74c f0cc3c391
Author: Ward Vandewege <ward at curii.com>
Date: Mon Sep 14 13:03:18 2020 -0400
16636: Merge branch 'master' into 16636-more-metrics
Arvados-DCO-1.1-Signed-off-by: Ward Vandewege <ward at curii.com>
commit ab439e74c340896f52629996f54614ea190f0f5e
Author: Ward Vandewege <ward at curii.com>
Date: Mon Sep 14 09:53:38 2020 -0400
16636: add 'time_from_queue_to_crunch_run' metric: wait times (between
seeing a container in the queue or requeueing, and starting its
crunch-run process on a worker) across previous starts.
Arvados-DCO-1.1-Signed-off-by: Ward Vandewege <ward at curii.com>
diff --git a/lib/dispatchcloud/dispatcher_test.go b/lib/dispatchcloud/dispatcher_test.go
index 92a42c7a2..80cb28f35 100644
--- a/lib/dispatchcloud/dispatcher_test.go
+++ b/lib/dispatchcloud/dispatcher_test.go
@@ -223,6 +223,8 @@ func (s *DispatcherSuite) TestDispatchToStubDriver(c *check.C) {
c.Check(resp.Body.String(), check.Matches, `(?ms).*time_to_ready_for_container_seconds_sum [0-9.]*`)
c.Check(resp.Body.String(), check.Matches, `(?ms).*time_from_shutdown_request_to_disappearance_seconds_count [0-9]*`)
c.Check(resp.Body.String(), check.Matches, `(?ms).*time_from_shutdown_request_to_disappearance_seconds_sum [0-9.]*`)
+ c.Check(resp.Body.String(), check.Matches, `(?ms).*time_from_queue_to_crunch_run_seconds_count [0-9]*`)
+ c.Check(resp.Body.String(), check.Matches, `(?ms).*time_from_queue_to_crunch_run_seconds_sum [0-9e+.]*`)
}
func (s *DispatcherSuite) TestAPIPermissions(c *check.C) {
diff --git a/lib/dispatchcloud/worker/pool.go b/lib/dispatchcloud/worker/pool.go
index 75bc01fc0..67962c9d6 100644
--- a/lib/dispatchcloud/worker/pool.go
+++ b/lib/dispatchcloud/worker/pool.go
@@ -170,16 +170,17 @@ type Pool struct {
runnerMD5 [md5.Size]byte
runnerCmd string
- mContainersRunning prometheus.Gauge
- mInstances *prometheus.GaugeVec
- mInstancesPrice *prometheus.GaugeVec
- mVCPUs *prometheus.GaugeVec
- mMemory *prometheus.GaugeVec
- mBootOutcomes *prometheus.CounterVec
- mDisappearances *prometheus.CounterVec
- mTimeToSSH prometheus.Summary
- mTimeToReadyForContainer prometheus.Summary
- mTimeFromShutdownToGone prometheus.Summary
+ mContainersRunning prometheus.Gauge
+ mInstances *prometheus.GaugeVec
+ mInstancesPrice *prometheus.GaugeVec
+ mVCPUs *prometheus.GaugeVec
+ mMemory *prometheus.GaugeVec
+ mBootOutcomes *prometheus.CounterVec
+ mDisappearances *prometheus.CounterVec
+ mTimeToSSH prometheus.Summary
+ mTimeToReadyForContainer prometheus.Summary
+ mTimeFromShutdownToGone prometheus.Summary
+ mTimeFromQueueToCrunchRun prometheus.Summary
}
type createCall struct {
@@ -670,6 +671,14 @@ func (wp *Pool) registerMetrics(reg *prometheus.Registry) {
Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.95: 0.005, 0.99: 0.001},
})
reg.MustRegister(wp.mTimeFromShutdownToGone)
+ wp.mTimeFromQueueToCrunchRun = prometheus.NewSummary(prometheus.SummaryOpts{
+ Namespace: "arvados",
+ Subsystem: "dispatchcloud",
+ Name: "containers_time_from_queue_to_crunch_run_seconds",
+ Help: "Number of seconds between the queuing of a container and the start of crunch-run.",
+ Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.95: 0.005, 0.99: 0.001},
+ })
+ reg.MustRegister(wp.mTimeFromQueueToCrunchRun)
}
func (wp *Pool) runMetrics() {
diff --git a/lib/dispatchcloud/worker/worker.go b/lib/dispatchcloud/worker/worker.go
index 9199d4baf..95794d0b3 100644
--- a/lib/dispatchcloud/worker/worker.go
+++ b/lib/dispatchcloud/worker/worker.go
@@ -176,6 +176,9 @@ func (wkr *worker) startContainer(ctr arvados.Container) {
}
go func() {
rr.Start()
+ if wkr.wp.mTimeFromQueueToCrunchRun != nil {
+ wkr.wp.mTimeFromQueueToCrunchRun.Observe(time.Since(ctr.CreatedAt).Seconds())
+ }
wkr.mtx.Lock()
defer wkr.mtx.Unlock()
now := time.Now()
commit b1df618b0c33eccc4ab15d89aad4a10fbf245993
Author: Ward Vandewege <ward at curii.com>
Date: Fri Sep 11 12:30:07 2020 -0400
16636: add instances_time_from_shutdown_request_to_disappearance_seconds
metric
Arvados-DCO-1.1-Signed-off-by: Ward Vandewege <ward at curii.com>
diff --git a/lib/dispatchcloud/dispatcher_test.go b/lib/dispatchcloud/dispatcher_test.go
index 6e1850410..92a42c7a2 100644
--- a/lib/dispatchcloud/dispatcher_test.go
+++ b/lib/dispatchcloud/dispatcher_test.go
@@ -221,6 +221,8 @@ func (s *DispatcherSuite) TestDispatchToStubDriver(c *check.C) {
c.Check(resp.Body.String(), check.Matches, `(?ms).*time_to_ready_for_container_seconds{quantile="0.95"} [0-9.]*`)
c.Check(resp.Body.String(), check.Matches, `(?ms).*time_to_ready_for_container_seconds_count [0-9]*`)
c.Check(resp.Body.String(), check.Matches, `(?ms).*time_to_ready_for_container_seconds_sum [0-9.]*`)
+ c.Check(resp.Body.String(), check.Matches, `(?ms).*time_from_shutdown_request_to_disappearance_seconds_count [0-9]*`)
+ c.Check(resp.Body.String(), check.Matches, `(?ms).*time_from_shutdown_request_to_disappearance_seconds_sum [0-9.]*`)
}
func (s *DispatcherSuite) TestAPIPermissions(c *check.C) {
diff --git a/lib/dispatchcloud/worker/pool.go b/lib/dispatchcloud/worker/pool.go
index 086887cb4..75bc01fc0 100644
--- a/lib/dispatchcloud/worker/pool.go
+++ b/lib/dispatchcloud/worker/pool.go
@@ -179,6 +179,7 @@ type Pool struct {
mDisappearances *prometheus.CounterVec
mTimeToSSH prometheus.Summary
mTimeToReadyForContainer prometheus.Summary
+ mTimeFromShutdownToGone prometheus.Summary
}
type createCall struct {
@@ -661,6 +662,14 @@ func (wp *Pool) registerMetrics(reg *prometheus.Registry) {
Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.95: 0.005, 0.99: 0.001},
})
reg.MustRegister(wp.mTimeToReadyForContainer)
+ wp.mTimeFromShutdownToGone = prometheus.NewSummary(prometheus.SummaryOpts{
+ Namespace: "arvados",
+ Subsystem: "dispatchcloud",
+ Name: "instances_time_from_shutdown_request_to_disappearance_seconds",
+ Help: "Number of seconds between the first shutdown attempt and the disappearance of the worker.",
+ Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.95: 0.005, 0.99: 0.001},
+ })
+ reg.MustRegister(wp.mTimeFromShutdownToGone)
}
func (wp *Pool) runMetrics() {
@@ -930,6 +939,9 @@ func (wp *Pool) sync(threshold time.Time, instances []cloud.Instance) {
if wp.mDisappearances != nil {
wp.mDisappearances.WithLabelValues(stateString[wkr.state]).Inc()
}
+ if wp.mTimeFromShutdownToGone != nil {
+ wp.mTimeFromShutdownToGone.Observe(time.Now().Sub(wkr.destroyed).Seconds())
+ }
delete(wp.workers, id)
go wkr.Close()
notify = true
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list