[ARVADOS] updated: 1.3.0-3152-g407fc461d
Git user
git at public.arvados.org
Tue Sep 15 17:59:48 UTC 2020
Summary of changes:
lib/dispatchcloud/container/queue.go | 2 +-
lib/dispatchcloud/scheduler/run_queue.go | 8 --------
lib/dispatchcloud/scheduler/run_queue_test.go | 3 +++
lib/dispatchcloud/scheduler/scheduler.go | 26 ++++++++++++++++++++++++++
lib/dispatchcloud/worker/pool.go | 3 ++-
5 files changed, 32 insertions(+), 10 deletions(-)
via 407fc461d20ece8b11b7b56f29a3caff3083ff8d (commit)
from a5dbdd1dbcdfcb835e7c1fe741e4c00927177404 (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
commit 407fc461d20ece8b11b7b56f29a3caff3083ff8d
Author: Ward Vandewege <ward at curii.com>
Date: Tue Sep 15 13:59:33 2020 -0400
16636: implement review feedback.
Arvados-DCO-1.1-Signed-off-by: Ward Vandewege <ward at curii.com>
diff --git a/lib/dispatchcloud/container/queue.go b/lib/dispatchcloud/container/queue.go
index 45b346383..a1ff414b7 100644
--- a/lib/dispatchcloud/container/queue.go
+++ b/lib/dispatchcloud/container/queue.go
@@ -382,7 +382,7 @@ func (cq *Queue) poll() (map[string]*arvados.Container, error) {
*next[upd.UUID] = upd
}
}
- selectParam := []string{"uuid", "state", "priority", "runtime_constraints", "container_image", "mounts", "scheduling_parameters"}
+ selectParam := []string{"uuid", "state", "priority", "runtime_constraints", "container_image", "mounts", "scheduling_parameters", "created_at"}
limitParam := 1000
mine, err := cq.fetchAll(arvados.ResourceListParams{
diff --git a/lib/dispatchcloud/scheduler/run_queue.go b/lib/dispatchcloud/scheduler/run_queue.go
index 9bbb064fe..b9d653a82 100644
--- a/lib/dispatchcloud/scheduler/run_queue.go
+++ b/lib/dispatchcloud/scheduler/run_queue.go
@@ -34,7 +34,6 @@ func (sch *Scheduler) runQueue() {
dontstart := map[arvados.InstanceType]bool{}
var overquota []container.QueueEnt // entries that are unmappable because of worker pool quota
var containerAllocatedWorkerBootingCount int
- var longestWaitTimeCandidate, previousLongestWaitTimeCandidate float64
tryrun:
for i, ctr := range sorted {
@@ -46,11 +45,6 @@ tryrun:
if _, running := running[ctr.UUID]; running || ctr.Priority < 1 {
continue
}
- previousLongestWaitTimeCandidate = longestWaitTimeCandidate
- since := time.Since(ctr.CreatedAt).Seconds()
- if since > longestWaitTimeCandidate {
- longestWaitTimeCandidate = since
- }
switch ctr.State {
case arvados.ContainerStateQueued:
if unalloc[it] < 1 && sch.pool.AtQuota() {
@@ -98,7 +92,6 @@ tryrun:
logger.Info("not restarting yet: crunch-run process from previous attempt has not exited")
} else if sch.pool.StartContainer(it, ctr) {
// Success.
- longestWaitTimeCandidate = previousLongestWaitTimeCandidate
} else {
containerAllocatedWorkerBootingCount += 1
dontstart[it] = true
@@ -108,7 +101,6 @@ tryrun:
sch.mContainersAllocatedNotStarted.Set(float64(containerAllocatedWorkerBootingCount))
sch.mContainersNotAllocatedOverQuota.Set(float64(len(overquota)))
- sch.mLongestWaitTimeSinceQueue.Set(longestWaitTimeCandidate)
if len(overquota) > 0 {
// Unlock any containers that are unmappable while
diff --git a/lib/dispatchcloud/scheduler/run_queue_test.go b/lib/dispatchcloud/scheduler/run_queue_test.go
index e7963ca7d..fd1d0a870 100644
--- a/lib/dispatchcloud/scheduler/run_queue_test.go
+++ b/lib/dispatchcloud/scheduler/run_queue_test.go
@@ -418,6 +418,7 @@ func (*SchedulerSuite) TestContainersMetrics(c *check.C) {
}
sch := New(ctx, &queue, &pool, nil, time.Millisecond, time.Millisecond)
sch.runQueue()
+ sch.updateMetrics()
c.Check(int(testutil.ToFloat64(sch.mContainersAllocatedNotStarted)), check.Equals, 1)
c.Check(int(testutil.ToFloat64(sch.mContainersNotAllocatedOverQuota)), check.Equals, 0)
@@ -429,6 +430,7 @@ func (*SchedulerSuite) TestContainersMetrics(c *check.C) {
pool = stubPool{}
sch = New(ctx, &queue, &pool, nil, time.Millisecond, time.Millisecond)
sch.runQueue()
+ sch.updateMetrics()
c.Check(int(testutil.ToFloat64(sch.mContainersAllocatedNotStarted)), check.Equals, 0)
c.Check(int(testutil.ToFloat64(sch.mContainersNotAllocatedOverQuota)), check.Equals, 1)
@@ -461,6 +463,7 @@ func (*SchedulerSuite) TestContainersMetrics(c *check.C) {
}
sch = New(ctx, &queue, &pool, nil, time.Millisecond, time.Millisecond)
sch.runQueue()
+ sch.updateMetrics()
c.Check(int(testutil.ToFloat64(sch.mLongestWaitTimeSinceQueue)), check.Equals, 0)
}
diff --git a/lib/dispatchcloud/scheduler/scheduler.go b/lib/dispatchcloud/scheduler/scheduler.go
index c16924573..b1d369ed2 100644
--- a/lib/dispatchcloud/scheduler/scheduler.go
+++ b/lib/dispatchcloud/scheduler/scheduler.go
@@ -11,6 +11,7 @@ import (
"sync"
"time"
+ "git.arvados.org/arvados.git/sdk/go/arvados"
"git.arvados.org/arvados.git/sdk/go/ctxlog"
"github.com/prometheus/client_golang/prometheus"
"github.com/sirupsen/logrus"
@@ -97,6 +98,30 @@ func (sch *Scheduler) registerMetrics(reg *prometheus.Registry) {
reg.MustRegister(sch.mLongestWaitTimeSinceQueue)
}
+func (sch *Scheduler) updateMetrics() {
+ earliest := time.Now()
+ entries, _ := sch.queue.Entries()
+ running := sch.pool.Running()
+ for _, ent := range entries {
+ if ent.Container.Priority > 0 &&
+ (ent.Container.State == arvados.ContainerStateQueued || ent.Container.State == arvados.ContainerStateLocked) {
+ // Exclude containers that are preparing to run the payload (i.e.
+ // ContainerStateLocked and running on a worker, most likely loading the
+ // payload image
+ if _, ok := running[ent.Container.UUID]; !ok {
+ if ent.Container.CreatedAt.Before(earliest) {
+ earliest = ent.Container.CreatedAt
+ }
+ }
+ }
+ }
+ if !earliest.IsZero() {
+ sch.mLongestWaitTimeSinceQueue.Set(time.Since(earliest).Seconds())
+ } else {
+ sch.mLongestWaitTimeSinceQueue.Set(0)
+ }
+}
+
// Start starts the scheduler.
func (sch *Scheduler) Start() {
go sch.runOnce.Do(sch.run)
@@ -149,6 +174,7 @@ func (sch *Scheduler) run() {
for {
sch.runQueue()
sch.sync()
+ sch.updateMetrics()
select {
case <-sch.stop:
return
diff --git a/lib/dispatchcloud/worker/pool.go b/lib/dispatchcloud/worker/pool.go
index 67962c9d6..4c90c4e6f 100644
--- a/lib/dispatchcloud/worker/pool.go
+++ b/lib/dispatchcloud/worker/pool.go
@@ -948,7 +948,8 @@ func (wp *Pool) sync(threshold time.Time, instances []cloud.Instance) {
if wp.mDisappearances != nil {
wp.mDisappearances.WithLabelValues(stateString[wkr.state]).Inc()
}
- if wp.mTimeFromShutdownToGone != nil {
+ // wkr.destroyed.IsZero() can happen if instance disappeared but we weren't trying to shut it down
+ if wp.mTimeFromShutdownToGone != nil && !wkr.destroyed.IsZero() {
wp.mTimeFromShutdownToGone.Observe(time.Now().Sub(wkr.destroyed).Seconds())
}
delete(wp.workers, id)
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list