[ARVADOS] updated: 1.3.0-3125-g7f4da9388
Git user
git at public.arvados.org
Thu Sep 10 14:57:15 UTC 2020
Summary of changes:
lib/dispatchcloud/scheduler/run_queue_test.go | 48 +++++++++++++++++++++++++--
lib/dispatchcloud/scheduler/scheduler.go | 2 +-
lib/dispatchcloud/scheduler/sync_test.go | 4 +--
3 files changed, 49 insertions(+), 5 deletions(-)
discards 4e683b5889f6cee2f625bae9708d9f174819b041 (commit)
via 7f4da9388e3d5ec8f38f6d6408916d1d46dfb10f (commit)
This update added new revisions after undoing existing revisions. That is
to say, the old revision is not a strict subset of the new revision. This
situation occurs when you --force push a change and generate a repository
containing something like this:
* -- * -- B -- O -- O -- O (4e683b5889f6cee2f625bae9708d9f174819b041)
\
N -- N -- N (7f4da9388e3d5ec8f38f6d6408916d1d46dfb10f)
When this happens we assume that you've already had alert emails for all
of the O revisions, and so we here report only the revisions in the N
branch from the common base, B.
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
commit 7f4da9388e3d5ec8f38f6d6408916d1d46dfb10f
Author: Ward Vandewege <ward at curii.com>
Date: Wed Sep 9 17:06:31 2020 -0400
16636: a-d-c: add two more metrics:
(gauge) number of containers allocated to VMs but not started yet (because VMs are pending/booting)
(gauge) number of containers not allocated to VMs (because provider quota is reached)
Arvados-DCO-1.1-Signed-off-by: Ward Vandewege <ward at curii.com>
diff --git a/lib/dispatchcloud/dispatcher.go b/lib/dispatchcloud/dispatcher.go
index 02b6c976a..278bcb665 100644
--- a/lib/dispatchcloud/dispatcher.go
+++ b/lib/dispatchcloud/dispatcher.go
@@ -181,7 +181,7 @@ func (disp *dispatcher) run() {
if pollInterval <= 0 {
pollInterval = defaultPollInterval
}
- sched := scheduler.New(disp.Context, disp.queue, disp.pool, staleLockTimeout, pollInterval)
+ sched := scheduler.New(disp.Context, disp.queue, disp.pool, disp.Registry, staleLockTimeout, pollInterval)
sched.Start()
defer sched.Stop()
diff --git a/lib/dispatchcloud/scheduler/run_queue.go b/lib/dispatchcloud/scheduler/run_queue.go
index 0e8e1dc2e..b9d653a82 100644
--- a/lib/dispatchcloud/scheduler/run_queue.go
+++ b/lib/dispatchcloud/scheduler/run_queue.go
@@ -33,6 +33,7 @@ func (sch *Scheduler) runQueue() {
dontstart := map[arvados.InstanceType]bool{}
var overquota []container.QueueEnt // entries that are unmappable because of worker pool quota
+ var containerAllocatedWorkerBootingCount int
tryrun:
for i, ctr := range sorted {
@@ -92,11 +93,15 @@ tryrun:
} else if sch.pool.StartContainer(it, ctr) {
// Success.
} else {
+ containerAllocatedWorkerBootingCount += 1
dontstart[it] = true
}
}
}
+ sch.mContainersAllocatedNotStarted.Set(float64(containerAllocatedWorkerBootingCount))
+ sch.mContainersNotAllocatedOverQuota.Set(float64(len(overquota)))
+
if len(overquota) > 0 {
// Unlock any containers that are unmappable while
// we're at quota.
diff --git a/lib/dispatchcloud/scheduler/run_queue_test.go b/lib/dispatchcloud/scheduler/run_queue_test.go
index 530eb5db9..6d7036498 100644
--- a/lib/dispatchcloud/scheduler/run_queue_test.go
+++ b/lib/dispatchcloud/scheduler/run_queue_test.go
@@ -13,6 +13,9 @@ import (
"git.arvados.org/arvados.git/lib/dispatchcloud/worker"
"git.arvados.org/arvados.git/sdk/go/arvados"
"git.arvados.org/arvados.git/sdk/go/ctxlog"
+
+ "github.com/prometheus/client_golang/prometheus/testutil"
+
check "gopkg.in/check.v1"
)
@@ -185,7 +188,7 @@ func (*SchedulerSuite) TestUseIdleWorkers(c *check.C) {
running: map[string]time.Time{},
canCreate: 0,
}
- New(ctx, &queue, &pool, time.Millisecond, time.Millisecond).runQueue()
+ New(ctx, &queue, &pool, nil, time.Millisecond, time.Millisecond).runQueue()
c.Check(pool.creates, check.DeepEquals, []arvados.InstanceType{test.InstanceType(1), test.InstanceType(1), test.InstanceType(1)})
c.Check(pool.starts, check.DeepEquals, []string{test.ContainerUUID(4)})
c.Check(pool.running, check.HasLen, 1)
@@ -241,7 +244,7 @@ func (*SchedulerSuite) TestShutdownAtQuota(c *check.C) {
starts: []string{},
canCreate: 0,
}
- New(ctx, &queue, &pool, time.Millisecond, time.Millisecond).runQueue()
+ New(ctx, &queue, &pool, nil, time.Millisecond, time.Millisecond).runQueue()
c.Check(pool.creates, check.DeepEquals, shouldCreate)
if len(shouldCreate) == 0 {
c.Check(pool.starts, check.DeepEquals, []string{})
@@ -336,7 +339,7 @@ func (*SchedulerSuite) TestStartWhileCreating(c *check.C) {
},
}
queue.Update()
- New(ctx, &queue, &pool, time.Millisecond, time.Millisecond).runQueue()
+ New(ctx, &queue, &pool, nil, time.Millisecond, time.Millisecond).runQueue()
c.Check(pool.creates, check.DeepEquals, []arvados.InstanceType{test.InstanceType(2), test.InstanceType(1)})
c.Check(pool.starts, check.DeepEquals, []string{uuids[6], uuids[5], uuids[3], uuids[2]})
running := map[string]bool{}
@@ -380,10 +383,51 @@ func (*SchedulerSuite) TestKillNonexistentContainer(c *check.C) {
},
}
queue.Update()
- sch := New(ctx, &queue, &pool, time.Millisecond, time.Millisecond)
+ sch := New(ctx, &queue, &pool, nil, time.Millisecond, time.Millisecond)
c.Check(pool.running, check.HasLen, 1)
sch.sync()
for deadline := time.Now().Add(time.Second); len(pool.Running()) > 0 && time.Now().Before(deadline); time.Sleep(time.Millisecond) {
}
c.Check(pool.Running(), check.HasLen, 0)
}
+
+func (*SchedulerSuite) TestContainersAllocatedNotStartedMetric(c *check.C) {
+ ctx := ctxlog.Context(context.Background(), ctxlog.TestLogger(c))
+ queue := test.Queue{
+ ChooseType: chooseType,
+ Containers: []arvados.Container{
+ {
+ UUID: test.ContainerUUID(1),
+ Priority: 1,
+ State: arvados.ContainerStateLocked,
+ RuntimeConstraints: arvados.RuntimeConstraints{
+ VCPUs: 1,
+ RAM: 1 << 30,
+ },
+ },
+ },
+ }
+ queue.Update()
+
+ // Create a pool with one unallocated (idle/booting/unknown) worker,
+ // and `idle` and `unknown` not set (empty). Iow this worker is in the booting
+ // state, and the container will be allocated but not started yet.
+ pool := stubPool{
+ unalloc: map[arvados.InstanceType]int{test.InstanceType(1): 1},
+ }
+ sch := New(ctx, &queue, &pool, nil, time.Millisecond, time.Millisecond)
+ sch.runQueue()
+
+ c.Check(int(testutil.ToFloat64(sch.mContainersAllocatedNotStarted)), check.Equals, 1)
+ c.Check(int(testutil.ToFloat64(sch.mContainersNotAllocatedOverQuota)), check.Equals, 0)
+
+ // Create a pool without workers. The queued container will not be started, and the
+ // 'over quota' metric will be 1 because no workers are available and canCreate defaults
+ // to zero.
+ pool = stubPool{}
+ sch = New(ctx, &queue, &pool, nil, time.Millisecond, time.Millisecond)
+ sch.runQueue()
+
+ c.Check(int(testutil.ToFloat64(sch.mContainersAllocatedNotStarted)), check.Equals, 0)
+ c.Check(int(testutil.ToFloat64(sch.mContainersNotAllocatedOverQuota)), check.Equals, 1)
+}
diff --git a/lib/dispatchcloud/scheduler/scheduler.go b/lib/dispatchcloud/scheduler/scheduler.go
index 6409ea031..6fd47e919 100644
--- a/lib/dispatchcloud/scheduler/scheduler.go
+++ b/lib/dispatchcloud/scheduler/scheduler.go
@@ -12,6 +12,7 @@ import (
"time"
"git.arvados.org/arvados.git/sdk/go/ctxlog"
+ "github.com/prometheus/client_golang/prometheus"
"github.com/sirupsen/logrus"
)
@@ -31,6 +32,7 @@ type Scheduler struct {
logger logrus.FieldLogger
queue ContainerQueue
pool WorkerPool
+ reg *prometheus.Registry
staleLockTimeout time.Duration
queueUpdateInterval time.Duration
@@ -41,17 +43,21 @@ type Scheduler struct {
runOnce sync.Once
stop chan struct{}
stopped chan struct{}
+
+ mContainersAllocatedNotStarted prometheus.Gauge
+ mContainersNotAllocatedOverQuota prometheus.Gauge
}
// New returns a new unstarted Scheduler.
//
// Any given queue and pool should not be used by more than one
// scheduler at a time.
-func New(ctx context.Context, queue ContainerQueue, pool WorkerPool, staleLockTimeout, queueUpdateInterval time.Duration) *Scheduler {
- return &Scheduler{
+func New(ctx context.Context, queue ContainerQueue, pool WorkerPool, reg *prometheus.Registry, staleLockTimeout, queueUpdateInterval time.Duration) *Scheduler {
+ sch := &Scheduler{
logger: ctxlog.FromContext(ctx),
queue: queue,
pool: pool,
+ reg: reg,
staleLockTimeout: staleLockTimeout,
queueUpdateInterval: queueUpdateInterval,
wakeup: time.NewTimer(time.Second),
@@ -59,6 +65,28 @@ func New(ctx context.Context, queue ContainerQueue, pool WorkerPool, staleLockTi
stopped: make(chan struct{}),
uuidOp: map[string]string{},
}
+ sch.registerMetrics(reg)
+ return sch
+}
+
+func (sch *Scheduler) registerMetrics(reg *prometheus.Registry) {
+ if reg == nil {
+ reg = prometheus.NewRegistry()
+ }
+ sch.mContainersAllocatedNotStarted = prometheus.NewGauge(prometheus.GaugeOpts{
+ Namespace: "arvados",
+ Subsystem: "dispatchcloud",
+ Name: "containers_allocated_not_started",
+ Help: "Number of containers allocated to a worker but not started yet (worker is booting).",
+ })
+ reg.MustRegister(sch.mContainersAllocatedNotStarted)
+ sch.mContainersNotAllocatedOverQuota = prometheus.NewGauge(prometheus.GaugeOpts{
+ Namespace: "arvados",
+ Subsystem: "dispatchcloud",
+ Name: "containers_not_allocated_over_quota",
+ Help: "Number of containers not allocated to a worker because the system has hit a quota.",
+ })
+ reg.MustRegister(sch.mContainersNotAllocatedOverQuota)
}
// Start starts the scheduler.
diff --git a/lib/dispatchcloud/scheduler/sync_test.go b/lib/dispatchcloud/scheduler/sync_test.go
index 538f5ea8c..a3ff0636e 100644
--- a/lib/dispatchcloud/scheduler/sync_test.go
+++ b/lib/dispatchcloud/scheduler/sync_test.go
@@ -48,7 +48,7 @@ func (*SchedulerSuite) TestForgetIrrelevantContainers(c *check.C) {
ents, _ := queue.Entries()
c.Check(ents, check.HasLen, 1)
- sch := New(ctx, &queue, &pool, time.Millisecond, time.Millisecond)
+ sch := New(ctx, &queue, &pool, nil, time.Millisecond, time.Millisecond)
sch.sync()
ents, _ = queue.Entries()
@@ -80,7 +80,7 @@ func (*SchedulerSuite) TestCancelOrphanedContainers(c *check.C) {
ents, _ := queue.Entries()
c.Check(ents, check.HasLen, 1)
- sch := New(ctx, &queue, &pool, time.Millisecond, time.Millisecond)
+ sch := New(ctx, &queue, &pool, nil, time.Millisecond, time.Millisecond)
// Sync shouldn't cancel the container because it might be
// running on the VM with state=="unknown".
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list