[ARVADOS] created: 1.3.0-3125-g4e683b588
Git user
git at public.arvados.org
Wed Sep 9 21:07:28 UTC 2020
at 4e683b5889f6cee2f625bae9708d9f174819b041 (commit)
commit 4e683b5889f6cee2f625bae9708d9f174819b041
Author: Ward Vandewege <ward at curii.com>
Date: Wed Sep 9 17:06:31 2020 -0400
16636: a-d-c: add two more metrics:
(gauge) number of containers allocated to VMs but not started yet (because VMs are pending/booting)
(gauge) number of containers not allocated to VMs (because provider quota is reached)
Arvados-DCO-1.1-Signed-off-by: Ward Vandewege <ward at curii.com>
diff --git a/lib/dispatchcloud/dispatcher.go b/lib/dispatchcloud/dispatcher.go
index 02b6c976a..278bcb665 100644
--- a/lib/dispatchcloud/dispatcher.go
+++ b/lib/dispatchcloud/dispatcher.go
@@ -181,7 +181,7 @@ func (disp *dispatcher) run() {
if pollInterval <= 0 {
pollInterval = defaultPollInterval
}
- sched := scheduler.New(disp.Context, disp.queue, disp.pool, staleLockTimeout, pollInterval)
+ sched := scheduler.New(disp.Context, disp.queue, disp.pool, disp.Registry, staleLockTimeout, pollInterval)
sched.Start()
defer sched.Stop()
diff --git a/lib/dispatchcloud/scheduler/run_queue.go b/lib/dispatchcloud/scheduler/run_queue.go
index 0e8e1dc2e..b9d653a82 100644
--- a/lib/dispatchcloud/scheduler/run_queue.go
+++ b/lib/dispatchcloud/scheduler/run_queue.go
@@ -33,6 +33,7 @@ func (sch *Scheduler) runQueue() {
dontstart := map[arvados.InstanceType]bool{}
var overquota []container.QueueEnt // entries that are unmappable because of worker pool quota
+ var containerAllocatedWorkerBootingCount int
tryrun:
for i, ctr := range sorted {
@@ -92,11 +93,15 @@ tryrun:
} else if sch.pool.StartContainer(it, ctr) {
// Success.
} else {
+ containerAllocatedWorkerBootingCount += 1
dontstart[it] = true
}
}
}
+ sch.mContainersAllocatedNotStarted.Set(float64(containerAllocatedWorkerBootingCount))
+ sch.mContainersNotAllocatedOverQuota.Set(float64(len(overquota)))
+
if len(overquota) > 0 {
// Unlock any containers that are unmappable while
// we're at quota.
diff --git a/lib/dispatchcloud/scheduler/run_queue_test.go b/lib/dispatchcloud/scheduler/run_queue_test.go
index 530eb5db9..3547f85b3 100644
--- a/lib/dispatchcloud/scheduler/run_queue_test.go
+++ b/lib/dispatchcloud/scheduler/run_queue_test.go
@@ -336,7 +336,7 @@ func (*SchedulerSuite) TestStartWhileCreating(c *check.C) {
},
}
queue.Update()
- New(ctx, &queue, &pool, time.Millisecond, time.Millisecond).runQueue()
+ New(ctx, &queue, &pool, nil, time.Millisecond, time.Millisecond).runQueue()
c.Check(pool.creates, check.DeepEquals, []arvados.InstanceType{test.InstanceType(2), test.InstanceType(1)})
c.Check(pool.starts, check.DeepEquals, []string{uuids[6], uuids[5], uuids[3], uuids[2]})
running := map[string]bool{}
@@ -380,7 +380,7 @@ func (*SchedulerSuite) TestKillNonexistentContainer(c *check.C) {
},
}
queue.Update()
- sch := New(ctx, &queue, &pool, time.Millisecond, time.Millisecond)
+ sch := New(ctx, &queue, &pool, nil, time.Millisecond, time.Millisecond)
c.Check(pool.running, check.HasLen, 1)
sch.sync()
for deadline := time.Now().Add(time.Second); len(pool.Running()) > 0 && time.Now().Before(deadline); time.Sleep(time.Millisecond) {
diff --git a/lib/dispatchcloud/scheduler/scheduler.go b/lib/dispatchcloud/scheduler/scheduler.go
index 6409ea031..0be272a4d 100644
--- a/lib/dispatchcloud/scheduler/scheduler.go
+++ b/lib/dispatchcloud/scheduler/scheduler.go
@@ -12,6 +12,7 @@ import (
"time"
"git.arvados.org/arvados.git/sdk/go/ctxlog"
+ "github.com/prometheus/client_golang/prometheus"
"github.com/sirupsen/logrus"
)
@@ -31,6 +32,7 @@ type Scheduler struct {
logger logrus.FieldLogger
queue ContainerQueue
pool WorkerPool
+ reg *prometheus.Registry
staleLockTimeout time.Duration
queueUpdateInterval time.Duration
@@ -41,17 +43,21 @@ type Scheduler struct {
runOnce sync.Once
stop chan struct{}
stopped chan struct{}
+
+ mContainersAllocatedNotStarted prometheus.Gauge
+ mContainersNotAllocatedOverQuota prometheus.Gauge
}
// New returns a new unstarted Scheduler.
//
// Any given queue and pool should not be used by more than one
// scheduler at a time.
-func New(ctx context.Context, queue ContainerQueue, pool WorkerPool, staleLockTimeout, queueUpdateInterval time.Duration) *Scheduler {
- return &Scheduler{
+func New(ctx context.Context, queue ContainerQueue, pool WorkerPool, reg *prometheus.Registry, staleLockTimeout, queueUpdateInterval time.Duration) *Scheduler {
+ sch := &Scheduler{
logger: ctxlog.FromContext(ctx),
queue: queue,
pool: pool,
+ reg: reg,
staleLockTimeout: staleLockTimeout,
queueUpdateInterval: queueUpdateInterval,
wakeup: time.NewTimer(time.Second),
@@ -59,6 +65,28 @@ func New(ctx context.Context, queue ContainerQueue, pool WorkerPool, staleLockTi
stopped: make(chan struct{}),
uuidOp: map[string]string{},
}
+ sch.registerMetrics(reg)
+ return sch
+}
+
+func (sch *Scheduler) registerMetrics(reg *prometheus.Registry) {
+ if reg == nil {
+ reg = prometheus.NewRegistry()
+ }
+ sch.mContainersAllocatedNotStarted = prometheus.NewGauge(prometheus.GaugeOpts{
+ Namespace: "arvados",
+ Subsystem: "dispatchcloud",
+ Name: "containers_allocated_not_started",
+ Help: "Number of containers allocated to a worker but not started yet (worker is booting).",
+ })
+ reg.MustRegister(sch.mContainersAllocatedNotStarted)
+ sch.mContainersNotAllocatedOverQuota = prometheus.NewGauge(prometheus.GaugeOpts{
+ Namespace: "arvados",
+ Subsystem: "dispatchcloud",
+ Name: "containers_running",
+ Help: "Number of containers not allocated to a worker because the system has hit a quota.",
+ })
+ reg.MustRegister(sch.mContainersNotAllocatedOverQuota)
}
// Start starts the scheduler.
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list