[ARVADOS] created: 1.3.0-3125-g4e683b588

Wed Sep 9 21:07:28 UTC 2020

at  4e683b5889f6cee2f625bae9708d9f174819b041 (commit)


commit 4e683b5889f6cee2f625bae9708d9f174819b041
Author: Ward Vandewege <ward at curii.com>
Date:   Wed Sep 9 17:06:31 2020 -0400

    16636: a-d-c: add two more metrics:
    
      (gauge) number of containers allocated to VMs but not started yet (because VMs are pending/booting)
      (gauge) number of containers not allocated to VMs (because provider quota is reached)
    
    Arvados-DCO-1.1-Signed-off-by: Ward Vandewege <ward at curii.com>

diff --git a/lib/dispatchcloud/dispatcher.go b/lib/dispatchcloud/dispatcher.go
index 02b6c976a..278bcb665 100644
--- a/lib/dispatchcloud/dispatcher.go
+++ b/lib/dispatchcloud/dispatcher.go
@@ -181,7 +181,7 @@ func (disp *dispatcher) run() {
 	if pollInterval <= 0 {
 		pollInterval = defaultPollInterval
 	}
-	sched := scheduler.New(disp.Context, disp.queue, disp.pool, staleLockTimeout, pollInterval)
+	sched := scheduler.New(disp.Context, disp.queue, disp.pool, disp.Registry, staleLockTimeout, pollInterval)
 	sched.Start()
 	defer sched.Stop()
 
diff --git a/lib/dispatchcloud/scheduler/run_queue.go b/lib/dispatchcloud/scheduler/run_queue.go
index 0e8e1dc2e..b9d653a82 100644
--- a/lib/dispatchcloud/scheduler/run_queue.go
+++ b/lib/dispatchcloud/scheduler/run_queue.go
@@ -33,6 +33,7 @@ func (sch *Scheduler) runQueue() {
 
 	dontstart := map[arvados.InstanceType]bool{}
 	var overquota []container.QueueEnt // entries that are unmappable because of worker pool quota
+	var containerAllocatedWorkerBootingCount int
 
 tryrun:
 	for i, ctr := range sorted {
@@ -92,11 +93,15 @@ tryrun:
 			} else if sch.pool.StartContainer(it, ctr) {
 				// Success.
 			} else {
+				containerAllocatedWorkerBootingCount += 1
 				dontstart[it] = true
 			}
 		}
 	}
 
+	sch.mContainersAllocatedNotStarted.Set(float64(containerAllocatedWorkerBootingCount))
+	sch.mContainersNotAllocatedOverQuota.Set(float64(len(overquota)))
+
 	if len(overquota) > 0 {
 		// Unlock any containers that are unmappable while
 		// we're at quota.
diff --git a/lib/dispatchcloud/scheduler/run_queue_test.go b/lib/dispatchcloud/scheduler/run_queue_test.go
index 530eb5db9..3547f85b3 100644
--- a/lib/dispatchcloud/scheduler/run_queue_test.go
+++ b/lib/dispatchcloud/scheduler/run_queue_test.go
@@ -336,7 +336,7 @@ func (*SchedulerSuite) TestStartWhileCreating(c *check.C) {
 		},
 	}
 	queue.Update()
-	New(ctx, &queue, &pool, time.Millisecond, time.Millisecond).runQueue()
+	New(ctx, &queue, &pool, nil, time.Millisecond, time.Millisecond).runQueue()
 	c.Check(pool.creates, check.DeepEquals, []arvados.InstanceType{test.InstanceType(2), test.InstanceType(1)})
 	c.Check(pool.starts, check.DeepEquals, []string{uuids[6], uuids[5], uuids[3], uuids[2]})
 	running := map[string]bool{}
@@ -380,7 +380,7 @@ func (*SchedulerSuite) TestKillNonexistentContainer(c *check.C) {
 		},
 	}
 	queue.Update()
-	sch := New(ctx, &queue, &pool, time.Millisecond, time.Millisecond)
+	sch := New(ctx, &queue, &pool, nil, time.Millisecond, time.Millisecond)
 	c.Check(pool.running, check.HasLen, 1)
 	sch.sync()
 	for deadline := time.Now().Add(time.Second); len(pool.Running()) > 0 && time.Now().Before(deadline); time.Sleep(time.Millisecond) {
diff --git a/lib/dispatchcloud/scheduler/scheduler.go b/lib/dispatchcloud/scheduler/scheduler.go
index 6409ea031..0be272a4d 100644
--- a/lib/dispatchcloud/scheduler/scheduler.go
+++ b/lib/dispatchcloud/scheduler/scheduler.go
@@ -12,6 +12,7 @@ import (
 	"time"
 
 	"git.arvados.org/arvados.git/sdk/go/ctxlog"
+	"github.com/prometheus/client_golang/prometheus"
 	"github.com/sirupsen/logrus"
 )
 
@@ -31,6 +32,7 @@ type Scheduler struct {
 	logger              logrus.FieldLogger
 	queue               ContainerQueue
 	pool                WorkerPool
+	reg                 *prometheus.Registry
 	staleLockTimeout    time.Duration
 	queueUpdateInterval time.Duration
 
@@ -41,17 +43,21 @@ type Scheduler struct {
 	runOnce sync.Once
 	stop    chan struct{}
 	stopped chan struct{}
+
+	mContainersAllocatedNotStarted   prometheus.Gauge
+	mContainersNotAllocatedOverQuota prometheus.Gauge
 }
 
 // New returns a new unstarted Scheduler.
 //
 // Any given queue and pool should not be used by more than one
 // scheduler at a time.
-func New(ctx context.Context, queue ContainerQueue, pool WorkerPool, staleLockTimeout, queueUpdateInterval time.Duration) *Scheduler {
-	return &Scheduler{
+func New(ctx context.Context, queue ContainerQueue, pool WorkerPool, reg *prometheus.Registry, staleLockTimeout, queueUpdateInterval time.Duration) *Scheduler {
+	sch := &Scheduler{
 		logger:              ctxlog.FromContext(ctx),
 		queue:               queue,
 		pool:                pool,
+		reg:                 reg,
 		staleLockTimeout:    staleLockTimeout,
 		queueUpdateInterval: queueUpdateInterval,
 		wakeup:              time.NewTimer(time.Second),
@@ -59,6 +65,28 @@ func New(ctx context.Context, queue ContainerQueue, pool WorkerPool, staleLockTi
 		stopped:             make(chan struct{}),
 		uuidOp:              map[string]string{},
 	}
+	sch.registerMetrics(reg)
+	return sch
+}
+
+func (sch *Scheduler) registerMetrics(reg *prometheus.Registry) {
+	if reg == nil {
+		reg = prometheus.NewRegistry()
+	}
+	sch.mContainersAllocatedNotStarted = prometheus.NewGauge(prometheus.GaugeOpts{
+		Namespace: "arvados",
+		Subsystem: "dispatchcloud",
+		Name:      "containers_allocated_not_started",
+		Help:      "Number of containers allocated to a worker but not started yet (worker is booting).",
+	})
+	reg.MustRegister(sch.mContainersAllocatedNotStarted)
+	sch.mContainersNotAllocatedOverQuota = prometheus.NewGauge(prometheus.GaugeOpts{
+		Namespace: "arvados",
+		Subsystem: "dispatchcloud",
+		Name:      "containers_running",
+		Help:      "Number of containers not allocated to a worker because the system has hit a quota.",
+	})
+	reg.MustRegister(sch.mContainersNotAllocatedOverQuota)
 }
 
 // Start starts the scheduler.

-----------------------------------------------------------------------


hooks/post-receive
--