[ARVADOS] updated: 1.3.0-3125-g7f4da9388

Thu Sep 10 14:57:15 UTC 2020

Summary of changes:
 lib/dispatchcloud/scheduler/run_queue_test.go | 48 +++++++++++++++++++++++++--
 lib/dispatchcloud/scheduler/scheduler.go      |  2 +-
 lib/dispatchcloud/scheduler/sync_test.go      |  4 +--
 3 files changed, 49 insertions(+), 5 deletions(-)

  discards  4e683b5889f6cee2f625bae9708d9f174819b041 (commit)
       via  7f4da9388e3d5ec8f38f6d6408916d1d46dfb10f (commit)

This update added new revisions after undoing existing revisions.  That is
to say, the old revision is not a strict subset of the new revision.  This
situation occurs when you --force push a change and generate a repository
containing something like this:

 * -- * -- B -- O -- O -- O (4e683b5889f6cee2f625bae9708d9f174819b041)
            \
             N -- N -- N (7f4da9388e3d5ec8f38f6d6408916d1d46dfb10f)

When this happens we assume that you've already had alert emails for all
of the O revisions, and so we here report only the revisions in the N
branch from the common base, B.

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.


commit 7f4da9388e3d5ec8f38f6d6408916d1d46dfb10f
Author: Ward Vandewege <ward at curii.com>
Date:   Wed Sep 9 17:06:31 2020 -0400

    16636: a-d-c: add two more metrics:
    
      (gauge) number of containers allocated to VMs but not started yet (because VMs are pending/booting)
      (gauge) number of containers not allocated to VMs (because provider quota is reached)
    
    Arvados-DCO-1.1-Signed-off-by: Ward Vandewege <ward at curii.com>

diff --git a/lib/dispatchcloud/dispatcher.go b/lib/dispatchcloud/dispatcher.go
index 02b6c976a..278bcb665 100644
--- a/lib/dispatchcloud/dispatcher.go
+++ b/lib/dispatchcloud/dispatcher.go
@@ -181,7 +181,7 @@ func (disp *dispatcher) run() {
 	if pollInterval <= 0 {
 		pollInterval = defaultPollInterval
 	}
-	sched := scheduler.New(disp.Context, disp.queue, disp.pool, staleLockTimeout, pollInterval)
+	sched := scheduler.New(disp.Context, disp.queue, disp.pool, disp.Registry, staleLockTimeout, pollInterval)
 	sched.Start()
 	defer sched.Stop()
 
diff --git a/lib/dispatchcloud/scheduler/run_queue.go b/lib/dispatchcloud/scheduler/run_queue.go
index 0e8e1dc2e..b9d653a82 100644
--- a/lib/dispatchcloud/scheduler/run_queue.go
+++ b/lib/dispatchcloud/scheduler/run_queue.go
@@ -33,6 +33,7 @@ func (sch *Scheduler) runQueue() {
 
 	dontstart := map[arvados.InstanceType]bool{}
 	var overquota []container.QueueEnt // entries that are unmappable because of worker pool quota
+	var containerAllocatedWorkerBootingCount int
 
 tryrun:
 	for i, ctr := range sorted {
@@ -92,11 +93,15 @@ tryrun:
 			} else if sch.pool.StartContainer(it, ctr) {
 				// Success.
 			} else {
+				containerAllocatedWorkerBootingCount += 1
 				dontstart[it] = true
 			}
 		}
 	}
 
+	sch.mContainersAllocatedNotStarted.Set(float64(containerAllocatedWorkerBootingCount))
+	sch.mContainersNotAllocatedOverQuota.Set(float64(len(overquota)))
+
 	if len(overquota) > 0 {
 		// Unlock any containers that are unmappable while
 		// we're at quota.
diff --git a/lib/dispatchcloud/scheduler/run_queue_test.go b/lib/dispatchcloud/scheduler/run_queue_test.go
index 530eb5db9..6d7036498 100644
--- a/lib/dispatchcloud/scheduler/run_queue_test.go
+++ b/lib/dispatchcloud/scheduler/run_queue_test.go
@@ -13,6 +13,9 @@ import (
 	"git.arvados.org/arvados.git/lib/dispatchcloud/worker"
 	"git.arvados.org/arvados.git/sdk/go/arvados"
 	"git.arvados.org/arvados.git/sdk/go/ctxlog"
+
+	"github.com/prometheus/client_golang/prometheus/testutil"
+
 	check "gopkg.in/check.v1"
 )
 
@@ -185,7 +188,7 @@ func (*SchedulerSuite) TestUseIdleWorkers(c *check.C) {
 		running:   map[string]time.Time{},
 		canCreate: 0,
 	}
-	New(ctx, &queue, &pool, time.Millisecond, time.Millisecond).runQueue()
+	New(ctx, &queue, &pool, nil, time.Millisecond, time.Millisecond).runQueue()
 	c.Check(pool.creates, check.DeepEquals, []arvados.InstanceType{test.InstanceType(1), test.InstanceType(1), test.InstanceType(1)})
 	c.Check(pool.starts, check.DeepEquals, []string{test.ContainerUUID(4)})
 	c.Check(pool.running, check.HasLen, 1)
@@ -241,7 +244,7 @@ func (*SchedulerSuite) TestShutdownAtQuota(c *check.C) {
 			starts:    []string{},
 			canCreate: 0,
 		}
-		New(ctx, &queue, &pool, time.Millisecond, time.Millisecond).runQueue()
+		New(ctx, &queue, &pool, nil, time.Millisecond, time.Millisecond).runQueue()
 		c.Check(pool.creates, check.DeepEquals, shouldCreate)
 		if len(shouldCreate) == 0 {
 			c.Check(pool.starts, check.DeepEquals, []string{})
@@ -336,7 +339,7 @@ func (*SchedulerSuite) TestStartWhileCreating(c *check.C) {
 		},
 	}
 	queue.Update()
-	New(ctx, &queue, &pool, time.Millisecond, time.Millisecond).runQueue()
+	New(ctx, &queue, &pool, nil, time.Millisecond, time.Millisecond).runQueue()
 	c.Check(pool.creates, check.DeepEquals, []arvados.InstanceType{test.InstanceType(2), test.InstanceType(1)})
 	c.Check(pool.starts, check.DeepEquals, []string{uuids[6], uuids[5], uuids[3], uuids[2]})
 	running := map[string]bool{}
@@ -380,10 +383,51 @@ func (*SchedulerSuite) TestKillNonexistentContainer(c *check.C) {
 		},
 	}
 	queue.Update()
-	sch := New(ctx, &queue, &pool, time.Millisecond, time.Millisecond)
+	sch := New(ctx, &queue, &pool, nil, time.Millisecond, time.Millisecond)
 	c.Check(pool.running, check.HasLen, 1)
 	sch.sync()
 	for deadline := time.Now().Add(time.Second); len(pool.Running()) > 0 && time.Now().Before(deadline); time.Sleep(time.Millisecond) {
 	}
 	c.Check(pool.Running(), check.HasLen, 0)
 }
+
+func (*SchedulerSuite) TestContainersAllocatedNotStartedMetric(c *check.C) {
+	ctx := ctxlog.Context(context.Background(), ctxlog.TestLogger(c))
+	queue := test.Queue{
+		ChooseType: chooseType,
+		Containers: []arvados.Container{
+			{
+				UUID:     test.ContainerUUID(1),
+				Priority: 1,
+				State:    arvados.ContainerStateLocked,
+				RuntimeConstraints: arvados.RuntimeConstraints{
+					VCPUs: 1,
+					RAM:   1 << 30,
+				},
+			},
+		},
+	}
+	queue.Update()
+
+	// Create a pool with one unallocated (idle/booting/unknown) worker,
+	// and `idle` and `unknown` not set (empty). Iow this worker is in the booting
+	// state, and the container will be allocated but not started yet.
+	pool := stubPool{
+		unalloc: map[arvados.InstanceType]int{test.InstanceType(1): 1},
+	}
+	sch := New(ctx, &queue, &pool, nil, time.Millisecond, time.Millisecond)
+	sch.runQueue()
+
+	c.Check(int(testutil.ToFloat64(sch.mContainersAllocatedNotStarted)), check.Equals, 1)
+	c.Check(int(testutil.ToFloat64(sch.mContainersNotAllocatedOverQuota)), check.Equals, 0)
+
+	// Create a pool without workers. The queued container will not be started, and the
+	// 'over quota' metric will be 1 because no workers are available and canCreate defaults
+	// to zero.
+	pool = stubPool{}
+	sch = New(ctx, &queue, &pool, nil, time.Millisecond, time.Millisecond)
+	sch.runQueue()
+
+	c.Check(int(testutil.ToFloat64(sch.mContainersAllocatedNotStarted)), check.Equals, 0)
+	c.Check(int(testutil.ToFloat64(sch.mContainersNotAllocatedOverQuota)), check.Equals, 1)
+}
diff --git a/lib/dispatchcloud/scheduler/scheduler.go b/lib/dispatchcloud/scheduler/scheduler.go
index 6409ea031..6fd47e919 100644
--- a/lib/dispatchcloud/scheduler/scheduler.go
+++ b/lib/dispatchcloud/scheduler/scheduler.go
@@ -12,6 +12,7 @@ import (
 	"time"
 
 	"git.arvados.org/arvados.git/sdk/go/ctxlog"
+	"github.com/prometheus/client_golang/prometheus"
 	"github.com/sirupsen/logrus"
 )
 
@@ -31,6 +32,7 @@ type Scheduler struct {
 	logger              logrus.FieldLogger
 	queue               ContainerQueue
 	pool                WorkerPool
+	reg                 *prometheus.Registry
 	staleLockTimeout    time.Duration
 	queueUpdateInterval time.Duration
 
@@ -41,17 +43,21 @@ type Scheduler struct {
 	runOnce sync.Once
 	stop    chan struct{}
 	stopped chan struct{}
+
+	mContainersAllocatedNotStarted   prometheus.Gauge
+	mContainersNotAllocatedOverQuota prometheus.Gauge
 }
 
 // New returns a new unstarted Scheduler.
 //
 // Any given queue and pool should not be used by more than one
 // scheduler at a time.
-func New(ctx context.Context, queue ContainerQueue, pool WorkerPool, staleLockTimeout, queueUpdateInterval time.Duration) *Scheduler {
-	return &Scheduler{
+func New(ctx context.Context, queue ContainerQueue, pool WorkerPool, reg *prometheus.Registry, staleLockTimeout, queueUpdateInterval time.Duration) *Scheduler {
+	sch := &Scheduler{
 		logger:              ctxlog.FromContext(ctx),
 		queue:               queue,
 		pool:                pool,
+		reg:                 reg,
 		staleLockTimeout:    staleLockTimeout,
 		queueUpdateInterval: queueUpdateInterval,
 		wakeup:              time.NewTimer(time.Second),
@@ -59,6 +65,28 @@ func New(ctx context.Context, queue ContainerQueue, pool WorkerPool, staleLockTi
 		stopped:             make(chan struct{}),
 		uuidOp:              map[string]string{},
 	}
+	sch.registerMetrics(reg)
+	return sch
+}
+
+func (sch *Scheduler) registerMetrics(reg *prometheus.Registry) {
+	if reg == nil {
+		reg = prometheus.NewRegistry()
+	}
+	sch.mContainersAllocatedNotStarted = prometheus.NewGauge(prometheus.GaugeOpts{
+		Namespace: "arvados",
+		Subsystem: "dispatchcloud",
+		Name:      "containers_allocated_not_started",
+		Help:      "Number of containers allocated to a worker but not started yet (worker is booting).",
+	})
+	reg.MustRegister(sch.mContainersAllocatedNotStarted)
+	sch.mContainersNotAllocatedOverQuota = prometheus.NewGauge(prometheus.GaugeOpts{
+		Namespace: "arvados",
+		Subsystem: "dispatchcloud",
+		Name:      "containers_not_allocated_over_quota",
+		Help:      "Number of containers not allocated to a worker because the system has hit a quota.",
+	})
+	reg.MustRegister(sch.mContainersNotAllocatedOverQuota)
 }
 
 // Start starts the scheduler.
diff --git a/lib/dispatchcloud/scheduler/sync_test.go b/lib/dispatchcloud/scheduler/sync_test.go
index 538f5ea8c..a3ff0636e 100644
--- a/lib/dispatchcloud/scheduler/sync_test.go
+++ b/lib/dispatchcloud/scheduler/sync_test.go
@@ -48,7 +48,7 @@ func (*SchedulerSuite) TestForgetIrrelevantContainers(c *check.C) {
 	ents, _ := queue.Entries()
 	c.Check(ents, check.HasLen, 1)
 
-	sch := New(ctx, &queue, &pool, time.Millisecond, time.Millisecond)
+	sch := New(ctx, &queue, &pool, nil, time.Millisecond, time.Millisecond)
 	sch.sync()
 
 	ents, _ = queue.Entries()
@@ -80,7 +80,7 @@ func (*SchedulerSuite) TestCancelOrphanedContainers(c *check.C) {
 	ents, _ := queue.Entries()
 	c.Check(ents, check.HasLen, 1)
 
-	sch := New(ctx, &queue, &pool, time.Millisecond, time.Millisecond)
+	sch := New(ctx, &queue, &pool, nil, time.Millisecond, time.Millisecond)
 
 	// Sync shouldn't cancel the container because it might be
 	// running on the VM with state=="unknown".

-----------------------------------------------------------------------


hooks/post-receive
--