[arvados] created: 2.6.0-327-gc62492927

Mon Jun 26 15:40:09 UTC 2023

at  c624929279e70d58017eab08ed286dda88bcd215 (commit)


commit c624929279e70d58017eab08ed286dda88bcd215
Author: Tom Clegg <tom at curii.com>
Date:   Mon Jun 26 11:17:48 2023 -0400

    20667: Add at_quota metric.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>

diff --git a/lib/dispatchcloud/scheduler/scheduler.go b/lib/dispatchcloud/scheduler/scheduler.go
index b1f8ea222..0a5a94c96 100644
--- a/lib/dispatchcloud/scheduler/scheduler.go
+++ b/lib/dispatchcloud/scheduler/scheduler.go
@@ -122,6 +122,18 @@ func (sch *Scheduler) registerMetrics(reg *prometheus.Registry) {
 		Help:      "Dynamically assigned limit on number of containers scheduled concurrency, set after receiving 503 errors from API.",
 	})
 	reg.MustRegister(sch.mMaxContainerConcurrency)
+	reg.MustRegister(prometheus.NewGaugeFunc(prometheus.GaugeOpts{
+		Namespace: "arvados",
+		Subsystem: "dispatchcloud",
+		Name:      "at_quota",
+		Help:      "Flag indicating the cloud driver is reporting an at-quota condition.",
+	}, func() float64 {
+		if sch.pool.AtQuota() {
+			return 1
+		} else {
+			return 0
+		}
+	}))
 }
 
 func (sch *Scheduler) updateMetrics() {

commit b36721d9501bd0ad8e455d8bdd9a3f103428c40f
Author: Tom Clegg <tom at curii.com>
Date:   Mon Jun 26 11:06:35 2023 -0400

    20667: Adjust timeout/retry settings to address occasional failures.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>

diff --git a/lib/dispatchcloud/dispatcher_test.go b/lib/dispatchcloud/dispatcher_test.go
index 273a3836d..6a8adcad9 100644
--- a/lib/dispatchcloud/dispatcher_test.go
+++ b/lib/dispatchcloud/dispatcher_test.go
@@ -69,6 +69,7 @@ func (s *DispatcherSuite) SetUpTest(c *check.C) {
 			DispatchPrivateKey:     string(dispatchprivraw),
 			StaleLockTimeout:       arvados.Duration(5 * time.Millisecond),
 			RuntimeEngine:          "stub",
+			MaxDispatchAttempts:    10,
 			CloudVMs: arvados.CloudVMsConfig{
 				Driver:               "test",
 				SyncInterval:         arvados.Duration(10 * time.Millisecond),
@@ -234,9 +235,9 @@ func (s *DispatcherSuite) TestDispatchToStubDriver(c *check.C) {
 		select {
 		case <-done:
 			// loop will end because len(waiting)==0
-		case <-time.After(3 * time.Second):
+		case <-time.After(5 * time.Second):
 			if len(waiting) >= waswaiting {
-				c.Fatalf("timed out; no progress in 3s while waiting for %d containers: %q", len(waiting), waiting)
+				c.Fatalf("timed out; no progress in 5 s while waiting for %d containers: %q", len(waiting), waiting)
 			}
 		}
 	}

commit 48afa5ce8f43ba08e52a31e1ddfc8f93a3cdcf78
Author: Tom Clegg <tom at curii.com>
Date:   Mon Jun 26 11:00:33 2023 -0400

    20667: Reduce max supervisors if pool reaches cloud quota.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>

diff --git a/lib/dispatchcloud/scheduler/run_queue.go b/lib/dispatchcloud/scheduler/run_queue.go
index 63ac4a7b3..0c4634d75 100644
--- a/lib/dispatchcloud/scheduler/run_queue.go
+++ b/lib/dispatchcloud/scheduler/run_queue.go
@@ -92,6 +92,19 @@ func (sch *Scheduler) runQueue() {
 	if sch.maxInstances > 0 && sch.maxConcurrency > sch.maxInstances {
 		sch.maxConcurrency = sch.maxInstances
 	}
+	if sch.pool.AtQuota() && len(running) > 0 && (sch.maxConcurrency == 0 || sch.maxConcurrency > len(running)) {
+		// Consider current workload to be the maximum
+		// allowed, for the sake of reporting metrics and
+		// calculating max supervisors.
+		//
+		// Now that sch.maxConcurrency is set, we will only
+		// raise it past len(running) by 10%.  This helps
+		// avoid running an inappropriate number of
+		// supervisors when we reach the cloud-imposed quota
+		// (which may be based on # CPUs etc) long before the
+		// configured MaxInstances.
+		sch.maxConcurrency = len(running)
+	}
 	sch.mMaxContainerConcurrency.Set(float64(sch.maxConcurrency))
 
 	maxSupervisors := int(float64(sch.maxConcurrency) * sch.supervisorFraction)

-----------------------------------------------------------------------


hooks/post-receive
--