[arvados] created: 2.6.0-327-gc62492927
git repository hosting
git at public.arvados.org
Mon Jun 26 15:40:09 UTC 2023
at c624929279e70d58017eab08ed286dda88bcd215 (commit)
commit c624929279e70d58017eab08ed286dda88bcd215
Author: Tom Clegg <tom at curii.com>
Date: Mon Jun 26 11:17:48 2023 -0400
20667: Add at_quota metric.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>
diff --git a/lib/dispatchcloud/scheduler/scheduler.go b/lib/dispatchcloud/scheduler/scheduler.go
index b1f8ea222..0a5a94c96 100644
--- a/lib/dispatchcloud/scheduler/scheduler.go
+++ b/lib/dispatchcloud/scheduler/scheduler.go
@@ -122,6 +122,18 @@ func (sch *Scheduler) registerMetrics(reg *prometheus.Registry) {
Help: "Dynamically assigned limit on number of containers scheduled concurrency, set after receiving 503 errors from API.",
})
reg.MustRegister(sch.mMaxContainerConcurrency)
+ reg.MustRegister(prometheus.NewGaugeFunc(prometheus.GaugeOpts{
+ Namespace: "arvados",
+ Subsystem: "dispatchcloud",
+ Name: "at_quota",
+ Help: "Flag indicating the cloud driver is reporting an at-quota condition.",
+ }, func() float64 {
+ if sch.pool.AtQuota() {
+ return 1
+ } else {
+ return 0
+ }
+ }))
}
func (sch *Scheduler) updateMetrics() {
commit b36721d9501bd0ad8e455d8bdd9a3f103428c40f
Author: Tom Clegg <tom at curii.com>
Date: Mon Jun 26 11:06:35 2023 -0400
20667: Adjust timeout/retry settings to address occasional failures.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>
diff --git a/lib/dispatchcloud/dispatcher_test.go b/lib/dispatchcloud/dispatcher_test.go
index 273a3836d..6a8adcad9 100644
--- a/lib/dispatchcloud/dispatcher_test.go
+++ b/lib/dispatchcloud/dispatcher_test.go
@@ -69,6 +69,7 @@ func (s *DispatcherSuite) SetUpTest(c *check.C) {
DispatchPrivateKey: string(dispatchprivraw),
StaleLockTimeout: arvados.Duration(5 * time.Millisecond),
RuntimeEngine: "stub",
+ MaxDispatchAttempts: 10,
CloudVMs: arvados.CloudVMsConfig{
Driver: "test",
SyncInterval: arvados.Duration(10 * time.Millisecond),
@@ -234,9 +235,9 @@ func (s *DispatcherSuite) TestDispatchToStubDriver(c *check.C) {
select {
case <-done:
// loop will end because len(waiting)==0
- case <-time.After(3 * time.Second):
+ case <-time.After(5 * time.Second):
if len(waiting) >= waswaiting {
- c.Fatalf("timed out; no progress in 3s while waiting for %d containers: %q", len(waiting), waiting)
+ c.Fatalf("timed out; no progress in 5 s while waiting for %d containers: %q", len(waiting), waiting)
}
}
}
commit 48afa5ce8f43ba08e52a31e1ddfc8f93a3cdcf78
Author: Tom Clegg <tom at curii.com>
Date: Mon Jun 26 11:00:33 2023 -0400
20667: Reduce max supervisors if pool reaches cloud quota.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>
diff --git a/lib/dispatchcloud/scheduler/run_queue.go b/lib/dispatchcloud/scheduler/run_queue.go
index 63ac4a7b3..0c4634d75 100644
--- a/lib/dispatchcloud/scheduler/run_queue.go
+++ b/lib/dispatchcloud/scheduler/run_queue.go
@@ -92,6 +92,19 @@ func (sch *Scheduler) runQueue() {
if sch.maxInstances > 0 && sch.maxConcurrency > sch.maxInstances {
sch.maxConcurrency = sch.maxInstances
}
+ if sch.pool.AtQuota() && len(running) > 0 && (sch.maxConcurrency == 0 || sch.maxConcurrency > len(running)) {
+ // Consider current workload to be the maximum
+ // allowed, for the sake of reporting metrics and
+ // calculating max supervisors.
+ //
+ // Now that sch.maxConcurrency is set, we will only
+ // raise it past len(running) by 10%. This helps
+ // avoid running an inappropriate number of
+ // supervisors when we reach the cloud-imposed quota
+ // (which may be based on # CPUs etc) long before the
+ // configured MaxInstances.
+ sch.maxConcurrency = len(running)
+ }
sch.mMaxContainerConcurrency.Set(float64(sch.maxConcurrency))
maxSupervisors := int(float64(sch.maxConcurrency) * sch.supervisorFraction)
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list