[arvados] created: 2.6.0-94-gafe02764a
git repository hosting
git at public.arvados.org
Tue May 2 21:19:22 UTC 2023
at afe02764a9209c0c4b0ec75df52b4851ec8ce01f (commit)
commit afe02764a9209c0c4b0ec75df52b4851ec8ce01f
Author: Tom Clegg <tom at curii.com>
Date: Tue May 2 17:16:05 2023 -0400
20457: Include delayed supervisor containers in overquota metric.
Previously, supervisor containers that had high enough priority to
run, but weren't scheduled because of SupervisorFraction, were not
counted in the containers_over_quota metric. This caused the
"overquota" metric to show a misleading time series as non-supervisor
containers made their way through the queue and the delayed supervisor
containers flapped between "not allocated because quota" (counted) and
"not allocated because SupervisorFraction" (not counted).
With this change, un-mappable supervisors always count toward the
containers_not_allocated_over_quota metric.
This also applies the "unlock if previously locked but now delayed due
to SupervisorFraction" logic to supervisor processes, which was
previously overlooked. This prevents supervisors from staying in
Locked state after being bumped by higher-priority containers.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>
diff --git a/lib/dispatchcloud/scheduler/run_queue.go b/lib/dispatchcloud/scheduler/run_queue.go
index dda3630ee..e6b1b06a2 100644
--- a/lib/dispatchcloud/scheduler/run_queue.go
+++ b/lib/dispatchcloud/scheduler/run_queue.go
@@ -76,7 +76,8 @@ func (sch *Scheduler) runQueue() {
}).Debug("runQueue")
dontstart := map[arvados.InstanceType]bool{}
- var overquota []container.QueueEnt // entries that are unmappable because of worker pool quota
+ var overquota []container.QueueEnt // entries that are unmappable because of worker pool quota
+ var overmaxsuper []container.QueueEnt // unmappable because max supervisors (these are not included in overquota)
var containerAllocatedWorkerBootingCount int
// trying is #containers running + #containers we're trying to
@@ -87,8 +88,8 @@ func (sch *Scheduler) runQueue() {
supervisors := 0
tryrun:
- for i, ctr := range sorted {
- ctr, it := ctr.Container, ctr.InstanceType
+ for i, ent := range sorted {
+ ctr, it := ent.Container, ent.InstanceType
logger := sch.logger.WithFields(logrus.Fields{
"ContainerUUID": ctr.UUID,
"InstanceType": it.Name,
@@ -96,6 +97,7 @@ tryrun:
if ctr.SchedulingParameters.Supervisor {
supervisors += 1
if sch.maxSupervisors > 0 && supervisors > sch.maxSupervisors {
+ overmaxsuper = append(overmaxsuper, sorted[i])
continue
}
}
@@ -173,14 +175,14 @@ tryrun:
}
sch.mContainersAllocatedNotStarted.Set(float64(containerAllocatedWorkerBootingCount))
- sch.mContainersNotAllocatedOverQuota.Set(float64(len(overquota)))
+ sch.mContainersNotAllocatedOverQuota.Set(float64(len(overquota) + len(overmaxsuper)))
- if len(overquota) > 0 {
+ if len(overquota)+len(overmaxsuper) > 0 {
// Unlock any containers that are unmappable while
// we're at quota (but if they have already been
// scheduled and they're loading docker images etc.,
// let them run).
- for _, ctr := range overquota {
+ for _, ctr := range append(overmaxsuper, overquota...) {
ctr := ctr.Container
_, toolate := running[ctr.UUID]
if ctr.State == arvados.ContainerStateLocked && !toolate {
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list