[arvados] created: 2.6.0-94-gafe02764a

git repository hosting git at public.arvados.org
Tue May 2 21:19:22 UTC 2023


        at  afe02764a9209c0c4b0ec75df52b4851ec8ce01f (commit)


commit afe02764a9209c0c4b0ec75df52b4851ec8ce01f
Author: Tom Clegg <tom at curii.com>
Date:   Tue May 2 17:16:05 2023 -0400

    20457: Include delayed supervisor containers in overquota metric.
    
    Previously, supervisor containers that had high enough priority to
    run, but weren't scheduled because of SupervisorFraction, were not
    counted in the containers_over_quota metric. This caused the
    "overquota" metric to show a misleading time series as non-supervisor
    containers made their way through the queue and the delayed supervisor
    containers flapped between "not allocated because quota" (counted) and
    "not allocated because SupervisorFraction" (not counted).
    
    With this change, un-mappable supervisors always count toward the
    containers_not_allocated_over_quota metric.
    
    This also applies the "unlock if previously locked but now delayed due
    to SupervisorFraction" logic to supervisor processes, which was
    previously overlooked. This prevents supervisors from staying in
    Locked state after being bumped by higher-priority containers.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>

diff --git a/lib/dispatchcloud/scheduler/run_queue.go b/lib/dispatchcloud/scheduler/run_queue.go
index dda3630ee..e6b1b06a2 100644
--- a/lib/dispatchcloud/scheduler/run_queue.go
+++ b/lib/dispatchcloud/scheduler/run_queue.go
@@ -76,7 +76,8 @@ func (sch *Scheduler) runQueue() {
 	}).Debug("runQueue")
 
 	dontstart := map[arvados.InstanceType]bool{}
-	var overquota []container.QueueEnt // entries that are unmappable because of worker pool quota
+	var overquota []container.QueueEnt    // entries that are unmappable because of worker pool quota
+	var overmaxsuper []container.QueueEnt // unmappable because max supervisors (these are not included in overquota)
 	var containerAllocatedWorkerBootingCount int
 
 	// trying is #containers running + #containers we're trying to
@@ -87,8 +88,8 @@ func (sch *Scheduler) runQueue() {
 	supervisors := 0
 
 tryrun:
-	for i, ctr := range sorted {
-		ctr, it := ctr.Container, ctr.InstanceType
+	for i, ent := range sorted {
+		ctr, it := ent.Container, ent.InstanceType
 		logger := sch.logger.WithFields(logrus.Fields{
 			"ContainerUUID": ctr.UUID,
 			"InstanceType":  it.Name,
@@ -96,6 +97,7 @@ tryrun:
 		if ctr.SchedulingParameters.Supervisor {
 			supervisors += 1
 			if sch.maxSupervisors > 0 && supervisors > sch.maxSupervisors {
+				overmaxsuper = append(overmaxsuper, sorted[i])
 				continue
 			}
 		}
@@ -173,14 +175,14 @@ tryrun:
 	}
 
 	sch.mContainersAllocatedNotStarted.Set(float64(containerAllocatedWorkerBootingCount))
-	sch.mContainersNotAllocatedOverQuota.Set(float64(len(overquota)))
+	sch.mContainersNotAllocatedOverQuota.Set(float64(len(overquota) + len(overmaxsuper)))
 
-	if len(overquota) > 0 {
+	if len(overquota)+len(overmaxsuper) > 0 {
 		// Unlock any containers that are unmappable while
 		// we're at quota (but if they have already been
 		// scheduled and they're loading docker images etc.,
 		// let them run).
-		for _, ctr := range overquota {
+		for _, ctr := range append(overmaxsuper, overquota...) {
 			ctr := ctr.Container
 			_, toolate := running[ctr.UUID]
 			if ctr.State == arvados.ContainerStateLocked && !toolate {

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list