[ARVADOS] created: 1.3.0-2844-g3aa3fb78a

Git user git at public.arvados.org
Tue Aug 4 20:09:09 UTC 2020


        at  3aa3fb78afa46e98c9be345045f4fea9fea0f08c (commit)


commit 3aa3fb78afa46e98c9be345045f4fea9fea0f08c
Author: Tom Clegg <tom at tomclegg.ca>
Date:   Tue Aug 4 16:06:29 2020 -0400

    16663: Don't kill orphaned containers when unprobed nodes exist.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at tomclegg.ca>

diff --git a/lib/dispatchcloud/scheduler/sync.go b/lib/dispatchcloud/scheduler/sync.go
index de69df982..116ca7643 100644
--- a/lib/dispatchcloud/scheduler/sync.go
+++ b/lib/dispatchcloud/scheduler/sync.go
@@ -8,6 +8,7 @@ import (
 	"fmt"
 
 	"git.arvados.org/arvados.git/lib/dispatchcloud/container"
+	"git.arvados.org/arvados.git/lib/dispatchcloud/worker"
 	"git.arvados.org/arvados.git/sdk/go/arvados"
 	"github.com/sirupsen/logrus"
 )
@@ -23,6 +24,7 @@ import (
 // Running containers whose crunch-run processes have exited are
 // cancelled.
 func (sch *Scheduler) sync() {
+	anyUnknownWorkers := sch.pool.CountWorkers()[worker.StateUnknown] > 0
 	running := sch.pool.Running()
 	qEntries, qUpdated := sch.queue.Entries()
 	for uuid, ent := range qEntries {
@@ -30,7 +32,9 @@ func (sch *Scheduler) sync() {
 		switch ent.Container.State {
 		case arvados.ContainerStateRunning:
 			if !running {
-				go sch.cancel(uuid, "not running on any worker")
+				if !anyUnknownWorkers {
+					go sch.cancel(uuid, "not running on any worker")
+				}
 			} else if !exited.IsZero() && qUpdated.After(exited) {
 				go sch.cancel(uuid, "state=Running after crunch-run exited")
 			} else if ent.Container.Priority == 0 {

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list