[ARVADOS] created: 1.1.3-245-g0dcb389

Git user git at public.curoverse.com
Tue Mar 27 13:07:47 EDT 2018


        at  0dcb3892ad4e2a5a8954b52f18c0db56902db6c9 (commit)


commit 0dcb3892ad4e2a5a8954b52f18c0db56902db6c9
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Tue Mar 27 13:01:38 2018 -0400

    13278: Release jobs that are pending because "launch failed".
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/services/crunch-dispatch-slurm/squeue.go b/services/crunch-dispatch-slurm/squeue.go
index ee79c6f..165f13e 100644
--- a/services/crunch-dispatch-slurm/squeue.go
+++ b/services/crunch-dispatch-slurm/squeue.go
@@ -157,7 +157,7 @@ func (sqc *SqueueChecker) check() {
 		replacing.nice = n
 		newq[uuid] = replacing
 
-		if state == "PENDING" && reason == "BadConstraints" && p == 0 && replacing.wantPriority > 0 {
+		if state == "PENDING" && ((reason == "BadConstraints" && p == 0) || reason == "launch failed requeued held") && replacing.wantPriority > 0 {
 			// When using SLURM 14.x or 15.x, our queued
 			// jobs land in this state when "scontrol
 			// reconfigure" invalidates their feature
@@ -171,6 +171,10 @@ func (sqc *SqueueChecker) check() {
 			// reappeared, so rather than second-guessing
 			// whether SLURM is ready, we just keep trying
 			// this until it works.
+			//
+			// "launch failed requeued held" seems to be
+			// another manifestation of this problem,
+			// resolved the same way.
 			sqc.Slurm.Release(uuid)
 		}
 	}

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list