[ARVADOS] created: 1.1.3-245-g0dcb389
Git user
git at public.curoverse.com
Tue Mar 27 13:07:47 EDT 2018
at 0dcb3892ad4e2a5a8954b52f18c0db56902db6c9 (commit)
commit 0dcb3892ad4e2a5a8954b52f18c0db56902db6c9
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date: Tue Mar 27 13:01:38 2018 -0400
13278: Release jobs that are pending because "launch failed".
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>
diff --git a/services/crunch-dispatch-slurm/squeue.go b/services/crunch-dispatch-slurm/squeue.go
index ee79c6f..165f13e 100644
--- a/services/crunch-dispatch-slurm/squeue.go
+++ b/services/crunch-dispatch-slurm/squeue.go
@@ -157,7 +157,7 @@ func (sqc *SqueueChecker) check() {
replacing.nice = n
newq[uuid] = replacing
- if state == "PENDING" && reason == "BadConstraints" && p == 0 && replacing.wantPriority > 0 {
+ if state == "PENDING" && ((reason == "BadConstraints" && p == 0) || reason == "launch failed requeued held") && replacing.wantPriority > 0 {
// When using SLURM 14.x or 15.x, our queued
// jobs land in this state when "scontrol
// reconfigure" invalidates their feature
@@ -171,6 +171,10 @@ func (sqc *SqueueChecker) check() {
// reappeared, so rather than second-guessing
// whether SLURM is ready, we just keep trying
// this until it works.
+ //
+ // "launch failed requeued held" seems to be
+ // another manifestation of this problem,
+ // resolved the same way.
sqc.Slurm.Release(uuid)
}
}
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list