[ARVADOS] updated: 1.1.4-223-gf39ddfd
Git user
git at public.curoverse.com
Mon May 7 10:40:28 EDT 2018
Summary of changes:
services/crunch-dispatch-slurm/squeue.go | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
via f39ddfd2ab8479c6c6bbef44fc479a6f20aa2527 (commit)
via 016ac3cd9e885192e069ba314593de64bb9e94b1 (commit)
from 524c20020594ba67a2a822eccb632f8a5f5dc3ce (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
commit f39ddfd2ab8479c6c6bbef44fc479a6f20aa2527
Merge: 524c200 016ac3c
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date: Mon May 7 10:40:17 2018 -0400
Merge branch '13278-launch-failed'
refs #13278
refs #13399
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>
commit 016ac3cd9e885192e069ba314593de64bb9e94b1
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date: Tue Mar 27 13:01:38 2018 -0400
13278: Release jobs that are pending because "launch failed".
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>
diff --git a/services/crunch-dispatch-slurm/squeue.go b/services/crunch-dispatch-slurm/squeue.go
index 9514da8..742943f 100644
--- a/services/crunch-dispatch-slurm/squeue.go
+++ b/services/crunch-dispatch-slurm/squeue.go
@@ -157,7 +157,7 @@ func (sqc *SqueueChecker) check() {
replacing.nice = n
newq[uuid] = replacing
- if state == "PENDING" && reason == "BadConstraints" && p == 0 && replacing.wantPriority > 0 {
+ if state == "PENDING" && ((reason == "BadConstraints" && p == 0) || reason == "launch failed requeued held") && replacing.wantPriority > 0 {
// When using SLURM 14.x or 15.x, our queued
// jobs land in this state when "scontrol
// reconfigure" invalidates their feature
@@ -171,6 +171,10 @@ func (sqc *SqueueChecker) check() {
// reappeared, so rather than second-guessing
// whether SLURM is ready, we just keep trying
// this until it works.
+ //
+ // "launch failed requeued held" seems to be
+ // another manifestation of this problem,
+ // resolved the same way.
log.Printf("releasing held job %q", uuid)
sqc.Slurm.Release(uuid)
} else if p < 1<<20 && replacing.wantPriority > 0 {
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list