[ARVADOS] updated: 1.1.3-148-g665d2c0
Git user
git at public.curoverse.com
Wed Mar 7 09:37:01 EST 2018
Summary of changes:
.../crunch-dispatch-slurm_test.go | 18 +++++++++++------
services/crunch-dispatch-slurm/slurm.go | 9 +++++++--
services/crunch-dispatch-slurm/squeue.go | 23 +++++++++++++++++++---
services/crunch-dispatch-slurm/squeue_test.go | 12 +++++------
services/nodemanager/arvnodeman/jobqueue.py | 2 +-
5 files changed, 46 insertions(+), 18 deletions(-)
via 665d2c06f2c45fe4d08bd6ba630259e53c03b805 (commit)
via f7d0830ae819e2a62115642be449fa79f2fc8152 (commit)
from 53dc92af621bb2064e922b82e88572f5180c503a (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
commit 665d2c06f2c45fe4d08bd6ba630259e53c03b805
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date: Wed Mar 7 09:19:40 2018 -0500
13078: Accommodate old SLURM versions.
In SLURM 14 and 15, if a queued job has feature constraints which
become invalid (e.g., when "scontrol reconfigure" clears all node
features), the job is put on hold with priority=0, and it stays in
this state even after the features reappear. "scontrol release"
recovers a job from this state.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>
diff --git a/services/crunch-dispatch-slurm/crunch-dispatch-slurm_test.go b/services/crunch-dispatch-slurm/crunch-dispatch-slurm_test.go
index 6852fc4..a5de3bb 100644
--- a/services/crunch-dispatch-slurm/crunch-dispatch-slurm_test.go
+++ b/services/crunch-dispatch-slurm/crunch-dispatch-slurm_test.go
@@ -55,10 +55,11 @@ func (s *IntegrationSuite) TearDownTest(c *C) {
}
type slurmFake struct {
- didBatch [][]string
- didCancel []string
- didRenice [][]string
- queue string
+ didBatch [][]string
+ didCancel []string
+ didRelease []int64
+ didRenice [][]string
+ queue string
// If non-nil, run this func during the 2nd+ call to Cancel()
onCancel func()
// Error returned by Batch()
@@ -74,6 +75,11 @@ func (sf *slurmFake) QueueCommand(args []string) *exec.Cmd {
return exec.Command("echo", sf.queue)
}
+func (sf *slurmFake) Release(jobid int64) error {
+ sf.didRelease = append(sf.didRelease, jobid)
+ return nil
+}
+
func (sf *slurmFake) Renice(name string, nice int64) error {
sf.didRenice = append(sf.didRenice, []string{name, fmt.Sprintf("%d", nice)})
return nil
@@ -151,7 +157,7 @@ func (s *IntegrationSuite) integrationTest(c *C,
}
func (s *IntegrationSuite) TestNormal(c *C) {
- s.slurm = slurmFake{queue: "zzzzz-dz642-queuedcontainer 10000 100\n"}
+ s.slurm = slurmFake{queue: "zzzzz-dz642-queuedcontainer 10000 100 PENDING Resources\n"}
container := s.integrationTest(c,
nil,
func(dispatcher *dispatch.Dispatcher, container arvados.Container) {
@@ -163,7 +169,7 @@ func (s *IntegrationSuite) TestNormal(c *C) {
}
func (s *IntegrationSuite) TestCancel(c *C) {
- s.slurm = slurmFake{queue: "zzzzz-dz642-queuedcontainer 10000 100\n"}
+ s.slurm = slurmFake{queue: "zzzzz-dz642-queuedcontainer 10000 100 PENDING Resources\n"}
readyToCancel := make(chan bool)
s.slurm.onCancel = func() { <-readyToCancel }
container := s.integrationTest(c,
diff --git a/services/crunch-dispatch-slurm/slurm.go b/services/crunch-dispatch-slurm/slurm.go
index 735e057..2e65292 100644
--- a/services/crunch-dispatch-slurm/slurm.go
+++ b/services/crunch-dispatch-slurm/slurm.go
@@ -13,10 +13,11 @@ import (
)
type Slurm interface {
+ Batch(script io.Reader, args []string) error
Cancel(name string) error
- Renice(name string, nice int64) error
QueueCommand(args []string) *exec.Cmd
- Batch(script io.Reader, args []string) error
+ Release(jobid int64) error
+ Renice(name string, nice int64) error
}
type slurmCLI struct{}
@@ -54,6 +55,10 @@ func (scli *slurmCLI) QueueCommand(args []string) *exec.Cmd {
return exec.Command("squeue", args...)
}
+func (scli *slurmCLI) Release(jobid int64) error {
+ return scli.run(nil, "scontrol", []string{"release", fmt.Sprintf("%d", jobid)})
+}
+
func (scli *slurmCLI) Renice(name string, nice int64) error {
return scli.run(nil, "scontrol", []string{"update", "JobName=" + name, fmt.Sprintf("Nice=%d", nice)})
}
diff --git a/services/crunch-dispatch-slurm/squeue.go b/services/crunch-dispatch-slurm/squeue.go
index 8862d16..1cea626 100644
--- a/services/crunch-dispatch-slurm/squeue.go
+++ b/services/crunch-dispatch-slurm/squeue.go
@@ -121,7 +121,7 @@ func (sqc *SqueueChecker) check() {
sqc.L.Lock()
defer sqc.L.Unlock()
- cmd := sqc.Slurm.QueueCommand([]string{"--all", "--noheader", "--format=%j %y %Q"})
+ cmd := sqc.Slurm.QueueCommand([]string{"--all", "--noheader", "--format=%j %y %Q %T %r"})
stdout, stderr := &bytes.Buffer{}, &bytes.Buffer{}
cmd.Stdout, cmd.Stderr = stdout, stderr
if err := cmd.Run(); err != nil {
@@ -135,9 +135,9 @@ func (sqc *SqueueChecker) check() {
if line == "" {
continue
}
- var uuid string
+ var uuid, state, reason string
var n, p int64
- if _, err := fmt.Sscan(line, &uuid, &n, &p); err != nil {
+ if _, err := fmt.Sscan(line, &uuid, &n, &p, &state, &reason); err != nil {
log.Printf("warning: ignoring unparsed line in squeue output: %q", line)
continue
}
@@ -148,6 +148,23 @@ func (sqc *SqueueChecker) check() {
replacing.priority = p
replacing.nice = n
newq[uuid] = replacing
+
+ if state == "PENDING" && reason == "BadConstraints" && p == 0 && replacing.wantPriority > 0 {
+ // When using SLURM 14.x or 15.x, our queued
+ // jobs land in this state when "scontrol
+ // reconfigure" invalidates their feature
+ // constraints by clearing all node features.
+ // They stay in this state even after the
+ // features reappear, until we run "scontrol
+ // release {jobid}".
+ //
+ // "scontrol release" is silent and successful
+ // regardless of whether the features have
+ // reappeared, so rather than second-guessing
+ // whether SLURM is ready, we just keep trying
+ // this until it works.
+ sqc.Slurm.Release(n)
+ }
}
sqc.queue = newq
sqc.Broadcast()
diff --git a/services/crunch-dispatch-slurm/squeue_test.go b/services/crunch-dispatch-slurm/squeue_test.go
index 11f7c48..ba257ea 100644
--- a/services/crunch-dispatch-slurm/squeue_test.go
+++ b/services/crunch-dispatch-slurm/squeue_test.go
@@ -24,31 +24,31 @@ func (s *SqueueSuite) TestReniceAll(c *C) {
}{
{
spread: 1,
- squeue: uuids[0] + " 10000 4294000000\n",
+ squeue: uuids[0] + " 10000 4294000000 PENDING Resources\n",
want: map[string]int64{uuids[0]: 1},
expect: [][]string{{uuids[0], "0"}},
},
{ // fake0 priority is too high
spread: 1,
- squeue: uuids[0] + " 10000 4294000777\n" + uuids[1] + " 10000 4294000444\n",
+ squeue: uuids[0] + " 10000 4294000777 PENDING Resources\n" + uuids[1] + " 10000 4294000444 PENDING Resources\n",
want: map[string]int64{uuids[0]: 1, uuids[1]: 999},
expect: [][]string{{uuids[1], "0"}, {uuids[0], "334"}},
},
{ // specify spread
spread: 100,
- squeue: uuids[0] + " 10000 4294000777\n" + uuids[1] + " 10000 4294000444\n",
+ squeue: uuids[0] + " 10000 4294000777 PENDING Resources\n" + uuids[1] + " 10000 4294000444 PENDING Resources\n",
want: map[string]int64{uuids[0]: 1, uuids[1]: 999},
expect: [][]string{{uuids[1], "0"}, {uuids[0], "433"}},
},
{ // ignore fake2 because SetPriority() not called
spread: 1,
- squeue: uuids[0] + " 10000 4294000000\n" + uuids[1] + " 10000 4294000111\n" + uuids[2] + " 10000 4294000222\n",
+ squeue: uuids[0] + " 10000 4294000000 PENDING Resources\n" + uuids[1] + " 10000 4294000111 PENDING Resources\n" + uuids[2] + " 10000 4294000222 PENDING Resources\n",
want: map[string]int64{uuids[0]: 999, uuids[1]: 1},
expect: [][]string{{uuids[0], "0"}, {uuids[1], "112"}},
},
{ // ignore fake2 because slurm priority=0
spread: 1,
- squeue: uuids[0] + " 10000 4294000000\n" + uuids[1] + " 10000 4294000111\n" + uuids[2] + " 10000 0\n",
+ squeue: uuids[0] + " 10000 4294000000 PENDING Resources\n" + uuids[1] + " 10000 4294000111 PENDING Resources\n" + uuids[2] + " 10000 0 PENDING Resources\n",
want: map[string]int64{uuids[0]: 999, uuids[1]: 1, uuids[2]: 997},
expect: [][]string{{uuids[0], "0"}, {uuids[1], "112"}},
},
@@ -103,7 +103,7 @@ func (s *SqueueSuite) TestSetPriorityBeforeQueued(c *C) {
for {
select {
case <-tick.C:
- slurm.queue = uuidGood + " 0 12345\n"
+ slurm.queue = uuidGood + " 0 12345 PENDING Resources\n"
sqc.check()
// Avoid immediately selecting this case again
commit f7d0830ae819e2a62115642be449fa79f2fc8152
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date: Wed Mar 7 09:17:56 2018 -0500
13078: Fix ignoring queued jobs with reason=BadConstraints.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>
diff --git a/services/nodemanager/arvnodeman/jobqueue.py b/services/nodemanager/arvnodeman/jobqueue.py
index 0360bfc..d002a1d 100644
--- a/services/nodemanager/arvnodeman/jobqueue.py
+++ b/services/nodemanager/arvnodeman/jobqueue.py
@@ -163,7 +163,7 @@ class JobQueueMonitorActor(clientactor.RemotePollLoopActor):
continue
if '-dz642-' not in jobname:
continue
- if not re.search(r'ReqNodeNotAvail|Resources|Priority', reason):
+ if not re.search(r'BadConstraints|ReqNodeNotAvail|Resources|Priority', reason):
continue
for feature in features.split(','):
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list