[ARVADOS] created: 1.1.3-151-g92474d9
Git user
git at public.curoverse.com
Wed Mar 7 18:12:44 EST 2018
at 92474d902612ad28b3f9e5b5c21de08162d23ffb (commit)
commit 92474d902612ad28b3f9e5b5c21de08162d23ffb
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date: Wed Mar 7 17:54:35 2018 -0500
13078: Add test case for "release job held because BadConstraints".
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>
diff --git a/services/crunch-dispatch-slurm/squeue_test.go b/services/crunch-dispatch-slurm/squeue_test.go
index ba257ea..c9329fd 100644
--- a/services/crunch-dispatch-slurm/squeue_test.go
+++ b/services/crunch-dispatch-slurm/squeue_test.go
@@ -14,6 +14,36 @@ var _ = Suite(&SqueueSuite{})
type SqueueSuite struct{}
+func (s *SqueueSuite) TestReleasePending(c *C) {
+ uuids := []string{
+ "zzzzz-dz642-fake0fake0fake0",
+ "zzzzz-dz642-fake1fake1fake1",
+ "zzzzz-dz642-fake2fake2fake2",
+ }
+ slurm := &slurmFake{
+ queue: uuids[0] + " 10000 4294000000 PENDING Resources\n" + uuids[1] + " 10000 4294000111 PENDING Resources\n" + uuids[2] + " 10000 0 PENDING BadConstraints\n",
+ }
+ sqc := &SqueueChecker{
+ Slurm: slurm,
+ Period: time.Hour,
+ }
+ sqc.startOnce.Do(sqc.start)
+ defer sqc.Stop()
+
+ done := make(chan struct{})
+ go func() {
+ for _, u := range uuids {
+ sqc.SetPriority(u, 1)
+ }
+ close(done)
+ }()
+ callUntilReady(sqc.check, done)
+
+ slurm.didRelease = nil
+ sqc.check()
+ c.Check(slurm.didRelease, DeepEquals, []string{uuids[2]})
+}
+
func (s *SqueueSuite) TestReniceAll(c *C) {
uuids := []string{"zzzzz-dz642-fake0fake0fake0", "zzzzz-dz642-fake1fake1fake1", "zzzzz-dz642-fake2fake2fake2"}
for _, test := range []struct {
@@ -123,3 +153,16 @@ func (s *SqueueSuite) TestSetPriorityBeforeQueued(c *C) {
}
}
}
+
+func callUntilReady(fn func(), done <-chan struct{}) {
+ tick := time.NewTicker(time.Millisecond)
+ defer tick.Stop()
+ for {
+ select {
+ case <-done:
+ return
+ case <-tick.C:
+ fn()
+ }
+ }
+}
commit bab7938e562be6ce1305c15faaffb43aef67ff77
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date: Wed Mar 7 10:09:20 2018 -0500
13078: Update comment.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>
diff --git a/services/nodemanager/arvnodeman/jobqueue.py b/services/nodemanager/arvnodeman/jobqueue.py
index d002a1d..20849c9 100644
--- a/services/nodemanager/arvnodeman/jobqueue.py
+++ b/services/nodemanager/arvnodeman/jobqueue.py
@@ -153,7 +153,7 @@ class JobQueueMonitorActor(clientactor.RemotePollLoopActor):
def _send_request(self):
queuelist = []
if self.slurm_queue:
- # cpus, memory, tempory disk space, reason, job name
+ # cpus, memory, tempory disk space, reason, job name, feature constraints
squeue_out = subprocess.check_output(["squeue", "--state=PENDING", "--noheader", "--format=%c|%m|%d|%r|%j|%f"])
for out in squeue_out.splitlines():
try:
commit 55573affe3611054b88a17d3cc89c21b8bfa14e7
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date: Wed Mar 7 09:43:28 2018 -0500
13078: Reduce "feature kludge" poll interval.
Whenever a new node comes online, "scontrol reconfigure" invalidates
all jobs that could be scheduled onto that node *or any other node* so
it's worthwhile to recover from this quickly.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>
diff --git a/lib/dispatchcloud/node_size.go b/lib/dispatchcloud/node_size.go
index ed33260..2ca4050 100644
--- a/lib/dispatchcloud/node_size.go
+++ b/lib/dispatchcloud/node_size.go
@@ -87,7 +87,7 @@ func SlurmNodeTypeFeatureKludge(cc *arvados.Cluster) {
}
for {
slurmKludge(features)
- time.Sleep(time.Minute)
+ time.Sleep(2 * time.Second)
}
}
commit e1f97dcf68d197525228781e2a861cc3e64a0231
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date: Wed Mar 7 09:19:40 2018 -0500
13078: Fix jobs stuck in "held" state in old SLURM versions.
In SLURM 14 and 15, if a queued job has feature constraints which
become invalid (e.g., when "scontrol reconfigure" clears all node
features), the job is put on hold with priority=0, and it stays in
this state even after the features reappear. "scontrol release"
recovers a job from this state.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>
diff --git a/services/crunch-dispatch-slurm/crunch-dispatch-slurm_test.go b/services/crunch-dispatch-slurm/crunch-dispatch-slurm_test.go
index 6852fc4..4211026 100644
--- a/services/crunch-dispatch-slurm/crunch-dispatch-slurm_test.go
+++ b/services/crunch-dispatch-slurm/crunch-dispatch-slurm_test.go
@@ -55,10 +55,11 @@ func (s *IntegrationSuite) TearDownTest(c *C) {
}
type slurmFake struct {
- didBatch [][]string
- didCancel []string
- didRenice [][]string
- queue string
+ didBatch [][]string
+ didCancel []string
+ didRelease []string
+ didRenice [][]string
+ queue string
// If non-nil, run this func during the 2nd+ call to Cancel()
onCancel func()
// Error returned by Batch()
@@ -74,6 +75,11 @@ func (sf *slurmFake) QueueCommand(args []string) *exec.Cmd {
return exec.Command("echo", sf.queue)
}
+func (sf *slurmFake) Release(name string) error {
+ sf.didRelease = append(sf.didRelease, name)
+ return nil
+}
+
func (sf *slurmFake) Renice(name string, nice int64) error {
sf.didRenice = append(sf.didRenice, []string{name, fmt.Sprintf("%d", nice)})
return nil
@@ -151,7 +157,7 @@ func (s *IntegrationSuite) integrationTest(c *C,
}
func (s *IntegrationSuite) TestNormal(c *C) {
- s.slurm = slurmFake{queue: "zzzzz-dz642-queuedcontainer 10000 100\n"}
+ s.slurm = slurmFake{queue: "zzzzz-dz642-queuedcontainer 10000 100 PENDING Resources\n"}
container := s.integrationTest(c,
nil,
func(dispatcher *dispatch.Dispatcher, container arvados.Container) {
@@ -163,7 +169,7 @@ func (s *IntegrationSuite) TestNormal(c *C) {
}
func (s *IntegrationSuite) TestCancel(c *C) {
- s.slurm = slurmFake{queue: "zzzzz-dz642-queuedcontainer 10000 100\n"}
+ s.slurm = slurmFake{queue: "zzzzz-dz642-queuedcontainer 10000 100 PENDING Resources\n"}
readyToCancel := make(chan bool)
s.slurm.onCancel = func() { <-readyToCancel }
container := s.integrationTest(c,
diff --git a/services/crunch-dispatch-slurm/slurm.go b/services/crunch-dispatch-slurm/slurm.go
index 735e057..9e9f452 100644
--- a/services/crunch-dispatch-slurm/slurm.go
+++ b/services/crunch-dispatch-slurm/slurm.go
@@ -13,10 +13,11 @@ import (
)
type Slurm interface {
+ Batch(script io.Reader, args []string) error
Cancel(name string) error
- Renice(name string, nice int64) error
QueueCommand(args []string) *exec.Cmd
- Batch(script io.Reader, args []string) error
+ Release(name string) error
+ Renice(name string, nice int64) error
}
type slurmCLI struct{}
@@ -54,6 +55,10 @@ func (scli *slurmCLI) QueueCommand(args []string) *exec.Cmd {
return exec.Command("squeue", args...)
}
+func (scli *slurmCLI) Release(name string) error {
+ return scli.run(nil, "scontrol", []string{"release", "Name=" + name})
+}
+
func (scli *slurmCLI) Renice(name string, nice int64) error {
return scli.run(nil, "scontrol", []string{"update", "JobName=" + name, fmt.Sprintf("Nice=%d", nice)})
}
diff --git a/services/crunch-dispatch-slurm/squeue.go b/services/crunch-dispatch-slurm/squeue.go
index 8862d16..365b4e8 100644
--- a/services/crunch-dispatch-slurm/squeue.go
+++ b/services/crunch-dispatch-slurm/squeue.go
@@ -121,7 +121,7 @@ func (sqc *SqueueChecker) check() {
sqc.L.Lock()
defer sqc.L.Unlock()
- cmd := sqc.Slurm.QueueCommand([]string{"--all", "--noheader", "--format=%j %y %Q"})
+ cmd := sqc.Slurm.QueueCommand([]string{"--all", "--noheader", "--format=%j %y %Q %T %r"})
stdout, stderr := &bytes.Buffer{}, &bytes.Buffer{}
cmd.Stdout, cmd.Stderr = stdout, stderr
if err := cmd.Run(); err != nil {
@@ -135,9 +135,9 @@ func (sqc *SqueueChecker) check() {
if line == "" {
continue
}
- var uuid string
+ var uuid, state, reason string
var n, p int64
- if _, err := fmt.Sscan(line, &uuid, &n, &p); err != nil {
+ if _, err := fmt.Sscan(line, &uuid, &n, &p, &state, &reason); err != nil {
log.Printf("warning: ignoring unparsed line in squeue output: %q", line)
continue
}
@@ -148,6 +148,23 @@ func (sqc *SqueueChecker) check() {
replacing.priority = p
replacing.nice = n
newq[uuid] = replacing
+
+ if state == "PENDING" && reason == "BadConstraints" && p == 0 && replacing.wantPriority > 0 {
+ // When using SLURM 14.x or 15.x, our queued
+ // jobs land in this state when "scontrol
+ // reconfigure" invalidates their feature
+ // constraints by clearing all node features.
+ // They stay in this state even after the
+ // features reappear, until we run "scontrol
+ // release {jobid}".
+ //
+ // "scontrol release" is silent and successful
+ // regardless of whether the features have
+ // reappeared, so rather than second-guessing
+ // whether SLURM is ready, we just keep trying
+ // this until it works.
+ sqc.Slurm.Release(uuid)
+ }
}
sqc.queue = newq
sqc.Broadcast()
diff --git a/services/crunch-dispatch-slurm/squeue_test.go b/services/crunch-dispatch-slurm/squeue_test.go
index 11f7c48..ba257ea 100644
--- a/services/crunch-dispatch-slurm/squeue_test.go
+++ b/services/crunch-dispatch-slurm/squeue_test.go
@@ -24,31 +24,31 @@ func (s *SqueueSuite) TestReniceAll(c *C) {
}{
{
spread: 1,
- squeue: uuids[0] + " 10000 4294000000\n",
+ squeue: uuids[0] + " 10000 4294000000 PENDING Resources\n",
want: map[string]int64{uuids[0]: 1},
expect: [][]string{{uuids[0], "0"}},
},
{ // fake0 priority is too high
spread: 1,
- squeue: uuids[0] + " 10000 4294000777\n" + uuids[1] + " 10000 4294000444\n",
+ squeue: uuids[0] + " 10000 4294000777 PENDING Resources\n" + uuids[1] + " 10000 4294000444 PENDING Resources\n",
want: map[string]int64{uuids[0]: 1, uuids[1]: 999},
expect: [][]string{{uuids[1], "0"}, {uuids[0], "334"}},
},
{ // specify spread
spread: 100,
- squeue: uuids[0] + " 10000 4294000777\n" + uuids[1] + " 10000 4294000444\n",
+ squeue: uuids[0] + " 10000 4294000777 PENDING Resources\n" + uuids[1] + " 10000 4294000444 PENDING Resources\n",
want: map[string]int64{uuids[0]: 1, uuids[1]: 999},
expect: [][]string{{uuids[1], "0"}, {uuids[0], "433"}},
},
{ // ignore fake2 because SetPriority() not called
spread: 1,
- squeue: uuids[0] + " 10000 4294000000\n" + uuids[1] + " 10000 4294000111\n" + uuids[2] + " 10000 4294000222\n",
+ squeue: uuids[0] + " 10000 4294000000 PENDING Resources\n" + uuids[1] + " 10000 4294000111 PENDING Resources\n" + uuids[2] + " 10000 4294000222 PENDING Resources\n",
want: map[string]int64{uuids[0]: 999, uuids[1]: 1},
expect: [][]string{{uuids[0], "0"}, {uuids[1], "112"}},
},
{ // ignore fake2 because slurm priority=0
spread: 1,
- squeue: uuids[0] + " 10000 4294000000\n" + uuids[1] + " 10000 4294000111\n" + uuids[2] + " 10000 0\n",
+ squeue: uuids[0] + " 10000 4294000000 PENDING Resources\n" + uuids[1] + " 10000 4294000111 PENDING Resources\n" + uuids[2] + " 10000 0 PENDING Resources\n",
want: map[string]int64{uuids[0]: 999, uuids[1]: 1, uuids[2]: 997},
expect: [][]string{{uuids[0], "0"}, {uuids[1], "112"}},
},
@@ -103,7 +103,7 @@ func (s *SqueueSuite) TestSetPriorityBeforeQueued(c *C) {
for {
select {
case <-tick.C:
- slurm.queue = uuidGood + " 0 12345\n"
+ slurm.queue = uuidGood + " 0 12345 PENDING Resources\n"
sqc.check()
// Avoid immediately selecting this case again
commit f7d0830ae819e2a62115642be449fa79f2fc8152
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date: Wed Mar 7 09:17:56 2018 -0500
13078: Fix ignoring queued jobs with reason=BadConstraints.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>
diff --git a/services/nodemanager/arvnodeman/jobqueue.py b/services/nodemanager/arvnodeman/jobqueue.py
index 0360bfc..d002a1d 100644
--- a/services/nodemanager/arvnodeman/jobqueue.py
+++ b/services/nodemanager/arvnodeman/jobqueue.py
@@ -163,7 +163,7 @@ class JobQueueMonitorActor(clientactor.RemotePollLoopActor):
continue
if '-dz642-' not in jobname:
continue
- if not re.search(r'ReqNodeNotAvail|Resources|Priority', reason):
+ if not re.search(r'BadConstraints|ReqNodeNotAvail|Resources|Priority', reason):
continue
for feature in features.split(','):
commit 53dc92af621bb2064e922b82e88572f5180c503a
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date: Tue Mar 6 17:14:24 2018 -0500
13078: Ignore held jobs with priority=0 (e.g., SLURM 15.x).
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>
diff --git a/services/crunch-dispatch-slurm/squeue.go b/services/crunch-dispatch-slurm/squeue.go
index df3c868..8862d16 100644
--- a/services/crunch-dispatch-slurm/squeue.go
+++ b/services/crunch-dispatch-slurm/squeue.go
@@ -79,6 +79,14 @@ func (sqc *SqueueChecker) reniceAll() {
// (perhaps it's not an Arvados job)
continue
}
+ if j.priority == 0 {
+ // SLURM <= 15.x implements "hold" by setting
+ // priority to 0. If we include held jobs
+ // here, we'll end up trying to push other
+ // jobs below them using negative priority,
+ // which won't help anything.
+ continue
+ }
jobs = append(jobs, j)
}
diff --git a/services/crunch-dispatch-slurm/squeue_test.go b/services/crunch-dispatch-slurm/squeue_test.go
index 694a4d6..11f7c48 100644
--- a/services/crunch-dispatch-slurm/squeue_test.go
+++ b/services/crunch-dispatch-slurm/squeue_test.go
@@ -46,6 +46,12 @@ func (s *SqueueSuite) TestReniceAll(c *C) {
want: map[string]int64{uuids[0]: 999, uuids[1]: 1},
expect: [][]string{{uuids[0], "0"}, {uuids[1], "112"}},
},
+ { // ignore fake2 because slurm priority=0
+ spread: 1,
+ squeue: uuids[0] + " 10000 4294000000\n" + uuids[1] + " 10000 4294000111\n" + uuids[2] + " 10000 0\n",
+ want: map[string]int64{uuids[0]: 999, uuids[1]: 1, uuids[2]: 997},
+ expect: [][]string{{uuids[0], "0"}, {uuids[1], "112"}},
+ },
} {
c.Logf("spread=%d squeue=%q want=%v -> expect=%v", test.spread, test.squeue, test.want, test.expect)
slurm := &slurmFake{
commit a67b550e12bcf12a8f67ef1472b2dce013747712
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date: Tue Mar 6 17:11:35 2018 -0500
13078: Remove debug log.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>
diff --git a/services/crunch-dispatch-slurm/squeue.go b/services/crunch-dispatch-slurm/squeue.go
index c0b2e41..df3c868 100644
--- a/services/crunch-dispatch-slurm/squeue.go
+++ b/services/crunch-dispatch-slurm/squeue.go
@@ -90,7 +90,6 @@ func (sqc *SqueueChecker) reniceAll() {
if renice[i] == job.nice {
continue
}
- log.Printf("updating slurm priority for %q: nice %d => %d", job.uuid, job.nice, renice[i])
sqc.Slurm.Renice(job.uuid, renice[i])
}
}
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list