[ARVADOS] updated: 1.1.4-550-g24dd4edbe
Git user
git at public.curoverse.com
Tue Jul 17 17:06:29 EDT 2018
Summary of changes:
services/crunch-dispatch-slurm/squeue.go | 20 ++++++++++++--------
1 file changed, 12 insertions(+), 8 deletions(-)
discards 36e0cd4cac9905d7626d13c9b1be8e4f46ad7926 (commit)
via 24dd4edbe843a0638a4f02bd84ab980230af6e1b (commit)
via bc2c281b6558f4d330f45e0174d9e44e3e9fd54d (commit)
via 8668f0edf5a0fba7c268343d240678bf5c0d98c0 (commit)
This update added new revisions after undoing existing revisions. That is
to say, the old revision is not a strict subset of the new revision. This
situation occurs when you --force push a change and generate a repository
containing something like this:
* -- * -- B -- O -- O -- O (36e0cd4cac9905d7626d13c9b1be8e4f46ad7926)
\
N -- N -- N (24dd4edbe843a0638a4f02bd84ab980230af6e1b)
When this happens we assume that you've already had alert emails for all
of the O revisions, and so we here report only the revisions in the N
branch from the common base, B.
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
commit 24dd4edbe843a0638a4f02bd84ab980230af6e1b
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date: Tue Jul 17 17:04:05 2018 -0400
13399: Link to wiki in warning message.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>
diff --git a/services/crunch-dispatch-slurm/squeue.go b/services/crunch-dispatch-slurm/squeue.go
index 4fa004e42..fd4851eb0 100644
--- a/services/crunch-dispatch-slurm/squeue.go
+++ b/services/crunch-dispatch-slurm/squeue.go
@@ -115,7 +115,7 @@ func (sqc *SqueueChecker) reniceAll() {
}
err := sqc.Slurm.Renice(job.uuid, niceNew)
if err != nil && niceNew > slurm15NiceLimit && strings.Contains(err.Error(), "Invalid nice value") {
- log.Printf("container %q clamping nice values at %d, priority order will not be correct", job.uuid, slurm15NiceLimit)
+ log.Printf("container %q clamping nice values at %d, priority order will not be correct -- see https://dev.arvados.org/projects/arvados/wiki/SLURM_integration#Limited-nice-values-SLURM-15", job.uuid, slurm15NiceLimit)
job.hitNiceLimit = true
}
}
commit bc2c281b6558f4d330f45e0174d9e44e3e9fd54d
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date: Tue Jul 17 15:32:45 2018 -0400
13399: Recognize held jobs with priority between 1 and 20K.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>
diff --git a/services/crunch-dispatch-slurm/squeue.go b/services/crunch-dispatch-slurm/squeue.go
index 4067d27d5..4fa004e42 100644
--- a/services/crunch-dispatch-slurm/squeue.go
+++ b/services/crunch-dispatch-slurm/squeue.go
@@ -168,14 +168,17 @@ func (sqc *SqueueChecker) check() {
replacing.nice = n
newq[uuid] = replacing
- if state == "PENDING" && ((reason == "BadConstraints" && p == 0) || reason == "launch failed requeued held") && replacing.wantPriority > 0 {
+ if state == "PENDING" && ((reason == "BadConstraints" && p <= 2*slurm15NiceLimit) || reason == "launch failed requeued held") && replacing.wantPriority > 0 {
// When using SLURM 14.x or 15.x, our queued
// jobs land in this state when "scontrol
// reconfigure" invalidates their feature
// constraints by clearing all node features.
// They stay in this state even after the
// features reappear, until we run "scontrol
- // release {jobid}".
+ // release {jobid}". Priority is usually 0 in
+ // this state, but sometimes (due to a race
+ // with nice adjustments?) it's a small
+ // positive value.
//
// "scontrol release" is silent and successful
// regardless of whether the features have
@@ -186,7 +189,7 @@ func (sqc *SqueueChecker) check() {
// "launch failed requeued held" seems to be
// another manifestation of this problem,
// resolved the same way.
- log.Printf("releasing held job %q", uuid)
+ log.Printf("releasing held job %q (priority=%d, state=%q, reason=%q)", uuid, p, state, reason)
sqc.Slurm.Release(uuid)
} else if p < 1<<20 && replacing.wantPriority > 0 {
log.Printf("warning: job %q has low priority %d, nice %d, state %q, reason %q", uuid, p, n, state, reason)
commit 8668f0edf5a0fba7c268343d240678bf5c0d98c0
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date: Tue Jul 3 17:03:11 2018 -0400
13399: If slurm refuses to renice past 10K, stop trying.
Fixes log spam.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>
diff --git a/services/crunch-dispatch-slurm/crunch-dispatch-slurm_test.go b/services/crunch-dispatch-slurm/crunch-dispatch-slurm_test.go
index 23a8a0ca0..719ec98d2 100644
--- a/services/crunch-dispatch-slurm/crunch-dispatch-slurm_test.go
+++ b/services/crunch-dispatch-slurm/crunch-dispatch-slurm_test.go
@@ -55,11 +55,12 @@ func (s *IntegrationSuite) TearDownTest(c *C) {
}
type slurmFake struct {
- didBatch [][]string
- didCancel []string
- didRelease []string
- didRenice [][]string
- queue string
+ didBatch [][]string
+ didCancel []string
+ didRelease []string
+ didRenice [][]string
+ queue string
+ rejectNice10K bool
// If non-nil, run this func during the 2nd+ call to Cancel()
onCancel func()
// Error returned by Batch()
@@ -82,6 +83,9 @@ func (sf *slurmFake) Release(name string) error {
func (sf *slurmFake) Renice(name string, nice int64) error {
sf.didRenice = append(sf.didRenice, []string{name, fmt.Sprintf("%d", nice)})
+ if sf.rejectNice10K && nice > 10000 {
+ return errors.New("scontrol: error: Invalid nice value, must be between -10000 and 10000")
+ }
return nil
}
diff --git a/services/crunch-dispatch-slurm/squeue.go b/services/crunch-dispatch-slurm/squeue.go
index 742943f19..4067d27d5 100644
--- a/services/crunch-dispatch-slurm/squeue.go
+++ b/services/crunch-dispatch-slurm/squeue.go
@@ -14,11 +14,14 @@ import (
"time"
)
+const slurm15NiceLimit int64 = 10000
+
type slurmJob struct {
uuid string
wantPriority int64
priority int64 // current slurm priority (incorporates nice value)
nice int64 // current slurm nice value
+ hitNiceLimit bool
}
// Squeue implements asynchronous polling monitor of the SLURM queue using the
@@ -103,10 +106,18 @@ func (sqc *SqueueChecker) reniceAll() {
})
renice := wantNice(jobs, sqc.PrioritySpread)
for i, job := range jobs {
- if renice[i] == job.nice {
+ niceNew := renice[i]
+ if job.hitNiceLimit && niceNew > slurm15NiceLimit {
+ niceNew = slurm15NiceLimit
+ }
+ if niceNew == job.nice {
continue
}
- sqc.Slurm.Renice(job.uuid, renice[i])
+ err := sqc.Slurm.Renice(job.uuid, niceNew)
+ if err != nil && niceNew > slurm15NiceLimit && strings.Contains(err.Error(), "Invalid nice value") {
+ log.Printf("container %q clamping nice values at %d, priority order will not be correct", job.uuid, slurm15NiceLimit)
+ job.hitNiceLimit = true
+ }
}
}
diff --git a/services/crunch-dispatch-slurm/squeue_test.go b/services/crunch-dispatch-slurm/squeue_test.go
index c9329fdf9..ef036dabd 100644
--- a/services/crunch-dispatch-slurm/squeue_test.go
+++ b/services/crunch-dispatch-slurm/squeue_test.go
@@ -103,6 +103,50 @@ func (s *SqueueSuite) TestReniceAll(c *C) {
}
}
+// If a limited nice range prevents desired priority adjustments, give
+// up and clamp nice to 10K.
+func (s *SqueueSuite) TestReniceInvalidNiceValue(c *C) {
+ uuids := []string{"zzzzz-dz642-fake0fake0fake0", "zzzzz-dz642-fake1fake1fake1", "zzzzz-dz642-fake2fake2fake2"}
+ slurm := &slurmFake{
+ queue: uuids[0] + " 0 4294000222 PENDING Resources\n" + uuids[1] + " 0 4294555222 PENDING Resources\n",
+ rejectNice10K: true,
+ }
+ sqc := &SqueueChecker{
+ Slurm: slurm,
+ PrioritySpread: 1,
+ Period: time.Hour,
+ }
+ sqc.startOnce.Do(sqc.start)
+ sqc.check()
+ sqc.SetPriority(uuids[0], 2)
+ sqc.SetPriority(uuids[1], 1)
+
+ // First attempt should renice to 555001, which will fail
+ sqc.reniceAll()
+ c.Check(slurm.didRenice, DeepEquals, [][]string{{uuids[1], "555001"}})
+
+ // Next attempt should renice to 10K, which will succeed
+ sqc.reniceAll()
+ c.Check(slurm.didRenice, DeepEquals, [][]string{{uuids[1], "555001"}, {uuids[1], "10000"}})
+ // ...so we'll change the squeue response to reflect the
+ // updated priority+nice, and make sure sqc sees that...
+ slurm.queue = uuids[0] + " 0 4294000222 PENDING Resources\n" + uuids[1] + " 10000 4294545222 PENDING Resources\n"
+ sqc.check()
+
+ // Next attempt should leave nice alone because it's already
+ // at the 10K limit
+ sqc.reniceAll()
+ c.Check(slurm.didRenice, DeepEquals, [][]string{{uuids[1], "555001"}, {uuids[1], "10000"}})
+
+ // Back to normal if desired nice value falls below 10K
+ slurm.queue = uuids[0] + " 0 4294000222 PENDING Resources\n" + uuids[1] + " 10000 4294000111 PENDING Resources\n"
+ sqc.check()
+ sqc.reniceAll()
+ c.Check(slurm.didRenice, DeepEquals, [][]string{{uuids[1], "555001"}, {uuids[1], "10000"}, {uuids[1], "9890"}})
+
+ sqc.Stop()
+}
+
// If the given UUID isn't in the slurm queue yet, SetPriority()
// should wait for it to appear on the very next poll, then give up.
func (s *SqueueSuite) TestSetPriorityBeforeQueued(c *C) {
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list