[ARVADOS] created: 1.1.4-548-g36e0cd4ca
Git user
git at public.curoverse.com
Tue Jul 3 17:04:45 EDT 2018
at 36e0cd4cac9905d7626d13c9b1be8e4f46ad7926 (commit)
commit 36e0cd4cac9905d7626d13c9b1be8e4f46ad7926
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date: Tue Jul 3 17:03:11 2018 -0400
13399: If slurm refuses to renice past 10K, stop trying.
Fixes log spam.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>
diff --git a/services/crunch-dispatch-slurm/crunch-dispatch-slurm_test.go b/services/crunch-dispatch-slurm/crunch-dispatch-slurm_test.go
index 23a8a0ca0..719ec98d2 100644
--- a/services/crunch-dispatch-slurm/crunch-dispatch-slurm_test.go
+++ b/services/crunch-dispatch-slurm/crunch-dispatch-slurm_test.go
@@ -55,11 +55,12 @@ func (s *IntegrationSuite) TearDownTest(c *C) {
}
type slurmFake struct {
- didBatch [][]string
- didCancel []string
- didRelease []string
- didRenice [][]string
- queue string
+ didBatch [][]string
+ didCancel []string
+ didRelease []string
+ didRenice [][]string
+ queue string
+ rejectNice10K bool
// If non-nil, run this func during the 2nd+ call to Cancel()
onCancel func()
// Error returned by Batch()
@@ -82,6 +83,9 @@ func (sf *slurmFake) Release(name string) error {
func (sf *slurmFake) Renice(name string, nice int64) error {
sf.didRenice = append(sf.didRenice, []string{name, fmt.Sprintf("%d", nice)})
+ if sf.rejectNice10K && nice > 10000 {
+ return errors.New("scontrol: error: Invalid nice value, must be between -10000 and 10000")
+ }
return nil
}
diff --git a/services/crunch-dispatch-slurm/squeue.go b/services/crunch-dispatch-slurm/squeue.go
index 742943f19..12936a7e2 100644
--- a/services/crunch-dispatch-slurm/squeue.go
+++ b/services/crunch-dispatch-slurm/squeue.go
@@ -19,6 +19,7 @@ type slurmJob struct {
wantPriority int64
priority int64 // current slurm priority (incorporates nice value)
nice int64 // current slurm nice value
+ hitNiceLimit bool
}
// Squeue implements asynchronous polling monitor of the SLURM queue using the
@@ -101,12 +102,21 @@ func (sqc *SqueueChecker) reniceAll() {
return jobs[i].uuid > jobs[j].uuid
}
})
+ const niceLimit int64 = 10000
renice := wantNice(jobs, sqc.PrioritySpread)
for i, job := range jobs {
- if renice[i] == job.nice {
+ niceNew := renice[i]
+ if job.hitNiceLimit && niceNew > niceLimit {
+ niceNew = niceLimit
+ }
+ if niceNew == job.nice {
continue
}
- sqc.Slurm.Renice(job.uuid, renice[i])
+ err := sqc.Slurm.Renice(job.uuid, niceNew)
+ if err != nil && niceNew > niceLimit && strings.Contains(err.Error(), "Invalid nice value") {
+ log.Printf("container %q clamping nice values at %d, priority order will not be correct", job.uuid, niceLimit)
+ job.hitNiceLimit = true
+ }
}
}
diff --git a/services/crunch-dispatch-slurm/squeue_test.go b/services/crunch-dispatch-slurm/squeue_test.go
index c9329fdf9..ef036dabd 100644
--- a/services/crunch-dispatch-slurm/squeue_test.go
+++ b/services/crunch-dispatch-slurm/squeue_test.go
@@ -103,6 +103,50 @@ func (s *SqueueSuite) TestReniceAll(c *C) {
}
}
+// If a limited nice range prevents desired priority adjustments, give
+// up and clamp nice to 10K.
+func (s *SqueueSuite) TestReniceInvalidNiceValue(c *C) {
+ uuids := []string{"zzzzz-dz642-fake0fake0fake0", "zzzzz-dz642-fake1fake1fake1", "zzzzz-dz642-fake2fake2fake2"}
+ slurm := &slurmFake{
+ queue: uuids[0] + " 0 4294000222 PENDING Resources\n" + uuids[1] + " 0 4294555222 PENDING Resources\n",
+ rejectNice10K: true,
+ }
+ sqc := &SqueueChecker{
+ Slurm: slurm,
+ PrioritySpread: 1,
+ Period: time.Hour,
+ }
+ sqc.startOnce.Do(sqc.start)
+ sqc.check()
+ sqc.SetPriority(uuids[0], 2)
+ sqc.SetPriority(uuids[1], 1)
+
+ // First attempt should renice to 555001, which will fail
+ sqc.reniceAll()
+ c.Check(slurm.didRenice, DeepEquals, [][]string{{uuids[1], "555001"}})
+
+ // Next attempt should renice to 10K, which will succeed
+ sqc.reniceAll()
+ c.Check(slurm.didRenice, DeepEquals, [][]string{{uuids[1], "555001"}, {uuids[1], "10000"}})
+ // ...so we'll change the squeue response to reflect the
+ // updated priority+nice, and make sure sqc sees that...
+ slurm.queue = uuids[0] + " 0 4294000222 PENDING Resources\n" + uuids[1] + " 10000 4294545222 PENDING Resources\n"
+ sqc.check()
+
+ // Next attempt should leave nice alone because it's already
+ // at the 10K limit
+ sqc.reniceAll()
+ c.Check(slurm.didRenice, DeepEquals, [][]string{{uuids[1], "555001"}, {uuids[1], "10000"}})
+
+ // Back to normal if desired nice value falls below 10K
+ slurm.queue = uuids[0] + " 0 4294000222 PENDING Resources\n" + uuids[1] + " 10000 4294000111 PENDING Resources\n"
+ sqc.check()
+ sqc.reniceAll()
+ c.Check(slurm.didRenice, DeepEquals, [][]string{{uuids[1], "555001"}, {uuids[1], "10000"}, {uuids[1], "9890"}})
+
+ sqc.Stop()
+}
+
// If the given UUID isn't in the slurm queue yet, SetPriority()
// should wait for it to appear on the very next poll, then give up.
func (s *SqueueSuite) TestSetPriorityBeforeQueued(c *C) {
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list