[ARVADOS] created: 1.1.4-548-g36e0cd4ca

Git user git at public.curoverse.com
Tue Jul 3 17:04:45 EDT 2018


        at  36e0cd4cac9905d7626d13c9b1be8e4f46ad7926 (commit)


commit 36e0cd4cac9905d7626d13c9b1be8e4f46ad7926
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Tue Jul 3 17:03:11 2018 -0400

    13399: If slurm refuses to renice past 10K, stop trying.
    
    Fixes log spam.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/services/crunch-dispatch-slurm/crunch-dispatch-slurm_test.go b/services/crunch-dispatch-slurm/crunch-dispatch-slurm_test.go
index 23a8a0ca0..719ec98d2 100644
--- a/services/crunch-dispatch-slurm/crunch-dispatch-slurm_test.go
+++ b/services/crunch-dispatch-slurm/crunch-dispatch-slurm_test.go
@@ -55,11 +55,12 @@ func (s *IntegrationSuite) TearDownTest(c *C) {
 }
 
 type slurmFake struct {
-	didBatch   [][]string
-	didCancel  []string
-	didRelease []string
-	didRenice  [][]string
-	queue      string
+	didBatch      [][]string
+	didCancel     []string
+	didRelease    []string
+	didRenice     [][]string
+	queue         string
+	rejectNice10K bool
 	// If non-nil, run this func during the 2nd+ call to Cancel()
 	onCancel func()
 	// Error returned by Batch()
@@ -82,6 +83,9 @@ func (sf *slurmFake) Release(name string) error {
 
 func (sf *slurmFake) Renice(name string, nice int64) error {
 	sf.didRenice = append(sf.didRenice, []string{name, fmt.Sprintf("%d", nice)})
+	if sf.rejectNice10K && nice > 10000 {
+		return errors.New("scontrol: error: Invalid nice value, must be between -10000 and 10000")
+	}
 	return nil
 }
 
diff --git a/services/crunch-dispatch-slurm/squeue.go b/services/crunch-dispatch-slurm/squeue.go
index 742943f19..12936a7e2 100644
--- a/services/crunch-dispatch-slurm/squeue.go
+++ b/services/crunch-dispatch-slurm/squeue.go
@@ -19,6 +19,7 @@ type slurmJob struct {
 	wantPriority int64
 	priority     int64 // current slurm priority (incorporates nice value)
 	nice         int64 // current slurm nice value
+	hitNiceLimit bool
 }
 
 // Squeue implements asynchronous polling monitor of the SLURM queue using the
@@ -101,12 +102,21 @@ func (sqc *SqueueChecker) reniceAll() {
 			return jobs[i].uuid > jobs[j].uuid
 		}
 	})
+	const niceLimit int64 = 10000
 	renice := wantNice(jobs, sqc.PrioritySpread)
 	for i, job := range jobs {
-		if renice[i] == job.nice {
+		niceNew := renice[i]
+		if job.hitNiceLimit && niceNew > niceLimit {
+			niceNew = niceLimit
+		}
+		if niceNew == job.nice {
 			continue
 		}
-		sqc.Slurm.Renice(job.uuid, renice[i])
+		err := sqc.Slurm.Renice(job.uuid, niceNew)
+		if err != nil && niceNew > niceLimit && strings.Contains(err.Error(), "Invalid nice value") {
+			log.Printf("container %q clamping nice values at %d, priority order will not be correct", job.uuid, niceLimit)
+			job.hitNiceLimit = true
+		}
 	}
 }
 
diff --git a/services/crunch-dispatch-slurm/squeue_test.go b/services/crunch-dispatch-slurm/squeue_test.go
index c9329fdf9..ef036dabd 100644
--- a/services/crunch-dispatch-slurm/squeue_test.go
+++ b/services/crunch-dispatch-slurm/squeue_test.go
@@ -103,6 +103,50 @@ func (s *SqueueSuite) TestReniceAll(c *C) {
 	}
 }
 
+// If a limited nice range prevents desired priority adjustments, give
+// up and clamp nice to 10K.
+func (s *SqueueSuite) TestReniceInvalidNiceValue(c *C) {
+	uuids := []string{"zzzzz-dz642-fake0fake0fake0", "zzzzz-dz642-fake1fake1fake1", "zzzzz-dz642-fake2fake2fake2"}
+	slurm := &slurmFake{
+		queue:         uuids[0] + " 0 4294000222 PENDING Resources\n" + uuids[1] + " 0 4294555222 PENDING Resources\n",
+		rejectNice10K: true,
+	}
+	sqc := &SqueueChecker{
+		Slurm:          slurm,
+		PrioritySpread: 1,
+		Period:         time.Hour,
+	}
+	sqc.startOnce.Do(sqc.start)
+	sqc.check()
+	sqc.SetPriority(uuids[0], 2)
+	sqc.SetPriority(uuids[1], 1)
+
+	// First attempt should renice to 555001, which will fail
+	sqc.reniceAll()
+	c.Check(slurm.didRenice, DeepEquals, [][]string{{uuids[1], "555001"}})
+
+	// Next attempt should renice to 10K, which will succeed
+	sqc.reniceAll()
+	c.Check(slurm.didRenice, DeepEquals, [][]string{{uuids[1], "555001"}, {uuids[1], "10000"}})
+	// ...so we'll change the squeue response to reflect the
+	// updated priority+nice, and make sure sqc sees that...
+	slurm.queue = uuids[0] + " 0 4294000222 PENDING Resources\n" + uuids[1] + " 10000 4294545222 PENDING Resources\n"
+	sqc.check()
+
+	// Next attempt should leave nice alone because it's already
+	// at the 10K limit
+	sqc.reniceAll()
+	c.Check(slurm.didRenice, DeepEquals, [][]string{{uuids[1], "555001"}, {uuids[1], "10000"}})
+
+	// Back to normal if desired nice value falls below 10K
+	slurm.queue = uuids[0] + " 0 4294000222 PENDING Resources\n" + uuids[1] + " 10000 4294000111 PENDING Resources\n"
+	sqc.check()
+	sqc.reniceAll()
+	c.Check(slurm.didRenice, DeepEquals, [][]string{{uuids[1], "555001"}, {uuids[1], "10000"}, {uuids[1], "9890"}})
+
+	sqc.Stop()
+}
+
 // If the given UUID isn't in the slurm queue yet, SetPriority()
 // should wait for it to appear on the very next poll, then give up.
 func (s *SqueueSuite) TestSetPriorityBeforeQueued(c *C) {

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list