[ARVADOS] updated: 1.1.4-550-g24dd4edbe

Git user git at public.curoverse.com
Tue Jul 17 17:06:29 EDT 2018


Summary of changes:
 services/crunch-dispatch-slurm/squeue.go | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

  discards  36e0cd4cac9905d7626d13c9b1be8e4f46ad7926 (commit)
       via  24dd4edbe843a0638a4f02bd84ab980230af6e1b (commit)
       via  bc2c281b6558f4d330f45e0174d9e44e3e9fd54d (commit)
       via  8668f0edf5a0fba7c268343d240678bf5c0d98c0 (commit)

This update added new revisions after undoing existing revisions.  That is
to say, the old revision is not a strict subset of the new revision.  This
situation occurs when you --force push a change and generate a repository
containing something like this:

 * -- * -- B -- O -- O -- O (36e0cd4cac9905d7626d13c9b1be8e4f46ad7926)
            \
             N -- N -- N (24dd4edbe843a0638a4f02bd84ab980230af6e1b)

When this happens we assume that you've already had alert emails for all
of the O revisions, and so we here report only the revisions in the N
branch from the common base, B.

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.


commit 24dd4edbe843a0638a4f02bd84ab980230af6e1b
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Tue Jul 17 17:04:05 2018 -0400

    13399: Link to wiki in warning message.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/services/crunch-dispatch-slurm/squeue.go b/services/crunch-dispatch-slurm/squeue.go
index 4fa004e42..fd4851eb0 100644
--- a/services/crunch-dispatch-slurm/squeue.go
+++ b/services/crunch-dispatch-slurm/squeue.go
@@ -115,7 +115,7 @@ func (sqc *SqueueChecker) reniceAll() {
 		}
 		err := sqc.Slurm.Renice(job.uuid, niceNew)
 		if err != nil && niceNew > slurm15NiceLimit && strings.Contains(err.Error(), "Invalid nice value") {
-			log.Printf("container %q clamping nice values at %d, priority order will not be correct", job.uuid, slurm15NiceLimit)
+			log.Printf("container %q clamping nice values at %d, priority order will not be correct -- see https://dev.arvados.org/projects/arvados/wiki/SLURM_integration#Limited-nice-values-SLURM-15", job.uuid, slurm15NiceLimit)
 			job.hitNiceLimit = true
 		}
 	}

commit bc2c281b6558f4d330f45e0174d9e44e3e9fd54d
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Tue Jul 17 15:32:45 2018 -0400

    13399: Recognize held jobs with priority between 1 and 20K.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/services/crunch-dispatch-slurm/squeue.go b/services/crunch-dispatch-slurm/squeue.go
index 4067d27d5..4fa004e42 100644
--- a/services/crunch-dispatch-slurm/squeue.go
+++ b/services/crunch-dispatch-slurm/squeue.go
@@ -168,14 +168,17 @@ func (sqc *SqueueChecker) check() {
 		replacing.nice = n
 		newq[uuid] = replacing
 
-		if state == "PENDING" && ((reason == "BadConstraints" && p == 0) || reason == "launch failed requeued held") && replacing.wantPriority > 0 {
+		if state == "PENDING" && ((reason == "BadConstraints" && p <= 2*slurm15NiceLimit) || reason == "launch failed requeued held") && replacing.wantPriority > 0 {
 			// When using SLURM 14.x or 15.x, our queued
 			// jobs land in this state when "scontrol
 			// reconfigure" invalidates their feature
 			// constraints by clearing all node features.
 			// They stay in this state even after the
 			// features reappear, until we run "scontrol
-			// release {jobid}".
+			// release {jobid}". Priority is usually 0 in
+			// this state, but sometimes (due to a race
+			// with nice adjustments?) it's a small
+			// positive value.
 			//
 			// "scontrol release" is silent and successful
 			// regardless of whether the features have
@@ -186,7 +189,7 @@ func (sqc *SqueueChecker) check() {
 			// "launch failed requeued held" seems to be
 			// another manifestation of this problem,
 			// resolved the same way.
-			log.Printf("releasing held job %q", uuid)
+			log.Printf("releasing held job %q (priority=%d, state=%q, reason=%q)", uuid, p, state, reason)
 			sqc.Slurm.Release(uuid)
 		} else if p < 1<<20 && replacing.wantPriority > 0 {
 			log.Printf("warning: job %q has low priority %d, nice %d, state %q, reason %q", uuid, p, n, state, reason)

commit 8668f0edf5a0fba7c268343d240678bf5c0d98c0
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Tue Jul 3 17:03:11 2018 -0400

    13399: If slurm refuses to renice past 10K, stop trying.
    
    Fixes log spam.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/services/crunch-dispatch-slurm/crunch-dispatch-slurm_test.go b/services/crunch-dispatch-slurm/crunch-dispatch-slurm_test.go
index 23a8a0ca0..719ec98d2 100644
--- a/services/crunch-dispatch-slurm/crunch-dispatch-slurm_test.go
+++ b/services/crunch-dispatch-slurm/crunch-dispatch-slurm_test.go
@@ -55,11 +55,12 @@ func (s *IntegrationSuite) TearDownTest(c *C) {
 }
 
 type slurmFake struct {
-	didBatch   [][]string
-	didCancel  []string
-	didRelease []string
-	didRenice  [][]string
-	queue      string
+	didBatch      [][]string
+	didCancel     []string
+	didRelease    []string
+	didRenice     [][]string
+	queue         string
+	rejectNice10K bool
 	// If non-nil, run this func during the 2nd+ call to Cancel()
 	onCancel func()
 	// Error returned by Batch()
@@ -82,6 +83,9 @@ func (sf *slurmFake) Release(name string) error {
 
 func (sf *slurmFake) Renice(name string, nice int64) error {
 	sf.didRenice = append(sf.didRenice, []string{name, fmt.Sprintf("%d", nice)})
+	if sf.rejectNice10K && nice > 10000 {
+		return errors.New("scontrol: error: Invalid nice value, must be between -10000 and 10000")
+	}
 	return nil
 }
 
diff --git a/services/crunch-dispatch-slurm/squeue.go b/services/crunch-dispatch-slurm/squeue.go
index 742943f19..4067d27d5 100644
--- a/services/crunch-dispatch-slurm/squeue.go
+++ b/services/crunch-dispatch-slurm/squeue.go
@@ -14,11 +14,14 @@ import (
 	"time"
 )
 
+const slurm15NiceLimit int64 = 10000
+
 type slurmJob struct {
 	uuid         string
 	wantPriority int64
 	priority     int64 // current slurm priority (incorporates nice value)
 	nice         int64 // current slurm nice value
+	hitNiceLimit bool
 }
 
 // Squeue implements asynchronous polling monitor of the SLURM queue using the
@@ -103,10 +106,18 @@ func (sqc *SqueueChecker) reniceAll() {
 	})
 	renice := wantNice(jobs, sqc.PrioritySpread)
 	for i, job := range jobs {
-		if renice[i] == job.nice {
+		niceNew := renice[i]
+		if job.hitNiceLimit && niceNew > slurm15NiceLimit {
+			niceNew = slurm15NiceLimit
+		}
+		if niceNew == job.nice {
 			continue
 		}
-		sqc.Slurm.Renice(job.uuid, renice[i])
+		err := sqc.Slurm.Renice(job.uuid, niceNew)
+		if err != nil && niceNew > slurm15NiceLimit && strings.Contains(err.Error(), "Invalid nice value") {
+			log.Printf("container %q clamping nice values at %d, priority order will not be correct", job.uuid, slurm15NiceLimit)
+			job.hitNiceLimit = true
+		}
 	}
 }
 
diff --git a/services/crunch-dispatch-slurm/squeue_test.go b/services/crunch-dispatch-slurm/squeue_test.go
index c9329fdf9..ef036dabd 100644
--- a/services/crunch-dispatch-slurm/squeue_test.go
+++ b/services/crunch-dispatch-slurm/squeue_test.go
@@ -103,6 +103,50 @@ func (s *SqueueSuite) TestReniceAll(c *C) {
 	}
 }
 
+// If a limited nice range prevents desired priority adjustments, give
+// up and clamp nice to 10K.
+func (s *SqueueSuite) TestReniceInvalidNiceValue(c *C) {
+	uuids := []string{"zzzzz-dz642-fake0fake0fake0", "zzzzz-dz642-fake1fake1fake1", "zzzzz-dz642-fake2fake2fake2"}
+	slurm := &slurmFake{
+		queue:         uuids[0] + " 0 4294000222 PENDING Resources\n" + uuids[1] + " 0 4294555222 PENDING Resources\n",
+		rejectNice10K: true,
+	}
+	sqc := &SqueueChecker{
+		Slurm:          slurm,
+		PrioritySpread: 1,
+		Period:         time.Hour,
+	}
+	sqc.startOnce.Do(sqc.start)
+	sqc.check()
+	sqc.SetPriority(uuids[0], 2)
+	sqc.SetPriority(uuids[1], 1)
+
+	// First attempt should renice to 555001, which will fail
+	sqc.reniceAll()
+	c.Check(slurm.didRenice, DeepEquals, [][]string{{uuids[1], "555001"}})
+
+	// Next attempt should renice to 10K, which will succeed
+	sqc.reniceAll()
+	c.Check(slurm.didRenice, DeepEquals, [][]string{{uuids[1], "555001"}, {uuids[1], "10000"}})
+	// ...so we'll change the squeue response to reflect the
+	// updated priority+nice, and make sure sqc sees that...
+	slurm.queue = uuids[0] + " 0 4294000222 PENDING Resources\n" + uuids[1] + " 10000 4294545222 PENDING Resources\n"
+	sqc.check()
+
+	// Next attempt should leave nice alone because it's already
+	// at the 10K limit
+	sqc.reniceAll()
+	c.Check(slurm.didRenice, DeepEquals, [][]string{{uuids[1], "555001"}, {uuids[1], "10000"}})
+
+	// Back to normal if desired nice value falls below 10K
+	slurm.queue = uuids[0] + " 0 4294000222 PENDING Resources\n" + uuids[1] + " 10000 4294000111 PENDING Resources\n"
+	sqc.check()
+	sqc.reniceAll()
+	c.Check(slurm.didRenice, DeepEquals, [][]string{{uuids[1], "555001"}, {uuids[1], "10000"}, {uuids[1], "9890"}})
+
+	sqc.Stop()
+}
+
 // If the given UUID isn't in the slurm queue yet, SetPriority()
 // should wait for it to appear on the very next poll, then give up.
 func (s *SqueueSuite) TestSetPriorityBeforeQueued(c *C) {

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list