[ARVADOS] updated: 1.1.3-148-g665d2c0

Git user git at public.curoverse.com
Wed Mar 7 09:37:01 EST 2018


Summary of changes:
 .../crunch-dispatch-slurm_test.go                  | 18 +++++++++++------
 services/crunch-dispatch-slurm/slurm.go            |  9 +++++++--
 services/crunch-dispatch-slurm/squeue.go           | 23 +++++++++++++++++++---
 services/crunch-dispatch-slurm/squeue_test.go      | 12 +++++------
 services/nodemanager/arvnodeman/jobqueue.py        |  2 +-
 5 files changed, 46 insertions(+), 18 deletions(-)

       via  665d2c06f2c45fe4d08bd6ba630259e53c03b805 (commit)
       via  f7d0830ae819e2a62115642be449fa79f2fc8152 (commit)
      from  53dc92af621bb2064e922b82e88572f5180c503a (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.


commit 665d2c06f2c45fe4d08bd6ba630259e53c03b805
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Wed Mar 7 09:19:40 2018 -0500

    13078: Accommodate old SLURM versions.
    
    In SLURM 14 and 15, if a queued job has feature constraints which
    become invalid (e.g., when "scontrol reconfigure" clears all node
    features), the job is put on hold with priority=0, and it stays in
    this state even after the features reappear. "scontrol release"
    recovers a job from this state.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/services/crunch-dispatch-slurm/crunch-dispatch-slurm_test.go b/services/crunch-dispatch-slurm/crunch-dispatch-slurm_test.go
index 6852fc4..a5de3bb 100644
--- a/services/crunch-dispatch-slurm/crunch-dispatch-slurm_test.go
+++ b/services/crunch-dispatch-slurm/crunch-dispatch-slurm_test.go
@@ -55,10 +55,11 @@ func (s *IntegrationSuite) TearDownTest(c *C) {
 }
 
 type slurmFake struct {
-	didBatch  [][]string
-	didCancel []string
-	didRenice [][]string
-	queue     string
+	didBatch   [][]string
+	didCancel  []string
+	didRelease []int64
+	didRenice  [][]string
+	queue      string
 	// If non-nil, run this func during the 2nd+ call to Cancel()
 	onCancel func()
 	// Error returned by Batch()
@@ -74,6 +75,11 @@ func (sf *slurmFake) QueueCommand(args []string) *exec.Cmd {
 	return exec.Command("echo", sf.queue)
 }
 
+func (sf *slurmFake) Release(jobid int64) error {
+	sf.didRelease = append(sf.didRelease, jobid)
+	return nil
+}
+
 func (sf *slurmFake) Renice(name string, nice int64) error {
 	sf.didRenice = append(sf.didRenice, []string{name, fmt.Sprintf("%d", nice)})
 	return nil
@@ -151,7 +157,7 @@ func (s *IntegrationSuite) integrationTest(c *C,
 }
 
 func (s *IntegrationSuite) TestNormal(c *C) {
-	s.slurm = slurmFake{queue: "zzzzz-dz642-queuedcontainer 10000 100\n"}
+	s.slurm = slurmFake{queue: "zzzzz-dz642-queuedcontainer 10000 100 PENDING Resources\n"}
 	container := s.integrationTest(c,
 		nil,
 		func(dispatcher *dispatch.Dispatcher, container arvados.Container) {
@@ -163,7 +169,7 @@ func (s *IntegrationSuite) TestNormal(c *C) {
 }
 
 func (s *IntegrationSuite) TestCancel(c *C) {
-	s.slurm = slurmFake{queue: "zzzzz-dz642-queuedcontainer 10000 100\n"}
+	s.slurm = slurmFake{queue: "zzzzz-dz642-queuedcontainer 10000 100 PENDING Resources\n"}
 	readyToCancel := make(chan bool)
 	s.slurm.onCancel = func() { <-readyToCancel }
 	container := s.integrationTest(c,
diff --git a/services/crunch-dispatch-slurm/slurm.go b/services/crunch-dispatch-slurm/slurm.go
index 735e057..2e65292 100644
--- a/services/crunch-dispatch-slurm/slurm.go
+++ b/services/crunch-dispatch-slurm/slurm.go
@@ -13,10 +13,11 @@ import (
 )
 
 type Slurm interface {
+	Batch(script io.Reader, args []string) error
 	Cancel(name string) error
-	Renice(name string, nice int64) error
 	QueueCommand(args []string) *exec.Cmd
-	Batch(script io.Reader, args []string) error
+	Release(jobid int64) error
+	Renice(name string, nice int64) error
 }
 
 type slurmCLI struct{}
@@ -54,6 +55,10 @@ func (scli *slurmCLI) QueueCommand(args []string) *exec.Cmd {
 	return exec.Command("squeue", args...)
 }
 
+func (scli *slurmCLI) Release(jobid int64) error {
+	return scli.run(nil, "scontrol", []string{"release", fmt.Sprintf("%d", jobid)})
+}
+
 func (scli *slurmCLI) Renice(name string, nice int64) error {
 	return scli.run(nil, "scontrol", []string{"update", "JobName=" + name, fmt.Sprintf("Nice=%d", nice)})
 }
diff --git a/services/crunch-dispatch-slurm/squeue.go b/services/crunch-dispatch-slurm/squeue.go
index 8862d16..1cea626 100644
--- a/services/crunch-dispatch-slurm/squeue.go
+++ b/services/crunch-dispatch-slurm/squeue.go
@@ -121,7 +121,7 @@ func (sqc *SqueueChecker) check() {
 	sqc.L.Lock()
 	defer sqc.L.Unlock()
 
-	cmd := sqc.Slurm.QueueCommand([]string{"--all", "--noheader", "--format=%j %y %Q"})
+	cmd := sqc.Slurm.QueueCommand([]string{"--all", "--noheader", "--format=%j %y %Q %T %r"})
 	stdout, stderr := &bytes.Buffer{}, &bytes.Buffer{}
 	cmd.Stdout, cmd.Stderr = stdout, stderr
 	if err := cmd.Run(); err != nil {
@@ -135,9 +135,9 @@ func (sqc *SqueueChecker) check() {
 		if line == "" {
 			continue
 		}
-		var uuid string
+		var uuid, state, reason string
 		var n, p int64
-		if _, err := fmt.Sscan(line, &uuid, &n, &p); err != nil {
+		if _, err := fmt.Sscan(line, &uuid, &n, &p, &state, &reason); err != nil {
 			log.Printf("warning: ignoring unparsed line in squeue output: %q", line)
 			continue
 		}
@@ -148,6 +148,23 @@ func (sqc *SqueueChecker) check() {
 		replacing.priority = p
 		replacing.nice = n
 		newq[uuid] = replacing
+
+		if state == "PENDING" && reason == "BadConstraints" && p == 0 && replacing.wantPriority > 0 {
+			// When using SLURM 14.x or 15.x, our queued
+			// jobs land in this state when "scontrol
+			// reconfigure" invalidates their feature
+			// constraints by clearing all node features.
+			// They stay in this state even after the
+			// features reappear, until we run "scontrol
+			// release {jobid}".
+			//
+			// "scontrol release" is silent and successful
+			// regardless of whether the features have
+			// reappeared, so rather than second-guessing
+			// whether SLURM is ready, we just keep trying
+			// this until it works.
+			sqc.Slurm.Release(n)
+		}
 	}
 	sqc.queue = newq
 	sqc.Broadcast()
diff --git a/services/crunch-dispatch-slurm/squeue_test.go b/services/crunch-dispatch-slurm/squeue_test.go
index 11f7c48..ba257ea 100644
--- a/services/crunch-dispatch-slurm/squeue_test.go
+++ b/services/crunch-dispatch-slurm/squeue_test.go
@@ -24,31 +24,31 @@ func (s *SqueueSuite) TestReniceAll(c *C) {
 	}{
 		{
 			spread: 1,
-			squeue: uuids[0] + " 10000 4294000000\n",
+			squeue: uuids[0] + " 10000 4294000000 PENDING Resources\n",
 			want:   map[string]int64{uuids[0]: 1},
 			expect: [][]string{{uuids[0], "0"}},
 		},
 		{ // fake0 priority is too high
 			spread: 1,
-			squeue: uuids[0] + " 10000 4294000777\n" + uuids[1] + " 10000 4294000444\n",
+			squeue: uuids[0] + " 10000 4294000777 PENDING Resources\n" + uuids[1] + " 10000 4294000444 PENDING Resources\n",
 			want:   map[string]int64{uuids[0]: 1, uuids[1]: 999},
 			expect: [][]string{{uuids[1], "0"}, {uuids[0], "334"}},
 		},
 		{ // specify spread
 			spread: 100,
-			squeue: uuids[0] + " 10000 4294000777\n" + uuids[1] + " 10000 4294000444\n",
+			squeue: uuids[0] + " 10000 4294000777 PENDING Resources\n" + uuids[1] + " 10000 4294000444 PENDING Resources\n",
 			want:   map[string]int64{uuids[0]: 1, uuids[1]: 999},
 			expect: [][]string{{uuids[1], "0"}, {uuids[0], "433"}},
 		},
 		{ // ignore fake2 because SetPriority() not called
 			spread: 1,
-			squeue: uuids[0] + " 10000 4294000000\n" + uuids[1] + " 10000 4294000111\n" + uuids[2] + " 10000 4294000222\n",
+			squeue: uuids[0] + " 10000 4294000000 PENDING Resources\n" + uuids[1] + " 10000 4294000111 PENDING Resources\n" + uuids[2] + " 10000 4294000222 PENDING Resources\n",
 			want:   map[string]int64{uuids[0]: 999, uuids[1]: 1},
 			expect: [][]string{{uuids[0], "0"}, {uuids[1], "112"}},
 		},
 		{ // ignore fake2 because slurm priority=0
 			spread: 1,
-			squeue: uuids[0] + " 10000 4294000000\n" + uuids[1] + " 10000 4294000111\n" + uuids[2] + " 10000 0\n",
+			squeue: uuids[0] + " 10000 4294000000 PENDING Resources\n" + uuids[1] + " 10000 4294000111 PENDING Resources\n" + uuids[2] + " 10000 0 PENDING Resources\n",
 			want:   map[string]int64{uuids[0]: 999, uuids[1]: 1, uuids[2]: 997},
 			expect: [][]string{{uuids[0], "0"}, {uuids[1], "112"}},
 		},
@@ -103,7 +103,7 @@ func (s *SqueueSuite) TestSetPriorityBeforeQueued(c *C) {
 	for {
 		select {
 		case <-tick.C:
-			slurm.queue = uuidGood + " 0 12345\n"
+			slurm.queue = uuidGood + " 0 12345 PENDING Resources\n"
 			sqc.check()
 
 			// Avoid immediately selecting this case again

commit f7d0830ae819e2a62115642be449fa79f2fc8152
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Wed Mar 7 09:17:56 2018 -0500

    13078: Fix ignoring queued jobs with reason=BadConstraints.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/services/nodemanager/arvnodeman/jobqueue.py b/services/nodemanager/arvnodeman/jobqueue.py
index 0360bfc..d002a1d 100644
--- a/services/nodemanager/arvnodeman/jobqueue.py
+++ b/services/nodemanager/arvnodeman/jobqueue.py
@@ -163,7 +163,7 @@ class JobQueueMonitorActor(clientactor.RemotePollLoopActor):
                     continue
                 if '-dz642-' not in jobname:
                     continue
-                if not re.search(r'ReqNodeNotAvail|Resources|Priority', reason):
+                if not re.search(r'BadConstraints|ReqNodeNotAvail|Resources|Priority', reason):
                     continue
 
                 for feature in features.split(','):

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list