[ARVADOS] created: 1.1.3-151-g92474d9

Git user git at public.curoverse.com
Wed Mar 7 18:12:44 EST 2018


        at  92474d902612ad28b3f9e5b5c21de08162d23ffb (commit)


commit 92474d902612ad28b3f9e5b5c21de08162d23ffb
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Wed Mar 7 17:54:35 2018 -0500

    13078: Add test case for "release job held because BadConstraints".
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/services/crunch-dispatch-slurm/squeue_test.go b/services/crunch-dispatch-slurm/squeue_test.go
index ba257ea..c9329fd 100644
--- a/services/crunch-dispatch-slurm/squeue_test.go
+++ b/services/crunch-dispatch-slurm/squeue_test.go
@@ -14,6 +14,36 @@ var _ = Suite(&SqueueSuite{})
 
 type SqueueSuite struct{}
 
+func (s *SqueueSuite) TestReleasePending(c *C) {
+	uuids := []string{
+		"zzzzz-dz642-fake0fake0fake0",
+		"zzzzz-dz642-fake1fake1fake1",
+		"zzzzz-dz642-fake2fake2fake2",
+	}
+	slurm := &slurmFake{
+		queue: uuids[0] + " 10000 4294000000 PENDING Resources\n" + uuids[1] + " 10000 4294000111 PENDING Resources\n" + uuids[2] + " 10000 0 PENDING BadConstraints\n",
+	}
+	sqc := &SqueueChecker{
+		Slurm:  slurm,
+		Period: time.Hour,
+	}
+	sqc.startOnce.Do(sqc.start)
+	defer sqc.Stop()
+
+	done := make(chan struct{})
+	go func() {
+		for _, u := range uuids {
+			sqc.SetPriority(u, 1)
+		}
+		close(done)
+	}()
+	callUntilReady(sqc.check, done)
+
+	slurm.didRelease = nil
+	sqc.check()
+	c.Check(slurm.didRelease, DeepEquals, []string{uuids[2]})
+}
+
 func (s *SqueueSuite) TestReniceAll(c *C) {
 	uuids := []string{"zzzzz-dz642-fake0fake0fake0", "zzzzz-dz642-fake1fake1fake1", "zzzzz-dz642-fake2fake2fake2"}
 	for _, test := range []struct {
@@ -123,3 +153,16 @@ func (s *SqueueSuite) TestSetPriorityBeforeQueued(c *C) {
 		}
 	}
 }
+
+func callUntilReady(fn func(), done <-chan struct{}) {
+	tick := time.NewTicker(time.Millisecond)
+	defer tick.Stop()
+	for {
+		select {
+		case <-done:
+			return
+		case <-tick.C:
+			fn()
+		}
+	}
+}

commit bab7938e562be6ce1305c15faaffb43aef67ff77
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Wed Mar 7 10:09:20 2018 -0500

    13078: Update comment.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/services/nodemanager/arvnodeman/jobqueue.py b/services/nodemanager/arvnodeman/jobqueue.py
index d002a1d..20849c9 100644
--- a/services/nodemanager/arvnodeman/jobqueue.py
+++ b/services/nodemanager/arvnodeman/jobqueue.py
@@ -153,7 +153,7 @@ class JobQueueMonitorActor(clientactor.RemotePollLoopActor):
     def _send_request(self):
         queuelist = []
         if self.slurm_queue:
-            # cpus, memory, tempory disk space, reason, job name
+            # cpus, memory, tempory disk space, reason, job name, feature constraints
             squeue_out = subprocess.check_output(["squeue", "--state=PENDING", "--noheader", "--format=%c|%m|%d|%r|%j|%f"])
             for out in squeue_out.splitlines():
                 try:

commit 55573affe3611054b88a17d3cc89c21b8bfa14e7
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Wed Mar 7 09:43:28 2018 -0500

    13078: Reduce "feature kludge" poll interval.
    
    Whenever a new node comes online, "scontrol reconfigure" invalidates
    all jobs that could be scheduled onto that node *or any other node* so
    it's worthwhile to recover from this quickly.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/lib/dispatchcloud/node_size.go b/lib/dispatchcloud/node_size.go
index ed33260..2ca4050 100644
--- a/lib/dispatchcloud/node_size.go
+++ b/lib/dispatchcloud/node_size.go
@@ -87,7 +87,7 @@ func SlurmNodeTypeFeatureKludge(cc *arvados.Cluster) {
 	}
 	for {
 		slurmKludge(features)
-		time.Sleep(time.Minute)
+		time.Sleep(2 * time.Second)
 	}
 }
 

commit e1f97dcf68d197525228781e2a861cc3e64a0231
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Wed Mar 7 09:19:40 2018 -0500

    13078: Fix jobs stuck in "held" state in old SLURM versions.
    
    In SLURM 14 and 15, if a queued job has feature constraints which
    become invalid (e.g., when "scontrol reconfigure" clears all node
    features), the job is put on hold with priority=0, and it stays in
    this state even after the features reappear. "scontrol release"
    recovers a job from this state.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/services/crunch-dispatch-slurm/crunch-dispatch-slurm_test.go b/services/crunch-dispatch-slurm/crunch-dispatch-slurm_test.go
index 6852fc4..4211026 100644
--- a/services/crunch-dispatch-slurm/crunch-dispatch-slurm_test.go
+++ b/services/crunch-dispatch-slurm/crunch-dispatch-slurm_test.go
@@ -55,10 +55,11 @@ func (s *IntegrationSuite) TearDownTest(c *C) {
 }
 
 type slurmFake struct {
-	didBatch  [][]string
-	didCancel []string
-	didRenice [][]string
-	queue     string
+	didBatch   [][]string
+	didCancel  []string
+	didRelease []string
+	didRenice  [][]string
+	queue      string
 	// If non-nil, run this func during the 2nd+ call to Cancel()
 	onCancel func()
 	// Error returned by Batch()
@@ -74,6 +75,11 @@ func (sf *slurmFake) QueueCommand(args []string) *exec.Cmd {
 	return exec.Command("echo", sf.queue)
 }
 
+func (sf *slurmFake) Release(name string) error {
+	sf.didRelease = append(sf.didRelease, name)
+	return nil
+}
+
 func (sf *slurmFake) Renice(name string, nice int64) error {
 	sf.didRenice = append(sf.didRenice, []string{name, fmt.Sprintf("%d", nice)})
 	return nil
@@ -151,7 +157,7 @@ func (s *IntegrationSuite) integrationTest(c *C,
 }
 
 func (s *IntegrationSuite) TestNormal(c *C) {
-	s.slurm = slurmFake{queue: "zzzzz-dz642-queuedcontainer 10000 100\n"}
+	s.slurm = slurmFake{queue: "zzzzz-dz642-queuedcontainer 10000 100 PENDING Resources\n"}
 	container := s.integrationTest(c,
 		nil,
 		func(dispatcher *dispatch.Dispatcher, container arvados.Container) {
@@ -163,7 +169,7 @@ func (s *IntegrationSuite) TestNormal(c *C) {
 }
 
 func (s *IntegrationSuite) TestCancel(c *C) {
-	s.slurm = slurmFake{queue: "zzzzz-dz642-queuedcontainer 10000 100\n"}
+	s.slurm = slurmFake{queue: "zzzzz-dz642-queuedcontainer 10000 100 PENDING Resources\n"}
 	readyToCancel := make(chan bool)
 	s.slurm.onCancel = func() { <-readyToCancel }
 	container := s.integrationTest(c,
diff --git a/services/crunch-dispatch-slurm/slurm.go b/services/crunch-dispatch-slurm/slurm.go
index 735e057..9e9f452 100644
--- a/services/crunch-dispatch-slurm/slurm.go
+++ b/services/crunch-dispatch-slurm/slurm.go
@@ -13,10 +13,11 @@ import (
 )
 
 type Slurm interface {
+	Batch(script io.Reader, args []string) error
 	Cancel(name string) error
-	Renice(name string, nice int64) error
 	QueueCommand(args []string) *exec.Cmd
-	Batch(script io.Reader, args []string) error
+	Release(name string) error
+	Renice(name string, nice int64) error
 }
 
 type slurmCLI struct{}
@@ -54,6 +55,10 @@ func (scli *slurmCLI) QueueCommand(args []string) *exec.Cmd {
 	return exec.Command("squeue", args...)
 }
 
+func (scli *slurmCLI) Release(name string) error {
+	return scli.run(nil, "scontrol", []string{"release", "Name=" + name})
+}
+
 func (scli *slurmCLI) Renice(name string, nice int64) error {
 	return scli.run(nil, "scontrol", []string{"update", "JobName=" + name, fmt.Sprintf("Nice=%d", nice)})
 }
diff --git a/services/crunch-dispatch-slurm/squeue.go b/services/crunch-dispatch-slurm/squeue.go
index 8862d16..365b4e8 100644
--- a/services/crunch-dispatch-slurm/squeue.go
+++ b/services/crunch-dispatch-slurm/squeue.go
@@ -121,7 +121,7 @@ func (sqc *SqueueChecker) check() {
 	sqc.L.Lock()
 	defer sqc.L.Unlock()
 
-	cmd := sqc.Slurm.QueueCommand([]string{"--all", "--noheader", "--format=%j %y %Q"})
+	cmd := sqc.Slurm.QueueCommand([]string{"--all", "--noheader", "--format=%j %y %Q %T %r"})
 	stdout, stderr := &bytes.Buffer{}, &bytes.Buffer{}
 	cmd.Stdout, cmd.Stderr = stdout, stderr
 	if err := cmd.Run(); err != nil {
@@ -135,9 +135,9 @@ func (sqc *SqueueChecker) check() {
 		if line == "" {
 			continue
 		}
-		var uuid string
+		var uuid, state, reason string
 		var n, p int64
-		if _, err := fmt.Sscan(line, &uuid, &n, &p); err != nil {
+		if _, err := fmt.Sscan(line, &uuid, &n, &p, &state, &reason); err != nil {
 			log.Printf("warning: ignoring unparsed line in squeue output: %q", line)
 			continue
 		}
@@ -148,6 +148,23 @@ func (sqc *SqueueChecker) check() {
 		replacing.priority = p
 		replacing.nice = n
 		newq[uuid] = replacing
+
+		if state == "PENDING" && reason == "BadConstraints" && p == 0 && replacing.wantPriority > 0 {
+			// When using SLURM 14.x or 15.x, our queued
+			// jobs land in this state when "scontrol
+			// reconfigure" invalidates their feature
+			// constraints by clearing all node features.
+			// They stay in this state even after the
+			// features reappear, until we run "scontrol
+			// release {jobid}".
+			//
+			// "scontrol release" is silent and successful
+			// regardless of whether the features have
+			// reappeared, so rather than second-guessing
+			// whether SLURM is ready, we just keep trying
+			// this until it works.
+			sqc.Slurm.Release(uuid)
+		}
 	}
 	sqc.queue = newq
 	sqc.Broadcast()
diff --git a/services/crunch-dispatch-slurm/squeue_test.go b/services/crunch-dispatch-slurm/squeue_test.go
index 11f7c48..ba257ea 100644
--- a/services/crunch-dispatch-slurm/squeue_test.go
+++ b/services/crunch-dispatch-slurm/squeue_test.go
@@ -24,31 +24,31 @@ func (s *SqueueSuite) TestReniceAll(c *C) {
 	}{
 		{
 			spread: 1,
-			squeue: uuids[0] + " 10000 4294000000\n",
+			squeue: uuids[0] + " 10000 4294000000 PENDING Resources\n",
 			want:   map[string]int64{uuids[0]: 1},
 			expect: [][]string{{uuids[0], "0"}},
 		},
 		{ // fake0 priority is too high
 			spread: 1,
-			squeue: uuids[0] + " 10000 4294000777\n" + uuids[1] + " 10000 4294000444\n",
+			squeue: uuids[0] + " 10000 4294000777 PENDING Resources\n" + uuids[1] + " 10000 4294000444 PENDING Resources\n",
 			want:   map[string]int64{uuids[0]: 1, uuids[1]: 999},
 			expect: [][]string{{uuids[1], "0"}, {uuids[0], "334"}},
 		},
 		{ // specify spread
 			spread: 100,
-			squeue: uuids[0] + " 10000 4294000777\n" + uuids[1] + " 10000 4294000444\n",
+			squeue: uuids[0] + " 10000 4294000777 PENDING Resources\n" + uuids[1] + " 10000 4294000444 PENDING Resources\n",
 			want:   map[string]int64{uuids[0]: 1, uuids[1]: 999},
 			expect: [][]string{{uuids[1], "0"}, {uuids[0], "433"}},
 		},
 		{ // ignore fake2 because SetPriority() not called
 			spread: 1,
-			squeue: uuids[0] + " 10000 4294000000\n" + uuids[1] + " 10000 4294000111\n" + uuids[2] + " 10000 4294000222\n",
+			squeue: uuids[0] + " 10000 4294000000 PENDING Resources\n" + uuids[1] + " 10000 4294000111 PENDING Resources\n" + uuids[2] + " 10000 4294000222 PENDING Resources\n",
 			want:   map[string]int64{uuids[0]: 999, uuids[1]: 1},
 			expect: [][]string{{uuids[0], "0"}, {uuids[1], "112"}},
 		},
 		{ // ignore fake2 because slurm priority=0
 			spread: 1,
-			squeue: uuids[0] + " 10000 4294000000\n" + uuids[1] + " 10000 4294000111\n" + uuids[2] + " 10000 0\n",
+			squeue: uuids[0] + " 10000 4294000000 PENDING Resources\n" + uuids[1] + " 10000 4294000111 PENDING Resources\n" + uuids[2] + " 10000 0 PENDING Resources\n",
 			want:   map[string]int64{uuids[0]: 999, uuids[1]: 1, uuids[2]: 997},
 			expect: [][]string{{uuids[0], "0"}, {uuids[1], "112"}},
 		},
@@ -103,7 +103,7 @@ func (s *SqueueSuite) TestSetPriorityBeforeQueued(c *C) {
 	for {
 		select {
 		case <-tick.C:
-			slurm.queue = uuidGood + " 0 12345\n"
+			slurm.queue = uuidGood + " 0 12345 PENDING Resources\n"
 			sqc.check()
 
 			// Avoid immediately selecting this case again

commit f7d0830ae819e2a62115642be449fa79f2fc8152
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Wed Mar 7 09:17:56 2018 -0500

    13078: Fix ignoring queued jobs with reason=BadConstraints.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/services/nodemanager/arvnodeman/jobqueue.py b/services/nodemanager/arvnodeman/jobqueue.py
index 0360bfc..d002a1d 100644
--- a/services/nodemanager/arvnodeman/jobqueue.py
+++ b/services/nodemanager/arvnodeman/jobqueue.py
@@ -163,7 +163,7 @@ class JobQueueMonitorActor(clientactor.RemotePollLoopActor):
                     continue
                 if '-dz642-' not in jobname:
                     continue
-                if not re.search(r'ReqNodeNotAvail|Resources|Priority', reason):
+                if not re.search(r'BadConstraints|ReqNodeNotAvail|Resources|Priority', reason):
                     continue
 
                 for feature in features.split(','):

commit 53dc92af621bb2064e922b82e88572f5180c503a
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Tue Mar 6 17:14:24 2018 -0500

    13078: Ignore held jobs with priority=0 (e.g., SLURM 15.x).
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/services/crunch-dispatch-slurm/squeue.go b/services/crunch-dispatch-slurm/squeue.go
index df3c868..8862d16 100644
--- a/services/crunch-dispatch-slurm/squeue.go
+++ b/services/crunch-dispatch-slurm/squeue.go
@@ -79,6 +79,14 @@ func (sqc *SqueueChecker) reniceAll() {
 			// (perhaps it's not an Arvados job)
 			continue
 		}
+		if j.priority == 0 {
+			// SLURM <= 15.x implements "hold" by setting
+			// priority to 0. If we include held jobs
+			// here, we'll end up trying to push other
+			// jobs below them using negative priority,
+			// which won't help anything.
+			continue
+		}
 		jobs = append(jobs, j)
 	}
 
diff --git a/services/crunch-dispatch-slurm/squeue_test.go b/services/crunch-dispatch-slurm/squeue_test.go
index 694a4d6..11f7c48 100644
--- a/services/crunch-dispatch-slurm/squeue_test.go
+++ b/services/crunch-dispatch-slurm/squeue_test.go
@@ -46,6 +46,12 @@ func (s *SqueueSuite) TestReniceAll(c *C) {
 			want:   map[string]int64{uuids[0]: 999, uuids[1]: 1},
 			expect: [][]string{{uuids[0], "0"}, {uuids[1], "112"}},
 		},
+		{ // ignore fake2 because slurm priority=0
+			spread: 1,
+			squeue: uuids[0] + " 10000 4294000000\n" + uuids[1] + " 10000 4294000111\n" + uuids[2] + " 10000 0\n",
+			want:   map[string]int64{uuids[0]: 999, uuids[1]: 1, uuids[2]: 997},
+			expect: [][]string{{uuids[0], "0"}, {uuids[1], "112"}},
+		},
 	} {
 		c.Logf("spread=%d squeue=%q want=%v -> expect=%v", test.spread, test.squeue, test.want, test.expect)
 		slurm := &slurmFake{

commit a67b550e12bcf12a8f67ef1472b2dce013747712
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Tue Mar 6 17:11:35 2018 -0500

    13078: Remove debug log.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/services/crunch-dispatch-slurm/squeue.go b/services/crunch-dispatch-slurm/squeue.go
index c0b2e41..df3c868 100644
--- a/services/crunch-dispatch-slurm/squeue.go
+++ b/services/crunch-dispatch-slurm/squeue.go
@@ -90,7 +90,6 @@ func (sqc *SqueueChecker) reniceAll() {
 		if renice[i] == job.nice {
 			continue
 		}
-		log.Printf("updating slurm priority for %q: nice %d => %d", job.uuid, job.nice, renice[i])
 		sqc.Slurm.Renice(job.uuid, renice[i])
 	}
 }

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list