[ARVADOS] updated: 1.1.4-617-g7f0f3ab4e

Git user git at public.curoverse.com
Tue Jul 17 17:07:08 EDT 2018


Summary of changes:
 apps/workbench/Gemfile.lock                        |   2 +-
 .../app/controllers/work_units_controller.rb       |  29 +++--
 apps/workbench/app/helpers/application_helper.rb   |  13 ++-
 apps/workbench/app/helpers/version_helper.rb       |   6 +
 .../views/application/_report_issue_popup.html.erb |   6 +-
 apps/workbench/config/application.default.yml      |   5 +
 apps/workbench/lib/app_version.rb                  |  15 +++
 doc/_config.yml                                    |  17 ++-
 doc/_includes/_install_compute_docker.liquid       |  27 +++++
 doc/api/methods/groups.html.textile.liquid         |   2 +-
 .../create-standard-objects.html.textile.liquid    |  84 --------------
 .../install-dispatch.html.textile.liquid           |  31 +++--
 .../install-slurm.html.textile.liquid              |   2 -
 doc/install/index.html.textile.liquid              |  16 +--
 .../install-arv-git-httpd.html.textile.liquid      |  45 ++++++++
 doc/install/install-components.html.textile.liquid |  28 +++++
 doc/install/install-composer.html.textile.liquid   |  59 ++++++++++
 .../install-keep-balance.html.textile.liquid       |   7 +-
 doc/install/install-keepproxy.html.textile.liquid  |  23 +++-
 doc/install/install-keepstore.html.textile.liquid  |  39 ++++++-
 ...nstall-manual-prerequisites.html.textile.liquid |   8 +-
 .../install-nodemanager.html.textile.liquid        |   2 +-
 doc/user/cwl/cwl-extensions.html.textile.liquid    |  23 ++++
 lib/cmd/cmd.go                                     |  16 +--
 lib/controller/handler.go                          |  23 ++++
 lib/controller/handler_test.go                     |   8 ++
 sdk/cwl/arvados_cwl/__init__.py                    |   4 +-
 sdk/cwl/arvados_cwl/arv-cwl-schema.yml             |  44 +++++++
 sdk/cwl/arvados_cwl/arvcontainer.py                |  14 ++-
 sdk/cwl/arvados_cwl/arvjob.py                      |  10 +-
 sdk/cwl/arvados_cwl/arvworkflow.py                 |  15 ++-
 sdk/cwl/arvados_cwl/context.py                     |   5 +
 sdk/cwl/arvados_cwl/pathmapper.py                  |  25 +++-
 sdk/cwl/arvados_cwl/runner.py                      |  20 +++-
 sdk/cwl/arvados_cwl/util.py                        |  31 +++++
 sdk/cwl/setup.py                                   |   8 +-
 .../collection_per_tool_packed.cwl                 | 126 ++++++++++++--------
 sdk/cwl/tests/makes_intermediates/echo.cwl         |  14 +++
 .../cwl/tests/makes_intermediates/hello1.txt       |   0
 .../tests/makes_intermediates/run_in_single.cwl    |  38 ++++++
 sdk/cwl/tests/makes_intermediates/subwf.cwl        |  15 +++
 sdk/cwl/tests/test_container.py                    |   7 +-
 sdk/cwl/tests/test_submit.py                       |  36 ++++++
 sdk/cwl/tests/test_util.py                         |  45 ++++++++
 sdk/cwl/tests/wf/expect_packed.cwl                 | 128 ++++++++++++++-------
 ...ubmit_wf.cwl => submit_wf_runner_resources.cwl} |   6 +
 sdk/python/arvados/__init__.py                     |   2 -
 sdk/python/arvados/api.py                          |   9 +-
 sdk/python/arvados/commands/keepdocker.py          |   2 +-
 sdk/python/arvados/safeapi.py                      |   6 +
 sdk/python/setup.py                                |   1 +
 sdk/python/tests/nginx.conf                        |  57 +++++++--
 sdk/python/tests/run_test_server.py                |  21 ++--
 services/api/Gemfile.lock                          |   2 +-
 .../controllers/arvados/v1/schema_controller.rb    |   2 +
 services/api/config/application.default.yml        |   5 +
 services/api/lib/app_version.rb                    |  15 +++
 .../arvados/v1/schema_controller_test.rb           |  14 ++-
 services/crunch-run/crunchrun.go                   | 114 +++++++++++++++---
 services/crunch-run/crunchrun_test.go              |  46 ++++++++
 services/crunch-run/logging_test.go                |   2 +-
 services/keep-web/cache.go                         |   2 +-
 services/keep-web/cadaver_test.go                  |   2 +-
 services/keep-web/handler.go                       |   4 +
 services/keep-web/handler_test.go                  |   2 +-
 services/keepproxy/keepproxy_test.go               |   2 +-
 .../arvnodeman/computenode/dispatch/slurm.py       |   2 +-
 .../arvnodeman/computenode/driver/__init__.py      |   8 +-
 .../arvnodeman/computenode/driver/azure.py         |   2 +-
 .../arvnodeman/computenode/driver/dummy.py         |   2 +-
 .../arvnodeman/computenode/driver/ec2.py           |   2 +-
 .../arvnodeman/computenode/driver/gce.py           |   3 +-
 services/nodemanager/arvnodeman/daemon.py          |   8 +-
 services/nodemanager/arvnodeman/jobqueue.py        |   2 +-
 services/nodemanager/arvnodeman/nodelist.py        |   2 +-
 services/nodemanager/setup.py                      |   4 +-
 services/nodemanager/tests/integration_test.py     |   2 +-
 .../tests/test_computenode_dispatch_slurm.py       |  14 +--
 services/nodemanager/tests/test_jobqueue.py        |  12 +-
 services/nodemanager/tests/test_nodelist.py        |   4 +-
 vendor/vendor.json                                 |  60 ++++++++++
 81 files changed, 1229 insertions(+), 335 deletions(-)
 delete mode 100644 doc/install/create-standard-objects.html.textile.liquid
 create mode 100644 doc/install/install-components.html.textile.liquid
 create mode 100644 doc/install/install-composer.html.textile.liquid
 create mode 100644 sdk/cwl/arvados_cwl/util.py
 create mode 100644 sdk/cwl/tests/makes_intermediates/echo.cwl
 copy apps/workbench/app/mailers/.gitkeep => sdk/cwl/tests/makes_intermediates/hello1.txt (100%)
 create mode 100644 sdk/cwl/tests/makes_intermediates/run_in_single.cwl
 create mode 100644 sdk/cwl/tests/makes_intermediates/subwf.cwl
 create mode 100644 sdk/cwl/tests/test_util.py
 copy sdk/cwl/tests/wf/{submit_wf.cwl => submit_wf_runner_resources.cwl} (81%)

  discards  24dd4edbe843a0638a4f02bd84ab980230af6e1b (commit)
  discards  bc2c281b6558f4d330f45e0174d9e44e3e9fd54d (commit)
  discards  8668f0edf5a0fba7c268343d240678bf5c0d98c0 (commit)
       via  7f0f3ab4e13dd808b7b761039c857d55c7843175 (commit)
       via  9a80d15b7cab21efe16ec2b543dfb566bea9def4 (commit)
       via  297c4aaf43858eff5022a1e72eb8e09660bde4b0 (commit)
       via  3c23a0a57bde2978402e5d46ca2b003becee6d58 (commit)
       via  42012c4746d6dd81eb3c72fddb2fdd36cff381e7 (commit)
       via  6fe8e52020d421797306e5c6536afbcee761510a (commit)
       via  83d08d7ccbc622ec97948929c83fb91f96743ca2 (commit)
       via  843b4e5cf7727528fb5b45629030fd0d6a364ab7 (commit)
       via  db5107dca09b786374f06a35abb51ffc3f032abd (commit)
       via  50cbdcbd67f8b0da06d3b188e7bfdea3963661a7 (commit)
       via  f9a05f61abdf33891b09d62205d009d1cae73d1b (commit)
       via  224ac505b1162837f2f84fe2735a959d71bd5ce9 (commit)
       via  f0d62ea064d32b980f723e37972788cbd693c2ff (commit)
       via  e7870334a9f2c44fcd8580d3e75074df216c647a (commit)
       via  f62cff30c9948815d3cf2b00294da01c146cb8f2 (commit)
       via  b5e8bde902d5b6d806c087fb30aea568da01a81c (commit)
       via  115d8b18837272ee53144586454a684541318b84 (commit)
       via  751cd3892cbcf7ecb65ec416dba52ee84a2fee2a (commit)
       via  d640a546f2cd55cfbd0a959bcfb84be1eecdd6a6 (commit)
       via  3712b5943d9d32346552cb91ec5c4690e14485b9 (commit)
       via  2ac43c8c20d917376b2b92c09c46fa5641d054f8 (commit)
       via  02d7f4e820b65d83a5e3709dc14b6d72b9f5ab49 (commit)
       via  8e693a9981f03d229ff2bff7dd6e5d06e0790c19 (commit)
       via  ac6a1a232a670d10ce12d19e6d2a7900d353943a (commit)
       via  e0784f339e794a64108f7c18e478d44e5751cc19 (commit)
       via  4369714821950366db98a54e4b62fdb5d09951a6 (commit)
       via  be0cdc7814a49fa093b86b698a9756971ba80fcf (commit)
       via  8ae6680ebae91af9b0aafd6c9cfe5e5fb97f8b12 (commit)
       via  5b0cb2c6292e8784b7a54b75b444e47a09c10a05 (commit)
       via  688319dca9cbbb27452a0062943bb38b6935254e (commit)
       via  eead78e2eb11528af8cb862dcb6f9a41737a8a14 (commit)
       via  cda50fb57f9b18cb769432876433b52f484692bd (commit)
       via  64937458c5bc2918fa679c97478b2931bd26cfc1 (commit)
       via  08540a58a67f2d575b1e15338c6d49b8b74e2c38 (commit)
       via  2c87b580a87a55010da626d352307343f75d6d3a (commit)
       via  e2f50c08c4fc6da7065be222c2620264b521897f (commit)
       via  6b495b26e49bb32eccaddffea36d91f34d3ba6f8 (commit)
       via  b478b8a0b4263b0e54060e3fecaab9b6ecdcb085 (commit)
       via  33c10053b22fd5065516eb7df4c58b55a70d490c (commit)
       via  c405f9adab08daa3d4edef9cddd5453a79446c7b (commit)
       via  6dd60d7c70a63ac884515387cd8c92bb97433e1b (commit)
       via  710399e0355ff8ed016a9d3c830730af2e228ad5 (commit)
       via  81fa59a1e7797da8d420a94216bd28031178ae51 (commit)
       via  f339946832e0bb7ad175acaf59733445e6915f7a (commit)
       via  e5a3fb0a69df6c27dc567949a2e64d0e7da65384 (commit)
       via  04c664d6150683e3258168eecc4440a635ef8b16 (commit)
       via  951c8a79bca7224fdf8c50463d0c9a60b43ce930 (commit)
       via  9b16a843a641159867ea390c4c2384b320183b15 (commit)
       via  77cfd1ee241b4a2d408bd3ec5ebc69f34c7496f5 (commit)
       via  d3e2fb85a39562e38ea67f84e9f065815450294a (commit)
       via  b2a50a8539be09b73c5d65719e8eae4b2a15273b (commit)
       via  bd55c808de1d4d25fea7cc047956c45ec9752fa5 (commit)
       via  a8d2a183ac7246bf368d40dba1af226a4968a9c9 (commit)
       via  69138b061aca4e0cbbb7dad080158205f6734e77 (commit)
       via  535856c28a12bb07dc986b980b0f4ccfdfd25640 (commit)
       via  335ee76030e85fa6ac4da79b598cf4c4a212443d (commit)
       via  cd6d6f1f15bc1452a09c16b1a3524b5b289c6100 (commit)
       via  c584cb69cdcfb377deed94745785330562a54ae3 (commit)
       via  3f278cea46034660149403d68e05f0f450330854 (commit)
       via  3a7481703542b0c95d5a23cb45b0d2e7c3ac4c9e (commit)
       via  66c644efeabea073428288378a39a8e988e5b69b (commit)
       via  9b6abcd0448567146b471ad02162d33fd4b1d5a8 (commit)
       via  5c4d9d38dcee73a7ffb6221c80f707c3924da64f (commit)
       via  ee4c01999aff1ebc1e2dc338a70d1d7b812c633c (commit)
       via  380e4da5aab5d24d0e90ea27880974c232538fbf (commit)
       via  4a78e8e91fdad38e567fef0cd43aa8cb6bd33580 (commit)
       via  02763280f54fd0c2a499285f8ce6afcbd8b9e082 (commit)
       via  988c59f51aa579ce8bf0eab1cc729e05a5ee5631 (commit)
       via  92ac046a8a7a0f5a3a88b8656bb171701af840a0 (commit)
       via  5d340bee3506b37f0d5f0b695dfa37661ad0fbb6 (commit)

This update added new revisions after undoing existing revisions.  That is
to say, the old revision is not a strict subset of the new revision.  This
situation occurs when you --force push a change and generate a repository
containing something like this:

 * -- * -- B -- O -- O -- O (24dd4edbe843a0638a4f02bd84ab980230af6e1b)
            \
             N -- N -- N (7f0f3ab4e13dd808b7b761039c857d55c7843175)

When this happens we assume that you've already had alert emails for all
of the O revisions, and so we here report only the revisions in the N
branch from the common base, B.

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.


commit 7f0f3ab4e13dd808b7b761039c857d55c7843175
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Tue Jul 17 17:04:05 2018 -0400

    13399: Link to wiki in warning message.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/services/crunch-dispatch-slurm/squeue.go b/services/crunch-dispatch-slurm/squeue.go
index 4fa004e42..fd4851eb0 100644
--- a/services/crunch-dispatch-slurm/squeue.go
+++ b/services/crunch-dispatch-slurm/squeue.go
@@ -115,7 +115,7 @@ func (sqc *SqueueChecker) reniceAll() {
 		}
 		err := sqc.Slurm.Renice(job.uuid, niceNew)
 		if err != nil && niceNew > slurm15NiceLimit && strings.Contains(err.Error(), "Invalid nice value") {
-			log.Printf("container %q clamping nice values at %d, priority order will not be correct", job.uuid, slurm15NiceLimit)
+			log.Printf("container %q clamping nice values at %d, priority order will not be correct -- see https://dev.arvados.org/projects/arvados/wiki/SLURM_integration#Limited-nice-values-SLURM-15", job.uuid, slurm15NiceLimit)
 			job.hitNiceLimit = true
 		}
 	}

commit 9a80d15b7cab21efe16ec2b543dfb566bea9def4
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Tue Jul 17 15:32:45 2018 -0400

    13399: Recognize held jobs with priority between 1 and 20K.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/services/crunch-dispatch-slurm/squeue.go b/services/crunch-dispatch-slurm/squeue.go
index 4067d27d5..4fa004e42 100644
--- a/services/crunch-dispatch-slurm/squeue.go
+++ b/services/crunch-dispatch-slurm/squeue.go
@@ -168,14 +168,17 @@ func (sqc *SqueueChecker) check() {
 		replacing.nice = n
 		newq[uuid] = replacing
 
-		if state == "PENDING" && ((reason == "BadConstraints" && p == 0) || reason == "launch failed requeued held") && replacing.wantPriority > 0 {
+		if state == "PENDING" && ((reason == "BadConstraints" && p <= 2*slurm15NiceLimit) || reason == "launch failed requeued held") && replacing.wantPriority > 0 {
 			// When using SLURM 14.x or 15.x, our queued
 			// jobs land in this state when "scontrol
 			// reconfigure" invalidates their feature
 			// constraints by clearing all node features.
 			// They stay in this state even after the
 			// features reappear, until we run "scontrol
-			// release {jobid}".
+			// release {jobid}". Priority is usually 0 in
+			// this state, but sometimes (due to a race
+			// with nice adjustments?) it's a small
+			// positive value.
 			//
 			// "scontrol release" is silent and successful
 			// regardless of whether the features have
@@ -186,7 +189,7 @@ func (sqc *SqueueChecker) check() {
 			// "launch failed requeued held" seems to be
 			// another manifestation of this problem,
 			// resolved the same way.
-			log.Printf("releasing held job %q", uuid)
+			log.Printf("releasing held job %q (priority=%d, state=%q, reason=%q)", uuid, p, state, reason)
 			sqc.Slurm.Release(uuid)
 		} else if p < 1<<20 && replacing.wantPriority > 0 {
 			log.Printf("warning: job %q has low priority %d, nice %d, state %q, reason %q", uuid, p, n, state, reason)

commit 297c4aaf43858eff5022a1e72eb8e09660bde4b0
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Tue Jul 3 17:03:11 2018 -0400

    13399: If slurm refuses to renice past 10K, stop trying.
    
    Fixes log spam.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/services/crunch-dispatch-slurm/crunch-dispatch-slurm_test.go b/services/crunch-dispatch-slurm/crunch-dispatch-slurm_test.go
index 23a8a0ca0..719ec98d2 100644
--- a/services/crunch-dispatch-slurm/crunch-dispatch-slurm_test.go
+++ b/services/crunch-dispatch-slurm/crunch-dispatch-slurm_test.go
@@ -55,11 +55,12 @@ func (s *IntegrationSuite) TearDownTest(c *C) {
 }
 
 type slurmFake struct {
-	didBatch   [][]string
-	didCancel  []string
-	didRelease []string
-	didRenice  [][]string
-	queue      string
+	didBatch      [][]string
+	didCancel     []string
+	didRelease    []string
+	didRenice     [][]string
+	queue         string
+	rejectNice10K bool
 	// If non-nil, run this func during the 2nd+ call to Cancel()
 	onCancel func()
 	// Error returned by Batch()
@@ -82,6 +83,9 @@ func (sf *slurmFake) Release(name string) error {
 
 func (sf *slurmFake) Renice(name string, nice int64) error {
 	sf.didRenice = append(sf.didRenice, []string{name, fmt.Sprintf("%d", nice)})
+	if sf.rejectNice10K && nice > 10000 {
+		return errors.New("scontrol: error: Invalid nice value, must be between -10000 and 10000")
+	}
 	return nil
 }
 
diff --git a/services/crunch-dispatch-slurm/squeue.go b/services/crunch-dispatch-slurm/squeue.go
index 742943f19..4067d27d5 100644
--- a/services/crunch-dispatch-slurm/squeue.go
+++ b/services/crunch-dispatch-slurm/squeue.go
@@ -14,11 +14,14 @@ import (
 	"time"
 )
 
+const slurm15NiceLimit int64 = 10000
+
 type slurmJob struct {
 	uuid         string
 	wantPriority int64
 	priority     int64 // current slurm priority (incorporates nice value)
 	nice         int64 // current slurm nice value
+	hitNiceLimit bool
 }
 
 // Squeue implements asynchronous polling monitor of the SLURM queue using the
@@ -103,10 +106,18 @@ func (sqc *SqueueChecker) reniceAll() {
 	})
 	renice := wantNice(jobs, sqc.PrioritySpread)
 	for i, job := range jobs {
-		if renice[i] == job.nice {
+		niceNew := renice[i]
+		if job.hitNiceLimit && niceNew > slurm15NiceLimit {
+			niceNew = slurm15NiceLimit
+		}
+		if niceNew == job.nice {
 			continue
 		}
-		sqc.Slurm.Renice(job.uuid, renice[i])
+		err := sqc.Slurm.Renice(job.uuid, niceNew)
+		if err != nil && niceNew > slurm15NiceLimit && strings.Contains(err.Error(), "Invalid nice value") {
+			log.Printf("container %q clamping nice values at %d, priority order will not be correct", job.uuid, slurm15NiceLimit)
+			job.hitNiceLimit = true
+		}
 	}
 }
 
diff --git a/services/crunch-dispatch-slurm/squeue_test.go b/services/crunch-dispatch-slurm/squeue_test.go
index c9329fdf9..ef036dabd 100644
--- a/services/crunch-dispatch-slurm/squeue_test.go
+++ b/services/crunch-dispatch-slurm/squeue_test.go
@@ -103,6 +103,50 @@ func (s *SqueueSuite) TestReniceAll(c *C) {
 	}
 }
 
+// If a limited nice range prevents desired priority adjustments, give
+// up and clamp nice to 10K.
+func (s *SqueueSuite) TestReniceInvalidNiceValue(c *C) {
+	uuids := []string{"zzzzz-dz642-fake0fake0fake0", "zzzzz-dz642-fake1fake1fake1", "zzzzz-dz642-fake2fake2fake2"}
+	slurm := &slurmFake{
+		queue:         uuids[0] + " 0 4294000222 PENDING Resources\n" + uuids[1] + " 0 4294555222 PENDING Resources\n",
+		rejectNice10K: true,
+	}
+	sqc := &SqueueChecker{
+		Slurm:          slurm,
+		PrioritySpread: 1,
+		Period:         time.Hour,
+	}
+	sqc.startOnce.Do(sqc.start)
+	sqc.check()
+	sqc.SetPriority(uuids[0], 2)
+	sqc.SetPriority(uuids[1], 1)
+
+	// First attempt should renice to 555001, which will fail
+	sqc.reniceAll()
+	c.Check(slurm.didRenice, DeepEquals, [][]string{{uuids[1], "555001"}})
+
+	// Next attempt should renice to 10K, which will succeed
+	sqc.reniceAll()
+	c.Check(slurm.didRenice, DeepEquals, [][]string{{uuids[1], "555001"}, {uuids[1], "10000"}})
+	// ...so we'll change the squeue response to reflect the
+	// updated priority+nice, and make sure sqc sees that...
+	slurm.queue = uuids[0] + " 0 4294000222 PENDING Resources\n" + uuids[1] + " 10000 4294545222 PENDING Resources\n"
+	sqc.check()
+
+	// Next attempt should leave nice alone because it's already
+	// at the 10K limit
+	sqc.reniceAll()
+	c.Check(slurm.didRenice, DeepEquals, [][]string{{uuids[1], "555001"}, {uuids[1], "10000"}})
+
+	// Back to normal if desired nice value falls below 10K
+	slurm.queue = uuids[0] + " 0 4294000222 PENDING Resources\n" + uuids[1] + " 10000 4294000111 PENDING Resources\n"
+	sqc.check()
+	sqc.reniceAll()
+	c.Check(slurm.didRenice, DeepEquals, [][]string{{uuids[1], "555001"}, {uuids[1], "10000"}, {uuids[1], "9890"}})
+
+	sqc.Stop()
+}
+
 // If the given UUID isn't in the slurm queue yet, SetPriority()
 // should wait for it to appear on the very next poll, then give up.
 func (s *SqueueSuite) TestSetPriorityBeforeQueued(c *C) {

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list