[ARVADOS] updated: 2.1.0-1725-g9908d2599

Mon Dec 13 20:18:26 UTC 2021

Summary of changes:
 apps/workbench/config/initializers/lograge.rb      |  1 +
 ...ll-compute-node-singularity.html.textile.liquid | 17 +++++
 lib/crunchrun/crunchrun_test.go                    | 37 ++++++++++
 lib/crunchrun/docker.go                            |  6 +-
 lib/crunchrun/docker_test.go                       | 33 +++++++++
 lib/crunchrun/singularity.go                       | 18 +++--
 lib/crunchrun/singularity_test.go                  | 23 ++++++
 lib/service/cmd.go                                 |  3 +-
 services/api/config/initializers/lograge.rb        |  1 +
 .../crunch-dispatch-local/crunch-dispatch-local.go |  9 +--
 .../crunch-dispatch-slurm/crunch-dispatch-slurm.go |  2 +
 services/keep-web/main.go                          | 23 +++---
 services/keep-web/server.go                        |  5 +-
 services/keep-web/server_test.go                   |  5 +-
 services/keepproxy/keepproxy.go                    | 81 +++++++++++++---------
 15 files changed, 204 insertions(+), 60 deletions(-)

  discards  23cc75f39c49d20c784fc16cb1a590f023c93559 (commit)
  discards  0bf88a25aa9f1ce2f4fb9e49504646ebdcaeb633 (commit)
       via  9908d25991d607687c7691548a862d1fb73788d5 (commit)
       via  b05ec24843655e162c8c3207e1695debdca9725e (commit)
       via  d7c8ef4e435b88e9a45e5cd9fc2365fb82c9ab36 (commit)
       via  58ea9370fa7b38382dfa9eea4c42a616e0a699f3 (commit)
       via  87a18ef2c05487c4330e6fb6ce6c7934f6bea5a6 (commit)
       via  d3716fbfea120893e1a23915c5f9bcb7ca96c371 (commit)
       via  96d284a1f12ff0bdf9c376c937181b97105fba22 (commit)
       via  3993c04f1811a28399adc350511c4397e3d15321 (commit)
       via  89cab1faedd4c4209ac642ffd442b0085d9da593 (commit)

This update added new revisions after undoing existing revisions.  That is
to say, the old revision is not a strict subset of the new revision.  This
situation occurs when you --force push a change and generate a repository
containing something like this:

 * -- * -- B -- O -- O -- O (23cc75f39c49d20c784fc16cb1a590f023c93559)
            \
             N -- N -- N (9908d25991d607687c7691548a862d1fb73788d5)

When this happens we assume that you've already had alert emails for all
of the O revisions, and so we here report only the revisions in the N
branch from the common base, B.

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.


commit 9908d25991d607687c7691548a862d1fb73788d5
Author: Peter Amstutz <peter.amstutz at curii.com>
Date:   Mon Dec 13 13:47:28 2021 -0500

    18321: Fix config export
    
    Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz at curii.com>

diff --git a/lib/config/export.go b/lib/config/export.go
index 4c4e341f5..a9abe0d55 100644
--- a/lib/config/export.go
+++ b/lib/config/export.go
@@ -143,6 +143,7 @@ var whitelist = map[string]bool{
 	"InstanceTypes":                                       true,
 	"InstanceTypes.*":                                     true,
 	"InstanceTypes.*.*":                                   true,
+	"InstanceTypes.*.*.*":                                 true,
 	"Login":                                               true,
 	"Login.Google":                                        true,
 	"Login.Google.AlternateEmailAddresses":                false,

commit b05ec24843655e162c8c3207e1695debdca9725e
Author: Peter Amstutz <peter.amstutz at curii.com>
Date:   Mon Dec 13 11:55:45 2021 -0500

    18321: Incorporate CUDA request into picking a node type
    
    Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz at curii.com>

diff --git a/lib/config/config.default.yml b/lib/config/config.default.yml
index a84dc5d31..c69cee75b 100644
--- a/lib/config/config.default.yml
+++ b/lib/config/config.default.yml
@@ -1303,6 +1303,11 @@ Clusters:
         AddedScratch: 0
         Price: 0.1
         Preemptible: false
+        # Include this section if the node type includes GPU (CUDA) support
+        CUDA:
+          DriverVersion: "11.0"
+          HardwareCapability: "9.0"
+          DeviceCount: 1
 
     StorageClasses:
 
diff --git a/lib/config/generated_config.go b/lib/config/generated_config.go
index 567ac30a9..9294f7696 100644
--- a/lib/config/generated_config.go
+++ b/lib/config/generated_config.go
@@ -1309,6 +1309,11 @@ Clusters:
         AddedScratch: 0
         Price: 0.1
         Preemptible: false
+        # Include this section if the node type includes GPU (CUDA) support
+        CUDA:
+          DriverVersion: "11.0"
+          HardwareCapability: "9.0"
+          DeviceCount: 1
 
     StorageClasses:
 
diff --git a/lib/dispatchcloud/node_size.go b/lib/dispatchcloud/node_size.go
index 1b10826cb..aa2cd7d56 100644
--- a/lib/dispatchcloud/node_size.go
+++ b/lib/dispatchcloud/node_size.go
@@ -83,6 +83,19 @@ func EstimateScratchSpace(ctr *arvados.Container) (needScratch int64) {
 	return
 }
 
+// compareVersion returns true if vs1 >= vs2, otherwise false
+func compareVersion(vs1 string, vs2 string) bool {
+	v1, err := strconv.ParseFloat(vs1, 64)
+	if err != nil {
+		return false
+	}
+	v2, err := strconv.ParseFloat(vs2, 64)
+	if err != nil {
+		return false
+	}
+	return v1 >= v2
+}
+
 // ChooseInstanceType returns the cheapest available
 // arvados.InstanceType big enough to run ctr.
 func ChooseInstanceType(cc *arvados.Cluster, ctr *arvados.Container) (best arvados.InstanceType, err error) {
@@ -103,14 +116,19 @@ func ChooseInstanceType(cc *arvados.Cluster, ctr *arvados.Container) (best arvad
 	ok := false
 	for _, it := range cc.InstanceTypes {
 		switch {
-		case ok && it.Price > best.Price:
-		case int64(it.Scratch) < needScratch:
-		case int64(it.RAM) < needRAM:
-		case it.VCPUs < needVCPUs:
-		case it.Preemptible != ctr.SchedulingParameters.Preemptible:
-		case it.Price == best.Price && (it.RAM < best.RAM || it.VCPUs < best.VCPUs):
-			// Equal price, but worse specs
+		// reasons to reject a node
+		case ok && it.Price > best.Price: // already selected a node, and this one is more expensive
+		case int64(it.Scratch) < needScratch: // insufficient scratch
+		case int64(it.RAM) < needRAM: // insufficient RAM
+		case it.VCPUs < needVCPUs: // insufficient VCPUs
+		case it.Preemptible != ctr.SchedulingParameters.Preemptible: // wrong preemptable setting
+		case it.Price == best.Price && (it.RAM < best.RAM || it.VCPUs < best.VCPUs): // same price, worse specs
+		case it.CUDA.DeviceCount < ctr.RuntimeConstraints.CUDADeviceCount: // insufficient CUDA devices
+		case it.CUDA.DeviceCount > 0 && !compareVersion(it.CUDA.DriverVersion, ctr.RuntimeConstraints.CUDADriverVersion): // insufficient driver version
+		case it.CUDA.DeviceCount > 0 && !compareVersion(it.CUDA.HardwareCapability, ctr.RuntimeConstraints.CUDAHardwareCapability): // insufficient hardware capability
+			// Don't select this node
 		default:
+			// Didn't reject the node, so select it
 			// Lower price || (same price && better specs)
 			best = it
 			ok = true
diff --git a/lib/dispatchcloud/node_size_test.go b/lib/dispatchcloud/node_size_test.go
index abd292cba..cdcf4033f 100644
--- a/lib/dispatchcloud/node_size_test.go
+++ b/lib/dispatchcloud/node_size_test.go
@@ -147,3 +147,76 @@ func (*NodeSizeSuite) TestScratchForDockerImage(c *check.C) {
 	// Short manifest will return 0
 	c.Check(n, check.Equals, int64(0))
 }
+
+func (*NodeSizeSuite) TestChooseGPU(c *check.C) {
+	menu := map[string]arvados.InstanceType{
+		"costly":         {Price: 4.4, RAM: 4000000000, VCPUs: 8, Scratch: 2 * GiB, Name: "costly", CUDA: arvados.CUDAFeatures{DeviceCount: 2, HardwareCapability: "9.0", DriverVersion: "11.0"}},
+		"low_capability": {Price: 2.1, RAM: 2000000000, VCPUs: 4, Scratch: 2 * GiB, Name: "low_capability", CUDA: arvados.CUDAFeatures{DeviceCount: 1, HardwareCapability: "8.0", DriverVersion: "11.0"}},
+		"best":           {Price: 2.2, RAM: 2000000000, VCPUs: 4, Scratch: 2 * GiB, Name: "best", CUDA: arvados.CUDAFeatures{DeviceCount: 1, HardwareCapability: "9.0", DriverVersion: "11.0"}},
+		"low_driver":     {Price: 2.1, RAM: 2000000000, VCPUs: 4, Scratch: 2 * GiB, Name: "low_driver", CUDA: arvados.CUDAFeatures{DeviceCount: 1, HardwareCapability: "9.0", DriverVersion: "10.0"}},
+		"small":          {Price: 1.1, RAM: 1000000000, VCPUs: 2, Scratch: 2 * GiB, Name: "small"},
+	}
+	best, err := ChooseInstanceType(&arvados.Cluster{InstanceTypes: menu}, &arvados.Container{
+		Mounts: map[string]arvados.Mount{
+			"/tmp": {Kind: "tmp", Capacity: 2 * int64(GiB)},
+		},
+		RuntimeConstraints: arvados.RuntimeConstraints{
+			VCPUs:                  2,
+			RAM:                    987654321,
+			KeepCacheRAM:           123456789,
+			CUDADeviceCount:        1,
+			CUDAHardwareCapability: "9.0",
+			CUDADriverVersion:      "11.0",
+		},
+	})
+	c.Check(err, check.IsNil)
+	c.Check(best.Name, check.Equals, "best")
+	c.Check(best.RAM >= 1234567890, check.Equals, true)
+	c.Check(best.VCPUs >= 2, check.Equals, true)
+	c.Check(best.CUDA.DeviceCount >= 1, check.Equals, true)
+	c.Check(best.CUDA.DriverVersion, check.Equals, "11.0")
+	c.Check(best.CUDA.HardwareCapability, check.Equals, "9.0")
+
+	best, err = ChooseInstanceType(&arvados.Cluster{InstanceTypes: menu}, &arvados.Container{
+		Mounts: map[string]arvados.Mount{
+			"/tmp": {Kind: "tmp", Capacity: 2 * int64(GiB)},
+		},
+		RuntimeConstraints: arvados.RuntimeConstraints{
+			VCPUs:                  2,
+			RAM:                    987654321,
+			KeepCacheRAM:           123456789,
+			CUDADeviceCount:        2,
+			CUDAHardwareCapability: "9.0",
+			CUDADriverVersion:      "11.0",
+		},
+	})
+	c.Check(err, check.IsNil)
+	c.Check(best.Name, check.Equals, "costly")
+	c.Check(best.RAM >= 1234567890, check.Equals, true)
+	c.Check(best.VCPUs >= 2, check.Equals, true)
+	c.Check(best.CUDA.DeviceCount >= 2, check.Equals, true)
+	c.Check(best.CUDA.DriverVersion, check.Equals, "11.0")
+	c.Check(best.CUDA.HardwareCapability, check.Equals, "9.0")
+
+	best, err = ChooseInstanceType(&arvados.Cluster{InstanceTypes: menu}, &arvados.Container{
+		Mounts: map[string]arvados.Mount{
+			"/tmp": {Kind: "tmp", Capacity: 2 * int64(GiB)},
+		},
+		RuntimeConstraints: arvados.RuntimeConstraints{
+			VCPUs:                  2,
+			RAM:                    987654321,
+			KeepCacheRAM:           123456789,
+			CUDADeviceCount:        1,
+			CUDAHardwareCapability: "8.0",
+			CUDADriverVersion:      "11.0",
+		},
+	})
+	c.Check(err, check.IsNil)
+	c.Check(best.Name, check.Equals, "low_capability")
+	c.Check(best.RAM >= 1234567890, check.Equals, true)
+	c.Check(best.VCPUs >= 2, check.Equals, true)
+	c.Check(best.CUDA.DeviceCount >= 1, check.Equals, true)
+	c.Check(best.CUDA.DriverVersion, check.Equals, "11.0")
+	c.Check(best.CUDA.HardwareCapability, check.Equals, "8.0")
+
+}
diff --git a/sdk/go/arvados/config.go b/sdk/go/arvados/config.go
index 474ce33b0..3111d6140 100644
--- a/sdk/go/arvados/config.go
+++ b/sdk/go/arvados/config.go
@@ -410,6 +410,12 @@ type RemoteCluster struct {
 	ActivateUsers bool
 }
 
+type CUDAFeatures struct {
+	DriverVersion      string
+	HardwareCapability string
+	DeviceCount        int
+}
+
 type InstanceType struct {
 	Name            string
 	ProviderType    string
@@ -420,6 +426,7 @@ type InstanceType struct {
 	AddedScratch    ByteSize
 	Price           float64
 	Preemptible     bool
+	CUDA            CUDAFeatures
 }
 
 type ContainersConfig struct {

-----------------------------------------------------------------------


hooks/post-receive
--