[ARVADOS] updated: 2.1.0-1725-g9908d2599
Git user
git at public.arvados.org
Mon Dec 13 20:18:26 UTC 2021
Summary of changes:
apps/workbench/config/initializers/lograge.rb | 1 +
...ll-compute-node-singularity.html.textile.liquid | 17 +++++
lib/crunchrun/crunchrun_test.go | 37 ++++++++++
lib/crunchrun/docker.go | 6 +-
lib/crunchrun/docker_test.go | 33 +++++++++
lib/crunchrun/singularity.go | 18 +++--
lib/crunchrun/singularity_test.go | 23 ++++++
lib/service/cmd.go | 3 +-
services/api/config/initializers/lograge.rb | 1 +
.../crunch-dispatch-local/crunch-dispatch-local.go | 9 +--
.../crunch-dispatch-slurm/crunch-dispatch-slurm.go | 2 +
services/keep-web/main.go | 23 +++---
services/keep-web/server.go | 5 +-
services/keep-web/server_test.go | 5 +-
services/keepproxy/keepproxy.go | 81 +++++++++++++---------
15 files changed, 204 insertions(+), 60 deletions(-)
discards 23cc75f39c49d20c784fc16cb1a590f023c93559 (commit)
discards 0bf88a25aa9f1ce2f4fb9e49504646ebdcaeb633 (commit)
via 9908d25991d607687c7691548a862d1fb73788d5 (commit)
via b05ec24843655e162c8c3207e1695debdca9725e (commit)
via d7c8ef4e435b88e9a45e5cd9fc2365fb82c9ab36 (commit)
via 58ea9370fa7b38382dfa9eea4c42a616e0a699f3 (commit)
via 87a18ef2c05487c4330e6fb6ce6c7934f6bea5a6 (commit)
via d3716fbfea120893e1a23915c5f9bcb7ca96c371 (commit)
via 96d284a1f12ff0bdf9c376c937181b97105fba22 (commit)
via 3993c04f1811a28399adc350511c4397e3d15321 (commit)
via 89cab1faedd4c4209ac642ffd442b0085d9da593 (commit)
This update added new revisions after undoing existing revisions. That is
to say, the old revision is not a strict subset of the new revision. This
situation occurs when you --force push a change and generate a repository
containing something like this:
* -- * -- B -- O -- O -- O (23cc75f39c49d20c784fc16cb1a590f023c93559)
\
N -- N -- N (9908d25991d607687c7691548a862d1fb73788d5)
When this happens we assume that you've already had alert emails for all
of the O revisions, and so we here report only the revisions in the N
branch from the common base, B.
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
commit 9908d25991d607687c7691548a862d1fb73788d5
Author: Peter Amstutz <peter.amstutz at curii.com>
Date: Mon Dec 13 13:47:28 2021 -0500
18321: Fix config export
Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz at curii.com>
diff --git a/lib/config/export.go b/lib/config/export.go
index 4c4e341f5..a9abe0d55 100644
--- a/lib/config/export.go
+++ b/lib/config/export.go
@@ -143,6 +143,7 @@ var whitelist = map[string]bool{
"InstanceTypes": true,
"InstanceTypes.*": true,
"InstanceTypes.*.*": true,
+ "InstanceTypes.*.*.*": true,
"Login": true,
"Login.Google": true,
"Login.Google.AlternateEmailAddresses": false,
commit b05ec24843655e162c8c3207e1695debdca9725e
Author: Peter Amstutz <peter.amstutz at curii.com>
Date: Mon Dec 13 11:55:45 2021 -0500
18321: Incorporate CUDA request into picking a node type
Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz at curii.com>
diff --git a/lib/config/config.default.yml b/lib/config/config.default.yml
index a84dc5d31..c69cee75b 100644
--- a/lib/config/config.default.yml
+++ b/lib/config/config.default.yml
@@ -1303,6 +1303,11 @@ Clusters:
AddedScratch: 0
Price: 0.1
Preemptible: false
+ # Include this section if the node type includes GPU (CUDA) support
+ CUDA:
+ DriverVersion: "11.0"
+ HardwareCapability: "9.0"
+ DeviceCount: 1
StorageClasses:
diff --git a/lib/config/generated_config.go b/lib/config/generated_config.go
index 567ac30a9..9294f7696 100644
--- a/lib/config/generated_config.go
+++ b/lib/config/generated_config.go
@@ -1309,6 +1309,11 @@ Clusters:
AddedScratch: 0
Price: 0.1
Preemptible: false
+ # Include this section if the node type includes GPU (CUDA) support
+ CUDA:
+ DriverVersion: "11.0"
+ HardwareCapability: "9.0"
+ DeviceCount: 1
StorageClasses:
diff --git a/lib/dispatchcloud/node_size.go b/lib/dispatchcloud/node_size.go
index 1b10826cb..aa2cd7d56 100644
--- a/lib/dispatchcloud/node_size.go
+++ b/lib/dispatchcloud/node_size.go
@@ -83,6 +83,19 @@ func EstimateScratchSpace(ctr *arvados.Container) (needScratch int64) {
return
}
+// compareVersion returns true if vs1 >= vs2, otherwise false
+func compareVersion(vs1 string, vs2 string) bool {
+ v1, err := strconv.ParseFloat(vs1, 64)
+ if err != nil {
+ return false
+ }
+ v2, err := strconv.ParseFloat(vs2, 64)
+ if err != nil {
+ return false
+ }
+ return v1 >= v2
+}
+
// ChooseInstanceType returns the cheapest available
// arvados.InstanceType big enough to run ctr.
func ChooseInstanceType(cc *arvados.Cluster, ctr *arvados.Container) (best arvados.InstanceType, err error) {
@@ -103,14 +116,19 @@ func ChooseInstanceType(cc *arvados.Cluster, ctr *arvados.Container) (best arvad
ok := false
for _, it := range cc.InstanceTypes {
switch {
- case ok && it.Price > best.Price:
- case int64(it.Scratch) < needScratch:
- case int64(it.RAM) < needRAM:
- case it.VCPUs < needVCPUs:
- case it.Preemptible != ctr.SchedulingParameters.Preemptible:
- case it.Price == best.Price && (it.RAM < best.RAM || it.VCPUs < best.VCPUs):
- // Equal price, but worse specs
+ // reasons to reject a node
+ case ok && it.Price > best.Price: // already selected a node, and this one is more expensive
+ case int64(it.Scratch) < needScratch: // insufficient scratch
+ case int64(it.RAM) < needRAM: // insufficient RAM
+ case it.VCPUs < needVCPUs: // insufficient VCPUs
+ case it.Preemptible != ctr.SchedulingParameters.Preemptible: // wrong preemptable setting
+ case it.Price == best.Price && (it.RAM < best.RAM || it.VCPUs < best.VCPUs): // same price, worse specs
+ case it.CUDA.DeviceCount < ctr.RuntimeConstraints.CUDADeviceCount: // insufficient CUDA devices
+ case it.CUDA.DeviceCount > 0 && !compareVersion(it.CUDA.DriverVersion, ctr.RuntimeConstraints.CUDADriverVersion): // insufficient driver version
+ case it.CUDA.DeviceCount > 0 && !compareVersion(it.CUDA.HardwareCapability, ctr.RuntimeConstraints.CUDAHardwareCapability): // insufficient hardware capability
+ // Don't select this node
default:
+ // Didn't reject the node, so select it
// Lower price || (same price && better specs)
best = it
ok = true
diff --git a/lib/dispatchcloud/node_size_test.go b/lib/dispatchcloud/node_size_test.go
index abd292cba..cdcf4033f 100644
--- a/lib/dispatchcloud/node_size_test.go
+++ b/lib/dispatchcloud/node_size_test.go
@@ -147,3 +147,76 @@ func (*NodeSizeSuite) TestScratchForDockerImage(c *check.C) {
// Short manifest will return 0
c.Check(n, check.Equals, int64(0))
}
+
+func (*NodeSizeSuite) TestChooseGPU(c *check.C) {
+ menu := map[string]arvados.InstanceType{
+ "costly": {Price: 4.4, RAM: 4000000000, VCPUs: 8, Scratch: 2 * GiB, Name: "costly", CUDA: arvados.CUDAFeatures{DeviceCount: 2, HardwareCapability: "9.0", DriverVersion: "11.0"}},
+ "low_capability": {Price: 2.1, RAM: 2000000000, VCPUs: 4, Scratch: 2 * GiB, Name: "low_capability", CUDA: arvados.CUDAFeatures{DeviceCount: 1, HardwareCapability: "8.0", DriverVersion: "11.0"}},
+ "best": {Price: 2.2, RAM: 2000000000, VCPUs: 4, Scratch: 2 * GiB, Name: "best", CUDA: arvados.CUDAFeatures{DeviceCount: 1, HardwareCapability: "9.0", DriverVersion: "11.0"}},
+ "low_driver": {Price: 2.1, RAM: 2000000000, VCPUs: 4, Scratch: 2 * GiB, Name: "low_driver", CUDA: arvados.CUDAFeatures{DeviceCount: 1, HardwareCapability: "9.0", DriverVersion: "10.0"}},
+ "small": {Price: 1.1, RAM: 1000000000, VCPUs: 2, Scratch: 2 * GiB, Name: "small"},
+ }
+ best, err := ChooseInstanceType(&arvados.Cluster{InstanceTypes: menu}, &arvados.Container{
+ Mounts: map[string]arvados.Mount{
+ "/tmp": {Kind: "tmp", Capacity: 2 * int64(GiB)},
+ },
+ RuntimeConstraints: arvados.RuntimeConstraints{
+ VCPUs: 2,
+ RAM: 987654321,
+ KeepCacheRAM: 123456789,
+ CUDADeviceCount: 1,
+ CUDAHardwareCapability: "9.0",
+ CUDADriverVersion: "11.0",
+ },
+ })
+ c.Check(err, check.IsNil)
+ c.Check(best.Name, check.Equals, "best")
+ c.Check(best.RAM >= 1234567890, check.Equals, true)
+ c.Check(best.VCPUs >= 2, check.Equals, true)
+ c.Check(best.CUDA.DeviceCount >= 1, check.Equals, true)
+ c.Check(best.CUDA.DriverVersion, check.Equals, "11.0")
+ c.Check(best.CUDA.HardwareCapability, check.Equals, "9.0")
+
+ best, err = ChooseInstanceType(&arvados.Cluster{InstanceTypes: menu}, &arvados.Container{
+ Mounts: map[string]arvados.Mount{
+ "/tmp": {Kind: "tmp", Capacity: 2 * int64(GiB)},
+ },
+ RuntimeConstraints: arvados.RuntimeConstraints{
+ VCPUs: 2,
+ RAM: 987654321,
+ KeepCacheRAM: 123456789,
+ CUDADeviceCount: 2,
+ CUDAHardwareCapability: "9.0",
+ CUDADriverVersion: "11.0",
+ },
+ })
+ c.Check(err, check.IsNil)
+ c.Check(best.Name, check.Equals, "costly")
+ c.Check(best.RAM >= 1234567890, check.Equals, true)
+ c.Check(best.VCPUs >= 2, check.Equals, true)
+ c.Check(best.CUDA.DeviceCount >= 2, check.Equals, true)
+ c.Check(best.CUDA.DriverVersion, check.Equals, "11.0")
+ c.Check(best.CUDA.HardwareCapability, check.Equals, "9.0")
+
+ best, err = ChooseInstanceType(&arvados.Cluster{InstanceTypes: menu}, &arvados.Container{
+ Mounts: map[string]arvados.Mount{
+ "/tmp": {Kind: "tmp", Capacity: 2 * int64(GiB)},
+ },
+ RuntimeConstraints: arvados.RuntimeConstraints{
+ VCPUs: 2,
+ RAM: 987654321,
+ KeepCacheRAM: 123456789,
+ CUDADeviceCount: 1,
+ CUDAHardwareCapability: "8.0",
+ CUDADriverVersion: "11.0",
+ },
+ })
+ c.Check(err, check.IsNil)
+ c.Check(best.Name, check.Equals, "low_capability")
+ c.Check(best.RAM >= 1234567890, check.Equals, true)
+ c.Check(best.VCPUs >= 2, check.Equals, true)
+ c.Check(best.CUDA.DeviceCount >= 1, check.Equals, true)
+ c.Check(best.CUDA.DriverVersion, check.Equals, "11.0")
+ c.Check(best.CUDA.HardwareCapability, check.Equals, "8.0")
+
+}
diff --git a/sdk/go/arvados/config.go b/sdk/go/arvados/config.go
index 474ce33b0..3111d6140 100644
--- a/sdk/go/arvados/config.go
+++ b/sdk/go/arvados/config.go
@@ -410,6 +410,12 @@ type RemoteCluster struct {
ActivateUsers bool
}
+type CUDAFeatures struct {
+ DriverVersion string
+ HardwareCapability string
+ DeviceCount int
+}
+
type InstanceType struct {
Name string
ProviderType string
@@ -420,6 +426,7 @@ type InstanceType struct {
AddedScratch ByteSize
Price float64
Preemptible bool
+ CUDA CUDAFeatures
}
type ContainersConfig struct {
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list