[ARVADOS] updated: 2.1.0-1727-g7f88afd56
Git user
git at public.arvados.org
Fri Dec 17 20:58:23 UTC 2021
Summary of changes:
lib/dispatchcloud/node_size.go | 15 ++++++-----
lib/dispatchcloud/node_size_test.go | 27 ++++++-------------
sdk/go/arvados/container.go | 8 +++---
services/api/app/models/arvados_model.rb | 5 ++++
services/api/app/models/container.rb | 17 +++++++++++-
services/api/app/models/container_request.rb | 16 +++++++++++
services/api/test/fixtures/container_requests.yml | 4 +++
services/api/test/fixtures/containers.yml | 30 +++++++++++++++++++++
services/api/test/unit/container_test.rb | 33 ++++++++++++++++++++---
9 files changed, 122 insertions(+), 33 deletions(-)
via 7f88afd565b76903ad4b27fb896ff0cd844dfb7f (commit)
from cb2d522176c17f2d388098b70fdbaa90fb30e682 (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
commit 7f88afd565b76903ad4b27fb896ff0cd844dfb7f
Author: Peter Amstutz <peter.amstutz at curii.com>
Date: Fri Dec 17 15:57:16 2021 -0500
18321: Account for CUDA in container reuse
Also ensure reuse across versions when CUDA isn't being used.
Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz at curii.com>
diff --git a/lib/dispatchcloud/node_size.go b/lib/dispatchcloud/node_size.go
index 6831d8d8e..7c7643bfc 100644
--- a/lib/dispatchcloud/node_size.go
+++ b/lib/dispatchcloud/node_size.go
@@ -84,16 +84,16 @@ func EstimateScratchSpace(ctr *arvados.Container) (needScratch int64) {
}
// compareVersion returns true if vs1 < vs2, otherwise false
-func versionLess(vs1 string, vs2 string) bool {
+func versionLess(vs1 string, vs2 string) (bool, error) {
v1, err := strconv.ParseFloat(vs1, 64)
if err != nil {
- return false
+ return false, err
}
v2, err := strconv.ParseFloat(vs2, 64)
if err != nil {
- return false
+ return false, err
}
- return v1 < v2
+ return v1 < v2, nil
}
// ChooseInstanceType returns the cheapest available
@@ -115,6 +115,9 @@ func ChooseInstanceType(cc *arvados.Cluster, ctr *arvados.Container) (best arvad
ok := false
for _, it := range cc.InstanceTypes {
+ driverInsuff, driverErr := versionLess(it.CUDA.DriverVersion, ctr.RuntimeConstraints.CUDA.DriverVersion)
+ capabilityInsuff, capabilityErr := versionLess(it.CUDA.HardwareCapability, ctr.RuntimeConstraints.CUDA.HardwareCapability)
+
switch {
// reasons to reject a node
case ok && it.Price > best.Price: // already selected a node, and this one is more expensive
@@ -124,8 +127,8 @@ func ChooseInstanceType(cc *arvados.Cluster, ctr *arvados.Container) (best arvad
case it.Preemptible != ctr.SchedulingParameters.Preemptible: // wrong preemptable setting
case it.Price == best.Price && (it.RAM < best.RAM || it.VCPUs < best.VCPUs): // same price, worse specs
case it.CUDA.DeviceCount < ctr.RuntimeConstraints.CUDA.DeviceCount: // insufficient CUDA devices
- case ctr.RuntimeConstraints.CUDA.DeviceCount > 0 && versionLess(it.CUDA.DriverVersion, ctr.RuntimeConstraints.CUDA.DriverVersion): // insufficient driver version
- case ctr.RuntimeConstraints.CUDA.DeviceCount > 0 && versionLess(it.CUDA.HardwareCapability, ctr.RuntimeConstraints.CUDA.HardwareCapability): // insufficient hardware capability
+ case ctr.RuntimeConstraints.CUDA.DeviceCount > 0 && (driverInsuff || driverErr != nil): // insufficient driver version
+ case ctr.RuntimeConstraints.CUDA.DeviceCount > 0 && (capabilityInsuff || capabilityErr != nil): // insufficient hardware capability
// Don't select this node
default:
// Didn't reject the node, so select it
diff --git a/lib/dispatchcloud/node_size_test.go b/lib/dispatchcloud/node_size_test.go
index 06255462b..eb3648e8a 100644
--- a/lib/dispatchcloud/node_size_test.go
+++ b/lib/dispatchcloud/node_size_test.go
@@ -155,6 +155,7 @@ func (*NodeSizeSuite) TestChooseGPU(c *check.C) {
"best": {Price: 2.2, RAM: 2000000000, VCPUs: 4, Scratch: 2 * GiB, Name: "best", CUDA: arvados.CUDAFeatures{DeviceCount: 1, HardwareCapability: "9.0", DriverVersion: "11.0"}},
"low_driver": {Price: 2.1, RAM: 2000000000, VCPUs: 4, Scratch: 2 * GiB, Name: "low_driver", CUDA: arvados.CUDAFeatures{DeviceCount: 1, HardwareCapability: "9.0", DriverVersion: "10.0"}},
"cheap_gpu": {Price: 2.0, RAM: 2000000000, VCPUs: 4, Scratch: 2 * GiB, Name: "cheap_gpu", CUDA: arvados.CUDAFeatures{DeviceCount: 1, HardwareCapability: "8.0", DriverVersion: "10.0"}},
+ "invalid_gpu": {Price: 1.9, RAM: 2000000000, VCPUs: 4, Scratch: 2 * GiB, Name: "invalid_gpu", CUDA: arvados.CUDAFeatures{DeviceCount: 1, HardwareCapability: "12.0.12", DriverVersion: "12.0.12"}},
"non_gpu": {Price: 1.1, RAM: 2000000000, VCPUs: 4, Scratch: 2 * GiB, Name: "non_gpu"},
}
@@ -201,23 +202,7 @@ func (*NodeSizeSuite) TestChooseGPU(c *check.C) {
HardwareCapability: "",
DriverVersion: "10.0",
},
- SelectedInstance: "cheap_gpu",
- },
- GPUTestCase{
- CUDA: arvados.CUDARuntimeConstraints{
- DeviceCount: 1,
- HardwareCapability: "8.0",
- DriverVersion: "",
- },
- SelectedInstance: "cheap_gpu",
- },
- GPUTestCase{
- CUDA: arvados.CUDARuntimeConstraints{
- DeviceCount: 1,
- HardwareCapability: "",
- DriverVersion: "",
- },
- SelectedInstance: "cheap_gpu",
+ SelectedInstance: "",
},
GPUTestCase{
CUDA: arvados.CUDARuntimeConstraints{
@@ -241,7 +226,11 @@ func (*NodeSizeSuite) TestChooseGPU(c *check.C) {
CUDA: tc.CUDA,
},
})
- c.Check(err, check.IsNil)
- c.Check(best.Name, check.Equals, tc.SelectedInstance)
+ if best.Name != "" {
+ c.Check(err, check.IsNil)
+ c.Check(best.Name, check.Equals, tc.SelectedInstance)
+ } else {
+ c.Check(err, check.Not(check.IsNil))
+ }
}
}
diff --git a/sdk/go/arvados/container.go b/sdk/go/arvados/container.go
index d0b127333..3510a6db0 100644
--- a/sdk/go/arvados/container.go
+++ b/sdk/go/arvados/container.go
@@ -94,9 +94,9 @@ type Mount struct {
}
type CUDARuntimeConstraints struct {
- DriverVersion string `json:"driver_version,omitempty"`
- HardwareCapability string `json:"hardware_capability,omitempty"`
- DeviceCount int `json:"device_count,omitempty"`
+ DriverVersion string `json:"driver_version"`
+ HardwareCapability string `json:"hardware_capability"`
+ DeviceCount int `json:"device_count"`
}
// RuntimeConstraints specify a container's compute resources (RAM,
@@ -106,7 +106,7 @@ type RuntimeConstraints struct {
RAM int64 `json:"ram"`
VCPUs int `json:"vcpus"`
KeepCacheRAM int64 `json:"keep_cache_ram"`
- CUDA CUDARuntimeConstraints `json:"cuda,omitempty"`
+ CUDA CUDARuntimeConstraints `json:"cuda"`
}
// SchedulingParameters specify a container's scheduling parameters
diff --git a/services/api/app/models/arvados_model.rb b/services/api/app/models/arvados_model.rb
index af226cde8..00934322d 100644
--- a/services/api/app/models/arvados_model.rb
+++ b/services/api/app/models/arvados_model.rb
@@ -884,6 +884,11 @@ class ArvadosModel < ApplicationRecord
def fill_container_defaults
self.runtime_constraints = {
'API' => false,
+ 'cuda' => {
+ 'device_count' => 0,
+ 'driver_version' => '',
+ 'hardware_capability' => '',
+ },
'keep_cache_ram' => 0,
'ram' => 0,
'vcpus' => 0,
diff --git a/services/api/app/models/container.rb b/services/api/app/models/container.rb
index d6a44c802..2443da455 100644
--- a/services/api/app/models/container.rb
+++ b/services/api/app/models/container.rb
@@ -289,7 +289,22 @@ class Container < ArvadosModel
candidates = candidates.where('secret_mounts_md5 = ?', secret_mounts_md5)
log_reuse_info(candidates) { "after filtering on secret_mounts_md5 #{secret_mounts_md5.inspect}" }
- candidates = candidates.where_serialized(:runtime_constraints, resolve_runtime_constraints(attrs[:runtime_constraints]), md5: true)
+ if attrs[:runtime_constraints]['cuda'].nil?
+ attrs[:runtime_constraints]['cuda'] = {
+ 'device_count' => 0,
+ 'driver_version' => '',
+ 'hardware_capability' => '',
+ }
+ end
+
+ candidates_inc_cuda = candidates.where_serialized(:runtime_constraints, resolve_runtime_constraints(attrs[:runtime_constraints]), md5: true)
+ if candidates_inc_cuda.count == 0 and attrs[:runtime_constraints]['cuda']['device_count'] == 0
+ # Fallback search on containers introduced before CUDA support,
+ # exclude empty CUDA request from query
+ candidates = candidates.where_serialized(:runtime_constraints, resolve_runtime_constraints(attrs[:runtime_constraints].except('cuda')), md5: true)
+ else
+ candidates = candidates_inc_cuda
+ end
log_reuse_info(candidates) { "after filtering on runtime_constraints #{attrs[:runtime_constraints].inspect}" }
log_reuse_info { "checking for state=Complete with readable output and log..." }
diff --git a/services/api/app/models/container_request.rb b/services/api/app/models/container_request.rb
index 4a580816c..00773fcb8 100644
--- a/services/api/app/models/container_request.rb
+++ b/services/api/app/models/container_request.rb
@@ -336,6 +336,22 @@ class ContainerRequest < ArvadosModel
"[#{k}]=#{v.inspect} must be a positive integer")
end
end
+ if runtime_constraints['cuda']
+ ['device_count'].each do |k|
+ v = runtime_constraints['cuda'][k]
+ if !v.is_a?(Integer) || v < 0
+ errors.add(:runtime_constraints,
+ "[cuda.#{k}]=#{v.inspect} must be a positive or zero integer")
+ end
+ end
+ ['driver_version', 'hardware_capability'].each do |k|
+ v = runtime_constraints['cuda'][k]
+ if !v.is_a?(String)
+ errors.add(:runtime_constraints,
+ "[cuda.#{k}]=#{v.inspect} must be a string")
+ end
+ end
+ end
end
end
diff --git a/services/api/test/fixtures/container_requests.yml b/services/api/test/fixtures/container_requests.yml
index 5be132ac3..dca89f56d 100644
--- a/services/api/test/fixtures/container_requests.yml
+++ b/services/api/test/fixtures/container_requests.yml
@@ -20,6 +20,10 @@ queued:
runtime_constraints:
vcpus: 1
ram: 123
+ cuda:
+ driver_version: ""
+ hardware_capability: ""
+ device_count: 0
mounts: {}
running:
diff --git a/services/api/test/fixtures/containers.yml b/services/api/test/fixtures/containers.yml
index e7cd0abd1..53539ce1e 100644
--- a/services/api/test/fixtures/containers.yml
+++ b/services/api/test/fixtures/containers.yml
@@ -16,6 +16,10 @@ queued:
runtime_constraints:
ram: 12000000000
vcpus: 4
+ cuda:
+ driver_version: ""
+ hardware_capability: ""
+ device_count: 0
mounts:
/tmp:
kind: tmp
@@ -449,3 +453,29 @@ runtime_token:
/var/spool/cwl:
kind: tmp
capacity: 24000000000
+
+cuda_container:
+ uuid: zzzzz-dz642-cudagpcontainer
+ owner_uuid: zzzzz-tpzed-000000000000000
+ state: Complete
+ exit_code: 0
+ priority: 1
+ created_at: 2016-01-11 11:11:11.111111111 Z
+ updated_at: 2016-01-11 11:11:11.111111111 Z
+ started_at: 2016-01-11 11:11:11.111111111 Z
+ finished_at: 2016-01-12 11:12:13.111111111 Z
+ container_image: test
+ cwd: test
+ log: ea10d51bcf88862dbcc36eb292017dfd+45
+ output: 1f4b0bc7583c2a7f9102c395f4ffc5e3+45
+ output_path: test
+ command: ["echo", "hello", "/bin/sh", "-c", "'cat' '/keep/fa7aeb5140e2848d39b416daeef4ffc5+45/foobar' '/keep/fa7aeb5140e2848d39b416daeef4ffc5+45/baz' '|' 'gzip' '>' '/dev/null'"]
+ runtime_constraints:
+ ram: 12000000000
+ vcpus: 4
+ cuda:
+ driver_version: "11.0"
+ hardware_capability: "9.0"
+ device_count: 1
+ secret_mounts: {}
+ secret_mounts_md5: 99914b932bd37a50b983c5e7c90ae93b
diff --git a/services/api/test/unit/container_test.rb b/services/api/test/unit/container_test.rb
index 375ab5a7b..ac3e6bea4 100644
--- a/services/api/test/unit/container_test.rb
+++ b/services/api/test/unit/container_test.rb
@@ -14,7 +14,7 @@ class ContainerTest < ActiveSupport::TestCase
container_image: 'fa3c1a9cb6783f85f2ecda037e07b8c3+167',
output_path: '/tmp',
priority: 1,
- runtime_constraints: {"vcpus" => 1, "ram" => 1},
+ runtime_constraints: {"vcpus" => 1, "ram" => 1, "cuda" => {"device_count":0, "driver_version": "", "hardware_capability": ""}},
}
REUSABLE_COMMON_ATTRS = {
@@ -26,7 +26,7 @@ class ContainerTest < ActiveSupport::TestCase
"API" => false,
"keep_cache_ram" => 0,
"ram" => 12000000000,
- "vcpus" => 4,
+ "vcpus" => 4
},
mounts: {
"test" => {"kind" => "json"},
@@ -229,7 +229,7 @@ class ContainerTest < ActiveSupport::TestCase
set_user_from_auth :active
env = {"C" => "3", "B" => "2", "A" => "1"}
m = {"F" => {"kind" => "3"}, "E" => {"kind" => "2"}, "D" => {"kind" => "1"}}
- rc = {"vcpus" => 1, "ram" => 1, "keep_cache_ram" => 1, "API" => true}
+ rc = {"vcpus" => 1, "ram" => 1, "keep_cache_ram" => 1, "API" => true, "cuda" => {"device_count":0, "driver_version": "", "hardware_capability": ""}}
c, _ = minimal_new(environment: env, mounts: m, runtime_constraints: rc)
c.reload
assert_equal Container.deep_sort_hash(env).to_json, c.environment.to_json
@@ -590,6 +590,33 @@ class ContainerTest < ActiveSupport::TestCase
assert_equal c1.uuid, reused.uuid
end
+ test "find_reusable method with cuda" do
+ set_user_from_auth :active
+ # No cuda
+ no_cuda_attrs = REUSABLE_COMMON_ATTRS.merge({use_existing:false, priority:1, environment:{"var" => "queued"},
+ runtime_constraints: {"vcpus" => 1, "ram" => 1, "keep_cache_ram"=>268435456, "API" => false,
+ "cuda" => {"device_count":0, "driver_version": "", "hardware_capability": ""}},})
+ c1, _ = minimal_new(no_cuda_attrs)
+ assert_equal Container::Queued, c1.state
+
+ # has cuda
+ cuda_attrs = REUSABLE_COMMON_ATTRS.merge({use_existing:false, priority:1, environment:{"var" => "queued"},
+ runtime_constraints: {"vcpus" => 1, "ram" => 1, "keep_cache_ram"=>268435456, "API" => false,
+ "cuda" => {"device_count":1, "driver_version": "11.0", "hardware_capability": "9.0"}},})
+ c2, _ = minimal_new(cuda_attrs)
+ assert_equal Container::Queued, c2.state
+
+ # should find the no cuda one
+ reused = Container.find_reusable(no_cuda_attrs)
+ assert_not_nil reused
+ assert_equal reused.uuid, c1.uuid
+
+ # should find the cuda one
+ reused = Container.find_reusable(cuda_attrs)
+ assert_not_nil reused
+ assert_equal reused.uuid, c2.uuid
+ end
+
test "Container running" do
set_user_from_auth :active
c, _ = minimal_new priority: 1
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list