[ARVADOS] updated: 2.1.0-1728-g6fe152024
Git user
git at public.arvados.org
Mon Dec 20 22:18:51 UTC 2021
Summary of changes:
doc/_includes/_container_runtime_constraints.liquid | 6 +++---
lib/config/load.go | 20 ++++++++++++++++++++
lib/crunchrun/crunchrun.go | 13 +------------
services/api/app/models/arvados_model.rb | 13 +++++++++++--
services/api/app/models/container.rb | 15 +++++++--------
services/api/app/models/container_request.rb | 4 ++--
6 files changed, 44 insertions(+), 27 deletions(-)
via 6fe152024269d838e31bc224adbd518c43cbfee5 (commit)
from 7f88afd565b76903ad4b27fb896ff0cd844dfb7f (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
commit 6fe152024269d838e31bc224adbd518c43cbfee5
Author: Peter Amstutz <peter.amstutz at curii.com>
Date: Mon Dec 20 17:15:46 2021 -0500
18321: Check runtime constraints md5 with/without empty cuda section
* config & API check format of DriverVersion and HardwareCapability
* crunch-run only pays attention to CUDA.DeviceCount
* update docs
Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz at curii.com>
diff --git a/doc/_includes/_container_runtime_constraints.liquid b/doc/_includes/_container_runtime_constraints.liquid
index 6926b9d3d..3b8df32d4 100644
--- a/doc/_includes/_container_runtime_constraints.liquid
+++ b/doc/_includes/_container_runtime_constraints.liquid
@@ -19,6 +19,6 @@ table(table table-bordered table-condensed).
h3. CUDA GPU support
table(table table-bordered table-condensed).
-|device_count|int|Number of GPUs to request.|Required to request a GPU node.|
-|driver_version|string|Minimum CUDA driver version.|Optional.|
-|hardware_capability|string|Minimum CUDA hardware capability.|Optional.|
+|device_count|int|Number of GPUs to request.|Count greater than 0 enables CUDA GPU support.|
+|driver_version|string|Minimum CUDA driver version, in "X.Y" format.|Required when device_count > 0|
+|hardware_capability|string|Minimum CUDA hardware capability, in "X.Y" format.|Required when device_count > 0|
diff --git a/lib/config/load.go b/lib/config/load.go
index 956a47b1a..888cc828a 100644
--- a/lib/config/load.go
+++ b/lib/config/load.go
@@ -14,6 +14,7 @@ import (
"io/ioutil"
"os"
"regexp"
+ "strconv"
"strings"
"git.arvados.org/arvados.git/sdk/go/arvados"
@@ -299,6 +300,7 @@ func (ldr *Loader) Load() (*arvados.Config, error) {
ldr.checkEmptyKeepstores(cc),
ldr.checkUnlistedKeepstores(cc),
ldr.checkStorageClasses(cc),
+ ldr.checkCUDAVersions(cc),
// TODO: check non-empty Rendezvous on
// services other than Keepstore
} {
@@ -399,6 +401,24 @@ func (ldr *Loader) checkStorageClasses(cc arvados.Cluster) error {
return nil
}
+func (ldr *Loader) checkCUDAVersions(cc arvados.Cluster) error {
+ for _, it := range cc.InstanceTypes {
+ if it.CUDA.DeviceCount == 0 {
+ continue
+ }
+
+ _, err := strconv.ParseFloat(it.CUDA.DriverVersion, 64)
+ if err != nil {
+ return fmt.Errorf("InstanceType %q has invalid CUDA.DriverVersion %q, expected format X.Y (%v)", it.Name, it.CUDA.DriverVersion, err)
+ }
+ _, err = strconv.ParseFloat(it.CUDA.HardwareCapability, 64)
+ if err != nil {
+ return fmt.Errorf("InstanceType %q has invalid CUDA.HardwareCapability %q, expected format X.Y (%v)", it.Name, it.CUDA.HardwareCapability, err)
+ }
+ }
+ return nil
+}
+
func checkKeyConflict(label string, m map[string]string) error {
saw := map[string]bool{}
for k := range m {
diff --git a/lib/crunchrun/crunchrun.go b/lib/crunchrun/crunchrun.go
index 52d9c4b0f..b237d9fa5 100644
--- a/lib/crunchrun/crunchrun.go
+++ b/lib/crunchrun/crunchrun.go
@@ -987,17 +987,6 @@ func (runner *ContainerRunner) CreateContainer(imageID string, bindmounts map[st
runner.executorStdout = stdout
runner.executorStderr = stderr
- cudaDeviceCount := 0
- if runner.Container.RuntimeConstraints.CUDA.DriverVersion != "" ||
- runner.Container.RuntimeConstraints.CUDA.HardwareCapability != "" ||
- runner.Container.RuntimeConstraints.CUDA.DeviceCount != 0 {
- // if any of these are set, enable CUDA GPU support
- cudaDeviceCount = runner.Container.RuntimeConstraints.CUDA.DeviceCount
- if cudaDeviceCount == 0 {
- cudaDeviceCount = 1
- }
- }
-
return runner.executor.Create(containerSpec{
Image: imageID,
VCPUs: runner.Container.RuntimeConstraints.VCPUs,
@@ -1007,7 +996,7 @@ func (runner *ContainerRunner) CreateContainer(imageID string, bindmounts map[st
BindMounts: bindmounts,
Command: runner.Container.Command,
EnableNetwork: enableNetwork,
- CUDADeviceCount: cudaDeviceCount,
+ CUDADeviceCount: runner.Container.RuntimeConstraints.CUDA.DeviceCount,
NetworkMode: runner.networkMode,
CgroupParent: runner.setCgroupParent,
Stdin: stdin,
diff --git a/services/api/app/models/arvados_model.rb b/services/api/app/models/arvados_model.rb
index 00934322d..374c6720f 100644
--- a/services/api/app/models/arvados_model.rb
+++ b/services/api/app/models/arvados_model.rb
@@ -701,7 +701,7 @@ class ArvadosModel < ApplicationRecord
false
end
- def self.where_serialized(colname, value, md5: false)
+ def self.where_serialized(colname, value, md5: false, multivalue: false)
colsql = colname.to_s
if md5
colsql = "md5(#{colsql})"
@@ -714,7 +714,16 @@ class ArvadosModel < ApplicationRecord
sql = "#{colsql} IN (?)"
sorted = deep_sort_hash(value)
end
- params = [sorted.to_yaml, SafeJSON.dump(sorted)]
+ params = []
+ if multivalue
+ sorted.each do |v|
+ params << v.to_yaml
+ params << SafeJSON.dump(v)
+ end
+ else
+ params << sorted.to_yaml
+ params << SafeJSON.dump(sorted)
+ end
if md5
params = params.map { |x| Digest::MD5.hexdigest(x) }
end
diff --git a/services/api/app/models/container.rb b/services/api/app/models/container.rb
index 2443da455..0326b1298 100644
--- a/services/api/app/models/container.rb
+++ b/services/api/app/models/container.rb
@@ -296,15 +296,14 @@ class Container < ArvadosModel
'hardware_capability' => '',
}
end
-
- candidates_inc_cuda = candidates.where_serialized(:runtime_constraints, resolve_runtime_constraints(attrs[:runtime_constraints]), md5: true)
- if candidates_inc_cuda.count == 0 and attrs[:runtime_constraints]['cuda']['device_count'] == 0
- # Fallback search on containers introduced before CUDA support,
- # exclude empty CUDA request from query
- candidates = candidates.where_serialized(:runtime_constraints, resolve_runtime_constraints(attrs[:runtime_constraints].except('cuda')), md5: true)
- else
- candidates = candidates_inc_cuda
+ resolved_runtime_constraints = [resolve_runtime_constraints(attrs[:runtime_constraints])]
+ if resolved_runtime_constraints[0]['cuda']['device_count'] == 0
+ # If no CUDA requested, extend search to include older container
+ # records that don't have a 'cuda' section in runtime_constraints
+ resolved_runtime_constraints << resolved_runtime_constraints[0].except('cuda')
end
+
+ candidates = candidates.where_serialized(:runtime_constraints, resolved_runtime_constraints, md5: true, multivalue: true)
log_reuse_info(candidates) { "after filtering on runtime_constraints #{attrs[:runtime_constraints].inspect}" }
log_reuse_info { "checking for state=Complete with readable output and log..." }
diff --git a/services/api/app/models/container_request.rb b/services/api/app/models/container_request.rb
index 00773fcb8..a3264e419 100644
--- a/services/api/app/models/container_request.rb
+++ b/services/api/app/models/container_request.rb
@@ -346,9 +346,9 @@ class ContainerRequest < ArvadosModel
end
['driver_version', 'hardware_capability'].each do |k|
v = runtime_constraints['cuda'][k]
- if !v.is_a?(String)
+ if !v.is_a?(String) || (runtime_constraints['cuda']['device_count'] > 0 && v.to_f == 0.0)
errors.add(:runtime_constraints,
- "[cuda.#{k}]=#{v.inspect} must be a string")
+ "[cuda.#{k}]=#{v.inspect} must be a string in format 'X.Y' version")
end
end
end
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list