[arvados] created: 2.7.0-6195-g8265e843a3
git repository hosting
git at public.arvados.org
Thu Mar 14 17:19:01 UTC 2024
at 8265e843a36b9fba46791728ca8170f3b12ce333 (commit)
commit 8265e843a36b9fba46791728ca8170f3b12ce333
Author: Tom Clegg <tom at curii.com>
Date: Wed Mar 13 17:22:39 2024 -0400
21449: Propagate max_run_time to LSF scheduler.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>
diff --git a/doc/install/crunch2-lsf/install-dispatch.html.textile.liquid b/doc/install/crunch2-lsf/install-dispatch.html.textile.liquid
index fc4393d0b6..6aeb11040c 100644
--- a/doc/install/crunch2-lsf/install-dispatch.html.textile.liquid
+++ b/doc/install/crunch2-lsf/install-dispatch.html.textile.liquid
@@ -75,6 +75,7 @@ Template variables starting with % will be substituted as follows:
%M memory in MB
%T tmp in MB
%G number of GPU devices (@runtime_constraints.cuda.device_count@)
+%W maximum job run time in minutes, suitable for use with @-W@ or @-We@ flags (see MaxRunTimeOverhead MaxRunTimeDefault below)
Use %% to express a literal %. The %%J in the default will be changed to %J, which is interpreted by @bsub@ itself.
@@ -83,7 +84,7 @@ For example:
<notextile>
<pre> Containers:
LSF:
- <code class="userinput">BsubArgumentsList: <b>["-o", "/tmp/crunch-run.%%J.out", "-e", "/tmp/crunch-run.%%J.err", "-J", "%U", "-n", "%C", "-D", "%MMB", "-R", "rusage[mem=%MMB:tmp=%TMB] span[hosts=1]", "-R", "select[mem>=%MMB]", "-R", "select[tmp>=%TMB]", "-R", "select[ncpus>=%C]"]</b></code>
+ <code class="userinput">BsubArgumentsList: <b>["-o", "/tmp/crunch-run.%%J.out", "-e", "/tmp/crunch-run.%%J.err", "-J", "%U", "-n", "%C", "-D", "%MMB", "-R", "rusage[mem=%MMB:tmp=%TMB] span[hosts=1]", "-R", "select[mem>=%MMB]", "-R", "select[tmp>=%TMB]", "-R", "select[ncpus>=%C]", "-We", "%W"]</b></code>
</pre>
</notextile>
@@ -100,6 +101,14 @@ If the container requests access to GPUs (@runtime_constraints.cuda.device_count
</pre>
</notextile>
+h3(#MaxRunTimeOverhead). Containers.LSF.MaxRunTimeOverhead
+
+Extra time to add to each container's @scheduling_parameters.max_run_time@ value when substituting for @%W@ in @BsubArgumentsList@, to account for time spent setting up the container image, copying output files, etc.
+
+h3(#MaxRunTimeDefault). Containers.LSF.MaxRunTimeDefault
+
+Default @max_run_time@ value to use for containers that do not specify one in @scheduling_parameters.max_run_time at . If this is zero, and @BsubArgumentsList@ contains @"-W", "%W"@ or @"-We", "%W"@, those arguments will be dropped when submitting containers that do not specify @scheduling_parameters.max_run_time at .
+
h3(#PollInterval). Containers.PollInterval
arvados-dispatch-lsf polls the API server periodically for new containers to run. The @PollInterval@ option controls how often this poll happens. Set this to a string of numbers suffixed with one of the time units @s@, @m@, or @h at . For example:
diff --git a/lib/config/config.default.yml b/lib/config/config.default.yml
index fa74e8e6c6..a3ae4fd56b 100644
--- a/lib/config/config.default.yml
+++ b/lib/config/config.default.yml
@@ -1379,15 +1379,23 @@ Clusters:
# %M memory in MB
# %T tmp in MB
# %G number of GPU devices (runtime_constraints.cuda.device_count)
+ # %W maximum run time in minutes (see MaxRunTimeOverhead and
+ # MaxRunTimeDefault below)
#
- # Use %% to express a literal %. The %%J in the default will be changed
- # to %J, which is interpreted by bsub itself.
+ # Use %% to express a literal %. For example, the %%J in the
+ # default argument list will be changed to %J, which is
+ # interpreted by bsub itself.
#
# Note that the default arguments cause LSF to write two files
# in /tmp on the compute node each time an Arvados container
# runs. Ensure you have something in place to delete old files
# from /tmp, or adjust the "-o" and "-e" arguments accordingly.
- BsubArgumentsList: ["-o", "/tmp/crunch-run.%%J.out", "-e", "/tmp/crunch-run.%%J.err", "-J", "%U", "-n", "%C", "-D", "%MMB", "-R", "rusage[mem=%MMB:tmp=%TMB] span[hosts=1]", "-R", "select[mem>=%MMB]", "-R", "select[tmp>=%TMB]", "-R", "select[ncpus>=%C]"]
+ #
+ # If ["-We", "%W"] or ["-W", "%W"] appear in this argument
+ # list, and MaxRunTimeDefault is not set (see below), both of
+ # those arguments will be dropped from the argument list when
+ # running a container that has no max_run_time value.
+ BsubArgumentsList: ["-o", "/tmp/crunch-run.%%J.out", "-e", "/tmp/crunch-run.%%J.err", "-J", "%U", "-n", "%C", "-D", "%MMB", "-R", "rusage[mem=%MMB:tmp=%TMB] span[hosts=1]", "-R", "select[mem>=%MMB]", "-R", "select[tmp>=%TMB]", "-R", "select[ncpus>=%C]", "-We", "%W"]
# Arguments that will be appended to the bsub command line
# when submitting Arvados containers as LSF jobs with
@@ -1402,6 +1410,19 @@ Clusters:
# Arvados LSF dispatcher runs ("submission host").
BsubSudoUser: "crunch"
+ # When passing the scheduling_constraints.max_run_time value
+ # to LSF via "%W", add this much time to account for
+ # crunch-run startup/shutdown overhead.
+ MaxRunTimeOverhead: 5m
+
+ # If non-zero, MaxRunTimeDefault is used as the default value
+ # for max_run_time for containers that do not specify a time
+ # limit. MaxRunTimeOverhead will be added to this.
+ #
+ # Example:
+ # MaxRunTimeDefault: 2h
+ MaxRunTimeDefault: 0
+
JobsAPI:
# Enable the legacy 'jobs' API (crunch v1). This value must be a string.
#
diff --git a/lib/lsf/dispatch.go b/lib/lsf/dispatch.go
index d1408d23cb..897e5803f2 100644
--- a/lib/lsf/dispatch.go
+++ b/lib/lsf/dispatch.go
@@ -306,6 +306,15 @@ func (disp *dispatcher) bsubArgs(container arvados.Container) ([]string, error)
container.RuntimeConstraints.KeepCacheRAM+
int64(disp.Cluster.Containers.ReserveExtraRAM)) / 1048576))
+ maxruntime := time.Duration(container.SchedulingParameters.MaxRunTime) * time.Second
+ if maxruntime == 0 {
+ maxruntime = disp.Cluster.Containers.LSF.MaxRunTimeDefault.Duration()
+ }
+ if maxruntime > 0 {
+ maxruntime += disp.Cluster.Containers.LSF.MaxRunTimeOverhead.Duration()
+ }
+ maxrunminutes := int64(math.Ceil(float64(maxruntime.Seconds()) / 60))
+
repl := map[string]string{
"%%": "%",
"%C": fmt.Sprintf("%d", vcpus),
@@ -313,6 +322,7 @@ func (disp *dispatcher) bsubArgs(container arvados.Container) ([]string, error)
"%T": fmt.Sprintf("%d", tmp),
"%U": container.UUID,
"%G": fmt.Sprintf("%d", container.RuntimeConstraints.CUDA.DeviceCount),
+ "%W": fmt.Sprintf("%d", maxrunminutes),
}
re := regexp.MustCompile(`%.`)
@@ -321,7 +331,16 @@ func (disp *dispatcher) bsubArgs(container arvados.Container) ([]string, error)
if container.RuntimeConstraints.CUDA.DeviceCount > 0 {
argumentTemplate = append(argumentTemplate, disp.Cluster.Containers.LSF.BsubCUDAArguments...)
}
- for _, a := range argumentTemplate {
+ for idx, a := range argumentTemplate {
+ if idx > 0 && (argumentTemplate[idx-1] == "-W" || argumentTemplate[idx-1] == "-We") && a == "%W" && maxrunminutes == 0 {
+ // LSF docs don't specify an argument to "-W"
+ // or "-We" that indicates "unknown", so
+ // instead we drop the "-W %W" part of the
+ // command line entirely when max runtime is
+ // unknown.
+ args = args[:len(args)-1]
+ continue
+ }
args = append(args, re.ReplaceAllStringFunc(a, func(s string) string {
subst := repl[s]
if len(subst) == 0 {
diff --git a/lib/lsf/dispatch_test.go b/lib/lsf/dispatch_test.go
index cd41071d2c..e1e0bcae31 100644
--- a/lib/lsf/dispatch_test.go
+++ b/lib/lsf/dispatch_test.go
@@ -34,6 +34,7 @@ type suite struct {
crTooBig arvados.ContainerRequest
crPending arvados.ContainerRequest
crCUDARequest arvados.ContainerRequest
+ crMaxRunTime arvados.ContainerRequest
}
func (s *suite) TearDownTest(c *check.C) {
@@ -116,6 +117,25 @@ func (s *suite) SetUpTest(c *check.C) {
})
c.Assert(err, check.IsNil)
+ err = arvados.NewClientFromEnv().RequestAndDecode(&s.crMaxRunTime, "POST", "arvados/v1/container_requests", nil, map[string]interface{}{
+ "container_request": map[string]interface{}{
+ "runtime_constraints": arvados.RuntimeConstraints{
+ RAM: 1000000,
+ VCPUs: 1,
+ },
+ "scheduling_parameters": arvados.SchedulingParameters{
+ MaxRunTime: 124,
+ },
+ "container_image": arvadostest.DockerImage112PDH,
+ "command": []string{"sleep", "123"},
+ "mounts": map[string]arvados.Mount{"/mnt/out": {Kind: "tmp", Capacity: 1000}},
+ "output_path": "/mnt/out",
+ "state": arvados.ContainerRequestStateCommitted,
+ "priority": 1,
+ "container_count_max": 1,
+ },
+ })
+ c.Assert(err, check.IsNil)
}
type lsfstub struct {
@@ -141,12 +161,7 @@ func (stub lsfstub) stubCommand(s *suite, c *check.C) func(prog string, args ...
}
switch prog {
case "bsub":
- defaultArgs := s.disp.Cluster.Containers.LSF.BsubArgumentsList
- if args[5] == s.crCUDARequest.ContainerUUID {
- c.Assert(len(args), check.Equals, len(defaultArgs)+len(s.disp.Cluster.Containers.LSF.BsubCUDAArguments))
- } else {
- c.Assert(len(args), check.Equals, len(defaultArgs))
- }
+ c.Assert(len(args) > 5, check.Equals, true)
// %%J must have been rewritten to %J
c.Check(args[1], check.Equals, "/tmp/crunch-run.%J.out")
args = args[4:]
@@ -204,6 +219,21 @@ func (stub lsfstub) stubCommand(s *suite, c *check.C) func(prog string, args ...
fakejobq[nextjobid] = args[1]
nextjobid++
mtx.Unlock()
+ case s.crMaxRunTime.ContainerUUID:
+ c.Check(args, check.DeepEquals, []string{
+ "-J", s.crMaxRunTime.ContainerUUID,
+ "-n", "1",
+ "-D", "257MB",
+ "-R", "rusage[mem=257MB:tmp=2304MB] span[hosts=1]",
+ "-R", "select[mem>=257MB]",
+ "-R", "select[tmp>=2304MB]",
+ "-R", "select[ncpus>=1]",
+ "-We", "8", // 124s + 5m overhead + roundup = 8m
+ })
+ mtx.Lock()
+ fakejobq[nextjobid] = args[1]
+ nextjobid++
+ mtx.Unlock()
default:
c.Errorf("unexpected uuid passed to bsub: args %q", args)
return exec.Command("false")
diff --git a/sdk/go/arvados/config.go b/sdk/go/arvados/config.go
index 9b2cf155b0..698ee20d8c 100644
--- a/sdk/go/arvados/config.go
+++ b/sdk/go/arvados/config.go
@@ -545,9 +545,11 @@ type ContainersConfig struct {
}
}
LSF struct {
- BsubSudoUser string
- BsubArgumentsList []string
- BsubCUDAArguments []string
+ BsubSudoUser string
+ BsubArgumentsList []string
+ BsubCUDAArguments []string
+ MaxRunTimeOverhead Duration
+ MaxRunTimeDefault Duration
}
}
diff --git a/services/api/app/models/container_request.rb b/services/api/app/models/container_request.rb
index 0a8e33d05f..f5789f31f6 100644
--- a/services/api/app/models/container_request.rb
+++ b/services/api/app/models/container_request.rb
@@ -470,8 +470,9 @@ class ContainerRequest < ArvadosModel
def validate_scheduling_parameters
if self.state == Committed
- if scheduling_parameters.include? 'partitions' and
- (!scheduling_parameters['partitions'].is_a?(Array) ||
+ if scheduling_parameters.include?('partitions') and
+ !scheduling_parameters['partitions'].nil? and
+ (!scheduling_parameters['partitions'].is_a?(Array) ||
scheduling_parameters['partitions'].reject{|x| !x.is_a?(String)}.size !=
scheduling_parameters['partitions'].size)
errors.add :scheduling_parameters, "partitions must be an array of strings"
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list