[arvados] created: 2.5.0-52-gbe5771dd0
git repository hosting
git at public.arvados.org
Fri Jan 27 19:55:27 UTC 2023
at be5771dd0fe41d5108d8fd72e62223b37948cf2d (commit)
commit be5771dd0fe41d5108d8fd72e62223b37948cf2d
Author: Tom Clegg <tom at curii.com>
Date: Fri Jan 27 14:54:47 2023 -0500
18075: Add CloudVMs.MaxInstances config, retire MaxComputeVMs.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>
diff --git a/doc/admin/upgrading.html.textile.liquid b/doc/admin/upgrading.html.textile.liquid
index 4da168293..324658d67 100644
--- a/doc/admin/upgrading.html.textile.liquid
+++ b/doc/admin/upgrading.html.textile.liquid
@@ -28,10 +28,14 @@ TODO: extract this information based on git commit messages and generate changel
<div class="releasenotes">
</notextile>
-h2(#main). development main (as of 2023-01-16)
+h2(#main). development main (as of 2023-01-27)
"previous: Upgrading to 2.5.0":#v2_5_0
+h3. Default limit for cloud VM instances
+
+There is a new configuration entry @CloudVMs.MaxInstances@ (default 64) that limits the number of VMs the cloud dispatcher will run at a time. This may need to be adjusted to suit your anticipated workload.
+
h3. Slow migration on upgrade
This upgrade includes a database schema update (changing an integer column in each table from 32-bit to 64-bit) that may be slow on a large installation. Expect the arvados-api-server package upgrade to take longer than usual.
diff --git a/lib/config/config.default.yml b/lib/config/config.default.yml
index 29a4a640b..26ada44d6 100644
--- a/lib/config/config.default.yml
+++ b/lib/config/config.default.yml
@@ -1005,13 +1005,6 @@ Clusters:
# with the cancelled container.
MaxRetryAttempts: 3
- # The maximum number of compute nodes that can be in use simultaneously
- # If this limit is reduced, any existing nodes with slot number >= new limit
- # will not be counted against the new limit. In other words, the new limit
- # won't be strictly enforced until those nodes with higher slot numbers
- # go down.
- MaxComputeVMs: 64
-
# Schedule all child containers on preemptible instances (e.g. AWS
# Spot Instances) even if not requested by the submitter.
#
@@ -1327,6 +1320,15 @@ Clusters:
# providers too, if desired.
MaxConcurrentInstanceCreateOps: 1
+ # The maximum number of instances to run at a time, or 0 for
+ # unlimited.
+ #
+ # If more instances than this are already running and busy
+ # when the dispatcher starts up, the running containers will
+ # be allowed to finish before the excess instances are shut
+ # down.
+ MaxInstances: 64
+
# Interval between cloud provider syncs/updates ("list all
# instances").
SyncInterval: 1m
diff --git a/lib/config/export.go b/lib/config/export.go
index bc7864486..f9699c6ed 100644
--- a/lib/config/export.go
+++ b/lib/config/export.go
@@ -131,7 +131,6 @@ var whitelist = map[string]bool{
"Containers.Logging": false,
"Containers.LogReuseDecisions": false,
"Containers.LSF": false,
- "Containers.MaxComputeVMs": false,
"Containers.MaxDispatchAttempts": false,
"Containers.MaxRetryAttempts": true,
"Containers.MinRetryPeriod": true,
diff --git a/lib/dispatchcloud/worker/pool.go b/lib/dispatchcloud/worker/pool.go
index 66e0bfee9..e3a7b553c 100644
--- a/lib/dispatchcloud/worker/pool.go
+++ b/lib/dispatchcloud/worker/pool.go
@@ -111,6 +111,7 @@ func NewPool(logger logrus.FieldLogger, arvClient *arvados.Client, reg *promethe
instanceTypes: cluster.InstanceTypes,
maxProbesPerSecond: cluster.Containers.CloudVMs.MaxProbesPerSecond,
maxConcurrentInstanceCreateOps: cluster.Containers.CloudVMs.MaxConcurrentInstanceCreateOps,
+ maxInstances: cluster.Containers.CloudVMs.MaxInstances,
probeInterval: duration(cluster.Containers.CloudVMs.ProbeInterval, defaultProbeInterval),
syncInterval: duration(cluster.Containers.CloudVMs.SyncInterval, defaultSyncInterval),
timeoutIdle: duration(cluster.Containers.CloudVMs.TimeoutIdle, defaultTimeoutIdle),
@@ -155,6 +156,7 @@ type Pool struct {
probeInterval time.Duration
maxProbesPerSecond int
maxConcurrentInstanceCreateOps int
+ maxInstances int
timeoutIdle time.Duration
timeoutBooting time.Duration
timeoutProbe time.Duration
@@ -369,7 +371,7 @@ func (wp *Pool) Create(it arvados.InstanceType) bool {
func (wp *Pool) AtQuota() bool {
wp.mtx.Lock()
defer wp.mtx.Unlock()
- return time.Now().Before(wp.atQuotaUntil)
+ return time.Now().Before(wp.atQuotaUntil) || (wp.maxInstances > 0 && wp.maxInstances <= len(wp.workers)+len(wp.creating))
}
// SetIdleBehavior determines how the indicated instance will behave
diff --git a/sdk/go/arvados/config.go b/sdk/go/arvados/config.go
index 76ed7cefb..677706c08 100644
--- a/sdk/go/arvados/config.go
+++ b/sdk/go/arvados/config.go
@@ -498,7 +498,6 @@ type ContainersConfig struct {
DefaultKeepCacheRAM ByteSize
DispatchPrivateKey string
LogReuseDecisions bool
- MaxComputeVMs int
MaxDispatchAttempts int
MaxRetryAttempts int
MinRetryPeriod Duration
@@ -562,6 +561,7 @@ type CloudVMsConfig struct {
MaxCloudOpsPerSecond int
MaxProbesPerSecond int
MaxConcurrentInstanceCreateOps int
+ MaxInstances int
PollInterval Duration
ProbeInterval Duration
SSHPort string
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list