[arvados] created: 2.5.0-52-gbe5771dd0

Fri Jan 27 19:55:27 UTC 2023

at  be5771dd0fe41d5108d8fd72e62223b37948cf2d (commit)


commit be5771dd0fe41d5108d8fd72e62223b37948cf2d
Author: Tom Clegg <tom at curii.com>
Date:   Fri Jan 27 14:54:47 2023 -0500

    18075: Add CloudVMs.MaxInstances config, retire MaxComputeVMs.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>

diff --git a/doc/admin/upgrading.html.textile.liquid b/doc/admin/upgrading.html.textile.liquid
index 4da168293..324658d67 100644
--- a/doc/admin/upgrading.html.textile.liquid
+++ b/doc/admin/upgrading.html.textile.liquid
@@ -28,10 +28,14 @@ TODO: extract this information based on git commit messages and generate changel
 <div class="releasenotes">
 </notextile>
 
-h2(#main). development main (as of 2023-01-16)
+h2(#main). development main (as of 2023-01-27)
 
 "previous: Upgrading to 2.5.0":#v2_5_0
 
+h3. Default limit for cloud VM instances
+
+There is a new configuration entry @CloudVMs.MaxInstances@ (default 64) that limits the number of VMs the cloud dispatcher will run at a time. This may need to be adjusted to suit your anticipated workload.
+
 h3. Slow migration on upgrade
 
 This upgrade includes a database schema update (changing an integer column in each table from 32-bit to 64-bit) that may be slow on a large installation. Expect the arvados-api-server package upgrade to take longer than usual.
diff --git a/lib/config/config.default.yml b/lib/config/config.default.yml
index 29a4a640b..26ada44d6 100644
--- a/lib/config/config.default.yml
+++ b/lib/config/config.default.yml
@@ -1005,13 +1005,6 @@ Clusters:
       # with the cancelled container.
       MaxRetryAttempts: 3
 
-      # The maximum number of compute nodes that can be in use simultaneously
-      # If this limit is reduced, any existing nodes with slot number >= new limit
-      # will not be counted against the new limit. In other words, the new limit
-      # won't be strictly enforced until those nodes with higher slot numbers
-      # go down.
-      MaxComputeVMs: 64
-
       # Schedule all child containers on preemptible instances (e.g. AWS
       # Spot Instances) even if not requested by the submitter.
       #
@@ -1327,6 +1320,15 @@ Clusters:
         # providers too, if desired.
         MaxConcurrentInstanceCreateOps: 1
 
+        # The maximum number of instances to run at a time, or 0 for
+        # unlimited.
+        #
+        # If more instances than this are already running and busy
+        # when the dispatcher starts up, the running containers will
+        # be allowed to finish before the excess instances are shut
+        # down.
+        MaxInstances: 64
+
         # Interval between cloud provider syncs/updates ("list all
         # instances").
         SyncInterval: 1m
diff --git a/lib/config/export.go b/lib/config/export.go
index bc7864486..f9699c6ed 100644
--- a/lib/config/export.go
+++ b/lib/config/export.go
@@ -131,7 +131,6 @@ var whitelist = map[string]bool{
 	"Containers.Logging":                       false,
 	"Containers.LogReuseDecisions":             false,
 	"Containers.LSF":                           false,
-	"Containers.MaxComputeVMs":                 false,
 	"Containers.MaxDispatchAttempts":           false,
 	"Containers.MaxRetryAttempts":              true,
 	"Containers.MinRetryPeriod":                true,
diff --git a/lib/dispatchcloud/worker/pool.go b/lib/dispatchcloud/worker/pool.go
index 66e0bfee9..e3a7b553c 100644
--- a/lib/dispatchcloud/worker/pool.go
+++ b/lib/dispatchcloud/worker/pool.go
@@ -111,6 +111,7 @@ func NewPool(logger logrus.FieldLogger, arvClient *arvados.Client, reg *promethe
 		instanceTypes:                  cluster.InstanceTypes,
 		maxProbesPerSecond:             cluster.Containers.CloudVMs.MaxProbesPerSecond,
 		maxConcurrentInstanceCreateOps: cluster.Containers.CloudVMs.MaxConcurrentInstanceCreateOps,
+		maxInstances:                   cluster.Containers.CloudVMs.MaxInstances,
 		probeInterval:                  duration(cluster.Containers.CloudVMs.ProbeInterval, defaultProbeInterval),
 		syncInterval:                   duration(cluster.Containers.CloudVMs.SyncInterval, defaultSyncInterval),
 		timeoutIdle:                    duration(cluster.Containers.CloudVMs.TimeoutIdle, defaultTimeoutIdle),
@@ -155,6 +156,7 @@ type Pool struct {
 	probeInterval                  time.Duration
 	maxProbesPerSecond             int
 	maxConcurrentInstanceCreateOps int
+	maxInstances                   int
 	timeoutIdle                    time.Duration
 	timeoutBooting                 time.Duration
 	timeoutProbe                   time.Duration
@@ -369,7 +371,7 @@ func (wp *Pool) Create(it arvados.InstanceType) bool {
 func (wp *Pool) AtQuota() bool {
 	wp.mtx.Lock()
 	defer wp.mtx.Unlock()
-	return time.Now().Before(wp.atQuotaUntil)
+	return time.Now().Before(wp.atQuotaUntil) || (wp.maxInstances > 0 && wp.maxInstances <= len(wp.workers)+len(wp.creating))
 }
 
 // SetIdleBehavior determines how the indicated instance will behave
diff --git a/sdk/go/arvados/config.go b/sdk/go/arvados/config.go
index 76ed7cefb..677706c08 100644
--- a/sdk/go/arvados/config.go
+++ b/sdk/go/arvados/config.go
@@ -498,7 +498,6 @@ type ContainersConfig struct {
 	DefaultKeepCacheRAM           ByteSize
 	DispatchPrivateKey            string
 	LogReuseDecisions             bool
-	MaxComputeVMs                 int
 	MaxDispatchAttempts           int
 	MaxRetryAttempts              int
 	MinRetryPeriod                Duration
@@ -562,6 +561,7 @@ type CloudVMsConfig struct {
 	MaxCloudOpsPerSecond           int
 	MaxProbesPerSecond             int
 	MaxConcurrentInstanceCreateOps int
+	MaxInstances                   int
 	PollInterval                   Duration
 	ProbeInterval                  Duration
 	SSHPort                        string

-----------------------------------------------------------------------


hooks/post-receive
--