[ARVADOS] created: 1.3.0-3040-gbaa1f2569
Git user
git at public.arvados.org
Mon Aug 31 18:42:42 UTC 2020
at baa1f256924655d67b704f35981e9839743fab99 (commit)
commit baa1f256924655d67b704f35981e9839743fab99
Author: Ward Vandewege <ward at curii.com>
Date: Mon Aug 31 14:42:14 2020 -0400
16739: a-d-c: add a concurrent node creation throttle option.
Arvados-DCO-1.1-Signed-off-by: Ward Vandewege <ward at curii.com>
diff --git a/doc/install/crunch2-cloud/install-dispatch-cloud.html.textile.liquid b/doc/install/crunch2-cloud/install-dispatch-cloud.html.textile.liquid
index 684177847..c288970bf 100644
--- a/doc/install/crunch2-cloud/install-dispatch-cloud.html.textile.liquid
+++ b/doc/install/crunch2-cloud/install-dispatch-cloud.html.textile.liquid
@@ -100,6 +100,9 @@ Using managed disks:
CloudVMs:
ImageID: "zzzzz-compute-v1597349873"
Driver: azure
+ # (azure) managed disks: set MaxConcurrentNodeCreateOps to 20 to avoid timeouts, cf
+ # https://docs.microsoft.com/en-us/azure/virtual-machines/linux/capture-image
+ MaxConcurrentNodeCreateOps: 20
DriverParameters:
# Credentials.
SubscriptionID: XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX
diff --git a/lib/config/config.default.yml b/lib/config/config.default.yml
index 80294afaf..010459b0e 100644
--- a/lib/config/config.default.yml
+++ b/lib/config/config.default.yml
@@ -945,6 +945,12 @@ Clusters:
# unlimited).
MaxCloudOpsPerSecond: 0
+ # Maximum concurrent node creation operations (0 = unlimited). (azure)
+ # managed disks: set MaxConcurrentNodeCreateOps to 20 to avoid
+ # timeouts, cf
+ # https://docs.microsoft.com/en-us/azure/virtual-machines/linux/capture-image
+ MaxConcurrentNodeCreateOps: 0
+
# Interval between cloud provider syncs/updates ("list all
# instances").
SyncInterval: 1m
diff --git a/lib/config/generated_config.go b/lib/config/generated_config.go
index 57204cf36..866012e09 100644
--- a/lib/config/generated_config.go
+++ b/lib/config/generated_config.go
@@ -951,6 +951,12 @@ Clusters:
# unlimited).
MaxCloudOpsPerSecond: 0
+ # Maximum concurrent node creation operations (0 = unlimited). (azure)
+ # managed disks: set MaxConcurrentNodeCreateOps to 20 to avoid
+ # timeouts, cf
+ # https://docs.microsoft.com/en-us/azure/virtual-machines/linux/capture-image
+ MaxConcurrentNodeCreateOps: 0
+
# Interval between cloud provider syncs/updates ("list all
# instances").
SyncInterval: 1m
diff --git a/lib/dispatchcloud/worker/pool.go b/lib/dispatchcloud/worker/pool.go
index 12bc1cdd7..2d958d837 100644
--- a/lib/dispatchcloud/worker/pool.go
+++ b/lib/dispatchcloud/worker/pool.go
@@ -96,27 +96,28 @@ func duration(conf arvados.Duration, def time.Duration) time.Duration {
// cluster configuration.
func NewPool(logger logrus.FieldLogger, arvClient *arvados.Client, reg *prometheus.Registry, instanceSetID cloud.InstanceSetID, instanceSet cloud.InstanceSet, newExecutor func(cloud.Instance) Executor, installPublicKey ssh.PublicKey, cluster *arvados.Cluster) *Pool {
wp := &Pool{
- logger: logger,
- arvClient: arvClient,
- instanceSetID: instanceSetID,
- instanceSet: &throttledInstanceSet{InstanceSet: instanceSet},
- newExecutor: newExecutor,
- bootProbeCommand: cluster.Containers.CloudVMs.BootProbeCommand,
- runnerSource: cluster.Containers.CloudVMs.DeployRunnerBinary,
- imageID: cloud.ImageID(cluster.Containers.CloudVMs.ImageID),
- instanceTypes: cluster.InstanceTypes,
- maxProbesPerSecond: cluster.Containers.CloudVMs.MaxProbesPerSecond,
- probeInterval: duration(cluster.Containers.CloudVMs.ProbeInterval, defaultProbeInterval),
- syncInterval: duration(cluster.Containers.CloudVMs.SyncInterval, defaultSyncInterval),
- timeoutIdle: duration(cluster.Containers.CloudVMs.TimeoutIdle, defaultTimeoutIdle),
- timeoutBooting: duration(cluster.Containers.CloudVMs.TimeoutBooting, defaultTimeoutBooting),
- timeoutProbe: duration(cluster.Containers.CloudVMs.TimeoutProbe, defaultTimeoutProbe),
- timeoutShutdown: duration(cluster.Containers.CloudVMs.TimeoutShutdown, defaultTimeoutShutdown),
- timeoutTERM: duration(cluster.Containers.CloudVMs.TimeoutTERM, defaultTimeoutTERM),
- timeoutSignal: duration(cluster.Containers.CloudVMs.TimeoutSignal, defaultTimeoutSignal),
- installPublicKey: installPublicKey,
- tagKeyPrefix: cluster.Containers.CloudVMs.TagKeyPrefix,
- stop: make(chan bool),
+ logger: logger,
+ arvClient: arvClient,
+ instanceSetID: instanceSetID,
+ instanceSet: &throttledInstanceSet{InstanceSet: instanceSet},
+ newExecutor: newExecutor,
+ bootProbeCommand: cluster.Containers.CloudVMs.BootProbeCommand,
+ runnerSource: cluster.Containers.CloudVMs.DeployRunnerBinary,
+ imageID: cloud.ImageID(cluster.Containers.CloudVMs.ImageID),
+ instanceTypes: cluster.InstanceTypes,
+ maxProbesPerSecond: cluster.Containers.CloudVMs.MaxProbesPerSecond,
+ maxConcurrentNodeCreateOps: cluster.Containers.CloudVMs.MaxConcurrentNodeCreateOps,
+ probeInterval: duration(cluster.Containers.CloudVMs.ProbeInterval, defaultProbeInterval),
+ syncInterval: duration(cluster.Containers.CloudVMs.SyncInterval, defaultSyncInterval),
+ timeoutIdle: duration(cluster.Containers.CloudVMs.TimeoutIdle, defaultTimeoutIdle),
+ timeoutBooting: duration(cluster.Containers.CloudVMs.TimeoutBooting, defaultTimeoutBooting),
+ timeoutProbe: duration(cluster.Containers.CloudVMs.TimeoutProbe, defaultTimeoutProbe),
+ timeoutShutdown: duration(cluster.Containers.CloudVMs.TimeoutShutdown, defaultTimeoutShutdown),
+ timeoutTERM: duration(cluster.Containers.CloudVMs.TimeoutTERM, defaultTimeoutTERM),
+ timeoutSignal: duration(cluster.Containers.CloudVMs.TimeoutSignal, defaultTimeoutSignal),
+ installPublicKey: installPublicKey,
+ tagKeyPrefix: cluster.Containers.CloudVMs.TagKeyPrefix,
+ stop: make(chan bool),
}
wp.registerMetrics(reg)
go func() {
@@ -132,26 +133,27 @@ func NewPool(logger logrus.FieldLogger, arvClient *arvados.Client, reg *promethe
// zero Pool should not be used. Call NewPool to create a new Pool.
type Pool struct {
// configuration
- logger logrus.FieldLogger
- arvClient *arvados.Client
- instanceSetID cloud.InstanceSetID
- instanceSet *throttledInstanceSet
- newExecutor func(cloud.Instance) Executor
- bootProbeCommand string
- runnerSource string
- imageID cloud.ImageID
- instanceTypes map[string]arvados.InstanceType
- syncInterval time.Duration
- probeInterval time.Duration
- maxProbesPerSecond int
- timeoutIdle time.Duration
- timeoutBooting time.Duration
- timeoutProbe time.Duration
- timeoutShutdown time.Duration
- timeoutTERM time.Duration
- timeoutSignal time.Duration
- installPublicKey ssh.PublicKey
- tagKeyPrefix string
+ logger logrus.FieldLogger
+ arvClient *arvados.Client
+ instanceSetID cloud.InstanceSetID
+ instanceSet *throttledInstanceSet
+ newExecutor func(cloud.Instance) Executor
+ bootProbeCommand string
+ runnerSource string
+ imageID cloud.ImageID
+ instanceTypes map[string]arvados.InstanceType
+ syncInterval time.Duration
+ probeInterval time.Duration
+ maxProbesPerSecond int
+ maxConcurrentNodeCreateOps int
+ timeoutIdle time.Duration
+ timeoutBooting time.Duration
+ timeoutProbe time.Duration
+ timeoutShutdown time.Duration
+ timeoutTERM time.Duration
+ timeoutSignal time.Duration
+ installPublicKey ssh.PublicKey
+ tagKeyPrefix string
// private state
subscribers map[<-chan struct{}]chan<- struct{}
@@ -281,6 +283,13 @@ func (wp *Pool) Unallocated() map[arvados.InstanceType]int {
return unalloc
}
+type RateLimitError struct{ Retry time.Time }
+
+func (e RateLimitError) Error() string {
+ return fmt.Sprintf("node creation request failed, hit maxConcurrentNodeCreateOps, wait until %s", e.Retry)
+}
+func (e RateLimitError) EarliestRetry() time.Time { return e.Retry }
+
// Create a new instance with the given type, and add it to the worker
// pool. The worker is added immediately; instance creation runs in
// the background.
@@ -301,6 +310,17 @@ func (wp *Pool) Create(it arvados.InstanceType) bool {
if time.Now().Before(wp.atQuotaUntil) || wp.throttleCreate.Error() != nil {
return false
}
+ // The maxConcurrentNodeCreateOps knob throttles the number of node create
+ // requests in flight. It was added to work around a limitation in Azure's
+ // managed disks, which support no more than 20 concurrent node creation
+ // requests from a single disk image (cf.
+ // https://docs.microsoft.com/en-us/azure/virtual-machines/linux/capture-image).
+ // The code assumes that node creation, from Azure's perspective, means the
+ // period until the instance appears in the "get all instances" list.
+ if wp.maxConcurrentNodeCreateOps > 0 && len(wp.creating) >= wp.maxConcurrentNodeCreateOps {
+ wp.instanceSet.throttleCreate.CheckRateLimitError(RateLimitError{Retry: time.Now().Add(5 * time.Second)}, wp.logger, "create instance", wp.notify)
+ return false
+ }
now := time.Now()
secret := randomHex(instanceSecretLength)
wp.creating[secret] = createCall{time: now, instanceType: it}
diff --git a/lib/dispatchcloud/worker/pool_test.go b/lib/dispatchcloud/worker/pool_test.go
index 0c173c107..6b540eee7 100644
--- a/lib/dispatchcloud/worker/pool_test.go
+++ b/lib/dispatchcloud/worker/pool_test.go
@@ -199,6 +199,44 @@ func (suite *PoolSuite) TestDrain(c *check.C) {
}
}
+func (suite *PoolSuite) TestNodeCreateThrottle(c *check.C) {
+ logger := ctxlog.TestLogger(c)
+ driver := test.StubDriver{HoldCloudOps: true}
+ instanceSet, err := driver.InstanceSet(nil, "test-instance-set-id", nil, logger)
+ c.Assert(err, check.IsNil)
+
+ type1 := test.InstanceType(1)
+ pool := &Pool{
+ logger: logger,
+ instanceSet: &throttledInstanceSet{InstanceSet: instanceSet},
+ maxConcurrentNodeCreateOps: 1,
+ instanceTypes: arvados.InstanceTypeMap{
+ type1.Name: type1,
+ },
+ }
+
+ c.Check(pool.Unallocated()[type1], check.Equals, 0)
+ res := pool.Create(type1)
+ c.Check(pool.Unallocated()[type1], check.Equals, 1)
+ c.Check(res, check.Equals, true)
+
+ res = pool.Create(type1)
+ c.Check(pool.Unallocated()[type1], check.Equals, 1)
+ c.Check(res, check.Equals, false)
+
+ pool.maxConcurrentNodeCreateOps = 2
+
+ res = pool.Create(type1)
+ c.Check(pool.Unallocated()[type1], check.Equals, 2)
+ c.Check(res, check.Equals, true)
+
+ pool.maxConcurrentNodeCreateOps = 0
+
+ res = pool.Create(type1)
+ c.Check(pool.Unallocated()[type1], check.Equals, 3)
+ c.Check(res, check.Equals, true)
+}
+
func (suite *PoolSuite) TestCreateUnallocShutdown(c *check.C) {
logger := ctxlog.TestLogger(c)
driver := test.StubDriver{HoldCloudOps: true}
diff --git a/sdk/go/arvados/config.go b/sdk/go/arvados/config.go
index 6e1549224..1e190a947 100644
--- a/sdk/go/arvados/config.go
+++ b/sdk/go/arvados/config.go
@@ -446,23 +446,24 @@ type ContainersConfig struct {
type CloudVMsConfig struct {
Enable bool
- BootProbeCommand string
- DeployRunnerBinary string
- ImageID string
- MaxCloudOpsPerSecond int
- MaxProbesPerSecond int
- PollInterval Duration
- ProbeInterval Duration
- SSHPort string
- SyncInterval Duration
- TimeoutBooting Duration
- TimeoutIdle Duration
- TimeoutProbe Duration
- TimeoutShutdown Duration
- TimeoutSignal Duration
- TimeoutTERM Duration
- ResourceTags map[string]string
- TagKeyPrefix string
+ BootProbeCommand string
+ DeployRunnerBinary string
+ ImageID string
+ MaxCloudOpsPerSecond int
+ MaxProbesPerSecond int
+ MaxConcurrentNodeCreateOps int
+ PollInterval Duration
+ ProbeInterval Duration
+ SSHPort string
+ SyncInterval Duration
+ TimeoutBooting Duration
+ TimeoutIdle Duration
+ TimeoutProbe Duration
+ TimeoutShutdown Duration
+ TimeoutSignal Duration
+ TimeoutTERM Duration
+ ResourceTags map[string]string
+ TagKeyPrefix string
Driver string
DriverParameters json.RawMessage
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list