[ARVADOS] updated: 1.3.0-3042-ga5fef23f2
Git user
git at public.arvados.org
Tue Sep 1 13:46:52 UTC 2020
Summary of changes:
lib/config/config.default.yml | 10 ++--
lib/config/generated_config.go | 10 ++--
lib/dispatchcloud/worker/pool.go | 100 ++++++++++++++++------------------
lib/dispatchcloud/worker/pool_test.go | 12 ++--
lib/dispatchcloud/worker/throttle.go | 6 ++
sdk/go/arvados/config.go | 36 ++++++------
6 files changed, 88 insertions(+), 86 deletions(-)
via a5fef23f2863cd0183ff596f4579110e2ddb3b3d (commit)
from da85d6516630d06ae3c34b4a52dc5ddff9fd5ace (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
commit a5fef23f2863cd0183ff596f4579110e2ddb3b3d
Author: Ward Vandewege <ward at curii.com>
Date: Tue Sep 1 09:43:57 2020 -0400
16739: implement review feedback.
Arvados-DCO-1.1-Signed-off-by: Ward Vandewege <ward at curii.com>
diff --git a/lib/config/config.default.yml b/lib/config/config.default.yml
index 010459b0e..57baf5342 100644
--- a/lib/config/config.default.yml
+++ b/lib/config/config.default.yml
@@ -945,11 +945,11 @@ Clusters:
# unlimited).
MaxCloudOpsPerSecond: 0
- # Maximum concurrent node creation operations (0 = unlimited). (azure)
- # managed disks: set MaxConcurrentNodeCreateOps to 20 to avoid
- # timeouts, cf
- # https://docs.microsoft.com/en-us/azure/virtual-machines/linux/capture-image
- MaxConcurrentNodeCreateOps: 0
+ # Maximum concurrent node creation operations (0 = unlimited). This is
+ # recommended by Azure in certain scenarios (see
+ # https://docs.microsoft.com/en-us/azure/virtual-machines/linux/capture-image)
+ # and can be used with other cloud providers too, if desired.
+ MaxConcurrentInstanceCreateOps: 0
# Interval between cloud provider syncs/updates ("list all
# instances").
diff --git a/lib/config/generated_config.go b/lib/config/generated_config.go
index 866012e09..3df644cf6 100644
--- a/lib/config/generated_config.go
+++ b/lib/config/generated_config.go
@@ -951,11 +951,11 @@ Clusters:
# unlimited).
MaxCloudOpsPerSecond: 0
- # Maximum concurrent node creation operations (0 = unlimited). (azure)
- # managed disks: set MaxConcurrentNodeCreateOps to 20 to avoid
- # timeouts, cf
- # https://docs.microsoft.com/en-us/azure/virtual-machines/linux/capture-image
- MaxConcurrentNodeCreateOps: 0
+ # Maximum concurrent node creation operations (0 = unlimited). This is
+ # recommended by Azure in certain scenarios (see
+ # https://docs.microsoft.com/en-us/azure/virtual-machines/linux/capture-image)
+ # and can be used with other cloud providers too, if desired.
+ MaxConcurrentInstanceCreateOps: 0
# Interval between cloud provider syncs/updates ("list all
# instances").
diff --git a/lib/dispatchcloud/worker/pool.go b/lib/dispatchcloud/worker/pool.go
index ec6a049e6..435b6e43a 100644
--- a/lib/dispatchcloud/worker/pool.go
+++ b/lib/dispatchcloud/worker/pool.go
@@ -96,28 +96,28 @@ func duration(conf arvados.Duration, def time.Duration) time.Duration {
// cluster configuration.
func NewPool(logger logrus.FieldLogger, arvClient *arvados.Client, reg *prometheus.Registry, instanceSetID cloud.InstanceSetID, instanceSet cloud.InstanceSet, newExecutor func(cloud.Instance) Executor, installPublicKey ssh.PublicKey, cluster *arvados.Cluster) *Pool {
wp := &Pool{
- logger: logger,
- arvClient: arvClient,
- instanceSetID: instanceSetID,
- instanceSet: &throttledInstanceSet{InstanceSet: instanceSet},
- newExecutor: newExecutor,
- bootProbeCommand: cluster.Containers.CloudVMs.BootProbeCommand,
- runnerSource: cluster.Containers.CloudVMs.DeployRunnerBinary,
- imageID: cloud.ImageID(cluster.Containers.CloudVMs.ImageID),
- instanceTypes: cluster.InstanceTypes,
- maxProbesPerSecond: cluster.Containers.CloudVMs.MaxProbesPerSecond,
- maxConcurrentNodeCreateOps: cluster.Containers.CloudVMs.MaxConcurrentNodeCreateOps,
- probeInterval: duration(cluster.Containers.CloudVMs.ProbeInterval, defaultProbeInterval),
- syncInterval: duration(cluster.Containers.CloudVMs.SyncInterval, defaultSyncInterval),
- timeoutIdle: duration(cluster.Containers.CloudVMs.TimeoutIdle, defaultTimeoutIdle),
- timeoutBooting: duration(cluster.Containers.CloudVMs.TimeoutBooting, defaultTimeoutBooting),
- timeoutProbe: duration(cluster.Containers.CloudVMs.TimeoutProbe, defaultTimeoutProbe),
- timeoutShutdown: duration(cluster.Containers.CloudVMs.TimeoutShutdown, defaultTimeoutShutdown),
- timeoutTERM: duration(cluster.Containers.CloudVMs.TimeoutTERM, defaultTimeoutTERM),
- timeoutSignal: duration(cluster.Containers.CloudVMs.TimeoutSignal, defaultTimeoutSignal),
- installPublicKey: installPublicKey,
- tagKeyPrefix: cluster.Containers.CloudVMs.TagKeyPrefix,
- stop: make(chan bool),
+ logger: logger,
+ arvClient: arvClient,
+ instanceSetID: instanceSetID,
+ instanceSet: &throttledInstanceSet{InstanceSet: instanceSet},
+ newExecutor: newExecutor,
+ bootProbeCommand: cluster.Containers.CloudVMs.BootProbeCommand,
+ runnerSource: cluster.Containers.CloudVMs.DeployRunnerBinary,
+ imageID: cloud.ImageID(cluster.Containers.CloudVMs.ImageID),
+ instanceTypes: cluster.InstanceTypes,
+ maxProbesPerSecond: cluster.Containers.CloudVMs.MaxProbesPerSecond,
+ maxConcurrentInstanceCreateOps: cluster.Containers.CloudVMs.MaxConcurrentInstanceCreateOps,
+ probeInterval: duration(cluster.Containers.CloudVMs.ProbeInterval, defaultProbeInterval),
+ syncInterval: duration(cluster.Containers.CloudVMs.SyncInterval, defaultSyncInterval),
+ timeoutIdle: duration(cluster.Containers.CloudVMs.TimeoutIdle, defaultTimeoutIdle),
+ timeoutBooting: duration(cluster.Containers.CloudVMs.TimeoutBooting, defaultTimeoutBooting),
+ timeoutProbe: duration(cluster.Containers.CloudVMs.TimeoutProbe, defaultTimeoutProbe),
+ timeoutShutdown: duration(cluster.Containers.CloudVMs.TimeoutShutdown, defaultTimeoutShutdown),
+ timeoutTERM: duration(cluster.Containers.CloudVMs.TimeoutTERM, defaultTimeoutTERM),
+ timeoutSignal: duration(cluster.Containers.CloudVMs.TimeoutSignal, defaultTimeoutSignal),
+ installPublicKey: installPublicKey,
+ tagKeyPrefix: cluster.Containers.CloudVMs.TagKeyPrefix,
+ stop: make(chan bool),
}
wp.registerMetrics(reg)
go func() {
@@ -133,27 +133,27 @@ func NewPool(logger logrus.FieldLogger, arvClient *arvados.Client, reg *promethe
// zero Pool should not be used. Call NewPool to create a new Pool.
type Pool struct {
// configuration
- logger logrus.FieldLogger
- arvClient *arvados.Client
- instanceSetID cloud.InstanceSetID
- instanceSet *throttledInstanceSet
- newExecutor func(cloud.Instance) Executor
- bootProbeCommand string
- runnerSource string
- imageID cloud.ImageID
- instanceTypes map[string]arvados.InstanceType
- syncInterval time.Duration
- probeInterval time.Duration
- maxProbesPerSecond int
- maxConcurrentNodeCreateOps int
- timeoutIdle time.Duration
- timeoutBooting time.Duration
- timeoutProbe time.Duration
- timeoutShutdown time.Duration
- timeoutTERM time.Duration
- timeoutSignal time.Duration
- installPublicKey ssh.PublicKey
- tagKeyPrefix string
+ logger logrus.FieldLogger
+ arvClient *arvados.Client
+ instanceSetID cloud.InstanceSetID
+ instanceSet *throttledInstanceSet
+ newExecutor func(cloud.Instance) Executor
+ bootProbeCommand string
+ runnerSource string
+ imageID cloud.ImageID
+ instanceTypes map[string]arvados.InstanceType
+ syncInterval time.Duration
+ probeInterval time.Duration
+ maxProbesPerSecond int
+ maxConcurrentInstanceCreateOps int
+ timeoutIdle time.Duration
+ timeoutBooting time.Duration
+ timeoutProbe time.Duration
+ timeoutShutdown time.Duration
+ timeoutTERM time.Duration
+ timeoutSignal time.Duration
+ installPublicKey ssh.PublicKey
+ tagKeyPrefix string
// private state
subscribers map[<-chan struct{}]chan<- struct{}
@@ -280,13 +280,6 @@ func (wp *Pool) Unallocated() map[arvados.InstanceType]int {
return unalloc
}
-type RateLimitError struct{ Retry time.Time }
-
-func (e RateLimitError) Error() string {
- return fmt.Sprintf("node creation request failed, hit maxConcurrentNodeCreateOps, wait until %s", e.Retry)
-}
-func (e RateLimitError) EarliestRetry() time.Time { return e.Retry }
-
// Create a new instance with the given type, and add it to the worker
// pool. The worker is added immediately; instance creation runs in
// the background.
@@ -307,15 +300,16 @@ func (wp *Pool) Create(it arvados.InstanceType) bool {
if time.Now().Before(wp.atQuotaUntil) || wp.instanceSet.throttleCreate.Error() != nil {
return false
}
- // The maxConcurrentNodeCreateOps knob throttles the number of node create
+ // The maxConcurrentInstanceCreateOps knob throttles the number of node create
// requests in flight. It was added to work around a limitation in Azure's
// managed disks, which support no more than 20 concurrent node creation
// requests from a single disk image (cf.
// https://docs.microsoft.com/en-us/azure/virtual-machines/linux/capture-image).
// The code assumes that node creation, from Azure's perspective, means the
// period until the instance appears in the "get all instances" list.
- if wp.maxConcurrentNodeCreateOps > 0 && len(wp.creating) >= wp.maxConcurrentNodeCreateOps {
- wp.instanceSet.throttleCreate.CheckRateLimitError(RateLimitError{Retry: time.Now().Add(5 * time.Second)}, wp.logger, "create instance", wp.notify)
+ if wp.maxConcurrentInstanceCreateOps > 0 && len(wp.creating) >= wp.maxConcurrentInstanceCreateOps {
+ logger.Info("reached MaxConcurrentInstanceCreateOps")
+ wp.instanceSet.throttleCreate.ErrorUntil(errors.New("reached MaxConcurrentInstanceCreateOps"), time.Now().Add(5*time.Second), wp.notify)
return false
}
now := time.Now()
diff --git a/lib/dispatchcloud/worker/pool_test.go b/lib/dispatchcloud/worker/pool_test.go
index 6b540eee7..7b116c3ee 100644
--- a/lib/dispatchcloud/worker/pool_test.go
+++ b/lib/dispatchcloud/worker/pool_test.go
@@ -207,9 +207,9 @@ func (suite *PoolSuite) TestNodeCreateThrottle(c *check.C) {
type1 := test.InstanceType(1)
pool := &Pool{
- logger: logger,
- instanceSet: &throttledInstanceSet{InstanceSet: instanceSet},
- maxConcurrentNodeCreateOps: 1,
+ logger: logger,
+ instanceSet: &throttledInstanceSet{InstanceSet: instanceSet},
+ maxConcurrentInstanceCreateOps: 1,
instanceTypes: arvados.InstanceTypeMap{
type1.Name: type1,
},
@@ -224,13 +224,15 @@ func (suite *PoolSuite) TestNodeCreateThrottle(c *check.C) {
c.Check(pool.Unallocated()[type1], check.Equals, 1)
c.Check(res, check.Equals, false)
- pool.maxConcurrentNodeCreateOps = 2
+ pool.instanceSet.throttleCreate.ResetError()
+ pool.maxConcurrentInstanceCreateOps = 2
res = pool.Create(type1)
c.Check(pool.Unallocated()[type1], check.Equals, 2)
c.Check(res, check.Equals, true)
- pool.maxConcurrentNodeCreateOps = 0
+ pool.instanceSet.throttleCreate.ResetError()
+ pool.maxConcurrentInstanceCreateOps = 0
res = pool.Create(type1)
c.Check(pool.Unallocated()[type1], check.Equals, 3)
diff --git a/lib/dispatchcloud/worker/throttle.go b/lib/dispatchcloud/worker/throttle.go
index defbf91ff..5374afb44 100644
--- a/lib/dispatchcloud/worker/throttle.go
+++ b/lib/dispatchcloud/worker/throttle.go
@@ -61,6 +61,12 @@ func (thr *throttle) Error() error {
return thr.err
}
+func (thr *throttle) ResetError() {
+ thr.mtx.Lock()
+ defer thr.mtx.Unlock()
+ thr.err = nil
+}
+
type throttledInstanceSet struct {
cloud.InstanceSet
throttleCreate throttle
diff --git a/sdk/go/arvados/config.go b/sdk/go/arvados/config.go
index 1e190a947..829d4097c 100644
--- a/sdk/go/arvados/config.go
+++ b/sdk/go/arvados/config.go
@@ -446,24 +446,24 @@ type ContainersConfig struct {
type CloudVMsConfig struct {
Enable bool
- BootProbeCommand string
- DeployRunnerBinary string
- ImageID string
- MaxCloudOpsPerSecond int
- MaxProbesPerSecond int
- MaxConcurrentNodeCreateOps int
- PollInterval Duration
- ProbeInterval Duration
- SSHPort string
- SyncInterval Duration
- TimeoutBooting Duration
- TimeoutIdle Duration
- TimeoutProbe Duration
- TimeoutShutdown Duration
- TimeoutSignal Duration
- TimeoutTERM Duration
- ResourceTags map[string]string
- TagKeyPrefix string
+ BootProbeCommand string
+ DeployRunnerBinary string
+ ImageID string
+ MaxCloudOpsPerSecond int
+ MaxProbesPerSecond int
+ MaxConcurrentInstanceCreateOps int
+ PollInterval Duration
+ ProbeInterval Duration
+ SSHPort string
+ SyncInterval Duration
+ TimeoutBooting Duration
+ TimeoutIdle Duration
+ TimeoutProbe Duration
+ TimeoutShutdown Duration
+ TimeoutSignal Duration
+ TimeoutTERM Duration
+ ResourceTags map[string]string
+ TagKeyPrefix string
Driver string
DriverParameters json.RawMessage
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list