[ARVADOS] updated: 1.3.0-3042-ga5fef23f2

Git user git at public.arvados.org
Tue Sep 1 13:46:52 UTC 2020


Summary of changes:
 lib/config/config.default.yml         |  10 ++--
 lib/config/generated_config.go        |  10 ++--
 lib/dispatchcloud/worker/pool.go      | 100 ++++++++++++++++------------------
 lib/dispatchcloud/worker/pool_test.go |  12 ++--
 lib/dispatchcloud/worker/throttle.go  |   6 ++
 sdk/go/arvados/config.go              |  36 ++++++------
 6 files changed, 88 insertions(+), 86 deletions(-)

       via  a5fef23f2863cd0183ff596f4579110e2ddb3b3d (commit)
      from  da85d6516630d06ae3c34b4a52dc5ddff9fd5ace (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.


commit a5fef23f2863cd0183ff596f4579110e2ddb3b3d
Author: Ward Vandewege <ward at curii.com>
Date:   Tue Sep 1 09:43:57 2020 -0400

    16739: implement review feedback.
    
    Arvados-DCO-1.1-Signed-off-by: Ward Vandewege <ward at curii.com>

diff --git a/lib/config/config.default.yml b/lib/config/config.default.yml
index 010459b0e..57baf5342 100644
--- a/lib/config/config.default.yml
+++ b/lib/config/config.default.yml
@@ -945,11 +945,11 @@ Clusters:
         # unlimited).
         MaxCloudOpsPerSecond: 0
 
-        # Maximum concurrent node creation operations (0 = unlimited).  (azure)
-        # managed disks: set MaxConcurrentNodeCreateOps to 20 to avoid
-        # timeouts, cf
-        # https://docs.microsoft.com/en-us/azure/virtual-machines/linux/capture-image
-        MaxConcurrentNodeCreateOps: 0
+        # Maximum concurrent node creation operations (0 = unlimited). This is
+        # recommended by Azure in certain scenarios (see
+        # https://docs.microsoft.com/en-us/azure/virtual-machines/linux/capture-image)
+        # and can be used with other cloud providers too, if desired.
+        MaxConcurrentInstanceCreateOps: 0
 
         # Interval between cloud provider syncs/updates ("list all
         # instances").
diff --git a/lib/config/generated_config.go b/lib/config/generated_config.go
index 866012e09..3df644cf6 100644
--- a/lib/config/generated_config.go
+++ b/lib/config/generated_config.go
@@ -951,11 +951,11 @@ Clusters:
         # unlimited).
         MaxCloudOpsPerSecond: 0
 
-        # Maximum concurrent node creation operations (0 = unlimited).  (azure)
-        # managed disks: set MaxConcurrentNodeCreateOps to 20 to avoid
-        # timeouts, cf
-        # https://docs.microsoft.com/en-us/azure/virtual-machines/linux/capture-image
-        MaxConcurrentNodeCreateOps: 0
+        # Maximum concurrent node creation operations (0 = unlimited). This is
+        # recommended by Azure in certain scenarios (see
+        # https://docs.microsoft.com/en-us/azure/virtual-machines/linux/capture-image)
+        # and can be used with other cloud providers too, if desired.
+        MaxConcurrentInstanceCreateOps: 0
 
         # Interval between cloud provider syncs/updates ("list all
         # instances").
diff --git a/lib/dispatchcloud/worker/pool.go b/lib/dispatchcloud/worker/pool.go
index ec6a049e6..435b6e43a 100644
--- a/lib/dispatchcloud/worker/pool.go
+++ b/lib/dispatchcloud/worker/pool.go
@@ -96,28 +96,28 @@ func duration(conf arvados.Duration, def time.Duration) time.Duration {
 // cluster configuration.
 func NewPool(logger logrus.FieldLogger, arvClient *arvados.Client, reg *prometheus.Registry, instanceSetID cloud.InstanceSetID, instanceSet cloud.InstanceSet, newExecutor func(cloud.Instance) Executor, installPublicKey ssh.PublicKey, cluster *arvados.Cluster) *Pool {
 	wp := &Pool{
-		logger:                     logger,
-		arvClient:                  arvClient,
-		instanceSetID:              instanceSetID,
-		instanceSet:                &throttledInstanceSet{InstanceSet: instanceSet},
-		newExecutor:                newExecutor,
-		bootProbeCommand:           cluster.Containers.CloudVMs.BootProbeCommand,
-		runnerSource:               cluster.Containers.CloudVMs.DeployRunnerBinary,
-		imageID:                    cloud.ImageID(cluster.Containers.CloudVMs.ImageID),
-		instanceTypes:              cluster.InstanceTypes,
-		maxProbesPerSecond:         cluster.Containers.CloudVMs.MaxProbesPerSecond,
-		maxConcurrentNodeCreateOps: cluster.Containers.CloudVMs.MaxConcurrentNodeCreateOps,
-		probeInterval:              duration(cluster.Containers.CloudVMs.ProbeInterval, defaultProbeInterval),
-		syncInterval:               duration(cluster.Containers.CloudVMs.SyncInterval, defaultSyncInterval),
-		timeoutIdle:                duration(cluster.Containers.CloudVMs.TimeoutIdle, defaultTimeoutIdle),
-		timeoutBooting:             duration(cluster.Containers.CloudVMs.TimeoutBooting, defaultTimeoutBooting),
-		timeoutProbe:               duration(cluster.Containers.CloudVMs.TimeoutProbe, defaultTimeoutProbe),
-		timeoutShutdown:            duration(cluster.Containers.CloudVMs.TimeoutShutdown, defaultTimeoutShutdown),
-		timeoutTERM:                duration(cluster.Containers.CloudVMs.TimeoutTERM, defaultTimeoutTERM),
-		timeoutSignal:              duration(cluster.Containers.CloudVMs.TimeoutSignal, defaultTimeoutSignal),
-		installPublicKey:           installPublicKey,
-		tagKeyPrefix:               cluster.Containers.CloudVMs.TagKeyPrefix,
-		stop:                       make(chan bool),
+		logger:                         logger,
+		arvClient:                      arvClient,
+		instanceSetID:                  instanceSetID,
+		instanceSet:                    &throttledInstanceSet{InstanceSet: instanceSet},
+		newExecutor:                    newExecutor,
+		bootProbeCommand:               cluster.Containers.CloudVMs.BootProbeCommand,
+		runnerSource:                   cluster.Containers.CloudVMs.DeployRunnerBinary,
+		imageID:                        cloud.ImageID(cluster.Containers.CloudVMs.ImageID),
+		instanceTypes:                  cluster.InstanceTypes,
+		maxProbesPerSecond:             cluster.Containers.CloudVMs.MaxProbesPerSecond,
+		maxConcurrentInstanceCreateOps: cluster.Containers.CloudVMs.MaxConcurrentInstanceCreateOps,
+		probeInterval:                  duration(cluster.Containers.CloudVMs.ProbeInterval, defaultProbeInterval),
+		syncInterval:                   duration(cluster.Containers.CloudVMs.SyncInterval, defaultSyncInterval),
+		timeoutIdle:                    duration(cluster.Containers.CloudVMs.TimeoutIdle, defaultTimeoutIdle),
+		timeoutBooting:                 duration(cluster.Containers.CloudVMs.TimeoutBooting, defaultTimeoutBooting),
+		timeoutProbe:                   duration(cluster.Containers.CloudVMs.TimeoutProbe, defaultTimeoutProbe),
+		timeoutShutdown:                duration(cluster.Containers.CloudVMs.TimeoutShutdown, defaultTimeoutShutdown),
+		timeoutTERM:                    duration(cluster.Containers.CloudVMs.TimeoutTERM, defaultTimeoutTERM),
+		timeoutSignal:                  duration(cluster.Containers.CloudVMs.TimeoutSignal, defaultTimeoutSignal),
+		installPublicKey:               installPublicKey,
+		tagKeyPrefix:                   cluster.Containers.CloudVMs.TagKeyPrefix,
+		stop:                           make(chan bool),
 	}
 	wp.registerMetrics(reg)
 	go func() {
@@ -133,27 +133,27 @@ func NewPool(logger logrus.FieldLogger, arvClient *arvados.Client, reg *promethe
 // zero Pool should not be used. Call NewPool to create a new Pool.
 type Pool struct {
 	// configuration
-	logger                     logrus.FieldLogger
-	arvClient                  *arvados.Client
-	instanceSetID              cloud.InstanceSetID
-	instanceSet                *throttledInstanceSet
-	newExecutor                func(cloud.Instance) Executor
-	bootProbeCommand           string
-	runnerSource               string
-	imageID                    cloud.ImageID
-	instanceTypes              map[string]arvados.InstanceType
-	syncInterval               time.Duration
-	probeInterval              time.Duration
-	maxProbesPerSecond         int
-	maxConcurrentNodeCreateOps int
-	timeoutIdle                time.Duration
-	timeoutBooting             time.Duration
-	timeoutProbe               time.Duration
-	timeoutShutdown            time.Duration
-	timeoutTERM                time.Duration
-	timeoutSignal              time.Duration
-	installPublicKey           ssh.PublicKey
-	tagKeyPrefix               string
+	logger                         logrus.FieldLogger
+	arvClient                      *arvados.Client
+	instanceSetID                  cloud.InstanceSetID
+	instanceSet                    *throttledInstanceSet
+	newExecutor                    func(cloud.Instance) Executor
+	bootProbeCommand               string
+	runnerSource                   string
+	imageID                        cloud.ImageID
+	instanceTypes                  map[string]arvados.InstanceType
+	syncInterval                   time.Duration
+	probeInterval                  time.Duration
+	maxProbesPerSecond             int
+	maxConcurrentInstanceCreateOps int
+	timeoutIdle                    time.Duration
+	timeoutBooting                 time.Duration
+	timeoutProbe                   time.Duration
+	timeoutShutdown                time.Duration
+	timeoutTERM                    time.Duration
+	timeoutSignal                  time.Duration
+	installPublicKey               ssh.PublicKey
+	tagKeyPrefix                   string
 
 	// private state
 	subscribers  map[<-chan struct{}]chan<- struct{}
@@ -280,13 +280,6 @@ func (wp *Pool) Unallocated() map[arvados.InstanceType]int {
 	return unalloc
 }
 
-type RateLimitError struct{ Retry time.Time }
-
-func (e RateLimitError) Error() string {
-	return fmt.Sprintf("node creation request failed, hit maxConcurrentNodeCreateOps, wait until %s", e.Retry)
-}
-func (e RateLimitError) EarliestRetry() time.Time { return e.Retry }
-
 // Create a new instance with the given type, and add it to the worker
 // pool. The worker is added immediately; instance creation runs in
 // the background.
@@ -307,15 +300,16 @@ func (wp *Pool) Create(it arvados.InstanceType) bool {
 	if time.Now().Before(wp.atQuotaUntil) || wp.instanceSet.throttleCreate.Error() != nil {
 		return false
 	}
-	// The maxConcurrentNodeCreateOps knob throttles the number of node create
+	// The maxConcurrentInstanceCreateOps knob throttles the number of node create
 	// requests in flight. It was added to work around a limitation in Azure's
 	// managed disks, which support no more than 20 concurrent node creation
 	// requests from a single disk image (cf.
 	// https://docs.microsoft.com/en-us/azure/virtual-machines/linux/capture-image).
 	// The code assumes that node creation, from Azure's perspective, means the
 	// period until the instance appears in the "get all instances" list.
-	if wp.maxConcurrentNodeCreateOps > 0 && len(wp.creating) >= wp.maxConcurrentNodeCreateOps {
-		wp.instanceSet.throttleCreate.CheckRateLimitError(RateLimitError{Retry: time.Now().Add(5 * time.Second)}, wp.logger, "create instance", wp.notify)
+	if wp.maxConcurrentInstanceCreateOps > 0 && len(wp.creating) >= wp.maxConcurrentInstanceCreateOps {
+		logger.Info("reached MaxConcurrentInstanceCreateOps")
+		wp.instanceSet.throttleCreate.ErrorUntil(errors.New("reached MaxConcurrentInstanceCreateOps"), time.Now().Add(5*time.Second), wp.notify)
 		return false
 	}
 	now := time.Now()
diff --git a/lib/dispatchcloud/worker/pool_test.go b/lib/dispatchcloud/worker/pool_test.go
index 6b540eee7..7b116c3ee 100644
--- a/lib/dispatchcloud/worker/pool_test.go
+++ b/lib/dispatchcloud/worker/pool_test.go
@@ -207,9 +207,9 @@ func (suite *PoolSuite) TestNodeCreateThrottle(c *check.C) {
 
 	type1 := test.InstanceType(1)
 	pool := &Pool{
-		logger:                     logger,
-		instanceSet:                &throttledInstanceSet{InstanceSet: instanceSet},
-		maxConcurrentNodeCreateOps: 1,
+		logger:                         logger,
+		instanceSet:                    &throttledInstanceSet{InstanceSet: instanceSet},
+		maxConcurrentInstanceCreateOps: 1,
 		instanceTypes: arvados.InstanceTypeMap{
 			type1.Name: type1,
 		},
@@ -224,13 +224,15 @@ func (suite *PoolSuite) TestNodeCreateThrottle(c *check.C) {
 	c.Check(pool.Unallocated()[type1], check.Equals, 1)
 	c.Check(res, check.Equals, false)
 
-	pool.maxConcurrentNodeCreateOps = 2
+	pool.instanceSet.throttleCreate.ResetError()
+	pool.maxConcurrentInstanceCreateOps = 2
 
 	res = pool.Create(type1)
 	c.Check(pool.Unallocated()[type1], check.Equals, 2)
 	c.Check(res, check.Equals, true)
 
-	pool.maxConcurrentNodeCreateOps = 0
+	pool.instanceSet.throttleCreate.ResetError()
+	pool.maxConcurrentInstanceCreateOps = 0
 
 	res = pool.Create(type1)
 	c.Check(pool.Unallocated()[type1], check.Equals, 3)
diff --git a/lib/dispatchcloud/worker/throttle.go b/lib/dispatchcloud/worker/throttle.go
index defbf91ff..5374afb44 100644
--- a/lib/dispatchcloud/worker/throttle.go
+++ b/lib/dispatchcloud/worker/throttle.go
@@ -61,6 +61,12 @@ func (thr *throttle) Error() error {
 	return thr.err
 }
 
+func (thr *throttle) ResetError() {
+	thr.mtx.Lock()
+	defer thr.mtx.Unlock()
+	thr.err = nil
+}
+
 type throttledInstanceSet struct {
 	cloud.InstanceSet
 	throttleCreate    throttle
diff --git a/sdk/go/arvados/config.go b/sdk/go/arvados/config.go
index 1e190a947..829d4097c 100644
--- a/sdk/go/arvados/config.go
+++ b/sdk/go/arvados/config.go
@@ -446,24 +446,24 @@ type ContainersConfig struct {
 type CloudVMsConfig struct {
 	Enable bool
 
-	BootProbeCommand           string
-	DeployRunnerBinary         string
-	ImageID                    string
-	MaxCloudOpsPerSecond       int
-	MaxProbesPerSecond         int
-	MaxConcurrentNodeCreateOps int
-	PollInterval               Duration
-	ProbeInterval              Duration
-	SSHPort                    string
-	SyncInterval               Duration
-	TimeoutBooting             Duration
-	TimeoutIdle                Duration
-	TimeoutProbe               Duration
-	TimeoutShutdown            Duration
-	TimeoutSignal              Duration
-	TimeoutTERM                Duration
-	ResourceTags               map[string]string
-	TagKeyPrefix               string
+	BootProbeCommand               string
+	DeployRunnerBinary             string
+	ImageID                        string
+	MaxCloudOpsPerSecond           int
+	MaxProbesPerSecond             int
+	MaxConcurrentInstanceCreateOps int
+	PollInterval                   Duration
+	ProbeInterval                  Duration
+	SSHPort                        string
+	SyncInterval                   Duration
+	TimeoutBooting                 Duration
+	TimeoutIdle                    Duration
+	TimeoutProbe                   Duration
+	TimeoutShutdown                Duration
+	TimeoutSignal                  Duration
+	TimeoutTERM                    Duration
+	ResourceTags                   map[string]string
+	TagKeyPrefix                   string
 
 	Driver           string
 	DriverParameters json.RawMessage

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list