[arvados] updated: 2.7.0-5290-ga73295e64f

git repository hosting git at public.arvados.org
Thu Nov 2 20:45:29 UTC 2023


Summary of changes:
 lib/cloud/ec2/ec2.go      | 31 +++++++++++++++++++++++++------
 lib/cloud/ec2/ec2_test.go | 31 +++++++++++++++++++++++++++++++
 2 files changed, 56 insertions(+), 6 deletions(-)

       via  a73295e64f58fe027b995e0cca3d103d4e2289ff (commit)
      from  88aedea4fdf827524c620830ec11681e5cd5f527 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.


commit a73295e64f58fe027b995e0cca3d103d4e2289ff
Author: Tom Clegg <tom at curii.com>
Date:   Thu Nov 2 16:45:01 2023 -0400

    20978: Return the last capacity error if failing all subnets.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>

diff --git a/lib/cloud/ec2/ec2.go b/lib/cloud/ec2/ec2.go
index 55f9a1e3a3..0d181be0e9 100644
--- a/lib/cloud/ec2/ec2.go
+++ b/lib/cloud/ec2/ec2.go
@@ -288,7 +288,7 @@ func (instanceSet *ec2InstanceSet) Create(
 	}
 
 	var rsv *ec2.Reservation
-	var err error
+	var errToReturn error
 	subnets := instanceSet.ec2config.SubnetID
 	currentSubnetIDIndex := int(atomic.LoadInt32(&instanceSet.currentSubnetIDIndex))
 	for tryOffset := 0; ; tryOffset++ {
@@ -299,8 +299,15 @@ func (instanceSet *ec2InstanceSet) Create(
 			trySubnet = subnets[tryIndex]
 			rii.NetworkInterfaces[0].SubnetId = aws.String(trySubnet)
 		}
+		var err error
 		rsv, err = instanceSet.client.RunInstances(&rii)
 		instanceSet.mInstanceStarts.WithLabelValues(trySubnet, boolLabelValue[err == nil]).Add(1)
+		if !isErrorCapacity(errToReturn) || isErrorCapacity(err) {
+			// We want to return the last capacity error,
+			// if any; otherwise the last non-capacity
+			// error.
+			errToReturn = err
+		}
 		if isErrorSubnetSpecific(err) &&
 			tryOffset < len(subnets)-1 {
 			instanceSet.logger.WithError(err).WithField("SubnetID", subnets[tryIndex]).
@@ -320,9 +327,8 @@ func (instanceSet *ec2InstanceSet) Create(
 		atomic.StoreInt32(&instanceSet.currentSubnetIDIndex, int32(tryIndex))
 		break
 	}
-	err = wrapError(err, &instanceSet.throttleDelayCreate)
-	if err != nil {
-		return nil, err
+	if rsv == nil {
+		return nil, wrapError(errToReturn, &instanceSet.throttleDelayCreate)
 	}
 	return &ec2Instance{
 		provider: instanceSet,
@@ -715,6 +721,20 @@ func isErrorSubnetSpecific(err error) bool {
 		code == "Unsupported"
 }
 
+// isErrorCapacity returns true if the error indicates lack of
+// capacity (either temporary or permanent) to run a specific instance
+// type -- i.e., retrying with a different instance type might
+// succeed.
+func isErrorCapacity(err error) bool {
+	aerr, ok := err.(awserr.Error)
+	if !ok {
+		return false
+	}
+	code := aerr.Code()
+	return code == "InsufficientInstanceCapacity" ||
+		(code == "Unsupported" && strings.Contains(aerr.Message(), "requested instance type"))
+}
+
 type ec2QuotaError struct {
 	error
 }
@@ -738,8 +758,7 @@ func wrapError(err error, throttleValue *atomic.Value) error {
 		return rateLimitError{error: err, earliestRetry: time.Now().Add(d)}
 	} else if isErrorQuota(err) {
 		return &ec2QuotaError{err}
-	} else if aerr, ok := err.(awserr.Error); ok && (aerr.Code() == "InsufficientInstanceCapacity" ||
-		(aerr.Code() == "Unsupported" && strings.Contains(aerr.Message(), "requested instance type"))) {
+	} else if isErrorCapacity(err) {
 		return &capacityError{err, true}
 	} else if err != nil {
 		throttleValue.Store(time.Duration(0))
diff --git a/lib/cloud/ec2/ec2_test.go b/lib/cloud/ec2/ec2_test.go
index 6ce5aa3cf9..4b83005896 100644
--- a/lib/cloud/ec2/ec2_test.go
+++ b/lib/cloud/ec2/ec2_test.go
@@ -399,6 +399,37 @@ func (*EC2InstanceSetSuite) TestCreateAllSubnetsFailing(c *check.C) {
 		`.*`)
 }
 
+func (*EC2InstanceSetSuite) TestCreateOneSubnetFailingCapacity(c *check.C) {
+	if *live != "" {
+		c.Skip("not applicable in live mode")
+		return
+	}
+	ap, img, cluster, reg := GetInstanceSet(c, `{"SubnetID":["subnet-full","subnet-broken"]}`)
+	ap.client.(*ec2stub).subnetErrorOnRunInstances = map[string]error{
+		"subnet-full": &ec2stubError{
+			code:    "InsufficientFreeAddressesInSubnet",
+			message: "subnet is full",
+		},
+		"subnet-broken": &ec2stubError{
+			code:    "InsufficientInstanceCapacity",
+			message: "insufficient capacity",
+		},
+	}
+	for i := 0; i < 3; i++ {
+		_, err := ap.Create(cluster.InstanceTypes["tiny"], img, nil, "", nil)
+		c.Check(err, check.NotNil)
+		c.Check(err, check.ErrorMatches, `.*InsufficientInstanceCapacity.*`)
+	}
+	c.Check(ap.client.(*ec2stub).runInstancesCalls, check.HasLen, 6)
+	metrics := arvadostest.GatherMetricsAsString(reg)
+	c.Check(metrics, check.Matches, `(?ms).*`+
+		`arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-broken",success="0"} 3\n`+
+		`arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-broken",success="1"} 0\n`+
+		`arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-full",success="0"} 3\n`+
+		`arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-full",success="1"} 0\n`+
+		`.*`)
+}
+
 func (*EC2InstanceSetSuite) TestTagInstances(c *check.C) {
 	ap, _, _, _ := GetInstanceSet(c, "{}")
 	l, err := ap.Instances(nil)

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list