[arvados] created: 2.6.0-424-g151f91222

git repository hosting git at public.arvados.org
Fri Aug 4 14:39:27 UTC 2023


        at  151f912226d746991e46a5c782c73c902064ee60 (commit)


commit 151f912226d746991e46a5c782c73c902064ee60
Author: Tom Clegg <tom at curii.com>
Date:   Fri Aug 4 10:39:05 2023 -0400

    20755: Report metrics for ec2 instances per subnet.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>

diff --git a/lib/cloud/ec2/ec2.go b/lib/cloud/ec2/ec2.go
index b98e519d0..5e4df05f4 100644
--- a/lib/cloud/ec2/ec2.go
+++ b/lib/cloud/ec2/ec2.go
@@ -113,6 +113,9 @@ type ec2InstanceSet struct {
 	prices        map[priceKey][]cloud.InstancePrice
 	pricesLock    sync.Mutex
 	pricesUpdated map[priceKey]time.Time
+
+	mInstances      *prometheus.GaugeVec
+	mInstanceStarts *prometheus.CounterVec
 }
 
 func newEC2InstanceSet(config json.RawMessage, instanceSetID cloud.InstanceSetID, _ cloud.SharedResourceTags, logger logrus.FieldLogger, reg *prometheus.Registry) (prv cloud.InstanceSet, err error) {
@@ -142,6 +145,36 @@ func newEC2InstanceSet(config json.RawMessage, instanceSetID cloud.InstanceSetID
 	if instanceSet.ec2config.EBSVolumeType == "" {
 		instanceSet.ec2config.EBSVolumeType = "gp2"
 	}
+
+	// Set up metrics
+	instanceSet.mInstances = prometheus.NewGaugeVec(prometheus.GaugeOpts{
+		Namespace: "arvados",
+		Subsystem: "dispatchcloud",
+		Name:      "ec2_instances",
+		Help:      "Number of instances running",
+	}, []string{"subnet_id"})
+	instanceSet.mInstanceStarts = prometheus.NewCounterVec(prometheus.CounterOpts{
+		Namespace: "arvados",
+		Subsystem: "dispatchcloud",
+		Name:      "ec2_instance_starts_total",
+		Help:      "Number of attempts to start a new instance",
+	}, []string{"subnet_id", "success"})
+	// Initialize all of the series we'll be reporting.  Otherwise
+	// the {subnet=A, success=0} series doesn't appear in metrics
+	// at all until there's a failure in subnet A.
+	for _, subnet := range instanceSet.ec2config.SubnetID {
+		instanceSet.mInstanceStarts.WithLabelValues(subnet, "0").Add(0)
+		instanceSet.mInstanceStarts.WithLabelValues(subnet, "1").Add(0)
+	}
+	if len(instanceSet.ec2config.SubnetID) == 0 {
+		instanceSet.mInstanceStarts.WithLabelValues("", "0").Add(0)
+		instanceSet.mInstanceStarts.WithLabelValues("", "1").Add(0)
+	}
+	if reg != nil {
+		reg.MustRegister(instanceSet.mInstances)
+		reg.MustRegister(instanceSet.mInstanceStarts)
+	}
+
 	return instanceSet, nil
 }
 
@@ -260,11 +293,14 @@ func (instanceSet *ec2InstanceSet) Create(
 	currentSubnetIDIndex := int(atomic.LoadInt32(&instanceSet.currentSubnetIDIndex))
 	for tryOffset := 0; ; tryOffset++ {
 		tryIndex := 0
+		trySubnet := ""
 		if len(subnets) > 0 {
 			tryIndex = (currentSubnetIDIndex + tryOffset) % len(subnets)
-			rii.NetworkInterfaces[0].SubnetId = aws.String(subnets[tryIndex])
+			trySubnet = subnets[tryIndex]
+			rii.NetworkInterfaces[0].SubnetId = aws.String(trySubnet)
 		}
 		rsv, err = instanceSet.client.RunInstances(&rii)
+		instanceSet.mInstanceStarts.WithLabelValues(trySubnet, boolLabelValue[err == nil]).Add(1)
 		if isErrorSubnetSpecific(err) &&
 			tryOffset < len(subnets)-1 {
 			instanceSet.logger.WithError(err).WithField("SubnetID", subnets[tryIndex]).
@@ -382,6 +418,24 @@ func (instanceSet *ec2InstanceSet) Instances(tags cloud.InstanceTags) (instances
 		}
 		instanceSet.updateSpotPrices(instances)
 	}
+
+	// Count instances in each subnet, and report in metrics.
+	subnetInstances := map[string]int{"": 0}
+	for _, subnet := range instanceSet.ec2config.SubnetID {
+		subnetInstances[subnet] = 0
+	}
+	for _, inst := range instances {
+		subnet := inst.(*ec2Instance).instance.SubnetId
+		if subnet != nil {
+			subnetInstances[*subnet]++
+		} else {
+			subnetInstances[""]++
+		}
+	}
+	for subnet, count := range subnetInstances {
+		instanceSet.mInstances.WithLabelValues(subnet).Set(float64(count))
+	}
+
 	return instances, err
 }
 
@@ -675,3 +729,5 @@ func wrapError(err error, throttleValue *atomic.Value) error {
 	throttleValue.Store(time.Duration(0))
 	return nil
 }
+
+var boolLabelValue = map[bool]string{false: "0", true: "1"}
diff --git a/lib/cloud/ec2/ec2_test.go b/lib/cloud/ec2/ec2_test.go
index d8322e7af..9f8e6d040 100644
--- a/lib/cloud/ec2/ec2_test.go
+++ b/lib/cloud/ec2/ec2_test.go
@@ -34,12 +34,14 @@ import (
 	"git.arvados.org/arvados.git/lib/cloud"
 	"git.arvados.org/arvados.git/lib/dispatchcloud/test"
 	"git.arvados.org/arvados.git/sdk/go/arvados"
+	"git.arvados.org/arvados.git/sdk/go/arvadostest"
 	"git.arvados.org/arvados.git/sdk/go/config"
 	"git.arvados.org/arvados.git/sdk/go/ctxlog"
 	"github.com/aws/aws-sdk-go/aws"
 	"github.com/aws/aws-sdk-go/aws/awserr"
 	"github.com/aws/aws-sdk-go/service/ec2"
 	"github.com/ghodss/yaml"
+	"github.com/prometheus/client_golang/prometheus"
 	"github.com/sirupsen/logrus"
 	check "gopkg.in/check.v1"
 )
@@ -210,7 +212,8 @@ func (err *ec2stubError) OrigErr() error  { return errors.New("stub OrigErr") }
 // Ensure ec2stubError satisfies the aws.Error interface
 var _ = awserr.Error(&ec2stubError{})
 
-func GetInstanceSet(c *check.C) (*ec2InstanceSet, cloud.ImageID, arvados.Cluster) {
+func GetInstanceSet(c *check.C) (*ec2InstanceSet, cloud.ImageID, arvados.Cluster, *prometheus.Registry) {
+	reg := prometheus.NewRegistry()
 	cluster := arvados.Cluster{
 		InstanceTypes: arvados.InstanceTypeMap(map[string]arvados.InstanceType{
 			"tiny": {
@@ -246,21 +249,19 @@ func GetInstanceSet(c *check.C) (*ec2InstanceSet, cloud.ImageID, arvados.Cluster
 		err := config.LoadFile(&exampleCfg, *live)
 		c.Assert(err, check.IsNil)
 
-		ap, err := newEC2InstanceSet(exampleCfg.DriverParameters, "test123", nil, logrus.StandardLogger(), nil)
+		is, err := newEC2InstanceSet(exampleCfg.DriverParameters, "test123", nil, logrus.StandardLogger(), reg)
 		c.Assert(err, check.IsNil)
-		return ap.(*ec2InstanceSet), cloud.ImageID(exampleCfg.ImageIDForTestSuite), cluster
-	}
-	ap := ec2InstanceSet{
-		instanceSetID: "test123",
-		logger:        ctxlog.TestLogger(c),
-		client:        &ec2stub{c: c, reftime: time.Now().UTC()},
-		keys:          make(map[string]string),
+		return is.(*ec2InstanceSet), cloud.ImageID(exampleCfg.ImageIDForTestSuite), cluster, reg
+	} else {
+		is, err := newEC2InstanceSet(json.RawMessage(`{}`), "test123", nil, ctxlog.TestLogger(c), reg)
+		c.Assert(err, check.IsNil)
+		is.(*ec2InstanceSet).client = &ec2stub{c: c, reftime: time.Now().UTC()}
+		return is.(*ec2InstanceSet), cloud.ImageID("blob"), cluster, reg
 	}
-	return &ap, cloud.ImageID("blob"), cluster
 }
 
 func (*EC2InstanceSetSuite) TestCreate(c *check.C) {
-	ap, img, cluster := GetInstanceSet(c)
+	ap, img, cluster, _ := GetInstanceSet(c)
 	pk, _ := test.LoadTestKey(c, "../../dispatchcloud/test/sshkey_dispatch")
 
 	inst, err := ap.Create(cluster.InstanceTypes["tiny"],
@@ -280,7 +281,7 @@ func (*EC2InstanceSetSuite) TestCreate(c *check.C) {
 }
 
 func (*EC2InstanceSetSuite) TestCreateWithExtraScratch(c *check.C) {
-	ap, img, cluster := GetInstanceSet(c)
+	ap, img, cluster, _ := GetInstanceSet(c)
 	inst, err := ap.Create(cluster.InstanceTypes["tiny-with-extra-scratch"],
 		img, map[string]string{
 			"TestTagName": "test tag value",
@@ -301,7 +302,7 @@ func (*EC2InstanceSetSuite) TestCreateWithExtraScratch(c *check.C) {
 }
 
 func (*EC2InstanceSetSuite) TestCreatePreemptible(c *check.C) {
-	ap, img, cluster := GetInstanceSet(c)
+	ap, img, cluster, _ := GetInstanceSet(c)
 	pk, _ := test.LoadTestKey(c, "../../dispatchcloud/test/sshkey_dispatch")
 
 	inst, err := ap.Create(cluster.InstanceTypes["tiny-preemptible"],
@@ -323,7 +324,7 @@ func (*EC2InstanceSetSuite) TestCreateFailoverSecondSubnet(c *check.C) {
 		return
 	}
 
-	ap, img, cluster := GetInstanceSet(c)
+	ap, img, cluster, reg := GetInstanceSet(c)
 	ap.ec2config.SubnetID = sliceOrSingleString{"subnet-full", "subnet-good"}
 	ap.client.(*ec2stub).subnetErrorOnRunInstances = map[string]error{
 		"subnet-full": &ec2stubError{
@@ -335,12 +336,22 @@ func (*EC2InstanceSetSuite) TestCreateFailoverSecondSubnet(c *check.C) {
 	c.Check(err, check.IsNil)
 	c.Check(inst, check.NotNil)
 	c.Check(ap.client.(*ec2stub).runInstancesCalls, check.HasLen, 2)
+	metrics := arvadostest.GatherMetricsAsString(reg)
+	c.Check(metrics, check.Matches, `(?ms).*`+
+		`arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-full",success="0"} 1\n`+
+		`arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-good",success="1"} 1\n`+
+		`.*`)
 
 	// Next RunInstances call should try the working subnet first
 	inst, err = ap.Create(cluster.InstanceTypes["tiny"], img, nil, "", nil)
 	c.Check(err, check.IsNil)
 	c.Check(inst, check.NotNil)
 	c.Check(ap.client.(*ec2stub).runInstancesCalls, check.HasLen, 3)
+	metrics = arvadostest.GatherMetricsAsString(reg)
+	c.Check(metrics, check.Matches, `(?ms).*`+
+		`arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-full",success="0"} 1\n`+
+		`arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-good",success="1"} 2\n`+
+		`.*`)
 }
 
 func (*EC2InstanceSetSuite) TestCreateAllSubnetsFailing(c *check.C) {
@@ -349,7 +360,7 @@ func (*EC2InstanceSetSuite) TestCreateAllSubnetsFailing(c *check.C) {
 		return
 	}
 
-	ap, img, cluster := GetInstanceSet(c)
+	ap, img, cluster, reg := GetInstanceSet(c)
 	ap.ec2config.SubnetID = sliceOrSingleString{"subnet-full", "subnet-broken"}
 	ap.client.(*ec2stub).subnetErrorOnRunInstances = map[string]error{
 		"subnet-full": &ec2stubError{
@@ -365,15 +376,25 @@ func (*EC2InstanceSetSuite) TestCreateAllSubnetsFailing(c *check.C) {
 	c.Check(err, check.NotNil)
 	c.Check(err, check.ErrorMatches, `.*InvalidSubnetId\.NotFound.*`)
 	c.Check(ap.client.(*ec2stub).runInstancesCalls, check.HasLen, 2)
+	metrics := arvadostest.GatherMetricsAsString(reg)
+	c.Check(metrics, check.Matches, `(?ms).*`+
+		`arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-broken",success="0"} 1\n`+
+		`arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-full",success="0"} 1\n`+
+		`.*`)
 
 	_, err = ap.Create(cluster.InstanceTypes["tiny"], img, nil, "", nil)
 	c.Check(err, check.NotNil)
 	c.Check(err, check.ErrorMatches, `.*InsufficientFreeAddressesInSubnet.*`)
 	c.Check(ap.client.(*ec2stub).runInstancesCalls, check.HasLen, 4)
+	metrics = arvadostest.GatherMetricsAsString(reg)
+	c.Check(metrics, check.Matches, `(?ms).*`+
+		`arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-broken",success="0"} 2\n`+
+		`arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-full",success="0"} 2\n`+
+		`.*`)
 }
 
 func (*EC2InstanceSetSuite) TestTagInstances(c *check.C) {
-	ap, _, _ := GetInstanceSet(c)
+	ap, _, _, _ := GetInstanceSet(c)
 	l, err := ap.Instances(nil)
 	c.Assert(err, check.IsNil)
 
@@ -385,7 +406,7 @@ func (*EC2InstanceSetSuite) TestTagInstances(c *check.C) {
 }
 
 func (*EC2InstanceSetSuite) TestListInstances(c *check.C) {
-	ap, _, _ := GetInstanceSet(c)
+	ap, _, _, reg := GetInstanceSet(c)
 	l, err := ap.Instances(nil)
 	c.Assert(err, check.IsNil)
 
@@ -393,10 +414,15 @@ func (*EC2InstanceSetSuite) TestListInstances(c *check.C) {
 		tg := i.Tags()
 		c.Logf("%v %v %v", i.String(), i.Address(), tg)
 	}
+
+	metrics := arvadostest.GatherMetricsAsString(reg)
+	c.Check(metrics, check.Matches, `(?ms).*`+
+		`arvados_dispatchcloud_ec2_instances{subnet_id="[^"]*"} \d+\n`+
+		`.*`)
 }
 
 func (*EC2InstanceSetSuite) TestDestroyInstances(c *check.C) {
-	ap, _, _ := GetInstanceSet(c)
+	ap, _, _, _ := GetInstanceSet(c)
 	l, err := ap.Instances(nil)
 	c.Assert(err, check.IsNil)
 
@@ -406,7 +432,7 @@ func (*EC2InstanceSetSuite) TestDestroyInstances(c *check.C) {
 }
 
 func (*EC2InstanceSetSuite) TestInstancePriceHistory(c *check.C) {
-	ap, img, cluster := GetInstanceSet(c)
+	ap, img, cluster, _ := GetInstanceSet(c)
 	pk, _ := test.LoadTestKey(c, "../../dispatchcloud/test/sshkey_dispatch")
 	tags := cloud.InstanceTags{"arvados-ec2-driver": "test"}
 
diff --git a/lib/config/load_test.go b/lib/config/load_test.go
index 9a0417908..75efc6a35 100644
--- a/lib/config/load_test.go
+++ b/lib/config/load_test.go
@@ -19,10 +19,10 @@ import (
 	"time"
 
 	"git.arvados.org/arvados.git/sdk/go/arvados"
+	"git.arvados.org/arvados.git/sdk/go/arvadostest"
 	"git.arvados.org/arvados.git/sdk/go/ctxlog"
 	"github.com/ghodss/yaml"
 	"github.com/prometheus/client_golang/prometheus"
-	"github.com/prometheus/common/expfmt"
 	"github.com/sirupsen/logrus"
 	"golang.org/x/sys/unix"
 	check "gopkg.in/check.v1"
@@ -882,15 +882,10 @@ func (s *LoadSuite) TestSourceTimestamp(c *check.C) {
 		c.Check(int(cfg.SourceTimestamp.Sub(trial.expectTime).Seconds()), check.Equals, 0)
 		c.Check(int(ldr.loadTimestamp.Sub(time.Now()).Seconds()), check.Equals, 0)
 
-		var buf bytes.Buffer
 		reg := prometheus.NewRegistry()
 		ldr.RegisterMetrics(reg)
-		enc := expfmt.NewEncoder(&buf, expfmt.FmtText)
-		got, _ := reg.Gather()
-		for _, mf := range got {
-			enc.Encode(mf)
-		}
-		c.Check(buf.String(), check.Matches, `# HELP .*
+		metrics := arvadostest.GatherMetricsAsString(reg)
+		c.Check(metrics, check.Matches, `# HELP .*
 # TYPE .*
 arvados_config_load_timestamp_seconds{sha256="83aea5d82eb1d53372cd65c936c60acc1c6ef946e61977bbca7cfea709d201a8"} \Q`+fmt.Sprintf("%g", float64(ldr.loadTimestamp.UnixNano())/1e9)+`\E
 # HELP .*
diff --git a/sdk/go/arvadostest/metrics.go b/sdk/go/arvadostest/metrics.go
new file mode 100644
index 000000000..5fe1d607b
--- /dev/null
+++ b/sdk/go/arvadostest/metrics.go
@@ -0,0 +1,22 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+package arvadostest
+
+import (
+	"bytes"
+
+	"github.com/prometheus/client_golang/prometheus"
+	"github.com/prometheus/common/expfmt"
+)
+
+func GatherMetricsAsString(reg *prometheus.Registry) string {
+	buf := bytes.NewBuffer(nil)
+	enc := expfmt.NewEncoder(buf, expfmt.FmtText)
+	got, _ := reg.Gather()
+	for _, mf := range got {
+		enc.Encode(mf)
+	}
+	return buf.String()
+}
diff --git a/services/keep-balance/balance_run_test.go b/services/keep-balance/balance_run_test.go
index aeed517d0..db9a66848 100644
--- a/services/keep-balance/balance_run_test.go
+++ b/services/keep-balance/balance_run_test.go
@@ -5,7 +5,6 @@
 package keepbalance
 
 import (
-	"bytes"
 	"context"
 	"encoding/json"
 	"fmt"
@@ -24,7 +23,6 @@ import (
 	"git.arvados.org/arvados.git/sdk/go/ctxlog"
 	"github.com/jmoiron/sqlx"
 	"github.com/prometheus/client_golang/prometheus"
-	"github.com/prometheus/common/expfmt"
 	check "gopkg.in/check.v1"
 )
 
@@ -591,14 +589,12 @@ func (s *runSuite) TestCommit(c *check.C) {
 	c.Assert(err, check.IsNil)
 	c.Check(string(lost), check.Not(check.Matches), `(?ms).*acbd18db4cc2f85cedef654fccc4a4d8.*`)
 
-	buf, err := s.getMetrics(c, srv)
-	c.Check(err, check.IsNil)
-	bufstr := buf.String()
-	c.Check(bufstr, check.Matches, `(?ms).*\narvados_keep_total_bytes 15\n.*`)
-	c.Check(bufstr, check.Matches, `(?ms).*\narvados_keepbalance_changeset_compute_seconds_sum [0-9\.]+\n.*`)
-	c.Check(bufstr, check.Matches, `(?ms).*\narvados_keepbalance_changeset_compute_seconds_count 1\n.*`)
-	c.Check(bufstr, check.Matches, `(?ms).*\narvados_keep_dedup_byte_ratio [1-9].*`)
-	c.Check(bufstr, check.Matches, `(?ms).*\narvados_keep_dedup_block_ratio [1-9].*`)
+	metrics := arvadostest.GatherMetricsAsString(srv.Metrics.reg)
+	c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_total_bytes 15\n.*`)
+	c.Check(metrics, check.Matches, `(?ms).*\narvados_keepbalance_changeset_compute_seconds_sum [0-9\.]+\n.*`)
+	c.Check(metrics, check.Matches, `(?ms).*\narvados_keepbalance_changeset_compute_seconds_count 1\n.*`)
+	c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_dedup_byte_ratio [1-9].*`)
+	c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_dedup_block_ratio [1-9].*`)
 }
 
 func (s *runSuite) TestChunkPrefix(c *check.C) {
@@ -674,23 +670,6 @@ func (s *runSuite) TestRunForever(c *check.C) {
 	c.Check(pullReqs.Count() >= 16, check.Equals, true)
 	c.Check(trashReqs.Count(), check.Equals, pullReqs.Count()+4)
 
-	buf, err := s.getMetrics(c, srv)
-	c.Check(err, check.IsNil)
-	c.Check(buf, check.Matches, `(?ms).*\narvados_keepbalance_changeset_compute_seconds_count `+fmt.Sprintf("%d", pullReqs.Count()/4)+`\n.*`)
-}
-
-func (s *runSuite) getMetrics(c *check.C, srv *Server) (*bytes.Buffer, error) {
-	mfs, err := srv.Metrics.reg.Gather()
-	if err != nil {
-		return nil, err
-	}
-
-	var buf bytes.Buffer
-	for _, mf := range mfs {
-		if _, err := expfmt.MetricFamilyToText(&buf, mf); err != nil {
-			return nil, err
-		}
-	}
-
-	return &buf, nil
+	metrics := arvadostest.GatherMetricsAsString(srv.Metrics.reg)
+	c.Check(metrics, check.Matches, `(?ms).*\narvados_keepbalance_changeset_compute_seconds_count `+fmt.Sprintf("%d", pullReqs.Count()/4)+`\n.*`)
 }
diff --git a/services/keep-web/cache_test.go b/services/keep-web/cache_test.go
index 010e29a0b..e95ebcf84 100644
--- a/services/keep-web/cache_test.go
+++ b/services/keep-web/cache_test.go
@@ -5,7 +5,6 @@
 package keepweb
 
 import (
-	"bytes"
 	"net/http"
 	"net/http/httptest"
 	"regexp"
@@ -14,21 +13,12 @@ import (
 
 	"git.arvados.org/arvados.git/sdk/go/arvados"
 	"git.arvados.org/arvados.git/sdk/go/arvadostest"
-	"github.com/prometheus/common/expfmt"
 	"gopkg.in/check.v1"
 )
 
 func (s *IntegrationSuite) checkCacheMetrics(c *check.C, regs ...string) {
 	s.handler.Cache.updateGauges()
-	reg := s.handler.Cache.registry
-	mfs, err := reg.Gather()
-	c.Check(err, check.IsNil)
-	buf := &bytes.Buffer{}
-	enc := expfmt.NewEncoder(buf, expfmt.FmtText)
-	for _, mf := range mfs {
-		c.Check(enc.Encode(mf), check.IsNil)
-	}
-	mm := buf.String()
+	mm := arvadostest.GatherMetricsAsString(s.handler.Cache.registry)
 	// Remove comments to make the "value vs. regexp" failure
 	// output easier to read.
 	mm = regexp.MustCompile(`(?m)^#.*\n`).ReplaceAllString(mm, "")

commit 245a36c76baa54ca6cf75f501d5367d976c68a38
Author: Tom Clegg <tom at curii.com>
Date:   Thu Aug 3 15:50:50 2023 -0400

    20755: Allow cloud drivers to register their own metrics.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>

diff --git a/lib/cloud/azure/azure.go b/lib/cloud/azure/azure.go
index 494db854e..71f2a23dc 100644
--- a/lib/cloud/azure/azure.go
+++ b/lib/cloud/azure/azure.go
@@ -28,6 +28,7 @@ import (
 	"github.com/Azure/go-autorest/autorest/azure/auth"
 	"github.com/Azure/go-autorest/autorest/to"
 	"github.com/jmcvetta/randutil"
+	"github.com/prometheus/client_golang/prometheus"
 	"github.com/sirupsen/logrus"
 	"golang.org/x/crypto/ssh"
 )
@@ -238,7 +239,7 @@ type azureInstanceSet struct {
 	logger             logrus.FieldLogger
 }
 
-func newAzureInstanceSet(config json.RawMessage, dispatcherID cloud.InstanceSetID, _ cloud.SharedResourceTags, logger logrus.FieldLogger) (prv cloud.InstanceSet, err error) {
+func newAzureInstanceSet(config json.RawMessage, dispatcherID cloud.InstanceSetID, _ cloud.SharedResourceTags, logger logrus.FieldLogger, reg *prometheus.Registry) (prv cloud.InstanceSet, err error) {
 	azcfg := azureInstanceSetConfig{}
 	err = json.Unmarshal(config, &azcfg)
 	if err != nil {
diff --git a/lib/cloud/azure/azure_test.go b/lib/cloud/azure/azure_test.go
index 2f88f7344..de8d655b1 100644
--- a/lib/cloud/azure/azure_test.go
+++ b/lib/cloud/azure/azure_test.go
@@ -156,7 +156,7 @@ func GetInstanceSet() (*azureInstanceSet, cloud.ImageID, arvados.Cluster, error)
 			return nil, cloud.ImageID(""), cluster, err
 		}
 
-		ap, err := newAzureInstanceSet(exampleCfg.DriverParameters, "test123", nil, logrus.StandardLogger())
+		ap, err := newAzureInstanceSet(exampleCfg.DriverParameters, "test123", nil, logrus.StandardLogger(), nil)
 		return ap.(*azureInstanceSet), cloud.ImageID(exampleCfg.ImageIDForTestSuite), cluster, err
 	}
 	ap := azureInstanceSet{
diff --git a/lib/cloud/cloudtest/tester.go b/lib/cloud/cloudtest/tester.go
index 41e8f658a..d53cb017e 100644
--- a/lib/cloud/cloudtest/tester.go
+++ b/lib/cloud/cloudtest/tester.go
@@ -64,7 +64,7 @@ func (t *tester) Run() bool {
 	deferredError := false
 
 	var err error
-	t.is, err = t.Driver.InstanceSet(t.DriverParameters, t.SetID, t.Tags, t.Logger)
+	t.is, err = t.Driver.InstanceSet(t.DriverParameters, t.SetID, t.Tags, t.Logger, nil)
 	if err != nil {
 		t.Logger.WithError(err).Info("error initializing driver")
 		return false
diff --git a/lib/cloud/ec2/ec2.go b/lib/cloud/ec2/ec2.go
index 526fc1307..b98e519d0 100644
--- a/lib/cloud/ec2/ec2.go
+++ b/lib/cloud/ec2/ec2.go
@@ -29,6 +29,7 @@ import (
 	"github.com/aws/aws-sdk-go/aws/request"
 	"github.com/aws/aws-sdk-go/aws/session"
 	"github.com/aws/aws-sdk-go/service/ec2"
+	"github.com/prometheus/client_golang/prometheus"
 	"github.com/sirupsen/logrus"
 	"golang.org/x/crypto/ssh"
 )
@@ -114,7 +115,7 @@ type ec2InstanceSet struct {
 	pricesUpdated map[priceKey]time.Time
 }
 
-func newEC2InstanceSet(config json.RawMessage, instanceSetID cloud.InstanceSetID, _ cloud.SharedResourceTags, logger logrus.FieldLogger) (prv cloud.InstanceSet, err error) {
+func newEC2InstanceSet(config json.RawMessage, instanceSetID cloud.InstanceSetID, _ cloud.SharedResourceTags, logger logrus.FieldLogger, reg *prometheus.Registry) (prv cloud.InstanceSet, err error) {
 	instanceSet := &ec2InstanceSet{
 		instanceSetID: instanceSetID,
 		logger:        logger,
diff --git a/lib/cloud/ec2/ec2_test.go b/lib/cloud/ec2/ec2_test.go
index 2f3d319e0..d8322e7af 100644
--- a/lib/cloud/ec2/ec2_test.go
+++ b/lib/cloud/ec2/ec2_test.go
@@ -246,7 +246,7 @@ func GetInstanceSet(c *check.C) (*ec2InstanceSet, cloud.ImageID, arvados.Cluster
 		err := config.LoadFile(&exampleCfg, *live)
 		c.Assert(err, check.IsNil)
 
-		ap, err := newEC2InstanceSet(exampleCfg.DriverParameters, "test123", nil, logrus.StandardLogger())
+		ap, err := newEC2InstanceSet(exampleCfg.DriverParameters, "test123", nil, logrus.StandardLogger(), nil)
 		c.Assert(err, check.IsNil)
 		return ap.(*ec2InstanceSet), cloud.ImageID(exampleCfg.ImageIDForTestSuite), cluster
 	}
diff --git a/lib/cloud/interfaces.go b/lib/cloud/interfaces.go
index 27cf26152..7c532fda4 100644
--- a/lib/cloud/interfaces.go
+++ b/lib/cloud/interfaces.go
@@ -11,6 +11,7 @@ import (
 	"time"
 
 	"git.arvados.org/arvados.git/sdk/go/arvados"
+	"github.com/prometheus/client_golang/prometheus"
 	"github.com/sirupsen/logrus"
 	"golang.org/x/crypto/ssh"
 )
@@ -191,7 +192,7 @@ type InitCommand string
 //
 //	type exampleDriver struct {}
 //
-//	func (*exampleDriver) InstanceSet(config json.RawMessage, id cloud.InstanceSetID, tags cloud.SharedResourceTags, logger logrus.FieldLogger) (cloud.InstanceSet, error) {
+//	func (*exampleDriver) InstanceSet(config json.RawMessage, id cloud.InstanceSetID, tags cloud.SharedResourceTags, logger logrus.FieldLogger, reg *prometheus.Registry) (cloud.InstanceSet, error) {
 //		var is exampleInstanceSet
 //		if err := json.Unmarshal(config, &is); err != nil {
 //			return nil, err
@@ -199,20 +200,18 @@ type InitCommand string
 //		is.ownID = id
 //		return &is, nil
 //	}
-//
-//	var _ = registerCloudDriver("example", &exampleDriver{})
 type Driver interface {
-	InstanceSet(config json.RawMessage, id InstanceSetID, tags SharedResourceTags, logger logrus.FieldLogger) (InstanceSet, error)
+	InstanceSet(config json.RawMessage, id InstanceSetID, tags SharedResourceTags, logger logrus.FieldLogger, reg *prometheus.Registry) (InstanceSet, error)
 }
 
 // DriverFunc makes a Driver using the provided function as its
 // InstanceSet method. This is similar to http.HandlerFunc.
-func DriverFunc(fn func(config json.RawMessage, id InstanceSetID, tags SharedResourceTags, logger logrus.FieldLogger) (InstanceSet, error)) Driver {
+func DriverFunc(fn func(config json.RawMessage, id InstanceSetID, tags SharedResourceTags, logger logrus.FieldLogger, reg *prometheus.Registry) (InstanceSet, error)) Driver {
 	return driverFunc(fn)
 }
 
-type driverFunc func(config json.RawMessage, id InstanceSetID, tags SharedResourceTags, logger logrus.FieldLogger) (InstanceSet, error)
+type driverFunc func(config json.RawMessage, id InstanceSetID, tags SharedResourceTags, logger logrus.FieldLogger, reg *prometheus.Registry) (InstanceSet, error)
 
-func (df driverFunc) InstanceSet(config json.RawMessage, id InstanceSetID, tags SharedResourceTags, logger logrus.FieldLogger) (InstanceSet, error) {
-	return df(config, id, tags, logger)
+func (df driverFunc) InstanceSet(config json.RawMessage, id InstanceSetID, tags SharedResourceTags, logger logrus.FieldLogger, reg *prometheus.Registry) (InstanceSet, error) {
+	return df(config, id, tags, logger, reg)
 }
diff --git a/lib/cloud/loopback/loopback.go b/lib/cloud/loopback/loopback.go
index 8afaa4525..41878acd2 100644
--- a/lib/cloud/loopback/loopback.go
+++ b/lib/cloud/loopback/loopback.go
@@ -21,6 +21,7 @@ import (
 	"git.arvados.org/arvados.git/lib/cloud"
 	"git.arvados.org/arvados.git/lib/dispatchcloud/test"
 	"git.arvados.org/arvados.git/sdk/go/arvados"
+	"github.com/prometheus/client_golang/prometheus"
 	"github.com/sirupsen/logrus"
 	"golang.org/x/crypto/ssh"
 )
@@ -45,7 +46,7 @@ type instanceSet struct {
 	mtx           sync.Mutex
 }
 
-func newInstanceSet(config json.RawMessage, instanceSetID cloud.InstanceSetID, _ cloud.SharedResourceTags, logger logrus.FieldLogger) (cloud.InstanceSet, error) {
+func newInstanceSet(config json.RawMessage, instanceSetID cloud.InstanceSetID, _ cloud.SharedResourceTags, logger logrus.FieldLogger, reg *prometheus.Registry) (cloud.InstanceSet, error) {
 	is := &instanceSet{
 		instanceSetID: instanceSetID,
 		logger:        logger,
diff --git a/lib/cloud/loopback/loopback_test.go b/lib/cloud/loopback/loopback_test.go
index 5c30f5f0e..0716179cb 100644
--- a/lib/cloud/loopback/loopback_test.go
+++ b/lib/cloud/loopback/loopback_test.go
@@ -29,7 +29,7 @@ var _ = check.Suite(&suite{})
 
 func (*suite) TestCreateListExecDestroy(c *check.C) {
 	logger := ctxlog.TestLogger(c)
-	is, err := Driver.InstanceSet(json.RawMessage("{}"), "testInstanceSetID", cloud.SharedResourceTags{"sharedTag": "sharedTagValue"}, logger)
+	is, err := Driver.InstanceSet(json.RawMessage("{}"), "testInstanceSetID", cloud.SharedResourceTags{"sharedTag": "sharedTagValue"}, logger, nil)
 	c.Assert(err, check.IsNil)
 
 	clientRSAKey, err := rsa.GenerateKey(rand.Reader, 1024)
diff --git a/lib/dispatchcloud/driver.go b/lib/dispatchcloud/driver.go
index 93515defb..44adc23fd 100644
--- a/lib/dispatchcloud/driver.go
+++ b/lib/dispatchcloud/driver.go
@@ -33,7 +33,7 @@ func newInstanceSet(cluster *arvados.Cluster, setID cloud.InstanceSetID, logger
 		return nil, fmt.Errorf("unsupported cloud driver %q", cluster.Containers.CloudVMs.Driver)
 	}
 	sharedResourceTags := cloud.SharedResourceTags(cluster.Containers.CloudVMs.ResourceTags)
-	is, err := driver.InstanceSet(cluster.Containers.CloudVMs.DriverParameters, setID, sharedResourceTags, logger)
+	is, err := driver.InstanceSet(cluster.Containers.CloudVMs.DriverParameters, setID, sharedResourceTags, logger, reg)
 	is = newInstrumentedInstanceSet(is, reg)
 	if maxops := cluster.Containers.CloudVMs.MaxCloudOpsPerSecond; maxops > 0 {
 		is = rateLimitedInstanceSet{
diff --git a/lib/dispatchcloud/test/stub_driver.go b/lib/dispatchcloud/test/stub_driver.go
index e91878527..037580f8d 100644
--- a/lib/dispatchcloud/test/stub_driver.go
+++ b/lib/dispatchcloud/test/stub_driver.go
@@ -20,6 +20,7 @@ import (
 	"git.arvados.org/arvados.git/lib/cloud"
 	"git.arvados.org/arvados.git/lib/crunchrun"
 	"git.arvados.org/arvados.git/sdk/go/arvados"
+	"github.com/prometheus/client_golang/prometheus"
 	"github.com/sirupsen/logrus"
 	"golang.org/x/crypto/ssh"
 )
@@ -62,7 +63,7 @@ type StubDriver struct {
 }
 
 // InstanceSet returns a new *StubInstanceSet.
-func (sd *StubDriver) InstanceSet(params json.RawMessage, id cloud.InstanceSetID, _ cloud.SharedResourceTags, logger logrus.FieldLogger) (cloud.InstanceSet, error) {
+func (sd *StubDriver) InstanceSet(params json.RawMessage, id cloud.InstanceSetID, _ cloud.SharedResourceTags, logger logrus.FieldLogger, reg *prometheus.Registry) (cloud.InstanceSet, error) {
 	if sd.holdCloudOps == nil {
 		sd.holdCloudOps = make(chan bool)
 	}
diff --git a/lib/dispatchcloud/worker/pool_test.go b/lib/dispatchcloud/worker/pool_test.go
index 7f3a1531e..8d2ba09eb 100644
--- a/lib/dispatchcloud/worker/pool_test.go
+++ b/lib/dispatchcloud/worker/pool_test.go
@@ -78,7 +78,7 @@ func (suite *PoolSuite) TestResumeAfterRestart(c *check.C) {
 
 	driver := &test.StubDriver{}
 	instanceSetID := cloud.InstanceSetID("test-instance-set-id")
-	is, err := driver.InstanceSet(nil, instanceSetID, nil, suite.logger)
+	is, err := driver.InstanceSet(nil, instanceSetID, nil, suite.logger, nil)
 	c.Assert(err, check.IsNil)
 
 	newExecutor := func(cloud.Instance) Executor {
@@ -157,7 +157,7 @@ func (suite *PoolSuite) TestResumeAfterRestart(c *check.C) {
 
 func (suite *PoolSuite) TestDrain(c *check.C) {
 	driver := test.StubDriver{}
-	instanceSet, err := driver.InstanceSet(nil, "test-instance-set-id", nil, suite.logger)
+	instanceSet, err := driver.InstanceSet(nil, "test-instance-set-id", nil, suite.logger, nil)
 	c.Assert(err, check.IsNil)
 
 	ac := arvados.NewClientFromEnv()
@@ -210,7 +210,7 @@ func (suite *PoolSuite) TestDrain(c *check.C) {
 
 func (suite *PoolSuite) TestNodeCreateThrottle(c *check.C) {
 	driver := test.StubDriver{HoldCloudOps: true}
-	instanceSet, err := driver.InstanceSet(nil, "test-instance-set-id", nil, suite.logger)
+	instanceSet, err := driver.InstanceSet(nil, "test-instance-set-id", nil, suite.logger, nil)
 	c.Assert(err, check.IsNil)
 
 	type1 := test.InstanceType(1)
@@ -250,7 +250,7 @@ func (suite *PoolSuite) TestNodeCreateThrottle(c *check.C) {
 
 func (suite *PoolSuite) TestCreateUnallocShutdown(c *check.C) {
 	driver := test.StubDriver{HoldCloudOps: true}
-	instanceSet, err := driver.InstanceSet(nil, "test-instance-set-id", nil, suite.logger)
+	instanceSet, err := driver.InstanceSet(nil, "test-instance-set-id", nil, suite.logger, nil)
 	c.Assert(err, check.IsNil)
 
 	type1 := arvados.InstanceType{Name: "a1s", ProviderType: "a1.small", VCPUs: 1, RAM: 1 * GiB, Price: .01}
diff --git a/lib/dispatchcloud/worker/worker_test.go b/lib/dispatchcloud/worker/worker_test.go
index d04ecbb72..5d8c67e91 100644
--- a/lib/dispatchcloud/worker/worker_test.go
+++ b/lib/dispatchcloud/worker/worker_test.go
@@ -43,7 +43,7 @@ func (suite *WorkerSuite) TestProbeAndUpdate(c *check.C) {
 	probeTimeout := time.Second
 
 	ac := arvados.NewClientFromEnv()
-	is, err := (&test.StubDriver{}).InstanceSet(nil, "test-instance-set-id", nil, suite.logger)
+	is, err := (&test.StubDriver{}).InstanceSet(nil, "test-instance-set-id", nil, suite.logger, nil)
 	c.Assert(err, check.IsNil)
 	inst, err := is.Create(arvados.InstanceType{}, "", nil, "echo InitCommand", nil)
 	c.Assert(err, check.IsNil)

commit 349a2a7fdd230456f8a8ccddf1b9932c824ca4f3
Author: Tom Clegg <tom at curii.com>
Date:   Thu Aug 3 15:03:51 2023 -0400

    20755: Support multiple/alternate subnets on EC2.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>

diff --git a/lib/cloud/ec2/ec2.go b/lib/cloud/ec2/ec2.go
index e2cf5e0f1..526fc1307 100644
--- a/lib/cloud/ec2/ec2.go
+++ b/lib/cloud/ec2/ec2.go
@@ -14,6 +14,7 @@ import (
 	"fmt"
 	"math/big"
 	"strconv"
+	"strings"
 	"sync"
 	"sync/atomic"
 	"time"
@@ -45,7 +46,7 @@ type ec2InstanceSetConfig struct {
 	SecretAccessKey         string
 	Region                  string
 	SecurityGroupIDs        arvados.StringSet
-	SubnetID                string
+	SubnetID                sliceOrSingleString
 	AdminUsername           string
 	EBSVolumeType           string
 	EBSPrice                float64
@@ -53,6 +54,39 @@ type ec2InstanceSetConfig struct {
 	SpotPriceUpdateInterval arvados.Duration
 }
 
+type sliceOrSingleString []string
+
+// UnmarshalJSON unmarshals an array of strings, and also accepts ""
+// as [], and "foo" as ["foo"].
+func (ss *sliceOrSingleString) UnmarshalJSON(data []byte) error {
+	if len(data) == 0 {
+		*ss = nil
+	} else if data[0] == '[' {
+		var slice []string
+		err := json.Unmarshal(data, &slice)
+		if err != nil {
+			return err
+		}
+		if len(slice) == 0 {
+			*ss = nil
+		} else {
+			*ss = slice
+		}
+	} else {
+		var str string
+		err := json.Unmarshal(data, &str)
+		if err != nil {
+			return err
+		}
+		if str == "" {
+			*ss = nil
+		} else {
+			*ss = []string{str}
+		}
+	}
+	return nil
+}
+
 type ec2Interface interface {
 	DescribeKeyPairs(input *ec2.DescribeKeyPairsInput) (*ec2.DescribeKeyPairsOutput, error)
 	ImportKeyPair(input *ec2.ImportKeyPairInput) (*ec2.ImportKeyPairOutput, error)
@@ -66,6 +100,7 @@ type ec2Interface interface {
 
 type ec2InstanceSet struct {
 	ec2config              ec2InstanceSetConfig
+	currentSubnetIDIndex   int32
 	instanceSetID          cloud.InstanceSetID
 	logger                 logrus.FieldLogger
 	client                 ec2Interface
@@ -174,7 +209,6 @@ func (instanceSet *ec2InstanceSet) Create(
 				DeleteOnTermination:      aws.Bool(true),
 				DeviceIndex:              aws.Int64(0),
 				Groups:                   aws.StringSlice(groups),
-				SubnetId:                 &instanceSet.ec2config.SubnetID,
 			}},
 		DisableApiTermination:             aws.Bool(false),
 		InstanceInitiatedShutdownBehavior: aws.String("terminate"),
@@ -219,7 +253,36 @@ func (instanceSet *ec2InstanceSet) Create(
 		}
 	}
 
-	rsv, err := instanceSet.client.RunInstances(&rii)
+	var rsv *ec2.Reservation
+	var err error
+	subnets := instanceSet.ec2config.SubnetID
+	currentSubnetIDIndex := int(atomic.LoadInt32(&instanceSet.currentSubnetIDIndex))
+	for tryOffset := 0; ; tryOffset++ {
+		tryIndex := 0
+		if len(subnets) > 0 {
+			tryIndex = (currentSubnetIDIndex + tryOffset) % len(subnets)
+			rii.NetworkInterfaces[0].SubnetId = aws.String(subnets[tryIndex])
+		}
+		rsv, err = instanceSet.client.RunInstances(&rii)
+		if isErrorSubnetSpecific(err) &&
+			tryOffset < len(subnets)-1 {
+			instanceSet.logger.WithError(err).WithField("SubnetID", subnets[tryIndex]).
+				Warn("RunInstances failed, trying next subnet")
+			continue
+		}
+		// Succeeded, or exhausted all subnets, or got a
+		// non-subnet-related error.
+		//
+		// We intentionally update currentSubnetIDIndex even
+		// in the non-retryable-failure case here to avoid a
+		// situation where successive calls to Create() keep
+		// returning errors for the same subnet (perhaps
+		// "subnet full") and never reveal the errors for the
+		// other configured subnets (perhaps "subnet ID
+		// invalid").
+		atomic.StoreInt32(&instanceSet.currentSubnetIDIndex, int32(tryIndex))
+		break
+	}
 	err = wrapError(err, &instanceSet.throttleDelayCreate)
 	if err != nil {
 		return nil, err
@@ -548,6 +611,8 @@ func (err rateLimitError) EarliestRetry() time.Time {
 }
 
 var isCodeCapacity = map[string]bool{
+	"InstanceLimitExceeded":             true,
+	"InsufficientAddressCapacity":       true,
 	"InsufficientFreeAddressesInSubnet": true,
 	"InsufficientInstanceCapacity":      true,
 	"InsufficientVolumeCapacity":        true,
@@ -566,6 +631,19 @@ func isErrorCapacity(err error) bool {
 	return false
 }
 
+// isErrorSubnetSpecific returns true if the problem encountered by
+// RunInstances might be avoided by trying a different subnet.
+func isErrorSubnetSpecific(err error) bool {
+	aerr, ok := err.(awserr.Error)
+	if !ok {
+		return false
+	}
+	code := aerr.Code()
+	return strings.Contains(code, "Subnet") ||
+		code == "InsufficientInstanceCapacity" ||
+		code == "InsufficientVolumeCapacity"
+}
+
 type ec2QuotaError struct {
 	error
 }
diff --git a/lib/cloud/ec2/ec2_test.go b/lib/cloud/ec2/ec2_test.go
index ede7f9de5..2f3d319e0 100644
--- a/lib/cloud/ec2/ec2_test.go
+++ b/lib/cloud/ec2/ec2_test.go
@@ -24,7 +24,9 @@ package ec2
 
 import (
 	"encoding/json"
+	"errors"
 	"flag"
+	"fmt"
 	"sync/atomic"
 	"testing"
 	"time"
@@ -33,9 +35,11 @@ import (
 	"git.arvados.org/arvados.git/lib/dispatchcloud/test"
 	"git.arvados.org/arvados.git/sdk/go/arvados"
 	"git.arvados.org/arvados.git/sdk/go/config"
+	"git.arvados.org/arvados.git/sdk/go/ctxlog"
 	"github.com/aws/aws-sdk-go/aws"
 	"github.com/aws/aws-sdk-go/aws/awserr"
 	"github.com/aws/aws-sdk-go/service/ec2"
+	"github.com/ghodss/yaml"
 	"github.com/sirupsen/logrus"
 	check "gopkg.in/check.v1"
 )
@@ -47,6 +51,34 @@ func Test(t *testing.T) {
 	check.TestingT(t)
 }
 
+type sliceOrStringSuite struct{}
+
+var _ = check.Suite(&sliceOrStringSuite{})
+
+func (s *sliceOrStringSuite) TestUnmarshal(c *check.C) {
+	var conf ec2InstanceSetConfig
+	for _, trial := range []struct {
+		input  string
+		output sliceOrSingleString
+	}{
+		{``, nil},
+		{`""`, nil},
+		{`[]`, nil},
+		{`"foo"`, sliceOrSingleString{"foo"}},
+		{`["foo"]`, sliceOrSingleString{"foo"}},
+		{`[foo]`, sliceOrSingleString{"foo"}},
+		{`["foo", "bar"]`, sliceOrSingleString{"foo", "bar"}},
+		{`[foo-bar, baz]`, sliceOrSingleString{"foo-bar", "baz"}},
+	} {
+		c.Logf("trial: %+v", trial)
+		err := yaml.Unmarshal([]byte("SubnetID: "+trial.input+"\n"), &conf)
+		if !c.Check(err, check.IsNil) {
+			continue
+		}
+		c.Check(conf.SubnetID, check.DeepEquals, trial.output)
+	}
+}
+
 type EC2InstanceSetSuite struct{}
 
 var _ = check.Suite(&EC2InstanceSetSuite{})
@@ -61,6 +93,10 @@ type ec2stub struct {
 	reftime               time.Time
 	importKeyPairCalls    []*ec2.ImportKeyPairInput
 	describeKeyPairsCalls []*ec2.DescribeKeyPairsInput
+	runInstancesCalls     []*ec2.RunInstancesInput
+	// {subnetID => error}: RunInstances returns error if subnetID
+	// matches.
+	subnetErrorOnRunInstances map[string]error
 }
 
 func (e *ec2stub) ImportKeyPair(input *ec2.ImportKeyPairInput) (*ec2.ImportKeyPairOutput, error) {
@@ -74,6 +110,13 @@ func (e *ec2stub) DescribeKeyPairs(input *ec2.DescribeKeyPairsInput) (*ec2.Descr
 }
 
 func (e *ec2stub) RunInstances(input *ec2.RunInstancesInput) (*ec2.Reservation, error) {
+	e.runInstancesCalls = append(e.runInstancesCalls, input)
+	if len(input.NetworkInterfaces) > 0 && input.NetworkInterfaces[0].SubnetId != nil {
+		err := e.subnetErrorOnRunInstances[*input.NetworkInterfaces[0].SubnetId]
+		if err != nil {
+			return nil, err
+		}
+	}
 	return &ec2.Reservation{Instances: []*ec2.Instance{{
 		InstanceId:   aws.String("i-123"),
 		InstanceType: aws.String("t2.micro"),
@@ -154,6 +197,19 @@ func (e *ec2stub) TerminateInstances(input *ec2.TerminateInstancesInput) (*ec2.T
 	return nil, nil
 }
 
+type ec2stubError struct {
+	code    string
+	message string
+}
+
+func (err *ec2stubError) Code() string    { return err.code }
+func (err *ec2stubError) Message() string { return err.message }
+func (err *ec2stubError) Error() string   { return fmt.Sprintf("%s: %s", err.code, err.message) }
+func (err *ec2stubError) OrigErr() error  { return errors.New("stub OrigErr") }
+
+// Ensure ec2stubError satisfies the aws.Error interface
+var _ = awserr.Error(&ec2stubError{})
+
 func GetInstanceSet(c *check.C) (*ec2InstanceSet, cloud.ImageID, arvados.Cluster) {
 	cluster := arvados.Cluster{
 		InstanceTypes: arvados.InstanceTypeMap(map[string]arvados.InstanceType{
@@ -196,7 +252,7 @@ func GetInstanceSet(c *check.C) (*ec2InstanceSet, cloud.ImageID, arvados.Cluster
 	}
 	ap := ec2InstanceSet{
 		instanceSetID: "test123",
-		logger:        logrus.StandardLogger(),
+		logger:        ctxlog.TestLogger(c),
 		client:        &ec2stub{c: c, reftime: time.Now().UTC()},
 		keys:          make(map[string]string),
 	}
@@ -261,6 +317,61 @@ func (*EC2InstanceSetSuite) TestCreatePreemptible(c *check.C) {
 
 }
 
+func (*EC2InstanceSetSuite) TestCreateFailoverSecondSubnet(c *check.C) {
+	if *live != "" {
+		c.Skip("not applicable in live mode")
+		return
+	}
+
+	ap, img, cluster := GetInstanceSet(c)
+	ap.ec2config.SubnetID = sliceOrSingleString{"subnet-full", "subnet-good"}
+	ap.client.(*ec2stub).subnetErrorOnRunInstances = map[string]error{
+		"subnet-full": &ec2stubError{
+			code:    "InsufficientFreeAddressesInSubnet",
+			message: "subnet is full",
+		},
+	}
+	inst, err := ap.Create(cluster.InstanceTypes["tiny"], img, nil, "", nil)
+	c.Check(err, check.IsNil)
+	c.Check(inst, check.NotNil)
+	c.Check(ap.client.(*ec2stub).runInstancesCalls, check.HasLen, 2)
+
+	// Next RunInstances call should try the working subnet first
+	inst, err = ap.Create(cluster.InstanceTypes["tiny"], img, nil, "", nil)
+	c.Check(err, check.IsNil)
+	c.Check(inst, check.NotNil)
+	c.Check(ap.client.(*ec2stub).runInstancesCalls, check.HasLen, 3)
+}
+
+func (*EC2InstanceSetSuite) TestCreateAllSubnetsFailing(c *check.C) {
+	if *live != "" {
+		c.Skip("not applicable in live mode")
+		return
+	}
+
+	ap, img, cluster := GetInstanceSet(c)
+	ap.ec2config.SubnetID = sliceOrSingleString{"subnet-full", "subnet-broken"}
+	ap.client.(*ec2stub).subnetErrorOnRunInstances = map[string]error{
+		"subnet-full": &ec2stubError{
+			code:    "InsufficientFreeAddressesInSubnet",
+			message: "subnet is full",
+		},
+		"subnet-broken": &ec2stubError{
+			code:    "InvalidSubnetId.NotFound",
+			message: "bogus subnet id",
+		},
+	}
+	_, err := ap.Create(cluster.InstanceTypes["tiny"], img, nil, "", nil)
+	c.Check(err, check.NotNil)
+	c.Check(err, check.ErrorMatches, `.*InvalidSubnetId\.NotFound.*`)
+	c.Check(ap.client.(*ec2stub).runInstancesCalls, check.HasLen, 2)
+
+	_, err = ap.Create(cluster.InstanceTypes["tiny"], img, nil, "", nil)
+	c.Check(err, check.NotNil)
+	c.Check(err, check.ErrorMatches, `.*InsufficientFreeAddressesInSubnet.*`)
+	c.Check(ap.client.(*ec2stub).runInstancesCalls, check.HasLen, 4)
+}
+
 func (*EC2InstanceSetSuite) TestTagInstances(c *check.C) {
 	ap, _, _ := GetInstanceSet(c)
 	l, err := ap.Instances(nil)
diff --git a/lib/config/config.default.yml b/lib/config/config.default.yml
index 723e64cea..d14ab4661 100644
--- a/lib/config/config.default.yml
+++ b/lib/config/config.default.yml
@@ -1531,10 +1531,23 @@ Clusters:
           SecretAccessKey: ""
 
           # (ec2) Instance configuration.
+
+          # (ec2) Region, like "us-east-1".
+          Region: ""
+
+          # (ec2) Security group IDs. Omit or use {} to use the
+          # default security group.
           SecurityGroupIDs:
             "SAMPLE": {}
+
+          # (ec2) One or more subnet IDs. Omit or leave empty to let
+          # AWS choose a default subnet from your default VPC. If
+          # multiple subnets are configured here (enclosed in brackets
+          # like [subnet-abc123, subnet-def456]) the cloud dispatcher
+          # will detect subnet-related errors and retry using a
+          # different subnet. Most sites specify one subnet.
           SubnetID: ""
-          Region: ""
+
           EBSVolumeType: gp2
           AdminUsername: debian
           # (ec2) name of the IAMInstanceProfile for instances started by

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list