[ARVADOS] created: 1.3.0-1100-g42966c194

Git user git at public.curoverse.com
Fri Jun 14 19:28:28 UTC 2019


        at  42966c194493f8e42e26e3d64880e5c93a9c3251 (commit)


commit 42966c194493f8e42e26e3d64880e5c93a9c3251
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Fri Jun 14 15:27:30 2019 -0400

    15340: Add metrics for cloud ops/errors and instance disappearances.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/lib/dispatchcloud/dispatcher.go b/lib/dispatchcloud/dispatcher.go
index bc699d928..12c60ecb1 100644
--- a/lib/dispatchcloud/dispatcher.go
+++ b/lib/dispatchcloud/dispatcher.go
@@ -132,12 +132,12 @@ func (disp *dispatcher) initialize() {
 		disp.sshKey = key
 	}
 
-	instanceSet, err := newInstanceSet(disp.Cluster, disp.InstanceSetID, disp.logger)
+	disp.reg = prometheus.NewRegistry()
+	instanceSet, err := newInstanceSet(disp.Cluster, disp.InstanceSetID, disp.logger, disp.reg)
 	if err != nil {
 		disp.logger.Fatalf("error initializing driver: %s", err)
 	}
 	disp.instanceSet = instanceSet
-	disp.reg = prometheus.NewRegistry()
 	disp.pool = worker.NewPool(disp.logger, disp.ArvClient, disp.reg, disp.InstanceSetID, disp.instanceSet, disp.newExecutor, disp.sshKey.PublicKey(), disp.Cluster)
 	disp.queue = container.NewQueue(disp.logger, disp.reg, disp.typeChooser, disp.ArvClient)
 
diff --git a/lib/dispatchcloud/dispatcher_test.go b/lib/dispatchcloud/dispatcher_test.go
index 012621f12..6b73e71cc 100644
--- a/lib/dispatchcloud/dispatcher_test.go
+++ b/lib/dispatchcloud/dispatcher_test.go
@@ -49,6 +49,7 @@ func (s *DispatcherSuite) SetUpTest(c *check.C) {
 	}
 
 	s.cluster = &arvados.Cluster{
+		ManagementToken: "test-management-token",
 		Containers: arvados.ContainersConfig{
 			DispatchPrivateKey: string(dispatchprivraw),
 			StaleLockTimeout:   arvados.Duration(5 * time.Millisecond),
@@ -193,6 +194,18 @@ func (s *DispatcherSuite) TestDispatchToStubDriver(c *check.C) {
 			c.Fatalf("timed out with %d containers (%v), %d instances (%+v)", len(ents), ents, len(insts), insts)
 		}
 	}
+
+	req := httptest.NewRequest("GET", "/metrics", nil)
+	req.Header.Set("Authorization", "Bearer "+s.cluster.ManagementToken)
+	resp := httptest.NewRecorder()
+	s.disp.ServeHTTP(resp, req)
+	c.Check(resp.Code, check.Equals, http.StatusOK)
+	c.Check(resp.Body.String(), check.Matches, `(?ms).*driver_operations{error="0",operation="Create"} [^0].*`)
+	c.Check(resp.Body.String(), check.Matches, `(?ms).*driver_operations{error="0",operation="List"} [^0].*`)
+	c.Check(resp.Body.String(), check.Matches, `(?ms).*driver_operations{error="1",operation="Create"} [^0].*`)
+	c.Check(resp.Body.String(), check.Matches, `(?ms).*driver_operations{error="1",operation="List"} 0\n.*`)
+	c.Check(resp.Body.String(), check.Matches, `(?ms).*instances_disappeared{state="shutdown"} [^0].*`)
+	c.Check(resp.Body.String(), check.Matches, `(?ms).*instances_disappeared{state="unknown"} 0\n.*`)
 }
 
 func (s *DispatcherSuite) TestAPIPermissions(c *check.C) {
diff --git a/lib/dispatchcloud/driver.go b/lib/dispatchcloud/driver.go
index b67b5d054..a8f3d5b5e 100644
--- a/lib/dispatchcloud/driver.go
+++ b/lib/dispatchcloud/driver.go
@@ -12,6 +12,7 @@ import (
 	"git.curoverse.com/arvados.git/lib/cloud/azure"
 	"git.curoverse.com/arvados.git/lib/cloud/ec2"
 	"git.curoverse.com/arvados.git/sdk/go/arvados"
+	"github.com/prometheus/client_golang/prometheus"
 	"github.com/sirupsen/logrus"
 	"golang.org/x/crypto/ssh"
 )
@@ -21,13 +22,14 @@ var drivers = map[string]cloud.Driver{
 	"ec2":   ec2.Driver,
 }
 
-func newInstanceSet(cluster *arvados.Cluster, setID cloud.InstanceSetID, logger logrus.FieldLogger) (cloud.InstanceSet, error) {
+func newInstanceSet(cluster *arvados.Cluster, setID cloud.InstanceSetID, logger logrus.FieldLogger, reg *prometheus.Registry) (cloud.InstanceSet, error) {
 	driver, ok := drivers[cluster.Containers.CloudVMs.Driver]
 	if !ok {
 		return nil, fmt.Errorf("unsupported cloud driver %q", cluster.Containers.CloudVMs.Driver)
 	}
 	sharedResourceTags := cloud.SharedResourceTags(cluster.Containers.CloudVMs.ResourceTags)
 	is, err := driver.InstanceSet(cluster.Containers.CloudVMs.DriverParameters, setID, sharedResourceTags, logger)
+	is = newInstrumentedInstanceSet(is, reg)
 	if maxops := cluster.Containers.CloudVMs.MaxCloudOpsPerSecond; maxops > 0 {
 		is = rateLimitedInstanceSet{
 			InstanceSet: is,
@@ -113,3 +115,65 @@ nextInstance:
 	}).WithError(err).Debugf("filteringInstanceSet returning instances")
 	return returning, err
 }
+
+func newInstrumentedInstanceSet(is cloud.InstanceSet, reg *prometheus.Registry) cloud.InstanceSet {
+	cv := prometheus.NewCounterVec(prometheus.CounterOpts{
+		Namespace: "arvados",
+		Subsystem: "dispatchcloud",
+		Name:      "driver_operations",
+		Help:      "Number of instance-create/destroy/list operations performed via cloud driver.",
+	}, []string{"operation", "error"})
+
+	// Create all counters, so they are reported with zero values
+	// (instead of being missing) until they are incremented.
+	for _, op := range []string{"Create", "List", "Destroy", "SetTags"} {
+		for _, error := range []string{"0", "1"} {
+			cv.WithLabelValues(op, error).Add(0)
+		}
+	}
+
+	reg.MustRegister(cv)
+	return instrumentedInstanceSet{is, cv}
+}
+
+type instrumentedInstanceSet struct {
+	cloud.InstanceSet
+	cv *prometheus.CounterVec
+}
+
+func (is instrumentedInstanceSet) Create(it arvados.InstanceType, image cloud.ImageID, tags cloud.InstanceTags, init cloud.InitCommand, pk ssh.PublicKey) (cloud.Instance, error) {
+	inst, err := is.InstanceSet.Create(it, image, tags, init, pk)
+	is.cv.WithLabelValues("Create", boolLabelValue(err != nil)).Inc()
+	return instrumentedInstance{inst, is.cv}, err
+}
+
+func (is instrumentedInstanceSet) Instances(tags cloud.InstanceTags) ([]cloud.Instance, error) {
+	instances, err := is.InstanceSet.Instances(tags)
+	is.cv.WithLabelValues("List", boolLabelValue(err != nil)).Inc()
+	return instances, err
+}
+
+type instrumentedInstance struct {
+	cloud.Instance
+	cv *prometheus.CounterVec
+}
+
+func (inst instrumentedInstance) Destroy() error {
+	err := inst.Instance.Destroy()
+	inst.cv.WithLabelValues("Destroy", boolLabelValue(err != nil)).Inc()
+	return err
+}
+
+func (inst instrumentedInstance) SetTags(tags cloud.InstanceTags) error {
+	err := inst.Instance.SetTags(tags)
+	inst.cv.WithLabelValues("SetTags", boolLabelValue(err != nil)).Inc()
+	return err
+}
+
+func boolLabelValue(v bool) string {
+	if v {
+		return "1"
+	} else {
+		return "0"
+	}
+}
diff --git a/lib/dispatchcloud/worker/pool.go b/lib/dispatchcloud/worker/pool.go
index 0ee36a96f..8616d6e9a 100644
--- a/lib/dispatchcloud/worker/pool.go
+++ b/lib/dispatchcloud/worker/pool.go
@@ -169,6 +169,7 @@ type Pool struct {
 	mInstancesPrice    *prometheus.GaugeVec
 	mVCPUs             *prometheus.GaugeVec
 	mMemory            *prometheus.GaugeVec
+	mDisappearances    *prometheus.CounterVec
 }
 
 type createCall struct {
@@ -556,6 +557,16 @@ func (wp *Pool) registerMetrics(reg *prometheus.Registry) {
 		Help:      "Total memory on all cloud VMs.",
 	}, []string{"category"})
 	reg.MustRegister(wp.mMemory)
+	wp.mDisappearances = prometheus.NewCounterVec(prometheus.CounterOpts{
+		Namespace: "arvados",
+		Subsystem: "dispatchcloud",
+		Name:      "instances_disappeared",
+		Help:      "Number of occurrences of an instance disappearing from the cloud provider's list of instances.",
+	}, []string{"state"})
+	for _, v := range stateString {
+		wp.mDisappearances.WithLabelValues(v).Add(0)
+	}
+	reg.MustRegister(wp.mDisappearances)
 }
 
 func (wp *Pool) runMetrics() {
@@ -778,6 +789,7 @@ func (wp *Pool) sync(threshold time.Time, instances []cloud.Instance) {
 			"WorkerState": wkr.state,
 		})
 		logger.Info("instance disappeared in cloud")
+		wp.mDisappearances.WithLabelValues(stateString[wkr.state]).Inc()
 		delete(wp.workers, id)
 		go wkr.Close()
 		notify = true

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list