[ARVADOS] created: 1.3.0-1100-g42966c194
Git user
git at public.curoverse.com
Fri Jun 14 19:28:28 UTC 2019
at 42966c194493f8e42e26e3d64880e5c93a9c3251 (commit)
commit 42966c194493f8e42e26e3d64880e5c93a9c3251
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date: Fri Jun 14 15:27:30 2019 -0400
15340: Add metrics for cloud ops/errors and instance disappearances.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>
diff --git a/lib/dispatchcloud/dispatcher.go b/lib/dispatchcloud/dispatcher.go
index bc699d928..12c60ecb1 100644
--- a/lib/dispatchcloud/dispatcher.go
+++ b/lib/dispatchcloud/dispatcher.go
@@ -132,12 +132,12 @@ func (disp *dispatcher) initialize() {
disp.sshKey = key
}
- instanceSet, err := newInstanceSet(disp.Cluster, disp.InstanceSetID, disp.logger)
+ disp.reg = prometheus.NewRegistry()
+ instanceSet, err := newInstanceSet(disp.Cluster, disp.InstanceSetID, disp.logger, disp.reg)
if err != nil {
disp.logger.Fatalf("error initializing driver: %s", err)
}
disp.instanceSet = instanceSet
- disp.reg = prometheus.NewRegistry()
disp.pool = worker.NewPool(disp.logger, disp.ArvClient, disp.reg, disp.InstanceSetID, disp.instanceSet, disp.newExecutor, disp.sshKey.PublicKey(), disp.Cluster)
disp.queue = container.NewQueue(disp.logger, disp.reg, disp.typeChooser, disp.ArvClient)
diff --git a/lib/dispatchcloud/dispatcher_test.go b/lib/dispatchcloud/dispatcher_test.go
index 012621f12..6b73e71cc 100644
--- a/lib/dispatchcloud/dispatcher_test.go
+++ b/lib/dispatchcloud/dispatcher_test.go
@@ -49,6 +49,7 @@ func (s *DispatcherSuite) SetUpTest(c *check.C) {
}
s.cluster = &arvados.Cluster{
+ ManagementToken: "test-management-token",
Containers: arvados.ContainersConfig{
DispatchPrivateKey: string(dispatchprivraw),
StaleLockTimeout: arvados.Duration(5 * time.Millisecond),
@@ -193,6 +194,18 @@ func (s *DispatcherSuite) TestDispatchToStubDriver(c *check.C) {
c.Fatalf("timed out with %d containers (%v), %d instances (%+v)", len(ents), ents, len(insts), insts)
}
}
+
+ req := httptest.NewRequest("GET", "/metrics", nil)
+ req.Header.Set("Authorization", "Bearer "+s.cluster.ManagementToken)
+ resp := httptest.NewRecorder()
+ s.disp.ServeHTTP(resp, req)
+ c.Check(resp.Code, check.Equals, http.StatusOK)
+ c.Check(resp.Body.String(), check.Matches, `(?ms).*driver_operations{error="0",operation="Create"} [^0].*`)
+ c.Check(resp.Body.String(), check.Matches, `(?ms).*driver_operations{error="0",operation="List"} [^0].*`)
+ c.Check(resp.Body.String(), check.Matches, `(?ms).*driver_operations{error="1",operation="Create"} [^0].*`)
+ c.Check(resp.Body.String(), check.Matches, `(?ms).*driver_operations{error="1",operation="List"} 0\n.*`)
+ c.Check(resp.Body.String(), check.Matches, `(?ms).*instances_disappeared{state="shutdown"} [^0].*`)
+ c.Check(resp.Body.String(), check.Matches, `(?ms).*instances_disappeared{state="unknown"} 0\n.*`)
}
func (s *DispatcherSuite) TestAPIPermissions(c *check.C) {
diff --git a/lib/dispatchcloud/driver.go b/lib/dispatchcloud/driver.go
index b67b5d054..a8f3d5b5e 100644
--- a/lib/dispatchcloud/driver.go
+++ b/lib/dispatchcloud/driver.go
@@ -12,6 +12,7 @@ import (
"git.curoverse.com/arvados.git/lib/cloud/azure"
"git.curoverse.com/arvados.git/lib/cloud/ec2"
"git.curoverse.com/arvados.git/sdk/go/arvados"
+ "github.com/prometheus/client_golang/prometheus"
"github.com/sirupsen/logrus"
"golang.org/x/crypto/ssh"
)
@@ -21,13 +22,14 @@ var drivers = map[string]cloud.Driver{
"ec2": ec2.Driver,
}
-func newInstanceSet(cluster *arvados.Cluster, setID cloud.InstanceSetID, logger logrus.FieldLogger) (cloud.InstanceSet, error) {
+func newInstanceSet(cluster *arvados.Cluster, setID cloud.InstanceSetID, logger logrus.FieldLogger, reg *prometheus.Registry) (cloud.InstanceSet, error) {
driver, ok := drivers[cluster.Containers.CloudVMs.Driver]
if !ok {
return nil, fmt.Errorf("unsupported cloud driver %q", cluster.Containers.CloudVMs.Driver)
}
sharedResourceTags := cloud.SharedResourceTags(cluster.Containers.CloudVMs.ResourceTags)
is, err := driver.InstanceSet(cluster.Containers.CloudVMs.DriverParameters, setID, sharedResourceTags, logger)
+ is = newInstrumentedInstanceSet(is, reg)
if maxops := cluster.Containers.CloudVMs.MaxCloudOpsPerSecond; maxops > 0 {
is = rateLimitedInstanceSet{
InstanceSet: is,
@@ -113,3 +115,65 @@ nextInstance:
}).WithError(err).Debugf("filteringInstanceSet returning instances")
return returning, err
}
+
+func newInstrumentedInstanceSet(is cloud.InstanceSet, reg *prometheus.Registry) cloud.InstanceSet {
+ cv := prometheus.NewCounterVec(prometheus.CounterOpts{
+ Namespace: "arvados",
+ Subsystem: "dispatchcloud",
+ Name: "driver_operations",
+ Help: "Number of instance-create/destroy/list operations performed via cloud driver.",
+ }, []string{"operation", "error"})
+
+ // Create all counters, so they are reported with zero values
+ // (instead of being missing) until they are incremented.
+ for _, op := range []string{"Create", "List", "Destroy", "SetTags"} {
+ for _, error := range []string{"0", "1"} {
+ cv.WithLabelValues(op, error).Add(0)
+ }
+ }
+
+ reg.MustRegister(cv)
+ return instrumentedInstanceSet{is, cv}
+}
+
+type instrumentedInstanceSet struct {
+ cloud.InstanceSet
+ cv *prometheus.CounterVec
+}
+
+func (is instrumentedInstanceSet) Create(it arvados.InstanceType, image cloud.ImageID, tags cloud.InstanceTags, init cloud.InitCommand, pk ssh.PublicKey) (cloud.Instance, error) {
+ inst, err := is.InstanceSet.Create(it, image, tags, init, pk)
+ is.cv.WithLabelValues("Create", boolLabelValue(err != nil)).Inc()
+ return instrumentedInstance{inst, is.cv}, err
+}
+
+func (is instrumentedInstanceSet) Instances(tags cloud.InstanceTags) ([]cloud.Instance, error) {
+ instances, err := is.InstanceSet.Instances(tags)
+ is.cv.WithLabelValues("List", boolLabelValue(err != nil)).Inc()
+ return instances, err
+}
+
+type instrumentedInstance struct {
+ cloud.Instance
+ cv *prometheus.CounterVec
+}
+
+func (inst instrumentedInstance) Destroy() error {
+ err := inst.Instance.Destroy()
+ inst.cv.WithLabelValues("Destroy", boolLabelValue(err != nil)).Inc()
+ return err
+}
+
+func (inst instrumentedInstance) SetTags(tags cloud.InstanceTags) error {
+ err := inst.Instance.SetTags(tags)
+ inst.cv.WithLabelValues("SetTags", boolLabelValue(err != nil)).Inc()
+ return err
+}
+
+func boolLabelValue(v bool) string {
+ if v {
+ return "1"
+ } else {
+ return "0"
+ }
+}
diff --git a/lib/dispatchcloud/worker/pool.go b/lib/dispatchcloud/worker/pool.go
index 0ee36a96f..8616d6e9a 100644
--- a/lib/dispatchcloud/worker/pool.go
+++ b/lib/dispatchcloud/worker/pool.go
@@ -169,6 +169,7 @@ type Pool struct {
mInstancesPrice *prometheus.GaugeVec
mVCPUs *prometheus.GaugeVec
mMemory *prometheus.GaugeVec
+ mDisappearances *prometheus.CounterVec
}
type createCall struct {
@@ -556,6 +557,16 @@ func (wp *Pool) registerMetrics(reg *prometheus.Registry) {
Help: "Total memory on all cloud VMs.",
}, []string{"category"})
reg.MustRegister(wp.mMemory)
+ wp.mDisappearances = prometheus.NewCounterVec(prometheus.CounterOpts{
+ Namespace: "arvados",
+ Subsystem: "dispatchcloud",
+ Name: "instances_disappeared",
+ Help: "Number of occurrences of an instance disappearing from the cloud provider's list of instances.",
+ }, []string{"state"})
+ for _, v := range stateString {
+ wp.mDisappearances.WithLabelValues(v).Add(0)
+ }
+ reg.MustRegister(wp.mDisappearances)
}
func (wp *Pool) runMetrics() {
@@ -778,6 +789,7 @@ func (wp *Pool) sync(threshold time.Time, instances []cloud.Instance) {
"WorkerState": wkr.state,
})
logger.Info("instance disappeared in cloud")
+ wp.mDisappearances.WithLabelValues(stateString[wkr.state]).Inc()
delete(wp.workers, id)
go wkr.Close()
notify = true
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list