[ARVADOS] created: 1.3.0-2833-g6036c55e1
Git user
git at public.arvados.org
Fri Jul 31 21:37:30 UTC 2020
at 6036c55e1239281746152e85dfabbc9ed3cb6864 (commit)
commit 6036c55e1239281746152e85dfabbc9ed3cb6864
Author: Ward Vandewege <ward at curii.com>
Date: Fri Jul 31 17:37:12 2020 -0400
16636: add boot outcome metrics.
Arvados-DCO-1.1-Signed-off-by: Ward Vandewege <ward at curii.com>
diff --git a/lib/dispatchcloud/worker/pool.go b/lib/dispatchcloud/worker/pool.go
index 79af5a0cb..bcf35e285 100644
--- a/lib/dispatchcloud/worker/pool.go
+++ b/lib/dispatchcloud/worker/pool.go
@@ -176,6 +176,7 @@ type Pool struct {
mInstancesPrice *prometheus.GaugeVec
mVCPUs *prometheus.GaugeVec
mMemory *prometheus.GaugeVec
+ mBootOutcomes *prometheus.CounterVec
mDisappearances *prometheus.CounterVec
}
@@ -593,6 +594,16 @@ func (wp *Pool) registerMetrics(reg *prometheus.Registry) {
Help: "Total memory on all cloud VMs.",
}, []string{"category"})
reg.MustRegister(wp.mMemory)
+ wp.mBootOutcomes = prometheus.NewCounterVec(prometheus.CounterOpts{
+ Namespace: "arvados",
+ Subsystem: "dispatchcloud",
+ Name: "boot_outcomes",
+ Help: "Boot outcomes by type.",
+ }, []string{"state"})
+ for k := range validBootOutcomes {
+ wp.mBootOutcomes.WithLabelValues(string(k)).Add(0)
+ }
+ reg.MustRegister(wp.mBootOutcomes)
wp.mDisappearances = prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: "arvados",
Subsystem: "dispatchcloud",
@@ -867,6 +878,7 @@ func (wp *Pool) sync(threshold time.Time, instances []cloud.Instance) {
"WorkerState": wkr.state,
})
logger.Info("instance disappeared in cloud")
+ wkr.reportBootOutcome(BootOutcomeDisappeared)
if wp.mDisappearances != nil {
wp.mDisappearances.WithLabelValues(stateString[wkr.state]).Inc()
}
diff --git a/lib/dispatchcloud/worker/worker.go b/lib/dispatchcloud/worker/worker.go
index 357ac20a0..6878bb065 100644
--- a/lib/dispatchcloud/worker/worker.go
+++ b/lib/dispatchcloud/worker/worker.go
@@ -43,6 +43,33 @@ var stateString = map[State]string{
StateShutdown: "shutdown",
}
+// BootOutcome is the result of a worker boot. It is used as a label in a metric.
+type BootOutcome string
+
+const (
+ BootOutcomeFailed BootOutcome = "failure"
+ BootOutcomeSucceeded BootOutcome = "success"
+ BootOutcomeIdleShutdown BootOutcome = "idle shutdown"
+ BootOutcomeDisappeared BootOutcome = "disappeared"
+)
+
+var validBootOutcomes = map[BootOutcome]bool{
+ BootOutcomeFailed: true,
+ BootOutcomeSucceeded: true,
+ BootOutcomeIdleShutdown: true,
+ BootOutcomeDisappeared: true,
+}
+
+func (wkr *worker) reportBootOutcome(outcome BootOutcome) {
+ if wkr.bootOutcomeReported {
+ return
+ }
+ if wkr.wp.mBootOutcomes != nil {
+ wkr.wp.mBootOutcomes.WithLabelValues(string(outcome)).Inc()
+ }
+ wkr.bootOutcomeReported = true
+}
+
// String implements fmt.Stringer.
func (s State) String() string {
return stateString[s]
@@ -74,22 +101,23 @@ type worker struct {
executor Executor
wp *Pool
- mtx sync.Locker // must be wp's Locker.
- state State
- idleBehavior IdleBehavior
- instance cloud.Instance
- instType arvados.InstanceType
- vcpus int64
- memory int64
- appeared time.Time
- probed time.Time
- updated time.Time
- busy time.Time
- destroyed time.Time
- lastUUID string
- running map[string]*remoteRunner // remember to update state idle<->running when this changes
- starting map[string]*remoteRunner // remember to update state idle<->running when this changes
- probing chan struct{}
+ mtx sync.Locker // must be wp's Locker.
+ state State
+ idleBehavior IdleBehavior
+ instance cloud.Instance
+ instType arvados.InstanceType
+ vcpus int64
+ memory int64
+ appeared time.Time
+ probed time.Time
+ updated time.Time
+ busy time.Time
+ destroyed time.Time
+ lastUUID string
+ running map[string]*remoteRunner // remember to update state idle<->running when this changes
+ starting map[string]*remoteRunner // remember to update state idle<->running when this changes
+ probing chan struct{}
+ bootOutcomeReported bool
}
func (wkr *worker) onUnkillable(uuid string) {
@@ -224,6 +252,7 @@ func (wkr *worker) probeAndUpdate() {
defer wkr.mtx.Unlock()
if reportedBroken && wkr.idleBehavior == IdleBehaviorRun {
logger.Info("probe reported broken instance")
+ wkr.reportBootOutcome(BootOutcomeFailed)
wkr.setIdleBehavior(IdleBehaviorDrain)
}
if !ok || (!booted && len(ctrUUIDs) == 0 && len(wkr.running) == 0) {
@@ -247,6 +276,7 @@ func (wkr *worker) probeAndUpdate() {
// some evidence about why the node never
// booted, even in non-debug mode.
if !booted {
+ wkr.reportBootOutcome(BootOutcomeFailed)
logger.WithFields(logrus.Fields{
"Duration": dur,
"stderr": string(stderr),
@@ -311,6 +341,7 @@ func (wkr *worker) probeAndUpdate() {
}
wkr.updated = updateTime
if booted && (initialState == StateUnknown || initialState == StateBooting) {
+ wkr.reportBootOutcome(BootOutcomeSucceeded)
logger.WithFields(logrus.Fields{
"RunningContainers": len(wkr.running),
"State": wkr.state,
@@ -468,6 +499,7 @@ func (wkr *worker) shutdownIfIdle() bool {
"IdleDuration": stats.Duration(time.Since(wkr.busy)),
"IdleBehavior": wkr.idleBehavior,
}).Info("shutdown worker")
+ wkr.reportBootOutcome(BootOutcomeIdleShutdown)
wkr.shutdown()
return true
}
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list