[ARVADOS] created: 1.3.0-2833-g6036c55e1

Git user git at public.arvados.org
Fri Jul 31 21:37:30 UTC 2020


        at  6036c55e1239281746152e85dfabbc9ed3cb6864 (commit)


commit 6036c55e1239281746152e85dfabbc9ed3cb6864
Author: Ward Vandewege <ward at curii.com>
Date:   Fri Jul 31 17:37:12 2020 -0400

    16636: add boot outcome metrics.
    
    Arvados-DCO-1.1-Signed-off-by: Ward Vandewege <ward at curii.com>

diff --git a/lib/dispatchcloud/worker/pool.go b/lib/dispatchcloud/worker/pool.go
index 79af5a0cb..bcf35e285 100644
--- a/lib/dispatchcloud/worker/pool.go
+++ b/lib/dispatchcloud/worker/pool.go
@@ -176,6 +176,7 @@ type Pool struct {
 	mInstancesPrice    *prometheus.GaugeVec
 	mVCPUs             *prometheus.GaugeVec
 	mMemory            *prometheus.GaugeVec
+	mBootOutcomes      *prometheus.CounterVec
 	mDisappearances    *prometheus.CounterVec
 }
 
@@ -593,6 +594,16 @@ func (wp *Pool) registerMetrics(reg *prometheus.Registry) {
 		Help:      "Total memory on all cloud VMs.",
 	}, []string{"category"})
 	reg.MustRegister(wp.mMemory)
+	wp.mBootOutcomes = prometheus.NewCounterVec(prometheus.CounterOpts{
+		Namespace: "arvados",
+		Subsystem: "dispatchcloud",
+		Name:      "boot_outcomes",
+		Help:      "Boot outcomes by type.",
+	}, []string{"state"})
+	for k := range validBootOutcomes {
+		wp.mBootOutcomes.WithLabelValues(string(k)).Add(0)
+	}
+	reg.MustRegister(wp.mBootOutcomes)
 	wp.mDisappearances = prometheus.NewCounterVec(prometheus.CounterOpts{
 		Namespace: "arvados",
 		Subsystem: "dispatchcloud",
@@ -867,6 +878,7 @@ func (wp *Pool) sync(threshold time.Time, instances []cloud.Instance) {
 			"WorkerState": wkr.state,
 		})
 		logger.Info("instance disappeared in cloud")
+		wkr.reportBootOutcome(BootOutcomeDisappeared)
 		if wp.mDisappearances != nil {
 			wp.mDisappearances.WithLabelValues(stateString[wkr.state]).Inc()
 		}
diff --git a/lib/dispatchcloud/worker/worker.go b/lib/dispatchcloud/worker/worker.go
index 357ac20a0..6878bb065 100644
--- a/lib/dispatchcloud/worker/worker.go
+++ b/lib/dispatchcloud/worker/worker.go
@@ -43,6 +43,33 @@ var stateString = map[State]string{
 	StateShutdown: "shutdown",
 }
 
+// BootOutcome is the result of a worker boot. It is used as a label in a metric.
+type BootOutcome string
+
+const (
+	BootOutcomeFailed       BootOutcome = "failure"
+	BootOutcomeSucceeded    BootOutcome = "success"
+	BootOutcomeIdleShutdown BootOutcome = "idle shutdown"
+	BootOutcomeDisappeared  BootOutcome = "disappeared"
+)
+
+var validBootOutcomes = map[BootOutcome]bool{
+	BootOutcomeFailed:       true,
+	BootOutcomeSucceeded:    true,
+	BootOutcomeIdleShutdown: true,
+	BootOutcomeDisappeared:  true,
+}
+
+func (wkr *worker) reportBootOutcome(outcome BootOutcome) {
+	if wkr.bootOutcomeReported {
+		return
+	}
+	if wkr.wp.mBootOutcomes != nil {
+		wkr.wp.mBootOutcomes.WithLabelValues(string(outcome)).Inc()
+	}
+	wkr.bootOutcomeReported = true
+}
+
 // String implements fmt.Stringer.
 func (s State) String() string {
 	return stateString[s]
@@ -74,22 +101,23 @@ type worker struct {
 	executor Executor
 	wp       *Pool
 
-	mtx          sync.Locker // must be wp's Locker.
-	state        State
-	idleBehavior IdleBehavior
-	instance     cloud.Instance
-	instType     arvados.InstanceType
-	vcpus        int64
-	memory       int64
-	appeared     time.Time
-	probed       time.Time
-	updated      time.Time
-	busy         time.Time
-	destroyed    time.Time
-	lastUUID     string
-	running      map[string]*remoteRunner // remember to update state idle<->running when this changes
-	starting     map[string]*remoteRunner // remember to update state idle<->running when this changes
-	probing      chan struct{}
+	mtx                 sync.Locker // must be wp's Locker.
+	state               State
+	idleBehavior        IdleBehavior
+	instance            cloud.Instance
+	instType            arvados.InstanceType
+	vcpus               int64
+	memory              int64
+	appeared            time.Time
+	probed              time.Time
+	updated             time.Time
+	busy                time.Time
+	destroyed           time.Time
+	lastUUID            string
+	running             map[string]*remoteRunner // remember to update state idle<->running when this changes
+	starting            map[string]*remoteRunner // remember to update state idle<->running when this changes
+	probing             chan struct{}
+	bootOutcomeReported bool
 }
 
 func (wkr *worker) onUnkillable(uuid string) {
@@ -224,6 +252,7 @@ func (wkr *worker) probeAndUpdate() {
 	defer wkr.mtx.Unlock()
 	if reportedBroken && wkr.idleBehavior == IdleBehaviorRun {
 		logger.Info("probe reported broken instance")
+		wkr.reportBootOutcome(BootOutcomeFailed)
 		wkr.setIdleBehavior(IdleBehaviorDrain)
 	}
 	if !ok || (!booted && len(ctrUUIDs) == 0 && len(wkr.running) == 0) {
@@ -247,6 +276,7 @@ func (wkr *worker) probeAndUpdate() {
 			// some evidence about why the node never
 			// booted, even in non-debug mode.
 			if !booted {
+				wkr.reportBootOutcome(BootOutcomeFailed)
 				logger.WithFields(logrus.Fields{
 					"Duration": dur,
 					"stderr":   string(stderr),
@@ -311,6 +341,7 @@ func (wkr *worker) probeAndUpdate() {
 	}
 	wkr.updated = updateTime
 	if booted && (initialState == StateUnknown || initialState == StateBooting) {
+		wkr.reportBootOutcome(BootOutcomeSucceeded)
 		logger.WithFields(logrus.Fields{
 			"RunningContainers": len(wkr.running),
 			"State":             wkr.state,
@@ -468,6 +499,7 @@ func (wkr *worker) shutdownIfIdle() bool {
 		"IdleDuration": stats.Duration(time.Since(wkr.busy)),
 		"IdleBehavior": wkr.idleBehavior,
 	}).Info("shutdown worker")
+	wkr.reportBootOutcome(BootOutcomeIdleShutdown)
 	wkr.shutdown()
 	return true
 }

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list