[arvados] created: 2.7.0-5671-g442183cf2d

git repository hosting git at public.arvados.org
Thu Jan 4 21:36:44 UTC 2024


        at  442183cf2d80b50454eb27d09596b8cea5b4303d (commit)


commit 442183cf2d80b50454eb27d09596b8cea5b4303d
Author: Tom Clegg <tom at curii.com>
Date:   Thu Jan 4 16:12:18 2024 -0500

    21036: Report remaining balancerStats as prometheus metrics.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>

diff --git a/services/keep-balance/balance_run_test.go b/services/keep-balance/balance_run_test.go
index f66194e2a2..7f6deb1e54 100644
--- a/services/keep-balance/balance_run_test.go
+++ b/services/keep-balance/balance_run_test.go
@@ -556,6 +556,10 @@ func (s *runSuite) TestDryRun(c *check.C) {
 	c.Check(bal.stats.trashesDeferred, check.Not(check.Equals), 0)
 	c.Check(bal.stats.underrep.replicas, check.Not(check.Equals), 0)
 	c.Check(bal.stats.overrep.replicas, check.Not(check.Equals), 0)
+
+	metrics := arvadostest.GatherMetricsAsString(srv.Metrics.reg)
+	c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_trash_entries_deferred_count [1-9].*`)
+	c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_pull_entries_deferred_count [1-9].*`)
 }
 
 func (s *runSuite) TestCommit(c *check.C) {
@@ -593,6 +597,19 @@ func (s *runSuite) TestCommit(c *check.C) {
 	c.Check(metrics, check.Matches, `(?ms).*\narvados_keepbalance_changeset_compute_seconds_count 1\n.*`)
 	c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_dedup_byte_ratio [1-9].*`)
 	c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_dedup_block_ratio [1-9].*`)
+
+	c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_replicated_block_count{replicas="0"} [1-9].*`)
+	c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_replicated_block_count{replicas="1"} [1-9].*`)
+	c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_replicated_block_count{replicas="9"} 0\n.*`)
+
+	c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_usage_replicas{status="needed",storage_class="default"} [1-9].*`)
+	c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_usage_blocks{status="needed",storage_class="default"} [1-9].*`)
+	c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_usage_bytes{status="needed",storage_class="default"} [1-9].*`)
+	c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_usage_bytes{status="unneeded",storage_class="default"} [1-9].*`)
+	c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_usage_bytes{status="unachievable",storage_class="default"} [1-9].*`)
+	c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_usage_bytes{status="pulling",storage_class="default"} [1-9].*`)
+
+	c.Logf("%s", metrics)
 }
 
 func (s *runSuite) TestChunkPrefix(c *check.C) {
diff --git a/services/keep-balance/metrics.go b/services/keep-balance/metrics.go
index 4683b67b98..02cee3955f 100644
--- a/services/keep-balance/metrics.go
+++ b/services/keep-balance/metrics.go
@@ -7,6 +7,7 @@ package keepbalance
 import (
 	"fmt"
 	"net/http"
+	"strconv"
 	"sync"
 
 	"github.com/prometheus/client_golang/prometheus"
@@ -17,18 +18,20 @@ type observer interface{ Observe(float64) }
 type setter interface{ Set(float64) }
 
 type metrics struct {
-	reg         *prometheus.Registry
-	statsGauges map[string]setter
-	observers   map[string]observer
-	setupOnce   sync.Once
-	mtx         sync.Mutex
+	reg            *prometheus.Registry
+	statsGauges    map[string]setter
+	statsGaugeVecs map[string]*prometheus.GaugeVec
+	observers      map[string]observer
+	setupOnce      sync.Once
+	mtx            sync.Mutex
 }
 
 func newMetrics(registry *prometheus.Registry) *metrics {
 	return &metrics{
-		reg:         registry,
-		statsGauges: map[string]setter{},
-		observers:   map[string]observer{},
+		reg:            registry,
+		statsGauges:    map[string]setter{},
+		statsGaugeVecs: map[string]*prometheus.GaugeVec{},
+		observers:      map[string]observer{},
 	}
 }
 
@@ -63,9 +66,24 @@ func (m *metrics) UpdateStats(s balancerStats) {
 		"transient":         {s.unref, "transient (unreferenced, new)"},
 		"overreplicated":    {s.overrep, "overreplicated"},
 		"underreplicated":   {s.underrep, "underreplicated"},
+		"unachievable":      {s.unachievable, "unachievable"},
+		"balanced":          {s.justright, "optimally balanced"},
+		"desired":           {s.desired, "desired"},
 		"lost":              {s.lost, "lost"},
 		"dedup_byte_ratio":  {s.dedupByteRatio(), "deduplication ratio, bytes referenced / bytes stored"},
 		"dedup_block_ratio": {s.dedupBlockRatio(), "deduplication ratio, blocks referenced / blocks stored"},
+		"collection_bytes":  {s.collectionBytes, "total apparent size of all collections"},
+		"referenced_bytes":  {s.collectionBlockBytes, "total size of unique referenced blocks"},
+		"reference_count":   {s.collectionBlockRefs, "block references in all collections"},
+		"referenced_blocks": {s.collectionBlocks, "blocks referenced by any collection"},
+
+		"pull_entries_sent_count":      {s.pulls, "total entries sent in pull lists"},
+		"pull_entries_deferred_count":  {s.pullsDeferred, "total entries deferred (not sent) in pull lists"},
+		"trash_entries_sent_count":     {s.trashes, "total entries sent in trash lists"},
+		"trash_entries_deferred_count": {s.trashesDeferred, "total entries deferred (not sent) in trash lists"},
+
+		"replicated_block_count": {s.replHistogram, "blocks with indicated number of replicas at last count"},
+		"usage":                  {s.classStats, "stored in indicated storage class"},
 	}
 	m.setupOnce.Do(func() {
 		// Register gauge(s) for each balancerStats field.
@@ -87,6 +105,29 @@ func (m *metrics) UpdateStats(s balancerStats) {
 				}
 			case int, int64, float64:
 				addGauge(name, gauge.Help)
+			case []int:
+				// replHistogram
+				gv := prometheus.NewGaugeVec(prometheus.GaugeOpts{
+					Namespace: "arvados",
+					Name:      name,
+					Subsystem: "keep",
+					Help:      gauge.Help,
+				}, []string{"replicas"})
+				m.reg.MustRegister(gv)
+				m.statsGaugeVecs[name] = gv
+			case map[string]replicationStats:
+				// classStats
+				for _, sub := range []string{"blocks", "bytes", "replicas"} {
+					name := name + "_" + sub
+					gv := prometheus.NewGaugeVec(prometheus.GaugeOpts{
+						Namespace: "arvados",
+						Name:      name,
+						Subsystem: "keep",
+						Help:      gauge.Help,
+					}, []string{"storage_class", "status"})
+					m.reg.MustRegister(gv)
+					m.statsGaugeVecs[name] = gv
+				}
 			default:
 				panic(fmt.Sprintf("bad gauge type %T", gauge.Value))
 			}
@@ -105,6 +146,38 @@ func (m *metrics) UpdateStats(s balancerStats) {
 			m.statsGauges[name].Set(float64(val))
 		case float64:
 			m.statsGauges[name].Set(float64(val))
+		case []int:
+			// replHistogram
+			for r, n := range val {
+				m.statsGaugeVecs[name].WithLabelValues(strconv.Itoa(r)).Set(float64(n))
+			}
+			// Record zero for higher-than-max-replication
+			// metrics, so we don't incorrectly continue
+			// to report stale metrics.
+			//
+			// For example, if we previously reported n=1
+			// for repl=6, but have since restarted
+			// keep-balance and the most replicated block
+			// now has repl=5, then the repl=6 gauge will
+			// still say n=1 until we clear it explicitly
+			// here.
+			for r := len(val); r < len(val)+4 || r < len(val)*2; r++ {
+				m.statsGaugeVecs[name].WithLabelValues(strconv.Itoa(r)).Set(0)
+			}
+		case map[string]replicationStats:
+			// classStats
+			for class, cs := range val {
+				for label, val := range map[string]blocksNBytes{
+					"needed":       cs.needed,
+					"unneeded":     cs.unneeded,
+					"pulling":      cs.pulling,
+					"unachievable": cs.unachievable,
+				} {
+					m.statsGaugeVecs[name+"_blocks"].WithLabelValues(class, label).Set(float64(val.blocks))
+					m.statsGaugeVecs[name+"_bytes"].WithLabelValues(class, label).Set(float64(val.bytes))
+					m.statsGaugeVecs[name+"_replicas"].WithLabelValues(class, label).Set(float64(val.replicas))
+				}
+			}
 		default:
 			panic(fmt.Sprintf("bad gauge type %T", gauge.Value))
 		}

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list