[arvados] created: 2.7.0-5671-g442183cf2d
git repository hosting
git at public.arvados.org
Thu Jan 4 21:36:44 UTC 2024
at 442183cf2d80b50454eb27d09596b8cea5b4303d (commit)
commit 442183cf2d80b50454eb27d09596b8cea5b4303d
Author: Tom Clegg <tom at curii.com>
Date: Thu Jan 4 16:12:18 2024 -0500
21036: Report remaining balancerStats as prometheus metrics.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>
diff --git a/services/keep-balance/balance_run_test.go b/services/keep-balance/balance_run_test.go
index f66194e2a2..7f6deb1e54 100644
--- a/services/keep-balance/balance_run_test.go
+++ b/services/keep-balance/balance_run_test.go
@@ -556,6 +556,10 @@ func (s *runSuite) TestDryRun(c *check.C) {
c.Check(bal.stats.trashesDeferred, check.Not(check.Equals), 0)
c.Check(bal.stats.underrep.replicas, check.Not(check.Equals), 0)
c.Check(bal.stats.overrep.replicas, check.Not(check.Equals), 0)
+
+ metrics := arvadostest.GatherMetricsAsString(srv.Metrics.reg)
+ c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_trash_entries_deferred_count [1-9].*`)
+ c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_pull_entries_deferred_count [1-9].*`)
}
func (s *runSuite) TestCommit(c *check.C) {
@@ -593,6 +597,19 @@ func (s *runSuite) TestCommit(c *check.C) {
c.Check(metrics, check.Matches, `(?ms).*\narvados_keepbalance_changeset_compute_seconds_count 1\n.*`)
c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_dedup_byte_ratio [1-9].*`)
c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_dedup_block_ratio [1-9].*`)
+
+ c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_replicated_block_count{replicas="0"} [1-9].*`)
+ c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_replicated_block_count{replicas="1"} [1-9].*`)
+ c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_replicated_block_count{replicas="9"} 0\n.*`)
+
+ c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_usage_replicas{status="needed",storage_class="default"} [1-9].*`)
+ c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_usage_blocks{status="needed",storage_class="default"} [1-9].*`)
+ c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_usage_bytes{status="needed",storage_class="default"} [1-9].*`)
+ c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_usage_bytes{status="unneeded",storage_class="default"} [1-9].*`)
+ c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_usage_bytes{status="unachievable",storage_class="default"} [1-9].*`)
+ c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_usage_bytes{status="pulling",storage_class="default"} [1-9].*`)
+
+ c.Logf("%s", metrics)
}
func (s *runSuite) TestChunkPrefix(c *check.C) {
diff --git a/services/keep-balance/metrics.go b/services/keep-balance/metrics.go
index 4683b67b98..02cee3955f 100644
--- a/services/keep-balance/metrics.go
+++ b/services/keep-balance/metrics.go
@@ -7,6 +7,7 @@ package keepbalance
import (
"fmt"
"net/http"
+ "strconv"
"sync"
"github.com/prometheus/client_golang/prometheus"
@@ -17,18 +18,20 @@ type observer interface{ Observe(float64) }
type setter interface{ Set(float64) }
type metrics struct {
- reg *prometheus.Registry
- statsGauges map[string]setter
- observers map[string]observer
- setupOnce sync.Once
- mtx sync.Mutex
+ reg *prometheus.Registry
+ statsGauges map[string]setter
+ statsGaugeVecs map[string]*prometheus.GaugeVec
+ observers map[string]observer
+ setupOnce sync.Once
+ mtx sync.Mutex
}
func newMetrics(registry *prometheus.Registry) *metrics {
return &metrics{
- reg: registry,
- statsGauges: map[string]setter{},
- observers: map[string]observer{},
+ reg: registry,
+ statsGauges: map[string]setter{},
+ statsGaugeVecs: map[string]*prometheus.GaugeVec{},
+ observers: map[string]observer{},
}
}
@@ -63,9 +66,24 @@ func (m *metrics) UpdateStats(s balancerStats) {
"transient": {s.unref, "transient (unreferenced, new)"},
"overreplicated": {s.overrep, "overreplicated"},
"underreplicated": {s.underrep, "underreplicated"},
+ "unachievable": {s.unachievable, "unachievable"},
+ "balanced": {s.justright, "optimally balanced"},
+ "desired": {s.desired, "desired"},
"lost": {s.lost, "lost"},
"dedup_byte_ratio": {s.dedupByteRatio(), "deduplication ratio, bytes referenced / bytes stored"},
"dedup_block_ratio": {s.dedupBlockRatio(), "deduplication ratio, blocks referenced / blocks stored"},
+ "collection_bytes": {s.collectionBytes, "total apparent size of all collections"},
+ "referenced_bytes": {s.collectionBlockBytes, "total size of unique referenced blocks"},
+ "reference_count": {s.collectionBlockRefs, "block references in all collections"},
+ "referenced_blocks": {s.collectionBlocks, "blocks referenced by any collection"},
+
+ "pull_entries_sent_count": {s.pulls, "total entries sent in pull lists"},
+ "pull_entries_deferred_count": {s.pullsDeferred, "total entries deferred (not sent) in pull lists"},
+ "trash_entries_sent_count": {s.trashes, "total entries sent in trash lists"},
+ "trash_entries_deferred_count": {s.trashesDeferred, "total entries deferred (not sent) in trash lists"},
+
+ "replicated_block_count": {s.replHistogram, "blocks with indicated number of replicas at last count"},
+ "usage": {s.classStats, "stored in indicated storage class"},
}
m.setupOnce.Do(func() {
// Register gauge(s) for each balancerStats field.
@@ -87,6 +105,29 @@ func (m *metrics) UpdateStats(s balancerStats) {
}
case int, int64, float64:
addGauge(name, gauge.Help)
+ case []int:
+ // replHistogram
+ gv := prometheus.NewGaugeVec(prometheus.GaugeOpts{
+ Namespace: "arvados",
+ Name: name,
+ Subsystem: "keep",
+ Help: gauge.Help,
+ }, []string{"replicas"})
+ m.reg.MustRegister(gv)
+ m.statsGaugeVecs[name] = gv
+ case map[string]replicationStats:
+ // classStats
+ for _, sub := range []string{"blocks", "bytes", "replicas"} {
+ name := name + "_" + sub
+ gv := prometheus.NewGaugeVec(prometheus.GaugeOpts{
+ Namespace: "arvados",
+ Name: name,
+ Subsystem: "keep",
+ Help: gauge.Help,
+ }, []string{"storage_class", "status"})
+ m.reg.MustRegister(gv)
+ m.statsGaugeVecs[name] = gv
+ }
default:
panic(fmt.Sprintf("bad gauge type %T", gauge.Value))
}
@@ -105,6 +146,38 @@ func (m *metrics) UpdateStats(s balancerStats) {
m.statsGauges[name].Set(float64(val))
case float64:
m.statsGauges[name].Set(float64(val))
+ case []int:
+ // replHistogram
+ for r, n := range val {
+ m.statsGaugeVecs[name].WithLabelValues(strconv.Itoa(r)).Set(float64(n))
+ }
+ // Record zero for higher-than-max-replication
+ // metrics, so we don't incorrectly continue
+ // to report stale metrics.
+ //
+ // For example, if we previously reported n=1
+ // for repl=6, but have since restarted
+ // keep-balance and the most replicated block
+ // now has repl=5, then the repl=6 gauge will
+ // still say n=1 until we clear it explicitly
+ // here.
+ for r := len(val); r < len(val)+4 || r < len(val)*2; r++ {
+ m.statsGaugeVecs[name].WithLabelValues(strconv.Itoa(r)).Set(0)
+ }
+ case map[string]replicationStats:
+ // classStats
+ for class, cs := range val {
+ for label, val := range map[string]blocksNBytes{
+ "needed": cs.needed,
+ "unneeded": cs.unneeded,
+ "pulling": cs.pulling,
+ "unachievable": cs.unachievable,
+ } {
+ m.statsGaugeVecs[name+"_blocks"].WithLabelValues(class, label).Set(float64(val.blocks))
+ m.statsGaugeVecs[name+"_bytes"].WithLabelValues(class, label).Set(float64(val.bytes))
+ m.statsGaugeVecs[name+"_replicas"].WithLabelValues(class, label).Set(float64(val.replicas))
+ }
+ }
default:
panic(fmt.Sprintf("bad gauge type %T", gauge.Value))
}
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list