[arvados] updated: 2.7.0-5805-g999ef5f71d
git repository hosting
git at public.arvados.org
Tue Jan 9 22:29:14 UTC 2024
Summary of changes:
doc/user/cwl/cwl-extensions.html.textile.liquid | 6 +-
docker/jobs/Dockerfile | 24 ++-----
docker/jobs/apt.arvados.org-dev.list | 2 +-
docker/jobs/apt.arvados.org-stable.list | 2 +-
docker/jobs/apt.arvados.org-testing.list | 2 +-
lib/dispatchcloud/dispatcher_test.go | 17 +++--
lib/dispatchcloud/scheduler/run_queue.go | 4 +-
sdk/cwl/arvados_cwl/arv-cwl-schema-v1.0.yml | 4 +-
sdk/cwl/arvados_cwl/arv-cwl-schema-v1.1.yml | 4 +-
sdk/cwl/arvados_cwl/arv-cwl-schema-v1.2.yml | 4 +-
sdk/cwl/arvados_cwl/arvcontainer.py | 2 +
sdk/cwl/tests/arvados-tests.yml | 2 +-
sdk/dev-jobs.dockerfile | 2 +-
services/keep-balance/balance_run_test.go | 34 ++++++++++
services/keep-balance/metrics.go | 89 ++++++++++++++++++++++---
15 files changed, 150 insertions(+), 48 deletions(-)
via 999ef5f71d522283465b54b6468bae6badf28ca4 (commit)
via 3eeff2311c09fcc8b0ee74f22dfb543a02aece92 (commit)
via f7ecc6662e387292fb6eead32667b49d05a1d544 (commit)
via 6b90e8d82fba8bba70c0b17d2094f8aa7c1800a5 (commit)
via 6cc306e481edc8ede97a9a11bdb73cd06056b4fe (commit)
via 6781014ea568cf45a0589ebd970ca155973bbddc (commit)
via b20db672f17bc0ed6b0970982d421fb30d6ed77c (commit)
via 553e9826223a13a90c4e1f53a05561532786cc8c (commit)
via 9d03e8f43ac22534ddceee34bff5660ac0acd925 (commit)
via b58d0232799aaf3051487666f8c20b293527e3f4 (commit)
via acfde1e0010da59cbe48cb1990cf528796c22389 (commit)
from 798a896a1a5f411d0d9675a513e1ddeccc3c818d (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
commit 999ef5f71d522283465b54b6468bae6badf28ca4
Author: Peter Amstutz <peter.amstutz at curii.com>
Date: Tue Jan 9 17:22:32 2024 -0500
21216: Make memoryRetryMultiplier optional and set a default value
Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz at curii.com>
diff --git a/doc/user/cwl/cwl-extensions.html.textile.liquid b/doc/user/cwl/cwl-extensions.html.textile.liquid
index e05072ddf6..3c8366721d 100644
--- a/doc/user/cwl/cwl-extensions.html.textile.liquid
+++ b/doc/user/cwl/cwl-extensions.html.textile.liquid
@@ -73,7 +73,7 @@ hints:
usePreemptible: true
arv:OutOfMemoryRetry:
- memoryRetryMultipler: 2
+ memoryRetryMultiplier: 2
memoryErrorRegex: "custom memory error"
{% endcodeblock %}
@@ -195,7 +195,7 @@ table(table table-bordered table-condensed).
h2(#OutOfMemoryRetry). arv:OutOfMemoryRetry
-Specify that when a workflow step appears to have failed because it did not request enough RAM, it should be re-submitted with more RAM. Out of memory conditions are detected either by the container being unexpectedly killed (exit code 137) or by matching a pattern in the container's output (see @memoryErrorRegex@). Retrying will increase the base RAM request by the value of @memoryRetryMultipler at . For example, if the original RAM request was 10 GiB and the multiplier is 1.5, then it will re-submit with 15 GiB.
+Specify that when a workflow step appears to have failed because it did not request enough RAM, it should be re-submitted with more RAM. Out of memory conditions are detected either by the container being unexpectedly killed (exit code 137) or by matching a pattern in the container's output (see @memoryErrorRegex@). Retrying will increase the base RAM request by the value of @memoryRetryMultiplier at . For example, if the original RAM request was 10 GiB and the multiplier is 1.5, then it will re-submit with 15 GiB.
Containers are only re-submitted once. If it fails a second time after increasing RAM, then the worklow step will still fail.
@@ -203,7 +203,7 @@ Also note that expressions that use @$(runtime.ram)@ (such as dynamic command li
table(table table-bordered table-condensed).
|_. Field |_. Type |_. Description |
-|memoryRetryMultipler|float|Required, the retry will multiply the base memory request by this factor to get the retry memory request.|
+|memoryRetryMultiplier|float|Optional, default value is 2. The retry will multiply the base memory request by this factor to get the retry memory request.|
|memoryErrorRegex|string|Optional, a custom regex that, if found in the stdout, stderr or crunch-run logging of a program, will trigger a retry with greater RAM. If not provided, the default pattern matches "out of memory" (with or without spaces), "memory error" (with or without spaces), "bad_alloc" and "container using over 90% of memory".|
h2. arv:dockerCollectionPDH
diff --git a/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.0.yml b/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.0.yml
index 450864df30..aeb41db568 100644
--- a/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.0.yml
+++ b/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.0.yml
@@ -479,7 +479,7 @@ $graph:
should be retried with more RAM. By default, searches for the
substrings 'bad_alloc' and 'OutOfMemory'.
- name: memoryRetryMultiplier
- type: float
+ type: float?
doc: |
If the container failed on its first run, re-submit the
container with the RAM request multiplied by this factor.
diff --git a/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.1.yml b/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.1.yml
index f33b94e69d..0e51d50080 100644
--- a/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.1.yml
+++ b/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.1.yml
@@ -422,7 +422,7 @@ $graph:
should be retried with more RAM. By default, searches for the
substrings 'bad_alloc' and 'OutOfMemory'.
- name: memoryRetryMultiplier
- type: float
+ type: float?
doc: |
If the container failed on its first run, re-submit the
container with the RAM request multiplied by this factor.
diff --git a/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.2.yml b/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.2.yml
index 0c6035c56f..a753579c9a 100644
--- a/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.2.yml
+++ b/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.2.yml
@@ -425,7 +425,7 @@ $graph:
should be retried with more RAM. By default, searches for the
substrings 'bad_alloc' and 'OutOfMemory'.
- name: memoryRetryMultiplier
- type: float
+ type: float?
doc: |
If the container failed on its first run, re-submit the
container with the RAM request multiplied by this factor.
diff --git a/sdk/cwl/arvados_cwl/arvcontainer.py b/sdk/cwl/arvados_cwl/arvcontainer.py
index 1f379ccec6..584ca1713a 100644
--- a/sdk/cwl/arvados_cwl/arvcontainer.py
+++ b/sdk/cwl/arvados_cwl/arvcontainer.py
@@ -375,6 +375,8 @@ class ArvadosContainer(JobBase):
ram_multiplier.append(oom_retry_req.get('memoryRetryMultiplier'))
elif oom_retry_req.get('memoryRetryMultipler'):
ram_multiplier.append(oom_retry_req.get('memoryRetryMultipler'))
+ else:
+ ram_multiplier.append(2)
if runtimeContext.runnerjob.startswith("arvwf:"):
wfuuid = runtimeContext.runnerjob[6:runtimeContext.runnerjob.index("#")]
commit 3eeff2311c09fcc8b0ee74f22dfb543a02aece92
Author: Peter Amstutz <peter.amstutz at curii.com>
Date: Tue Jan 9 17:12:55 2024 -0500
21216: Make the misspelled version optional
Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz at curii.com>
diff --git a/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.0.yml b/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.0.yml
index b91564bdd1..450864df30 100644
--- a/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.0.yml
+++ b/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.0.yml
@@ -484,7 +484,7 @@ $graph:
If the container failed on its first run, re-submit the
container with the RAM request multiplied by this factor.
- name: memoryRetryMultipler
- type: float
+ type: float?
doc: |
Deprecated misspelling of "memoryRetryMultiplier". Kept only
for backwards compatability, don't use this.
diff --git a/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.1.yml b/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.1.yml
index ebec3f334e..f33b94e69d 100644
--- a/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.1.yml
+++ b/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.1.yml
@@ -427,7 +427,7 @@ $graph:
If the container failed on its first run, re-submit the
container with the RAM request multiplied by this factor.
- name: memoryRetryMultipler
- type: float
+ type: float?
doc: |
Deprecated misspelling of "memoryRetryMultiplier". Kept only
for backwards compatability, don't use this.
diff --git a/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.2.yml b/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.2.yml
index 7b70a00dab..0c6035c56f 100644
--- a/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.2.yml
+++ b/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.2.yml
@@ -430,7 +430,7 @@ $graph:
If the container failed on its first run, re-submit the
container with the RAM request multiplied by this factor.
- name: memoryRetryMultipler
- type: float
+ type: float?
doc: |
Deprecated misspelling of "memoryRetryMultiplier". Kept only
for backwards compatability, don't use this.
diff --git a/sdk/cwl/tests/arvados-tests.yml b/sdk/cwl/tests/arvados-tests.yml
index 7ca8ca0950..cb4a151f0e 100644
--- a/sdk/cwl/tests/arvados-tests.yml
+++ b/sdk/cwl/tests/arvados-tests.yml
@@ -487,7 +487,7 @@
- job: oom/fakeoom.yml
output: {}
- tool: oom/19975-oom-misspelled.cwl
+ tool: oom/19975-oom-mispelled.cwl
doc: "Test feature 19975 - retry on exit 137, old misspelled version"
- job: oom/fakeoom2.yml
commit f7ecc6662e387292fb6eead32667b49d05a1d544
Author: Peter Amstutz <peter.amstutz at curii.com>
Date: Tue Jan 9 16:51:53 2024 -0500
21367: Use regular bullseye instead of bullseye-slim
Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz at curii.com>
diff --git a/sdk/dev-jobs.dockerfile b/sdk/dev-jobs.dockerfile
index 95b039eba9..656572eb4f 100644
--- a/sdk/dev-jobs.dockerfile
+++ b/sdk/dev-jobs.dockerfile
@@ -13,7 +13,7 @@
# (This dockerfile file must be located in the arvados/sdk/ directory because
# of the docker build root.)
-FROM debian:buster-slim
+FROM debian:bullseye
MAINTAINER Arvados Package Maintainers <packaging at arvados.org>
ENV DEBIAN_FRONTEND noninteractive
commit 6b90e8d82fba8bba70c0b17d2094f8aa7c1800a5
Author: Brett Smith <brett.smith at curii.com>
Date: Tue Jan 9 15:38:33 2024 -0500
21367: Add arvados virtualenv to $PATH
This is both more user-friendly (now a-c-r itself is in $PATH) and a
cleaner implementation than the previous version.
Arvados-DCO-1.1-Signed-off-by: Brett Smith <brett.smith at curii.com>
diff --git a/docker/jobs/Dockerfile b/docker/jobs/Dockerfile
index 563cd41e92..371b9cc984 100644
--- a/docker/jobs/Dockerfile
+++ b/docker/jobs/Dockerfile
@@ -14,13 +14,10 @@ ADD 1078ECD7.key /etc/apt/trusted.gpg.d/arvados.asc
RUN apt-get update -q
RUN DEBIAN_FRONTEND=noninteractive apt-get install -yq --no-install-recommends python3-arvados-cwl-runner=$cwl_runner_version
-# use the Python executable from the python-arvados-cwl-runner package
-RUN PYTHON=`ls /usr/share/python3*/dist/python3-arvados-cwl-runner/bin/python|head -n1` && rm -f /usr/bin/python && ln -s $PYTHON /usr/bin/python
-RUN PYTHON3=`ls /usr/share/python3*/dist/python3-arvados-cwl-runner/bin/python3|head -n1` && rm -f /usr/bin/python3 && ln -s $PYTHON3 /usr/bin/python3
-
# Install dependencies and set up system.
RUN /usr/sbin/adduser --disabled-password \
--gecos 'Crunch execution user' crunch && \
/usr/bin/install --directory --owner=crunch --group=crunch --mode=0700 /keep /tmp/crunch-src /tmp/crunch-job
USER crunch
+ENV PATH=/usr/share/python3.9/dist/python3-arvados-cwl-runner/bin:/usr/local/bin:/usr/bin:/bin
commit 6cc306e481edc8ede97a9a11bdb73cd06056b4fe
Author: Brett Smith <brett.smith at curii.com>
Date: Tue Jan 9 15:34:29 2024 -0500
21367: Modernize apt key handling in arvados/jobs Dockerfile
Arvados-DCO-1.1-Signed-off-by: Brett Smith <brett.smith at curii.com>
diff --git a/docker/jobs/Dockerfile b/docker/jobs/Dockerfile
index 6404574117..563cd41e92 100644
--- a/docker/jobs/Dockerfile
+++ b/docker/jobs/Dockerfile
@@ -6,22 +6,13 @@
FROM debian:bullseye-slim
MAINTAINER Arvados Package Maintainers <packaging at arvados.org>
-ENV DEBIAN_FRONTEND noninteractive
-
-RUN apt-get update -q
-RUN apt-get install -yq --no-install-recommends gnupg
-
ARG repo_version
-ADD apt.arvados.org-$repo_version.list /etc/apt/sources.list.d/
-
-ADD 1078ECD7.key /tmp/
-RUN cat /tmp/1078ECD7.key | apt-key add -
-
-ARG python_sdk_version
ARG cwl_runner_version
+ADD apt.arvados.org-$repo_version.list /etc/apt/sources.list.d/
+ADD 1078ECD7.key /etc/apt/trusted.gpg.d/arvados.asc
RUN apt-get update -q
-RUN apt-get install -yq --no-install-recommends python3-arvados-cwl-runner=$cwl_runner_version
+RUN DEBIAN_FRONTEND=noninteractive apt-get install -yq --no-install-recommends python3-arvados-cwl-runner=$cwl_runner_version
# use the Python executable from the python-arvados-cwl-runner package
RUN PYTHON=`ls /usr/share/python3*/dist/python3-arvados-cwl-runner/bin/python|head -n1` && rm -f /usr/bin/python && ln -s $PYTHON /usr/bin/python
commit 6781014ea568cf45a0589ebd970ca155973bbddc
Author: Brett Smith <brett.smith at curii.com>
Date: Tue Jan 9 15:26:06 2024 -0500
21367: Remove debug prints
Arvados-DCO-1.1-Signed-off-by: Brett Smith <brett.smith at curii.com>
diff --git a/docker/jobs/Dockerfile b/docker/jobs/Dockerfile
index 0dfd9b444e..6404574117 100644
--- a/docker/jobs/Dockerfile
+++ b/docker/jobs/Dockerfile
@@ -12,7 +12,6 @@ RUN apt-get update -q
RUN apt-get install -yq --no-install-recommends gnupg
ARG repo_version
-RUN echo repo_version $repo_version
ADD apt.arvados.org-$repo_version.list /etc/apt/sources.list.d/
ADD 1078ECD7.key /tmp/
@@ -20,7 +19,6 @@ RUN cat /tmp/1078ECD7.key | apt-key add -
ARG python_sdk_version
ARG cwl_runner_version
-RUN echo cwl_runner_version $cwl_runner_version python_sdk_version $python_sdk_version
RUN apt-get update -q
RUN apt-get install -yq --no-install-recommends python3-arvados-cwl-runner=$cwl_runner_version
commit b20db672f17bc0ed6b0970982d421fb30d6ed77c
Author: Brett Smith <brett.smith at curii.com>
Date: Tue Jan 9 15:24:24 2024 -0500
21367: Update arvados/jobs image to bullseye
Required for Python 3.7 deprecation.
Arvados-DCO-1.1-Signed-off-by: Brett Smith <brett.smith at curii.com>
diff --git a/docker/jobs/Dockerfile b/docker/jobs/Dockerfile
index 1b75e13420..0dfd9b444e 100644
--- a/docker/jobs/Dockerfile
+++ b/docker/jobs/Dockerfile
@@ -3,7 +3,7 @@
# SPDX-License-Identifier: Apache-2.0
# Based on Debian
-FROM debian:buster-slim
+FROM debian:bullseye-slim
MAINTAINER Arvados Package Maintainers <packaging at arvados.org>
ENV DEBIAN_FRONTEND noninteractive
diff --git a/docker/jobs/apt.arvados.org-dev.list b/docker/jobs/apt.arvados.org-dev.list
index 210f5d5511..155244ba9f 100644
--- a/docker/jobs/apt.arvados.org-dev.list
+++ b/docker/jobs/apt.arvados.org-dev.list
@@ -1,2 +1,2 @@
# apt.arvados.org
-deb http://apt.arvados.org/buster buster-dev main
+deb http://apt.arvados.org/bullseye bullseye-dev main
diff --git a/docker/jobs/apt.arvados.org-stable.list b/docker/jobs/apt.arvados.org-stable.list
index 153e729805..5a4b8c91c8 100644
--- a/docker/jobs/apt.arvados.org-stable.list
+++ b/docker/jobs/apt.arvados.org-stable.list
@@ -1,2 +1,2 @@
# apt.arvados.org
-deb http://apt.arvados.org/buster buster main
+deb http://apt.arvados.org/bullseye bullseye main
diff --git a/docker/jobs/apt.arvados.org-testing.list b/docker/jobs/apt.arvados.org-testing.list
index d5f4581685..302862ca64 100644
--- a/docker/jobs/apt.arvados.org-testing.list
+++ b/docker/jobs/apt.arvados.org-testing.list
@@ -1,2 +1,2 @@
# apt.arvados.org
-deb http://apt.arvados.org/buster buster-testing main
+deb http://apt.arvados.org/bullseye bullseye-testing main
commit 553e9826223a13a90c4e1f53a05561532786cc8c
Author: Tom Clegg <tom at curii.com>
Date: Thu Jan 4 17:51:33 2024 -0500
21258: Ensure at least one boot failure.
With the previous approach, it was possible for all containers needing
a type4 instance to finish, and a different instance type to report a
quota error and cause the scheduler to shut down the now-unneeded
instance, all before the "guaranteed broken" node reached
TimeoutBooting. In such a case it would not be counted as a boot
failure.
To avoid this, the new approach induces boot failures on *all* type4
instances until 2x TimeoutBooting intervals have passed.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>
diff --git a/lib/dispatchcloud/dispatcher_test.go b/lib/dispatchcloud/dispatcher_test.go
index 51c2c3d6a3..20185554b8 100644
--- a/lib/dispatchcloud/dispatcher_test.go
+++ b/lib/dispatchcloud/dispatcher_test.go
@@ -207,6 +207,7 @@ func (s *DispatcherSuite) TestDispatchToStubDriver(c *check.C) {
finishContainer(ctr)
return int(rand.Uint32() & 0x3)
}
+ var type4BrokenUntil time.Time
var countCapacityErrors int64
vmCount := int32(0)
s.stubDriver.Queue = queue
@@ -224,6 +225,17 @@ func (s *DispatcherSuite) TestDispatchToStubDriver(c *check.C) {
stubvm.CrashRunningContainer = finishContainer
stubvm.ExtraCrunchRunArgs = "'--runtime-engine=stub' '--foo' '--extra='\\''args'\\'''"
switch {
+ case stubvm.Instance().ProviderType() == test.InstanceType(4).ProviderType &&
+ (type4BrokenUntil.IsZero() || time.Now().Before(type4BrokenUntil)):
+ // Initially (at least 2*TimeoutBooting), all
+ // instances of this type are completely
+ // broken. This ensures the
+ // boot_outcomes{outcome="failure"} metric is
+ // not zero.
+ stubvm.Broken = time.Now()
+ if type4BrokenUntil.IsZero() {
+ type4BrokenUntil = time.Now().Add(2 * s.cluster.Containers.CloudVMs.TimeoutBooting.Duration())
+ }
case n%7 == 0:
// some instances start out OK but then stop
// running any commands
@@ -235,11 +247,6 @@ func (s *DispatcherSuite) TestDispatchToStubDriver(c *check.C) {
// some instances start out OK but then start
// reporting themselves as broken
stubvm.ReportBroken = time.Now().Add(time.Duration(rand.Int63n(200)) * time.Millisecond)
- case n == 3:
- // 1 instance is completely broken, ensuring
- // the boot_outcomes{outcome="failure"} metric
- // is not zero
- stubvm.Broken = time.Now()
default:
stubvm.CrunchRunCrashRate = 0.1
stubvm.ArvMountDeadlockRate = 0.1
commit 9d03e8f43ac22534ddceee34bff5660ac0acd925
Author: Tom Clegg <tom at curii.com>
Date: Thu Jan 4 10:19:22 2024 -0500
21258: Fix log message.
Reverts an unintentional logging change in
1875af9bcf4a1afe435176e952e63341a9ae9c03 from
msg="creating new instance" InstanceType="type8" ...
to
msg="creating new instance" InstanceType="{type8 providertype8 8 8589934592 0 0 0 0.984 false { 0}}" ...
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>
diff --git a/lib/dispatchcloud/scheduler/run_queue.go b/lib/dispatchcloud/scheduler/run_queue.go
index 2f1f175890..03fa592777 100644
--- a/lib/dispatchcloud/scheduler/run_queue.go
+++ b/lib/dispatchcloud/scheduler/run_queue.go
@@ -239,7 +239,7 @@ tryrun:
// so mark it as allocated, and try to
// start the container.
unalloc[unallocType]--
- logger = logger.WithField("InstanceType", unallocType)
+ logger = logger.WithField("InstanceType", unallocType.Name)
if dontstart[unallocType] {
// We already tried & failed to start
// a higher-priority container on the
@@ -282,7 +282,7 @@ tryrun:
logger.Trace("all eligible types at capacity")
continue
}
- logger = logger.WithField("InstanceType", availableType)
+ logger = logger.WithField("InstanceType", availableType.Name)
if !sch.pool.Create(availableType) {
// Failed despite not being at quota,
// e.g., cloud ops throttled.
commit b58d0232799aaf3051487666f8c20b293527e3f4
Author: Tom Clegg <tom at curii.com>
Date: Mon Jan 8 11:29:20 2024 -0500
21036: Test all balancerStats metrics.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>
diff --git a/services/keep-balance/balance_run_test.go b/services/keep-balance/balance_run_test.go
index 7f6deb1e54..81e4c7b867 100644
--- a/services/keep-balance/balance_run_test.go
+++ b/services/keep-balance/balance_run_test.go
@@ -598,17 +598,34 @@ func (s *runSuite) TestCommit(c *check.C) {
c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_dedup_byte_ratio [1-9].*`)
c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_dedup_block_ratio [1-9].*`)
+ for _, cat := range []string{
+ "dedup_byte_ratio", "dedup_block_ratio", "collection_bytes",
+ "referenced_bytes", "referenced_blocks", "reference_count",
+ "pull_entries_sent_count",
+ "trash_entries_sent_count",
+ } {
+ c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_`+cat+` [1-9].*`)
+ }
+
+ for _, cat := range []string{
+ "pull_entries_deferred_count",
+ "trash_entries_deferred_count",
+ } {
+ c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_`+cat+` 0\n.*`)
+ }
+
c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_replicated_block_count{replicas="0"} [1-9].*`)
c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_replicated_block_count{replicas="1"} [1-9].*`)
c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_replicated_block_count{replicas="9"} 0\n.*`)
- c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_usage_replicas{status="needed",storage_class="default"} [1-9].*`)
- c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_usage_blocks{status="needed",storage_class="default"} [1-9].*`)
- c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_usage_bytes{status="needed",storage_class="default"} [1-9].*`)
- c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_usage_bytes{status="unneeded",storage_class="default"} [1-9].*`)
- c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_usage_bytes{status="unachievable",storage_class="default"} [1-9].*`)
- c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_usage_bytes{status="pulling",storage_class="default"} [1-9].*`)
-
+ for _, sub := range []string{"replicas", "blocks", "bytes"} {
+ for _, cat := range []string{"needed", "unneeded", "unachievable", "pulling"} {
+ c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_usage_`+sub+`{status="`+cat+`",storage_class="default"} [1-9].*`)
+ }
+ for _, cat := range []string{"total", "garbage", "transient", "overreplicated", "underreplicated", "unachievable", "balanced", "desired", "lost"} {
+ c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_`+cat+`_`+sub+` [0-9].*`)
+ }
+ }
c.Logf("%s", metrics)
}
commit acfde1e0010da59cbe48cb1990cf528796c22389
Author: Tom Clegg <tom at curii.com>
Date: Thu Jan 4 16:12:18 2024 -0500
21036: Report remaining balancerStats as prometheus metrics.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>
diff --git a/services/keep-balance/balance_run_test.go b/services/keep-balance/balance_run_test.go
index f66194e2a2..7f6deb1e54 100644
--- a/services/keep-balance/balance_run_test.go
+++ b/services/keep-balance/balance_run_test.go
@@ -556,6 +556,10 @@ func (s *runSuite) TestDryRun(c *check.C) {
c.Check(bal.stats.trashesDeferred, check.Not(check.Equals), 0)
c.Check(bal.stats.underrep.replicas, check.Not(check.Equals), 0)
c.Check(bal.stats.overrep.replicas, check.Not(check.Equals), 0)
+
+ metrics := arvadostest.GatherMetricsAsString(srv.Metrics.reg)
+ c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_trash_entries_deferred_count [1-9].*`)
+ c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_pull_entries_deferred_count [1-9].*`)
}
func (s *runSuite) TestCommit(c *check.C) {
@@ -593,6 +597,19 @@ func (s *runSuite) TestCommit(c *check.C) {
c.Check(metrics, check.Matches, `(?ms).*\narvados_keepbalance_changeset_compute_seconds_count 1\n.*`)
c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_dedup_byte_ratio [1-9].*`)
c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_dedup_block_ratio [1-9].*`)
+
+ c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_replicated_block_count{replicas="0"} [1-9].*`)
+ c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_replicated_block_count{replicas="1"} [1-9].*`)
+ c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_replicated_block_count{replicas="9"} 0\n.*`)
+
+ c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_usage_replicas{status="needed",storage_class="default"} [1-9].*`)
+ c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_usage_blocks{status="needed",storage_class="default"} [1-9].*`)
+ c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_usage_bytes{status="needed",storage_class="default"} [1-9].*`)
+ c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_usage_bytes{status="unneeded",storage_class="default"} [1-9].*`)
+ c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_usage_bytes{status="unachievable",storage_class="default"} [1-9].*`)
+ c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_usage_bytes{status="pulling",storage_class="default"} [1-9].*`)
+
+ c.Logf("%s", metrics)
}
func (s *runSuite) TestChunkPrefix(c *check.C) {
diff --git a/services/keep-balance/metrics.go b/services/keep-balance/metrics.go
index 4683b67b98..02cee3955f 100644
--- a/services/keep-balance/metrics.go
+++ b/services/keep-balance/metrics.go
@@ -7,6 +7,7 @@ package keepbalance
import (
"fmt"
"net/http"
+ "strconv"
"sync"
"github.com/prometheus/client_golang/prometheus"
@@ -17,18 +18,20 @@ type observer interface{ Observe(float64) }
type setter interface{ Set(float64) }
type metrics struct {
- reg *prometheus.Registry
- statsGauges map[string]setter
- observers map[string]observer
- setupOnce sync.Once
- mtx sync.Mutex
+ reg *prometheus.Registry
+ statsGauges map[string]setter
+ statsGaugeVecs map[string]*prometheus.GaugeVec
+ observers map[string]observer
+ setupOnce sync.Once
+ mtx sync.Mutex
}
func newMetrics(registry *prometheus.Registry) *metrics {
return &metrics{
- reg: registry,
- statsGauges: map[string]setter{},
- observers: map[string]observer{},
+ reg: registry,
+ statsGauges: map[string]setter{},
+ statsGaugeVecs: map[string]*prometheus.GaugeVec{},
+ observers: map[string]observer{},
}
}
@@ -63,9 +66,24 @@ func (m *metrics) UpdateStats(s balancerStats) {
"transient": {s.unref, "transient (unreferenced, new)"},
"overreplicated": {s.overrep, "overreplicated"},
"underreplicated": {s.underrep, "underreplicated"},
+ "unachievable": {s.unachievable, "unachievable"},
+ "balanced": {s.justright, "optimally balanced"},
+ "desired": {s.desired, "desired"},
"lost": {s.lost, "lost"},
"dedup_byte_ratio": {s.dedupByteRatio(), "deduplication ratio, bytes referenced / bytes stored"},
"dedup_block_ratio": {s.dedupBlockRatio(), "deduplication ratio, blocks referenced / blocks stored"},
+ "collection_bytes": {s.collectionBytes, "total apparent size of all collections"},
+ "referenced_bytes": {s.collectionBlockBytes, "total size of unique referenced blocks"},
+ "reference_count": {s.collectionBlockRefs, "block references in all collections"},
+ "referenced_blocks": {s.collectionBlocks, "blocks referenced by any collection"},
+
+ "pull_entries_sent_count": {s.pulls, "total entries sent in pull lists"},
+ "pull_entries_deferred_count": {s.pullsDeferred, "total entries deferred (not sent) in pull lists"},
+ "trash_entries_sent_count": {s.trashes, "total entries sent in trash lists"},
+ "trash_entries_deferred_count": {s.trashesDeferred, "total entries deferred (not sent) in trash lists"},
+
+ "replicated_block_count": {s.replHistogram, "blocks with indicated number of replicas at last count"},
+ "usage": {s.classStats, "stored in indicated storage class"},
}
m.setupOnce.Do(func() {
// Register gauge(s) for each balancerStats field.
@@ -87,6 +105,29 @@ func (m *metrics) UpdateStats(s balancerStats) {
}
case int, int64, float64:
addGauge(name, gauge.Help)
+ case []int:
+ // replHistogram
+ gv := prometheus.NewGaugeVec(prometheus.GaugeOpts{
+ Namespace: "arvados",
+ Name: name,
+ Subsystem: "keep",
+ Help: gauge.Help,
+ }, []string{"replicas"})
+ m.reg.MustRegister(gv)
+ m.statsGaugeVecs[name] = gv
+ case map[string]replicationStats:
+ // classStats
+ for _, sub := range []string{"blocks", "bytes", "replicas"} {
+ name := name + "_" + sub
+ gv := prometheus.NewGaugeVec(prometheus.GaugeOpts{
+ Namespace: "arvados",
+ Name: name,
+ Subsystem: "keep",
+ Help: gauge.Help,
+ }, []string{"storage_class", "status"})
+ m.reg.MustRegister(gv)
+ m.statsGaugeVecs[name] = gv
+ }
default:
panic(fmt.Sprintf("bad gauge type %T", gauge.Value))
}
@@ -105,6 +146,38 @@ func (m *metrics) UpdateStats(s balancerStats) {
m.statsGauges[name].Set(float64(val))
case float64:
m.statsGauges[name].Set(float64(val))
+ case []int:
+ // replHistogram
+ for r, n := range val {
+ m.statsGaugeVecs[name].WithLabelValues(strconv.Itoa(r)).Set(float64(n))
+ }
+ // Record zero for higher-than-max-replication
+ // metrics, so we don't incorrectly continue
+ // to report stale metrics.
+ //
+ // For example, if we previously reported n=1
+ // for repl=6, but have since restarted
+ // keep-balance and the most replicated block
+ // now has repl=5, then the repl=6 gauge will
+ // still say n=1 until we clear it explicitly
+ // here.
+ for r := len(val); r < len(val)+4 || r < len(val)*2; r++ {
+ m.statsGaugeVecs[name].WithLabelValues(strconv.Itoa(r)).Set(0)
+ }
+ case map[string]replicationStats:
+ // classStats
+ for class, cs := range val {
+ for label, val := range map[string]blocksNBytes{
+ "needed": cs.needed,
+ "unneeded": cs.unneeded,
+ "pulling": cs.pulling,
+ "unachievable": cs.unachievable,
+ } {
+ m.statsGaugeVecs[name+"_blocks"].WithLabelValues(class, label).Set(float64(val.blocks))
+ m.statsGaugeVecs[name+"_bytes"].WithLabelValues(class, label).Set(float64(val.bytes))
+ m.statsGaugeVecs[name+"_replicas"].WithLabelValues(class, label).Set(float64(val.replicas))
+ }
+ }
default:
panic(fmt.Sprintf("bad gauge type %T", gauge.Value))
}
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list