[arvados] updated: 2.7.0-5805-g999ef5f71d

git repository hosting git at public.arvados.org
Tue Jan 9 22:29:14 UTC 2024


Summary of changes:
 doc/user/cwl/cwl-extensions.html.textile.liquid |  6 +-
 docker/jobs/Dockerfile                          | 24 ++-----
 docker/jobs/apt.arvados.org-dev.list            |  2 +-
 docker/jobs/apt.arvados.org-stable.list         |  2 +-
 docker/jobs/apt.arvados.org-testing.list        |  2 +-
 lib/dispatchcloud/dispatcher_test.go            | 17 +++--
 lib/dispatchcloud/scheduler/run_queue.go        |  4 +-
 sdk/cwl/arvados_cwl/arv-cwl-schema-v1.0.yml     |  4 +-
 sdk/cwl/arvados_cwl/arv-cwl-schema-v1.1.yml     |  4 +-
 sdk/cwl/arvados_cwl/arv-cwl-schema-v1.2.yml     |  4 +-
 sdk/cwl/arvados_cwl/arvcontainer.py             |  2 +
 sdk/cwl/tests/arvados-tests.yml                 |  2 +-
 sdk/dev-jobs.dockerfile                         |  2 +-
 services/keep-balance/balance_run_test.go       | 34 ++++++++++
 services/keep-balance/metrics.go                | 89 ++++++++++++++++++++++---
 15 files changed, 150 insertions(+), 48 deletions(-)

       via  999ef5f71d522283465b54b6468bae6badf28ca4 (commit)
       via  3eeff2311c09fcc8b0ee74f22dfb543a02aece92 (commit)
       via  f7ecc6662e387292fb6eead32667b49d05a1d544 (commit)
       via  6b90e8d82fba8bba70c0b17d2094f8aa7c1800a5 (commit)
       via  6cc306e481edc8ede97a9a11bdb73cd06056b4fe (commit)
       via  6781014ea568cf45a0589ebd970ca155973bbddc (commit)
       via  b20db672f17bc0ed6b0970982d421fb30d6ed77c (commit)
       via  553e9826223a13a90c4e1f53a05561532786cc8c (commit)
       via  9d03e8f43ac22534ddceee34bff5660ac0acd925 (commit)
       via  b58d0232799aaf3051487666f8c20b293527e3f4 (commit)
       via  acfde1e0010da59cbe48cb1990cf528796c22389 (commit)
      from  798a896a1a5f411d0d9675a513e1ddeccc3c818d (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.


commit 999ef5f71d522283465b54b6468bae6badf28ca4
Author: Peter Amstutz <peter.amstutz at curii.com>
Date:   Tue Jan 9 17:22:32 2024 -0500

    21216: Make memoryRetryMultiplier optional and set a default value
    
    Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz at curii.com>

diff --git a/doc/user/cwl/cwl-extensions.html.textile.liquid b/doc/user/cwl/cwl-extensions.html.textile.liquid
index e05072ddf6..3c8366721d 100644
--- a/doc/user/cwl/cwl-extensions.html.textile.liquid
+++ b/doc/user/cwl/cwl-extensions.html.textile.liquid
@@ -73,7 +73,7 @@ hints:
     usePreemptible: true
 
   arv:OutOfMemoryRetry:
-    memoryRetryMultipler: 2
+    memoryRetryMultiplier: 2
     memoryErrorRegex: "custom memory error"
 {% endcodeblock %}
 
@@ -195,7 +195,7 @@ table(table table-bordered table-condensed).
 
 h2(#OutOfMemoryRetry). arv:OutOfMemoryRetry
 
-Specify that when a workflow step appears to have failed because it did not request enough RAM, it should be re-submitted with more RAM.  Out of memory conditions are detected either by the container being unexpectedly killed (exit code 137) or by matching a pattern in the container's output (see @memoryErrorRegex@).  Retrying will increase the base RAM request by the value of @memoryRetryMultipler at .  For example, if the original RAM request was 10 GiB and the multiplier is 1.5, then it will re-submit with 15 GiB.
+Specify that when a workflow step appears to have failed because it did not request enough RAM, it should be re-submitted with more RAM.  Out of memory conditions are detected either by the container being unexpectedly killed (exit code 137) or by matching a pattern in the container's output (see @memoryErrorRegex@).  Retrying will increase the base RAM request by the value of @memoryRetryMultiplier at .  For example, if the original RAM request was 10 GiB and the multiplier is 1.5, then it will re-submit with 15 GiB.
 
 Containers are only re-submitted once.  If it fails a second time after increasing RAM, then the worklow step will still fail.
 
@@ -203,7 +203,7 @@ Also note that expressions that use @$(runtime.ram)@ (such as dynamic command li
 
 table(table table-bordered table-condensed).
 |_. Field |_. Type |_. Description |
-|memoryRetryMultipler|float|Required, the retry will multiply the base memory request by this factor to get the retry memory request.|
+|memoryRetryMultiplier|float|Optional, default value is 2.  The retry will multiply the base memory request by this factor to get the retry memory request.|
 |memoryErrorRegex|string|Optional, a custom regex that, if found in the stdout, stderr or crunch-run logging of a program, will trigger a retry with greater RAM.  If not provided, the default pattern matches "out of memory" (with or without spaces), "memory error" (with or without spaces), "bad_alloc" and "container using over 90% of memory".|
 
 h2. arv:dockerCollectionPDH
diff --git a/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.0.yml b/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.0.yml
index 450864df30..aeb41db568 100644
--- a/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.0.yml
+++ b/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.0.yml
@@ -479,7 +479,7 @@ $graph:
         should be retried with more RAM.  By default, searches for the
         substrings 'bad_alloc' and 'OutOfMemory'.
     - name: memoryRetryMultiplier
-      type: float
+      type: float?
       doc: |
         If the container failed on its first run, re-submit the
         container with the RAM request multiplied by this factor.
diff --git a/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.1.yml b/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.1.yml
index f33b94e69d..0e51d50080 100644
--- a/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.1.yml
+++ b/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.1.yml
@@ -422,7 +422,7 @@ $graph:
         should be retried with more RAM.  By default, searches for the
         substrings 'bad_alloc' and 'OutOfMemory'.
     - name: memoryRetryMultiplier
-      type: float
+      type: float?
       doc: |
         If the container failed on its first run, re-submit the
         container with the RAM request multiplied by this factor.
diff --git a/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.2.yml b/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.2.yml
index 0c6035c56f..a753579c9a 100644
--- a/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.2.yml
+++ b/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.2.yml
@@ -425,7 +425,7 @@ $graph:
         should be retried with more RAM.  By default, searches for the
         substrings 'bad_alloc' and 'OutOfMemory'.
     - name: memoryRetryMultiplier
-      type: float
+      type: float?
       doc: |
         If the container failed on its first run, re-submit the
         container with the RAM request multiplied by this factor.
diff --git a/sdk/cwl/arvados_cwl/arvcontainer.py b/sdk/cwl/arvados_cwl/arvcontainer.py
index 1f379ccec6..584ca1713a 100644
--- a/sdk/cwl/arvados_cwl/arvcontainer.py
+++ b/sdk/cwl/arvados_cwl/arvcontainer.py
@@ -375,6 +375,8 @@ class ArvadosContainer(JobBase):
                 ram_multiplier.append(oom_retry_req.get('memoryRetryMultiplier'))
             elif oom_retry_req.get('memoryRetryMultipler'):
                 ram_multiplier.append(oom_retry_req.get('memoryRetryMultipler'))
+            else:
+                ram_multiplier.append(2)
 
         if runtimeContext.runnerjob.startswith("arvwf:"):
             wfuuid = runtimeContext.runnerjob[6:runtimeContext.runnerjob.index("#")]

commit 3eeff2311c09fcc8b0ee74f22dfb543a02aece92
Author: Peter Amstutz <peter.amstutz at curii.com>
Date:   Tue Jan 9 17:12:55 2024 -0500

    21216: Make the misspelled version optional
    
    Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz at curii.com>

diff --git a/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.0.yml b/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.0.yml
index b91564bdd1..450864df30 100644
--- a/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.0.yml
+++ b/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.0.yml
@@ -484,7 +484,7 @@ $graph:
         If the container failed on its first run, re-submit the
         container with the RAM request multiplied by this factor.
     - name: memoryRetryMultipler
-      type: float
+      type: float?
       doc: |
         Deprecated misspelling of "memoryRetryMultiplier".  Kept only
         for backwards compatability, don't use this.
diff --git a/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.1.yml b/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.1.yml
index ebec3f334e..f33b94e69d 100644
--- a/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.1.yml
+++ b/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.1.yml
@@ -427,7 +427,7 @@ $graph:
         If the container failed on its first run, re-submit the
         container with the RAM request multiplied by this factor.
     - name: memoryRetryMultipler
-      type: float
+      type: float?
       doc: |
         Deprecated misspelling of "memoryRetryMultiplier".  Kept only
         for backwards compatability, don't use this.
diff --git a/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.2.yml b/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.2.yml
index 7b70a00dab..0c6035c56f 100644
--- a/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.2.yml
+++ b/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.2.yml
@@ -430,7 +430,7 @@ $graph:
         If the container failed on its first run, re-submit the
         container with the RAM request multiplied by this factor.
     - name: memoryRetryMultipler
-      type: float
+      type: float?
       doc: |
         Deprecated misspelling of "memoryRetryMultiplier".  Kept only
         for backwards compatability, don't use this.
diff --git a/sdk/cwl/tests/arvados-tests.yml b/sdk/cwl/tests/arvados-tests.yml
index 7ca8ca0950..cb4a151f0e 100644
--- a/sdk/cwl/tests/arvados-tests.yml
+++ b/sdk/cwl/tests/arvados-tests.yml
@@ -487,7 +487,7 @@
 
 - job: oom/fakeoom.yml
   output: {}
-  tool: oom/19975-oom-misspelled.cwl
+  tool: oom/19975-oom-mispelled.cwl
   doc: "Test feature 19975 - retry on exit 137, old misspelled version"
 
 - job: oom/fakeoom2.yml

commit f7ecc6662e387292fb6eead32667b49d05a1d544
Author: Peter Amstutz <peter.amstutz at curii.com>
Date:   Tue Jan 9 16:51:53 2024 -0500

    21367: Use regular bullseye instead of bullseye-slim
    
    Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz at curii.com>

diff --git a/sdk/dev-jobs.dockerfile b/sdk/dev-jobs.dockerfile
index 95b039eba9..656572eb4f 100644
--- a/sdk/dev-jobs.dockerfile
+++ b/sdk/dev-jobs.dockerfile
@@ -13,7 +13,7 @@
 # (This dockerfile file must be located in the arvados/sdk/ directory because
 #  of the docker build root.)
 
-FROM debian:buster-slim
+FROM debian:bullseye
 MAINTAINER Arvados Package Maintainers <packaging at arvados.org>
 
 ENV DEBIAN_FRONTEND noninteractive

commit 6b90e8d82fba8bba70c0b17d2094f8aa7c1800a5
Author: Brett Smith <brett.smith at curii.com>
Date:   Tue Jan 9 15:38:33 2024 -0500

    21367: Add arvados virtualenv to $PATH
    
    This is both more user-friendly (now a-c-r itself is in $PATH) and a
    cleaner implementation than the previous version.
    
    Arvados-DCO-1.1-Signed-off-by: Brett Smith <brett.smith at curii.com>

diff --git a/docker/jobs/Dockerfile b/docker/jobs/Dockerfile
index 563cd41e92..371b9cc984 100644
--- a/docker/jobs/Dockerfile
+++ b/docker/jobs/Dockerfile
@@ -14,13 +14,10 @@ ADD 1078ECD7.key /etc/apt/trusted.gpg.d/arvados.asc
 RUN apt-get update -q
 RUN DEBIAN_FRONTEND=noninteractive apt-get install -yq --no-install-recommends python3-arvados-cwl-runner=$cwl_runner_version
 
-# use the Python executable from the python-arvados-cwl-runner package
-RUN PYTHON=`ls /usr/share/python3*/dist/python3-arvados-cwl-runner/bin/python|head -n1` && rm -f /usr/bin/python && ln -s $PYTHON /usr/bin/python
-RUN PYTHON3=`ls /usr/share/python3*/dist/python3-arvados-cwl-runner/bin/python3|head -n1` && rm -f /usr/bin/python3 && ln -s $PYTHON3 /usr/bin/python3
-
 # Install dependencies and set up system.
 RUN /usr/sbin/adduser --disabled-password \
       --gecos 'Crunch execution user' crunch && \
     /usr/bin/install --directory --owner=crunch --group=crunch --mode=0700 /keep /tmp/crunch-src /tmp/crunch-job
 
 USER crunch
+ENV PATH=/usr/share/python3.9/dist/python3-arvados-cwl-runner/bin:/usr/local/bin:/usr/bin:/bin

commit 6cc306e481edc8ede97a9a11bdb73cd06056b4fe
Author: Brett Smith <brett.smith at curii.com>
Date:   Tue Jan 9 15:34:29 2024 -0500

    21367: Modernize apt key handling in arvados/jobs Dockerfile
    
    Arvados-DCO-1.1-Signed-off-by: Brett Smith <brett.smith at curii.com>

diff --git a/docker/jobs/Dockerfile b/docker/jobs/Dockerfile
index 6404574117..563cd41e92 100644
--- a/docker/jobs/Dockerfile
+++ b/docker/jobs/Dockerfile
@@ -6,22 +6,13 @@
 FROM debian:bullseye-slim
 MAINTAINER Arvados Package Maintainers <packaging at arvados.org>
 
-ENV DEBIAN_FRONTEND noninteractive
-
-RUN apt-get update -q
-RUN apt-get install -yq --no-install-recommends gnupg
-
 ARG repo_version
-ADD apt.arvados.org-$repo_version.list /etc/apt/sources.list.d/
-
-ADD 1078ECD7.key /tmp/
-RUN cat /tmp/1078ECD7.key | apt-key add -
-
-ARG python_sdk_version
 ARG cwl_runner_version
 
+ADD apt.arvados.org-$repo_version.list /etc/apt/sources.list.d/
+ADD 1078ECD7.key /etc/apt/trusted.gpg.d/arvados.asc
 RUN apt-get update -q
-RUN apt-get install -yq --no-install-recommends python3-arvados-cwl-runner=$cwl_runner_version
+RUN DEBIAN_FRONTEND=noninteractive apt-get install -yq --no-install-recommends python3-arvados-cwl-runner=$cwl_runner_version
 
 # use the Python executable from the python-arvados-cwl-runner package
 RUN PYTHON=`ls /usr/share/python3*/dist/python3-arvados-cwl-runner/bin/python|head -n1` && rm -f /usr/bin/python && ln -s $PYTHON /usr/bin/python

commit 6781014ea568cf45a0589ebd970ca155973bbddc
Author: Brett Smith <brett.smith at curii.com>
Date:   Tue Jan 9 15:26:06 2024 -0500

    21367: Remove debug prints
    
    Arvados-DCO-1.1-Signed-off-by: Brett Smith <brett.smith at curii.com>

diff --git a/docker/jobs/Dockerfile b/docker/jobs/Dockerfile
index 0dfd9b444e..6404574117 100644
--- a/docker/jobs/Dockerfile
+++ b/docker/jobs/Dockerfile
@@ -12,7 +12,6 @@ RUN apt-get update -q
 RUN apt-get install -yq --no-install-recommends gnupg
 
 ARG repo_version
-RUN echo repo_version $repo_version
 ADD apt.arvados.org-$repo_version.list /etc/apt/sources.list.d/
 
 ADD 1078ECD7.key /tmp/
@@ -20,7 +19,6 @@ RUN cat /tmp/1078ECD7.key | apt-key add -
 
 ARG python_sdk_version
 ARG cwl_runner_version
-RUN echo cwl_runner_version $cwl_runner_version python_sdk_version $python_sdk_version
 
 RUN apt-get update -q
 RUN apt-get install -yq --no-install-recommends python3-arvados-cwl-runner=$cwl_runner_version

commit b20db672f17bc0ed6b0970982d421fb30d6ed77c
Author: Brett Smith <brett.smith at curii.com>
Date:   Tue Jan 9 15:24:24 2024 -0500

    21367: Update arvados/jobs image to bullseye
    
    Required for Python 3.7 deprecation.
    
    Arvados-DCO-1.1-Signed-off-by: Brett Smith <brett.smith at curii.com>

diff --git a/docker/jobs/Dockerfile b/docker/jobs/Dockerfile
index 1b75e13420..0dfd9b444e 100644
--- a/docker/jobs/Dockerfile
+++ b/docker/jobs/Dockerfile
@@ -3,7 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 # Based on Debian
-FROM debian:buster-slim
+FROM debian:bullseye-slim
 MAINTAINER Arvados Package Maintainers <packaging at arvados.org>
 
 ENV DEBIAN_FRONTEND noninteractive
diff --git a/docker/jobs/apt.arvados.org-dev.list b/docker/jobs/apt.arvados.org-dev.list
index 210f5d5511..155244ba9f 100644
--- a/docker/jobs/apt.arvados.org-dev.list
+++ b/docker/jobs/apt.arvados.org-dev.list
@@ -1,2 +1,2 @@
 # apt.arvados.org
-deb http://apt.arvados.org/buster buster-dev main
+deb http://apt.arvados.org/bullseye bullseye-dev main
diff --git a/docker/jobs/apt.arvados.org-stable.list b/docker/jobs/apt.arvados.org-stable.list
index 153e729805..5a4b8c91c8 100644
--- a/docker/jobs/apt.arvados.org-stable.list
+++ b/docker/jobs/apt.arvados.org-stable.list
@@ -1,2 +1,2 @@
 # apt.arvados.org
-deb http://apt.arvados.org/buster buster main
+deb http://apt.arvados.org/bullseye bullseye main
diff --git a/docker/jobs/apt.arvados.org-testing.list b/docker/jobs/apt.arvados.org-testing.list
index d5f4581685..302862ca64 100644
--- a/docker/jobs/apt.arvados.org-testing.list
+++ b/docker/jobs/apt.arvados.org-testing.list
@@ -1,2 +1,2 @@
 # apt.arvados.org
-deb http://apt.arvados.org/buster buster-testing main
+deb http://apt.arvados.org/bullseye bullseye-testing main

commit 553e9826223a13a90c4e1f53a05561532786cc8c
Author: Tom Clegg <tom at curii.com>
Date:   Thu Jan 4 17:51:33 2024 -0500

    21258: Ensure at least one boot failure.
    
    With the previous approach, it was possible for all containers needing
    a type4 instance to finish, and a different instance type to report a
    quota error and cause the scheduler to shut down the now-unneeded
    instance, all before the "guaranteed broken" node reached
    TimeoutBooting. In such a case it would not be counted as a boot
    failure.
    
    To avoid this, the new approach induces boot failures on *all* type4
    instances until 2x TimeoutBooting intervals have passed.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>

diff --git a/lib/dispatchcloud/dispatcher_test.go b/lib/dispatchcloud/dispatcher_test.go
index 51c2c3d6a3..20185554b8 100644
--- a/lib/dispatchcloud/dispatcher_test.go
+++ b/lib/dispatchcloud/dispatcher_test.go
@@ -207,6 +207,7 @@ func (s *DispatcherSuite) TestDispatchToStubDriver(c *check.C) {
 		finishContainer(ctr)
 		return int(rand.Uint32() & 0x3)
 	}
+	var type4BrokenUntil time.Time
 	var countCapacityErrors int64
 	vmCount := int32(0)
 	s.stubDriver.Queue = queue
@@ -224,6 +225,17 @@ func (s *DispatcherSuite) TestDispatchToStubDriver(c *check.C) {
 		stubvm.CrashRunningContainer = finishContainer
 		stubvm.ExtraCrunchRunArgs = "'--runtime-engine=stub' '--foo' '--extra='\\''args'\\'''"
 		switch {
+		case stubvm.Instance().ProviderType() == test.InstanceType(4).ProviderType &&
+			(type4BrokenUntil.IsZero() || time.Now().Before(type4BrokenUntil)):
+			// Initially (at least 2*TimeoutBooting), all
+			// instances of this type are completely
+			// broken. This ensures the
+			// boot_outcomes{outcome="failure"} metric is
+			// not zero.
+			stubvm.Broken = time.Now()
+			if type4BrokenUntil.IsZero() {
+				type4BrokenUntil = time.Now().Add(2 * s.cluster.Containers.CloudVMs.TimeoutBooting.Duration())
+			}
 		case n%7 == 0:
 			// some instances start out OK but then stop
 			// running any commands
@@ -235,11 +247,6 @@ func (s *DispatcherSuite) TestDispatchToStubDriver(c *check.C) {
 			// some instances start out OK but then start
 			// reporting themselves as broken
 			stubvm.ReportBroken = time.Now().Add(time.Duration(rand.Int63n(200)) * time.Millisecond)
-		case n == 3:
-			// 1 instance is completely broken, ensuring
-			// the boot_outcomes{outcome="failure"} metric
-			// is not zero
-			stubvm.Broken = time.Now()
 		default:
 			stubvm.CrunchRunCrashRate = 0.1
 			stubvm.ArvMountDeadlockRate = 0.1

commit 9d03e8f43ac22534ddceee34bff5660ac0acd925
Author: Tom Clegg <tom at curii.com>
Date:   Thu Jan 4 10:19:22 2024 -0500

    21258: Fix log message.
    
    Reverts an unintentional logging change in
    1875af9bcf4a1afe435176e952e63341a9ae9c03 from
    
    msg="creating new instance" InstanceType="type8" ...
    
    to
    
    msg="creating new instance" InstanceType="{type8 providertype8 8 8589934592 0 0 0 0.984 false {  0}}" ...
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>

diff --git a/lib/dispatchcloud/scheduler/run_queue.go b/lib/dispatchcloud/scheduler/run_queue.go
index 2f1f175890..03fa592777 100644
--- a/lib/dispatchcloud/scheduler/run_queue.go
+++ b/lib/dispatchcloud/scheduler/run_queue.go
@@ -239,7 +239,7 @@ tryrun:
 				// so mark it as allocated, and try to
 				// start the container.
 				unalloc[unallocType]--
-				logger = logger.WithField("InstanceType", unallocType)
+				logger = logger.WithField("InstanceType", unallocType.Name)
 				if dontstart[unallocType] {
 					// We already tried & failed to start
 					// a higher-priority container on the
@@ -282,7 +282,7 @@ tryrun:
 				logger.Trace("all eligible types at capacity")
 				continue
 			}
-			logger = logger.WithField("InstanceType", availableType)
+			logger = logger.WithField("InstanceType", availableType.Name)
 			if !sch.pool.Create(availableType) {
 				// Failed despite not being at quota,
 				// e.g., cloud ops throttled.

commit b58d0232799aaf3051487666f8c20b293527e3f4
Author: Tom Clegg <tom at curii.com>
Date:   Mon Jan 8 11:29:20 2024 -0500

    21036: Test all balancerStats metrics.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>

diff --git a/services/keep-balance/balance_run_test.go b/services/keep-balance/balance_run_test.go
index 7f6deb1e54..81e4c7b867 100644
--- a/services/keep-balance/balance_run_test.go
+++ b/services/keep-balance/balance_run_test.go
@@ -598,17 +598,34 @@ func (s *runSuite) TestCommit(c *check.C) {
 	c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_dedup_byte_ratio [1-9].*`)
 	c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_dedup_block_ratio [1-9].*`)
 
+	for _, cat := range []string{
+		"dedup_byte_ratio", "dedup_block_ratio", "collection_bytes",
+		"referenced_bytes", "referenced_blocks", "reference_count",
+		"pull_entries_sent_count",
+		"trash_entries_sent_count",
+	} {
+		c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_`+cat+` [1-9].*`)
+	}
+
+	for _, cat := range []string{
+		"pull_entries_deferred_count",
+		"trash_entries_deferred_count",
+	} {
+		c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_`+cat+` 0\n.*`)
+	}
+
 	c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_replicated_block_count{replicas="0"} [1-9].*`)
 	c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_replicated_block_count{replicas="1"} [1-9].*`)
 	c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_replicated_block_count{replicas="9"} 0\n.*`)
 
-	c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_usage_replicas{status="needed",storage_class="default"} [1-9].*`)
-	c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_usage_blocks{status="needed",storage_class="default"} [1-9].*`)
-	c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_usage_bytes{status="needed",storage_class="default"} [1-9].*`)
-	c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_usage_bytes{status="unneeded",storage_class="default"} [1-9].*`)
-	c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_usage_bytes{status="unachievable",storage_class="default"} [1-9].*`)
-	c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_usage_bytes{status="pulling",storage_class="default"} [1-9].*`)
-
+	for _, sub := range []string{"replicas", "blocks", "bytes"} {
+		for _, cat := range []string{"needed", "unneeded", "unachievable", "pulling"} {
+			c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_usage_`+sub+`{status="`+cat+`",storage_class="default"} [1-9].*`)
+		}
+		for _, cat := range []string{"total", "garbage", "transient", "overreplicated", "underreplicated", "unachievable", "balanced", "desired", "lost"} {
+			c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_`+cat+`_`+sub+` [0-9].*`)
+		}
+	}
 	c.Logf("%s", metrics)
 }
 

commit acfde1e0010da59cbe48cb1990cf528796c22389
Author: Tom Clegg <tom at curii.com>
Date:   Thu Jan 4 16:12:18 2024 -0500

    21036: Report remaining balancerStats as prometheus metrics.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>

diff --git a/services/keep-balance/balance_run_test.go b/services/keep-balance/balance_run_test.go
index f66194e2a2..7f6deb1e54 100644
--- a/services/keep-balance/balance_run_test.go
+++ b/services/keep-balance/balance_run_test.go
@@ -556,6 +556,10 @@ func (s *runSuite) TestDryRun(c *check.C) {
 	c.Check(bal.stats.trashesDeferred, check.Not(check.Equals), 0)
 	c.Check(bal.stats.underrep.replicas, check.Not(check.Equals), 0)
 	c.Check(bal.stats.overrep.replicas, check.Not(check.Equals), 0)
+
+	metrics := arvadostest.GatherMetricsAsString(srv.Metrics.reg)
+	c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_trash_entries_deferred_count [1-9].*`)
+	c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_pull_entries_deferred_count [1-9].*`)
 }
 
 func (s *runSuite) TestCommit(c *check.C) {
@@ -593,6 +597,19 @@ func (s *runSuite) TestCommit(c *check.C) {
 	c.Check(metrics, check.Matches, `(?ms).*\narvados_keepbalance_changeset_compute_seconds_count 1\n.*`)
 	c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_dedup_byte_ratio [1-9].*`)
 	c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_dedup_block_ratio [1-9].*`)
+
+	c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_replicated_block_count{replicas="0"} [1-9].*`)
+	c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_replicated_block_count{replicas="1"} [1-9].*`)
+	c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_replicated_block_count{replicas="9"} 0\n.*`)
+
+	c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_usage_replicas{status="needed",storage_class="default"} [1-9].*`)
+	c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_usage_blocks{status="needed",storage_class="default"} [1-9].*`)
+	c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_usage_bytes{status="needed",storage_class="default"} [1-9].*`)
+	c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_usage_bytes{status="unneeded",storage_class="default"} [1-9].*`)
+	c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_usage_bytes{status="unachievable",storage_class="default"} [1-9].*`)
+	c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_usage_bytes{status="pulling",storage_class="default"} [1-9].*`)
+
+	c.Logf("%s", metrics)
 }
 
 func (s *runSuite) TestChunkPrefix(c *check.C) {
diff --git a/services/keep-balance/metrics.go b/services/keep-balance/metrics.go
index 4683b67b98..02cee3955f 100644
--- a/services/keep-balance/metrics.go
+++ b/services/keep-balance/metrics.go
@@ -7,6 +7,7 @@ package keepbalance
 import (
 	"fmt"
 	"net/http"
+	"strconv"
 	"sync"
 
 	"github.com/prometheus/client_golang/prometheus"
@@ -17,18 +18,20 @@ type observer interface{ Observe(float64) }
 type setter interface{ Set(float64) }
 
 type metrics struct {
-	reg         *prometheus.Registry
-	statsGauges map[string]setter
-	observers   map[string]observer
-	setupOnce   sync.Once
-	mtx         sync.Mutex
+	reg            *prometheus.Registry
+	statsGauges    map[string]setter
+	statsGaugeVecs map[string]*prometheus.GaugeVec
+	observers      map[string]observer
+	setupOnce      sync.Once
+	mtx            sync.Mutex
 }
 
 func newMetrics(registry *prometheus.Registry) *metrics {
 	return &metrics{
-		reg:         registry,
-		statsGauges: map[string]setter{},
-		observers:   map[string]observer{},
+		reg:            registry,
+		statsGauges:    map[string]setter{},
+		statsGaugeVecs: map[string]*prometheus.GaugeVec{},
+		observers:      map[string]observer{},
 	}
 }
 
@@ -63,9 +66,24 @@ func (m *metrics) UpdateStats(s balancerStats) {
 		"transient":         {s.unref, "transient (unreferenced, new)"},
 		"overreplicated":    {s.overrep, "overreplicated"},
 		"underreplicated":   {s.underrep, "underreplicated"},
+		"unachievable":      {s.unachievable, "unachievable"},
+		"balanced":          {s.justright, "optimally balanced"},
+		"desired":           {s.desired, "desired"},
 		"lost":              {s.lost, "lost"},
 		"dedup_byte_ratio":  {s.dedupByteRatio(), "deduplication ratio, bytes referenced / bytes stored"},
 		"dedup_block_ratio": {s.dedupBlockRatio(), "deduplication ratio, blocks referenced / blocks stored"},
+		"collection_bytes":  {s.collectionBytes, "total apparent size of all collections"},
+		"referenced_bytes":  {s.collectionBlockBytes, "total size of unique referenced blocks"},
+		"reference_count":   {s.collectionBlockRefs, "block references in all collections"},
+		"referenced_blocks": {s.collectionBlocks, "blocks referenced by any collection"},
+
+		"pull_entries_sent_count":      {s.pulls, "total entries sent in pull lists"},
+		"pull_entries_deferred_count":  {s.pullsDeferred, "total entries deferred (not sent) in pull lists"},
+		"trash_entries_sent_count":     {s.trashes, "total entries sent in trash lists"},
+		"trash_entries_deferred_count": {s.trashesDeferred, "total entries deferred (not sent) in trash lists"},
+
+		"replicated_block_count": {s.replHistogram, "blocks with indicated number of replicas at last count"},
+		"usage":                  {s.classStats, "stored in indicated storage class"},
 	}
 	m.setupOnce.Do(func() {
 		// Register gauge(s) for each balancerStats field.
@@ -87,6 +105,29 @@ func (m *metrics) UpdateStats(s balancerStats) {
 				}
 			case int, int64, float64:
 				addGauge(name, gauge.Help)
+			case []int:
+				// replHistogram
+				gv := prometheus.NewGaugeVec(prometheus.GaugeOpts{
+					Namespace: "arvados",
+					Name:      name,
+					Subsystem: "keep",
+					Help:      gauge.Help,
+				}, []string{"replicas"})
+				m.reg.MustRegister(gv)
+				m.statsGaugeVecs[name] = gv
+			case map[string]replicationStats:
+				// classStats
+				for _, sub := range []string{"blocks", "bytes", "replicas"} {
+					name := name + "_" + sub
+					gv := prometheus.NewGaugeVec(prometheus.GaugeOpts{
+						Namespace: "arvados",
+						Name:      name,
+						Subsystem: "keep",
+						Help:      gauge.Help,
+					}, []string{"storage_class", "status"})
+					m.reg.MustRegister(gv)
+					m.statsGaugeVecs[name] = gv
+				}
 			default:
 				panic(fmt.Sprintf("bad gauge type %T", gauge.Value))
 			}
@@ -105,6 +146,38 @@ func (m *metrics) UpdateStats(s balancerStats) {
 			m.statsGauges[name].Set(float64(val))
 		case float64:
 			m.statsGauges[name].Set(float64(val))
+		case []int:
+			// replHistogram
+			for r, n := range val {
+				m.statsGaugeVecs[name].WithLabelValues(strconv.Itoa(r)).Set(float64(n))
+			}
+			// Record zero for higher-than-max-replication
+			// metrics, so we don't incorrectly continue
+			// to report stale metrics.
+			//
+			// For example, if we previously reported n=1
+			// for repl=6, but have since restarted
+			// keep-balance and the most replicated block
+			// now has repl=5, then the repl=6 gauge will
+			// still say n=1 until we clear it explicitly
+			// here.
+			for r := len(val); r < len(val)+4 || r < len(val)*2; r++ {
+				m.statsGaugeVecs[name].WithLabelValues(strconv.Itoa(r)).Set(0)
+			}
+		case map[string]replicationStats:
+			// classStats
+			for class, cs := range val {
+				for label, val := range map[string]blocksNBytes{
+					"needed":       cs.needed,
+					"unneeded":     cs.unneeded,
+					"pulling":      cs.pulling,
+					"unachievable": cs.unachievable,
+				} {
+					m.statsGaugeVecs[name+"_blocks"].WithLabelValues(class, label).Set(float64(val.blocks))
+					m.statsGaugeVecs[name+"_bytes"].WithLabelValues(class, label).Set(float64(val.bytes))
+					m.statsGaugeVecs[name+"_replicas"].WithLabelValues(class, label).Set(float64(val.replicas))
+				}
+			}
 		default:
 			panic(fmt.Sprintf("bad gauge type %T", gauge.Value))
 		}

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list