[ARVADOS] created: 2.1.0-2249-g2359fecc1

Git user git at public.arvados.org
Wed Apr 13 19:15:36 UTC 2022


        at  2359fecc1866212534b2148e22a0b8803e421455 (commit)


commit 2359fecc1866212534b2148e22a0b8803e421455
Author: Tom Clegg <tom at curii.com>
Date:   Wed Apr 13 15:11:56 2022 -0400

    18794: Avoid failing health check on incomplete-but-working config.
    
    It is typical to have some InternalURLs that are only reachable by
    from the same node (not necessarily where health-check runs) or are
    missing entirely because only Nginx needs to know where they are.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>

diff --git a/sdk/go/health/aggregator.go b/sdk/go/health/aggregator.go
index 864ef592b..6e2c08e4c 100644
--- a/sdk/go/health/aggregator.go
+++ b/sdk/go/health/aggregator.go
@@ -6,12 +6,14 @@ package health
 
 import (
 	"bufio"
+	"bytes"
 	"context"
 	"encoding/json"
 	"errors"
 	"flag"
 	"fmt"
 	"io"
+	"net"
 	"net/http"
 	"net/url"
 	"regexp"
@@ -124,7 +126,7 @@ type Metrics struct {
 }
 
 type ServiceHealth struct {
-	Health string `json:"health"`
+	Health string `json:"health"` // "OK", "ERROR", or "SKIP"
 	N      int    `json:"n"`
 }
 
@@ -142,7 +144,7 @@ func (agg *Aggregator) ClusterHealth() ClusterHealthResponse {
 		// Ensure svc is listed in resp.Services.
 		mtx.Lock()
 		if _, ok := resp.Services[svcName]; !ok {
-			resp.Services[svcName] = ServiceHealth{Health: "ERROR"}
+			resp.Services[svcName] = ServiceHealth{Health: "NONE"}
 		}
 		mtx.Unlock()
 
@@ -159,11 +161,13 @@ func (agg *Aggregator) ClusterHealth() ClusterHealthResponse {
 					}
 				} else {
 					result = agg.ping(pingURL)
-					m, err := agg.metrics(pingURL)
-					if err != nil {
-						result.Error = "metrics: " + err.Error()
+					if result.Health != "SKIP" {
+						m, err := agg.metrics(pingURL)
+						if err != nil && result.Error == "" {
+							result.Error = "metrics: " + err.Error()
+						}
+						result.Metrics = m
 					}
-					result.Metrics = m
 				}
 
 				mtx.Lock()
@@ -174,7 +178,7 @@ func (agg *Aggregator) ClusterHealth() ClusterHealthResponse {
 					h.N++
 					h.Health = "OK"
 					resp.Services[svcName] = h
-				} else {
+				} else if result.Health != "SKIP" {
 					resp.Health = "ERROR"
 				}
 			}(svcName, addr)
@@ -184,10 +188,20 @@ func (agg *Aggregator) ClusterHealth() ClusterHealthResponse {
 
 	// Report ERROR if a needed service didn't fail any checks
 	// merely because it isn't configured to run anywhere.
-	for _, sh := range resp.Services {
-		if sh.Health != "OK" {
-			resp.Health = "ERROR"
-			break
+	for svcName, sh := range resp.Services {
+		switch svcName {
+		case arvados.ServiceNameDispatchCloud,
+			arvados.ServiceNameDispatchLSF:
+			// ok to not run any given dispatcher
+		case arvados.ServiceNameHealth,
+			arvados.ServiceNameWorkbench1,
+			arvados.ServiceNameWorkbench2:
+			// typically doesn't have InternalURLs in config
+		default:
+			if sh.Health != "OK" && sh.Health != "SKIP" {
+				resp.Health = "ERROR"
+				continue
+			}
 		}
 	}
 
@@ -244,6 +258,16 @@ func (agg *Aggregator) ping(target *url.URL) (result CheckResult) {
 	req.Header.Set("X-Forwarded-Proto", "https")
 
 	resp, err := agg.httpClient.Do(req)
+	if urlerr, ok := err.(*url.Error); ok {
+		if neterr, ok := urlerr.Err.(*net.OpError); ok && isLocalHost(target.Hostname()) {
+			result = CheckResult{
+				Health: "SKIP",
+				Error:  neterr.Error(),
+			}
+			err = nil
+			return
+		}
+	}
 	if err != nil {
 		return
 	}
@@ -307,6 +331,13 @@ func (agg *Aggregator) metrics(pingURL *url.URL) (result Metrics, err error) {
 	return
 }
 
+// Test whether host is an easily recognizable loopback address:
+// 0.0.0.0, 127.x.x.x, ::1, or localhost.
+func isLocalHost(host string) bool {
+	ip := net.ParseIP(host)
+	return ip.IsLoopback() || bytes.Equal(ip.To4(), []byte{0, 0, 0, 0}) || strings.EqualFold(host, "localhost")
+}
+
 func (agg *Aggregator) checkAuth(req *http.Request) bool {
 	creds := auth.CredentialsFromRequest(req)
 	for _, token := range creds.Tokens {
diff --git a/sdk/go/health/aggregator_test.go b/sdk/go/health/aggregator_test.go
index 5d76c19f2..050b2c0fc 100644
--- a/sdk/go/health/aggregator_test.go
+++ b/sdk/go/health/aggregator_test.go
@@ -123,6 +123,44 @@ func (s *AggregatorSuite) TestHealthyAndUnhealthy(c *check.C) {
 	c.Logf("%#v", ep)
 }
 
+// If an InternalURL host is 0.0.0.0, localhost, 127/8, or ::1 and
+// nothing is listening there, don't fail the health check -- instead,
+// assume the relevant component just isn't installed/enabled on this
+// node, but does work when contacted through ExternalURL.
+func (s *AggregatorSuite) TestUnreachableLoopbackPort(c *check.C) {
+	srvH, listenH := s.stubServer(&healthyHandler{})
+	defer srvH.Close()
+	s.setAllServiceURLs(listenH)
+	arvadostest.SetServiceURL(&s.handler.Cluster.Services.Keepproxy, "http://localhost:9/")
+	arvadostest.SetServiceURL(&s.handler.Cluster.Services.Workbench1, "http://0.0.0.0:9/")
+	arvadostest.SetServiceURL(&s.handler.Cluster.Services.Keepbalance, "http://127.0.0.127:9/")
+	arvadostest.SetServiceURL(&s.handler.Cluster.Services.WebDAV, "http://[::1]:9/")
+	s.handler.ServeHTTP(s.resp, s.req)
+	s.checkOK(c)
+
+	// If a non-loopback address is unreachable, that's still a
+	// fail.
+	s.resp = httptest.NewRecorder()
+	arvadostest.SetServiceURL(&s.handler.Cluster.Services.WebDAV, "http://172.31.255.254:9/")
+	s.handler.ServeHTTP(s.resp, s.req)
+	s.checkUnhealthy(c)
+}
+
+func (s *AggregatorSuite) TestIsLocalHost(c *check.C) {
+	c.Check(isLocalHost("Localhost"), check.Equals, true)
+	c.Check(isLocalHost("localhost"), check.Equals, true)
+	c.Check(isLocalHost("127.0.0.1"), check.Equals, true)
+	c.Check(isLocalHost("127.0.0.127"), check.Equals, true)
+	c.Check(isLocalHost("127.1.2.7"), check.Equals, true)
+	c.Check(isLocalHost("0.0.0.0"), check.Equals, true)
+	c.Check(isLocalHost("::1"), check.Equals, true)
+	c.Check(isLocalHost("1.2.3.4"), check.Equals, false)
+	c.Check(isLocalHost("1::1"), check.Equals, false)
+	c.Check(isLocalHost("example.com"), check.Equals, false)
+	c.Check(isLocalHost("127.0.0"), check.Equals, false)
+	c.Check(isLocalHost(""), check.Equals, false)
+}
+
 func (s *AggregatorSuite) TestConfigMismatch(c *check.C) {
 	// time1/hash1: current config
 	time1 := time.Now().Add(time.Second - time.Minute - time.Hour)

commit 6b4e10efa050158c16ff81cf3303f2faf8984337
Author: Tom Clegg <tom at curii.com>
Date:   Mon Apr 11 14:35:03 2022 -0400

    18794: Add /metrics endpoint to RailsAPI.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>

diff --git a/services/api/app/controllers/arvados/v1/healthcheck_controller.rb b/services/api/app/controllers/arvados/v1/management_controller.rb
similarity index 54%
rename from services/api/app/controllers/arvados/v1/healthcheck_controller.rb
rename to services/api/app/controllers/arvados/v1/management_controller.rb
index c56208207..55a00d346 100644
--- a/services/api/app/controllers/arvados/v1/healthcheck_controller.rb
+++ b/services/api/app/controllers/arvados/v1/management_controller.rb
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: AGPL-3.0
 
-class Arvados::V1::HealthcheckController < ApplicationController
+class Arvados::V1::ManagementController < ApplicationController
   skip_before_action :catch_redirect_hint
   skip_before_action :find_objects_for_index
   skip_before_action :find_object_by_uuid
@@ -29,8 +29,24 @@ class Arvados::V1::HealthcheckController < ApplicationController
     end
   end
 
-  def ping
-    resp = {"health" => "OK"}
-    send_json resp
+  def metrics
+    render content_type: 'text/plain', plain: <<~EOF
+# HELP arvados_config_load_timestamp_seconds Time when config file was loaded.
+# TYPE arvados_config_load_timestamp_seconds gauge
+arvados_config_load_timestamp_seconds{sha256="#{Rails.configuration.SourceSHA256}"} #{Rails.configuration.LoadTimestamp.to_f}
+# HELP arvados_config_source_timestamp_seconds Timestamp of config file when it was loaded.
+# TYPE arvados_config_source_timestamp_seconds gauge
+arvados_config_source_timestamp_seconds{sha256="#{Rails.configuration.SourceSHA256}"} #{Rails.configuration.SourceTimestamp.to_f}
+EOF
+  end
+
+  def health
+    case params[:check]
+    when 'ping'
+      resp = {"health" => "OK"}
+      send_json resp
+    else
+      send_json ({"errors" => "not found"}), status: 404
+    end
   end
 end
diff --git a/services/api/config/arvados_config.rb b/services/api/config/arvados_config.rb
index 8a96c432a..c0f7ee174 100644
--- a/services/api/config/arvados_config.rb
+++ b/services/api/config/arvados_config.rb
@@ -30,6 +30,7 @@ end
 
 # Load the defaults, used by config:migrate and fallback loading
 # legacy application.yml
+load_time = Time.now.utc
 defaultYAML, stderr, status = Open3.capture3("arvados-server", "config-dump", "-config=-", "-skip-legacy", stdin_data: "Clusters: {xxxxx: {}}")
 if !status.success?
   puts stderr
@@ -39,6 +40,8 @@ confs = YAML.load(defaultYAML, deserialize_symbols: false)
 clusterID, clusterConfig = confs["Clusters"].first
 $arvados_config_defaults = clusterConfig
 $arvados_config_defaults["ClusterID"] = clusterID
+$arvados_config_defaults["SourceTimestamp"] = Time.rfc3339(confs["SourceTimestamp"])
+$arvados_config_defaults["SourceSHA256"] = confs["SourceSHA256"]
 
 if ENV["ARVADOS_CONFIG"] == "none"
   # Don't load config. This magic value is set by packaging scripts so
@@ -54,6 +57,8 @@ else
       clusterID, clusterConfig = confs["Clusters"].first
       $arvados_config_global = clusterConfig
       $arvados_config_global["ClusterID"] = clusterID
+      $arvados_config_global["SourceTimestamp"] = Time.rfc3339(confs["SourceTimestamp"])
+      $arvados_config_global["SourceSHA256"] = confs["SourceSHA256"]
     else
       # config-dump failed, assume we will be loading from legacy
       # application.yml, initialize with defaults.
@@ -64,6 +69,7 @@ end
 
 # Now make a copy
 $arvados_config = $arvados_config_global.deep_dup
+$arvados_config["LoadTimestamp"] = load_time
 
 def arrayToHash cfg, k, v
   val = {}
diff --git a/services/api/config/routes.rb b/services/api/config/routes.rb
index 98f5788d6..9c7bfc3a7 100644
--- a/services/api/config/routes.rb
+++ b/services/api/config/routes.rb
@@ -112,7 +112,8 @@ Rails.application.routes.draw do
 
   match '/static/login_failure', to: 'static#login_failure', as: :login_failure, via: [:get, :post]
 
-  match '/_health/ping', to: 'arvados/v1/healthcheck#ping', via: [:get]
+  match '/_health/:check', to: 'arvados/v1/management#health', via: [:get]
+  match '/metrics', to: 'arvados/v1/management#metrics', via: [:get]
 
   # Send unroutable requests to an arbitrary controller
   # (ends up at ApplicationController#render_not_found)
diff --git a/services/api/test/functional/arvados/v1/healthcheck_controller_test.rb b/services/api/test/functional/arvados/v1/healthcheck_controller_test.rb
deleted file mode 100644
index 76fdb0426..000000000
--- a/services/api/test/functional/arvados/v1/healthcheck_controller_test.rb
+++ /dev/null
@@ -1,34 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: AGPL-3.0
-
-require 'test_helper'
-
-class Arvados::V1::HealthcheckControllerTest < ActionController::TestCase
-  [
-    [false, nil, 404, 'disabled'],
-    [true, nil, 401, 'authorization required'],
-    [true, 'badformatwithnoBearer', 403, 'authorization error'],
-    [true, 'Bearer wrongtoken', 403, 'authorization error'],
-    [true, 'Bearer configuredmanagementtoken', 200, '{"health":"OK"}'],
-  ].each do |enabled, header, error_code, error_msg|
-    test "ping when #{if enabled then 'enabled' else 'disabled' end} with header '#{header}'" do
-      if enabled
-        Rails.configuration.ManagementToken = 'configuredmanagementtoken'
-      else
-        Rails.configuration.ManagementToken = ""
-      end
-
-      @request.headers['Authorization'] = header
-      get :ping
-      assert_response error_code
-
-      resp = JSON.parse(@response.body)
-      if error_code == 200
-        assert_equal(JSON.load('{"health":"OK"}'), resp)
-      else
-        assert_equal(error_msg, resp['errors'])
-      end
-    end
-  end
-end
diff --git a/services/api/test/functional/arvados/v1/management_controller_test.rb b/services/api/test/functional/arvados/v1/management_controller_test.rb
new file mode 100644
index 000000000..5b34f9fef
--- /dev/null
+++ b/services/api/test/functional/arvados/v1/management_controller_test.rb
@@ -0,0 +1,71 @@
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: AGPL-3.0
+
+require 'test_helper'
+
+class Arvados::V1::ManagementControllerTest < ActionController::TestCase
+  [
+    [false, nil, 404, 'disabled'],
+    [true, nil, 401, 'authorization required'],
+    [true, 'badformatwithnoBearer', 403, 'authorization error'],
+    [true, 'Bearer wrongtoken', 403, 'authorization error'],
+    [true, 'Bearer configuredmanagementtoken', 200, '{"health":"OK"}'],
+  ].each do |enabled, header, error_code, error_msg|
+    test "_health/ping when #{if enabled then 'enabled' else 'disabled' end} with header '#{header}'" do
+      if enabled
+        Rails.configuration.ManagementToken = 'configuredmanagementtoken'
+      else
+        Rails.configuration.ManagementToken = ""
+      end
+
+      @request.headers['Authorization'] = header
+      get :health, params: {check: 'ping'}
+      assert_response error_code
+
+      resp = JSON.parse(@response.body)
+      if error_code == 200
+        assert_equal(JSON.load('{"health":"OK"}'), resp)
+      else
+        assert_equal(error_msg, resp['errors'])
+      end
+    end
+  end
+
+  test "metrics" do
+    mtime = File.mtime(ENV["ARVADOS_CONFIG"])
+    hash = Digest::SHA256.hexdigest(File.read(ENV["ARVADOS_CONFIG"]))
+    Rails.configuration.ManagementToken = "configuredmanagementtoken"
+    @request.headers['Authorization'] = "Bearer configuredmanagementtoken"
+    get :metrics
+    assert_response :success
+    assert_equal 'text/plain', @response.content_type
+
+    assert_match /\narvados_config_source_timestamp_seconds{sha256="#{hash}"} #{Regexp.escape mtime.utc.to_f.to_s}\n/, @response.body
+
+    # Expect mtime < loadtime < now
+    m = @response.body.match(/\narvados_config_load_timestamp_seconds{sha256="#{hash}"} (.*?)\n/)
+    assert_operator m[1].to_f, :>, mtime.utc.to_f
+    assert_operator m[1].to_f, :<, Time.now.utc.to_f
+  end
+
+  test "metrics disabled" do
+    Rails.configuration.ManagementToken = ""
+    @request.headers['Authorization'] = "Bearer configuredmanagementtoken"
+    get :metrics
+    assert_response 404
+  end
+
+  test "metrics bad token" do
+    Rails.configuration.ManagementToken = "configuredmanagementtoken"
+    @request.headers['Authorization'] = "Bearer asdf"
+    get :metrics
+    assert_response 403
+  end
+
+  test "metrics unauthorized" do
+    Rails.configuration.ManagementToken = "configuredmanagementtoken"
+    get :metrics
+    assert_response 401
+  end
+end
diff --git a/services/api/test/integration/errors_test.rb b/services/api/test/integration/errors_test.rb
index a2a1545ce..a5359278e 100644
--- a/services/api/test/integration/errors_test.rb
+++ b/services/api/test/integration/errors_test.rb
@@ -24,7 +24,7 @@ class ErrorsTest < ActionDispatch::IntegrationTest
       # Generally, new routes should appear under /arvados/v1/. If
       # they appear elsewhere, that might have been caused by default
       # rails generator behavior that we don't want.
-      assert_match(/^\/(|\*a|arvados\/v1\/.*|auth\/.*|login|logout|database\/reset|discovery\/.*|static\/.*|sys\/trash_sweep|themes\/.*|assets|_health\/.*)(\(\.:format\))?$/,
+      assert_match(/^\/(|\*a|arvados\/v1\/.*|auth\/.*|login|logout|database\/reset|discovery\/.*|static\/.*|sys\/trash_sweep|themes\/.*|assets|_health|metrics\/.*)(\(\.:format\))?$/,
                    route.path.spec.to_s,
                    "Unexpected new route: #{route.path.spec}")
     end

commit 72f222dfd8982bfd1b19804ba8ba42b80708134b
Author: Tom Clegg <tom at curii.com>
Date:   Mon Apr 11 11:18:50 2022 -0400

    18794: Fail health check on mismatched configs.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>

diff --git a/sdk/go/health/aggregator.go b/sdk/go/health/aggregator.go
index 4175bc544..864ef592b 100644
--- a/sdk/go/health/aggregator.go
+++ b/sdk/go/health/aggregator.go
@@ -5,6 +5,7 @@
 package health
 
 import (
+	"bufio"
 	"context"
 	"encoding/json"
 	"errors"
@@ -13,6 +14,9 @@ import (
 	"io"
 	"net/http"
 	"net/url"
+	"regexp"
+	"strconv"
+	"strings"
 	"sync"
 	"time"
 
@@ -100,6 +104,8 @@ type ClusterHealthResponse struct {
 	// "service S is needed, but isn't configured to run
 	// anywhere."
 	Services map[arvados.ServiceName]ServiceHealth `json:"services"`
+
+	Errors []string `json:"errors"`
 }
 
 type CheckResult struct {
@@ -109,6 +115,12 @@ type CheckResult struct {
 	HTTPStatusText string                 `json:",omitempty"`
 	Response       map[string]interface{} `json:"response"`
 	ResponseTime   json.Number            `json:"responseTime"`
+	Metrics        Metrics                `json:"-"`
+}
+
+type Metrics struct {
+	ConfigSourceTimestamp time.Time
+	ConfigSourceSHA256    string
 }
 
 type ServiceHealth struct {
@@ -147,6 +159,11 @@ func (agg *Aggregator) ClusterHealth() ClusterHealthResponse {
 					}
 				} else {
 					result = agg.ping(pingURL)
+					m, err := agg.metrics(pingURL)
+					if err != nil {
+						result.Error = "metrics: " + err.Error()
+					}
+					result.Metrics = m
 				}
 
 				mtx.Lock()
@@ -173,6 +190,27 @@ func (agg *Aggregator) ClusterHealth() ClusterHealthResponse {
 			break
 		}
 	}
+
+	var newest Metrics
+	for _, result := range resp.Checks {
+		if result.Metrics.ConfigSourceTimestamp.After(newest.ConfigSourceTimestamp) {
+			newest = result.Metrics
+		}
+	}
+	var mismatches []string
+	for target, result := range resp.Checks {
+		if hash := result.Metrics.ConfigSourceSHA256; hash != "" && hash != newest.ConfigSourceSHA256 {
+			mismatches = append(mismatches, target)
+		}
+	}
+	for _, target := range mismatches {
+		msg := fmt.Sprintf("outdated config: %s: config file (sha256 %s) does not match latest version with timestamp %s",
+			strings.TrimSuffix(target, "/_health/ping"),
+			resp.Checks[target].Metrics.ConfigSourceSHA256,
+			newest.ConfigSourceTimestamp.Format(time.RFC3339))
+		resp.Errors = append(resp.Errors, msg)
+		resp.Health = "ERROR"
+	}
 	return resp
 }
 
@@ -194,7 +232,9 @@ func (agg *Aggregator) ping(target *url.URL) (result CheckResult) {
 		}
 	}()
 
-	req, err := http.NewRequest("GET", target.String(), nil)
+	ctx, cancel := context.WithTimeout(context.Background(), time.Duration(agg.timeout))
+	defer cancel()
+	req, err := http.NewRequestWithContext(ctx, "GET", target.String(), nil)
 	if err != nil {
 		return
 	}
@@ -203,9 +243,6 @@ func (agg *Aggregator) ping(target *url.URL) (result CheckResult) {
 	// Avoid workbench1's redirect-http-to-https feature
 	req.Header.Set("X-Forwarded-Proto", "https")
 
-	ctx, cancel := context.WithTimeout(req.Context(), time.Duration(agg.timeout))
-	defer cancel()
-	req = req.WithContext(ctx)
 	resp, err := agg.httpClient.Do(req)
 	if err != nil {
 		return
@@ -227,6 +264,49 @@ func (agg *Aggregator) ping(target *url.URL) (result CheckResult) {
 	return
 }
 
+var reMetric = regexp.MustCompile(`([a-z_]+){sha256="([0-9a-f]+)"} (\d[\d\.e\+]+)`)
+
+func (agg *Aggregator) metrics(pingURL *url.URL) (result Metrics, err error) {
+	metricsURL, err := pingURL.Parse("/metrics")
+	if err != nil {
+		return
+	}
+	ctx, cancel := context.WithTimeout(context.Background(), time.Duration(agg.timeout))
+	defer cancel()
+	req, err := http.NewRequestWithContext(ctx, "GET", metricsURL.String(), nil)
+	if err != nil {
+		return
+	}
+	req.Header.Set("Authorization", "Bearer "+agg.Cluster.ManagementToken)
+
+	// Avoid workbench1's redirect-http-to-https feature
+	req.Header.Set("X-Forwarded-Proto", "https")
+
+	resp, err := agg.httpClient.Do(req)
+	if err != nil {
+		return
+	} else if resp.StatusCode != http.StatusOK {
+		err = fmt.Errorf("%s: HTTP %d %s", metricsURL.String(), resp.StatusCode, resp.Status)
+		return
+	}
+
+	scanner := bufio.NewScanner(resp.Body)
+	for scanner.Scan() {
+		m := reMetric.FindSubmatch(scanner.Bytes())
+		if len(m) != 4 || string(m[1]) != "arvados_config_source_timestamp_seconds" {
+			continue
+		}
+		result.ConfigSourceSHA256 = string(m[2])
+		unixtime, _ := strconv.ParseFloat(string(m[3]), 64)
+		result.ConfigSourceTimestamp = time.UnixMicro(int64(unixtime * 1e6))
+	}
+	if err = scanner.Err(); err != nil {
+		err = fmt.Errorf("error parsing response from %s: %w", metricsURL.String(), err)
+		return
+	}
+	return
+}
+
 func (agg *Aggregator) checkAuth(req *http.Request) bool {
 	creds := auth.CredentialsFromRequest(req)
 	for _, token := range creds.Tokens {
diff --git a/sdk/go/health/aggregator_test.go b/sdk/go/health/aggregator_test.go
index 62eca894b..5d76c19f2 100644
--- a/sdk/go/health/aggregator_test.go
+++ b/sdk/go/health/aggregator_test.go
@@ -6,7 +6,9 @@ package health
 
 import (
 	"bytes"
+	"crypto/sha256"
 	"encoding/json"
+	"fmt"
 	"io/ioutil"
 	"net/http"
 	"net/http/httptest"
@@ -121,6 +123,65 @@ func (s *AggregatorSuite) TestHealthyAndUnhealthy(c *check.C) {
 	c.Logf("%#v", ep)
 }
 
+func (s *AggregatorSuite) TestConfigMismatch(c *check.C) {
+	// time1/hash1: current config
+	time1 := time.Now().Add(time.Second - time.Minute - time.Hour)
+	hash1 := fmt.Sprintf("%x", sha256.Sum256([]byte(`Clusters: {zzzzz: {SystemRootToken: xyzzy}}`)))
+	// time2/hash2: old config
+	time2 := time1.Add(-time.Hour)
+	hash2 := fmt.Sprintf("%x", sha256.Sum256([]byte(`Clusters: {zzzzz: {SystemRootToken: old-token}}`)))
+
+	// srv1: current file
+	handler1 := healthyHandler{configHash: hash1, configTime: time1}
+	srv1, listen1 := s.stubServer(&handler1)
+	defer srv1.Close()
+	// srv2: old file, current content
+	handler2 := healthyHandler{configHash: hash1, configTime: time2}
+	srv2, listen2 := s.stubServer(&handler2)
+	defer srv2.Close()
+	// srv3: old file, old content
+	handler3 := healthyHandler{configHash: hash2, configTime: time2}
+	srv3, listen3 := s.stubServer(&handler3)
+	defer srv3.Close()
+	// srv4: no metrics handler
+	handler4 := healthyHandler{}
+	srv4, listen4 := s.stubServer(&handler4)
+	defer srv4.Close()
+
+	s.setAllServiceURLs(listen1)
+
+	// listen2 => old timestamp, same content => no problem
+	s.resp = httptest.NewRecorder()
+	arvadostest.SetServiceURL(&s.handler.Cluster.Services.DispatchCloud,
+		"http://localhost"+listen2+"/")
+	s.handler.ServeHTTP(s.resp, s.req)
+	resp := s.checkOK(c)
+
+	// listen4 => no metrics on some services => no problem
+	s.resp = httptest.NewRecorder()
+	arvadostest.SetServiceURL(&s.handler.Cluster.Services.WebDAV,
+		"http://localhost"+listen4+"/")
+	s.handler.ServeHTTP(s.resp, s.req)
+	resp = s.checkOK(c)
+
+	// listen3 => old timestamp, old content => report discrepancy
+	s.resp = httptest.NewRecorder()
+	arvadostest.SetServiceURL(&s.handler.Cluster.Services.Keepstore,
+		"http://localhost"+listen1+"/",
+		"http://localhost"+listen3+"/")
+	s.handler.ServeHTTP(s.resp, s.req)
+	resp = s.checkUnhealthy(c)
+	if c.Check(len(resp.Errors) > 0, check.Equals, true) {
+		c.Check(resp.Errors[0], check.Matches, `outdated config: \Qkeepstore+http://localhost`+listen3+`\E: config file \(sha256 .*\) does not match latest version with timestamp .*`)
+	}
+
+	// no services report config time (migrating to current version) => no problem
+	s.resp = httptest.NewRecorder()
+	s.setAllServiceURLs(listen4)
+	s.handler.ServeHTTP(s.resp, s.req)
+	s.checkOK(c)
+}
+
 func (s *AggregatorSuite) TestPingTimeout(c *check.C) {
 	s.handler.timeout = arvados.Duration(100 * time.Millisecond)
 	srv, listen := s.stubServer(&slowHandler{})
@@ -143,7 +204,7 @@ func (s *AggregatorSuite) TestCheckCommand(c *check.C) {
 	tmpdir := c.MkDir()
 	confdata, err := yaml.Marshal(arvados.Config{Clusters: map[string]arvados.Cluster{s.handler.Cluster.ClusterID: *s.handler.Cluster}})
 	c.Assert(err, check.IsNil)
-	confdata = regexp.MustCompile(`SourceTimestamp: [^\n]+\n`).ReplaceAll(confdata, []byte{})
+	confdata = regexp.MustCompile(`Source(Timestamp|SHA256): [^\n]+\n`).ReplaceAll(confdata, []byte{})
 	err = ioutil.WriteFile(tmpdir+"/config.yml", confdata, 0777)
 	c.Assert(err, check.IsNil)
 	var stdout, stderr bytes.Buffer
@@ -209,11 +270,37 @@ func (*unhealthyHandler) ServeHTTP(resp http.ResponseWriter, req *http.Request)
 	}
 }
 
-type healthyHandler struct{}
+type healthyHandler struct {
+	configHash string
+	configTime time.Time
+}
 
-func (*healthyHandler) ServeHTTP(resp http.ResponseWriter, req *http.Request) {
+func (h *healthyHandler) ServeHTTP(resp http.ResponseWriter, req *http.Request) {
+	authOK := req.Header.Get("Authorization") == "Bearer "+arvadostest.ManagementToken
 	if req.URL.Path == "/_health/ping" {
+		if !authOK {
+			http.Error(resp, "unauthorized", http.StatusUnauthorized)
+			return
+		}
 		resp.Write([]byte(`{"health":"OK"}`))
+	} else if req.URL.Path == "/metrics" {
+		if !authOK {
+			http.Error(resp, "unauthorized", http.StatusUnauthorized)
+			return
+		}
+		t := h.configTime
+		if t.IsZero() {
+			t = time.Now()
+		}
+		fmt.Fprintf(resp, `# HELP arvados_config_load_timestamp_seconds Time when config file was loaded.
+# TYPE arvados_config_load_timestamp_seconds gauge
+arvados_config_load_timestamp_seconds{sha256="%s"} %g
+# HELP arvados_config_source_timestamp_seconds Timestamp of config file when it was loaded.
+# TYPE arvados_config_source_timestamp_seconds gauge
+arvados_config_source_timestamp_seconds{sha256="%s"} %g
+`,
+			h.configHash, float64(time.Now().UnixNano())/1e9,
+			h.configHash, float64(t.UnixNano())/1e9)
 	} else {
 		http.Error(resp, "not found", http.StatusNotFound)
 	}

commit a92e041a642687ca27d8bacf298faddcd442f3e1
Author: Tom Clegg <tom at curii.com>
Date:   Fri Apr 8 17:48:21 2022 -0400

    18794: Export config load time as prometheus metric.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>

diff --git a/lib/config/load.go b/lib/config/load.go
index d03ef5388..1ad31402c 100644
--- a/lib/config/load.go
+++ b/lib/config/load.go
@@ -53,6 +53,8 @@ type Loader struct {
 	// read configdata from, or the time when we read configdata
 	// from a pipe.
 	sourceTimestamp time.Time
+	// UTC time when configdata was read.
+	loadTimestamp time.Time
 }
 
 // NewLoader returns a new Loader with Stdin and Logger set to the
@@ -173,32 +175,36 @@ func (ldr *Loader) MungeLegacyConfigArgs(lgr logrus.FieldLogger, args []string,
 	return munged
 }
 
-func (ldr *Loader) loadBytes(path string) ([]byte, time.Time, error) {
+func (ldr *Loader) loadBytes(path string) (buf []byte, sourceTime, loadTime time.Time, err error) {
+	loadTime = time.Now().UTC()
 	if path == "-" {
-		buf, err := ioutil.ReadAll(ldr.Stdin)
-		return buf, time.Now().UTC(), err
+		buf, err = ioutil.ReadAll(ldr.Stdin)
+		sourceTime = loadTime
+		return
 	}
 	f, err := os.Open(path)
 	if err != nil {
-		return nil, time.Time{}, err
+		return
 	}
 	defer f.Close()
 	fi, err := f.Stat()
 	if err != nil {
-		return nil, time.Time{}, err
+		return
 	}
-	buf, err := ioutil.ReadAll(f)
-	return buf, fi.ModTime().UTC(), err
+	sourceTime = fi.ModTime().UTC()
+	buf, err = ioutil.ReadAll(f)
+	return
 }
 
 func (ldr *Loader) Load() (*arvados.Config, error) {
 	if ldr.configdata == nil {
-		buf, t, err := ldr.loadBytes(ldr.Path)
+		buf, sourceTime, loadTime, err := ldr.loadBytes(ldr.Path)
 		if err != nil {
 			return nil, err
 		}
 		ldr.configdata = buf
-		ldr.sourceTimestamp = t
+		ldr.sourceTimestamp = sourceTime
+		ldr.loadTimestamp = loadTime
 	}
 
 	// FIXME: We should reject YAML if the same key is used twice
@@ -579,12 +585,22 @@ func (ldr *Loader) autofillPreemptible(label string, cc *arvados.Cluster) {
 // called before Load(). Metrics are not updated by subsequent calls
 // to Load().
 func (ldr *Loader) RegisterMetrics(reg *prometheus.Registry) {
+	hash := fmt.Sprintf("%x", sha256.Sum256(ldr.configdata))
 	vec := prometheus.NewGaugeVec(prometheus.GaugeOpts{
 		Namespace: "arvados",
 		Subsystem: "config",
 		Name:      "source_timestamp_seconds",
 		Help:      "Timestamp of config file when it was loaded.",
 	}, []string{"sha256"})
-	vec.WithLabelValues(fmt.Sprintf("%x", sha256.Sum256(ldr.configdata))).Set(float64(ldr.sourceTimestamp.UnixNano()) / 1e9)
+	vec.WithLabelValues(hash).Set(float64(ldr.sourceTimestamp.UnixNano()) / 1e9)
+	reg.MustRegister(vec)
+
+	vec = prometheus.NewGaugeVec(prometheus.GaugeOpts{
+		Namespace: "arvados",
+		Subsystem: "config",
+		Name:      "load_timestamp_seconds",
+		Help:      "Time when config file was loaded.",
+	}, []string{"sha256"})
+	vec.WithLabelValues(hash).Set(float64(ldr.loadTimestamp.UnixNano()) / 1e9)
 	reg.MustRegister(vec)
 }
diff --git a/lib/config/load_test.go b/lib/config/load_test.go
index 48e2d33a4..3daa66fc9 100644
--- a/lib/config/load_test.go
+++ b/lib/config/load_test.go
@@ -803,7 +803,9 @@ func (s *LoadSuite) TestSourceTimestamp(c *check.C) {
 		cfg, err := ldr.Load()
 		c.Assert(err, check.IsNil)
 		c.Check(cfg.SourceTimestamp, check.Equals, cfg.SourceTimestamp.UTC())
+		c.Check(cfg.SourceTimestamp, check.Equals, ldr.sourceTimestamp)
 		c.Check(int(cfg.SourceTimestamp.Sub(trial.expectTime).Seconds()), check.Equals, 0)
+		c.Check(int(ldr.loadTimestamp.Sub(time.Now()).Seconds()), check.Equals, 0)
 
 		var buf bytes.Buffer
 		reg := prometheus.NewRegistry()
@@ -813,6 +815,12 @@ func (s *LoadSuite) TestSourceTimestamp(c *check.C) {
 		for _, mf := range got {
 			enc.Encode(mf)
 		}
-		c.Check(buf.String(), check.Matches, `# HELP .*\n# TYPE .*\narvados_config_source_timestamp_seconds{sha256="83aea5d82eb1d53372cd65c936c60acc1c6ef946e61977bbca7cfea709d201a8"} \Q`+fmt.Sprintf("%g", float64(cfg.SourceTimestamp.UnixNano())/1e9)+`\E\n`)
+		c.Check(buf.String(), check.Matches, `# HELP .*
+# TYPE .*
+arvados_config_load_timestamp_seconds{sha256="83aea5d82eb1d53372cd65c936c60acc1c6ef946e61977bbca7cfea709d201a8"} \Q`+fmt.Sprintf("%g", float64(ldr.loadTimestamp.UnixNano())/1e9)+`\E
+# HELP .*
+# TYPE .*
+arvados_config_source_timestamp_seconds{sha256="83aea5d82eb1d53372cd65c936c60acc1c6ef946e61977bbca7cfea709d201a8"} \Q`+fmt.Sprintf("%g", float64(cfg.SourceTimestamp.UnixNano())/1e9)+`\E
+`)
 	}
 }

commit 4e10563689b0a874612d133f489ff8af69da6063
Author: Tom Clegg <tom at curii.com>
Date:   Fri Apr 8 16:05:26 2022 -0400

    18794: Export config timestamp/hash as prometheus metric.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>

diff --git a/lib/config/load.go b/lib/config/load.go
index 28f300e3b..d03ef5388 100644
--- a/lib/config/load.go
+++ b/lib/config/load.go
@@ -6,6 +6,7 @@ package config
 
 import (
 	"bytes"
+	"crypto/sha256"
 	_ "embed"
 	"encoding/json"
 	"errors"
@@ -22,6 +23,7 @@ import (
 	"git.arvados.org/arvados.git/sdk/go/arvados"
 	"github.com/ghodss/yaml"
 	"github.com/imdario/mergo"
+	"github.com/prometheus/client_golang/prometheus"
 	"github.com/sirupsen/logrus"
 )
 
@@ -569,3 +571,20 @@ func (ldr *Loader) autofillPreemptible(label string, cc *arvados.Cluster) {
 	}
 
 }
+
+// RegisterMetrics registers metrics showing the timestamp and content
+// hash of the currently loaded config.
+//
+// Must not be called more than once for a given registry. Must not be
+// called before Load(). Metrics are not updated by subsequent calls
+// to Load().
+func (ldr *Loader) RegisterMetrics(reg *prometheus.Registry) {
+	vec := prometheus.NewGaugeVec(prometheus.GaugeOpts{
+		Namespace: "arvados",
+		Subsystem: "config",
+		Name:      "source_timestamp_seconds",
+		Help:      "Timestamp of config file when it was loaded.",
+	}, []string{"sha256"})
+	vec.WithLabelValues(fmt.Sprintf("%x", sha256.Sum256(ldr.configdata))).Set(float64(ldr.sourceTimestamp.UnixNano()) / 1e9)
+	reg.MustRegister(vec)
+}
diff --git a/lib/config/load_test.go b/lib/config/load_test.go
index 97830b4e1..48e2d33a4 100644
--- a/lib/config/load_test.go
+++ b/lib/config/load_test.go
@@ -20,6 +20,8 @@ import (
 	"git.arvados.org/arvados.git/sdk/go/arvados"
 	"git.arvados.org/arvados.git/sdk/go/ctxlog"
 	"github.com/ghodss/yaml"
+	"github.com/prometheus/client_golang/prometheus"
+	"github.com/prometheus/common/expfmt"
 	"github.com/sirupsen/logrus"
 	"golang.org/x/sys/unix"
 	check "gopkg.in/check.v1"
@@ -802,5 +804,15 @@ func (s *LoadSuite) TestSourceTimestamp(c *check.C) {
 		c.Assert(err, check.IsNil)
 		c.Check(cfg.SourceTimestamp, check.Equals, cfg.SourceTimestamp.UTC())
 		c.Check(int(cfg.SourceTimestamp.Sub(trial.expectTime).Seconds()), check.Equals, 0)
+
+		var buf bytes.Buffer
+		reg := prometheus.NewRegistry()
+		ldr.RegisterMetrics(reg)
+		enc := expfmt.NewEncoder(&buf, expfmt.FmtText)
+		got, _ := reg.Gather()
+		for _, mf := range got {
+			enc.Encode(mf)
+		}
+		c.Check(buf.String(), check.Matches, `# HELP .*\n# TYPE .*\narvados_config_source_timestamp_seconds{sha256="83aea5d82eb1d53372cd65c936c60acc1c6ef946e61977bbca7cfea709d201a8"} \Q`+fmt.Sprintf("%g", float64(cfg.SourceTimestamp.UnixNano())/1e9)+`\E\n`)
 	}
 }
diff --git a/lib/service/cmd.go b/lib/service/cmd.go
index dbafc89fe..d11e4a74d 100644
--- a/lib/service/cmd.go
+++ b/lib/service/cmd.go
@@ -117,6 +117,8 @@ func (c *command) RunCommand(prog string, args []string, stdin io.Reader, stdout
 	ctx = context.WithValue(ctx, contextKeyURL{}, listenURL)
 
 	reg := prometheus.NewRegistry()
+	loader.RegisterMetrics(reg)
+
 	handler := c.newHandler(ctx, cluster, cluster.SystemRootToken, reg)
 	if err = handler.CheckHealth(); err != nil {
 		return 1

commit 03cd0561b3816fb2448ec355be6771cc49074ef9
Author: Tom Clegg <tom at curii.com>
Date:   Fri Apr 8 11:08:14 2022 -0400

    18794: Add SourceTimestamp and SourceSHA256 to config-dump output.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>

diff --git a/lib/config/load.go b/lib/config/load.go
index 5afb51c5a..28f300e3b 100644
--- a/lib/config/load.go
+++ b/lib/config/load.go
@@ -17,6 +17,7 @@ import (
 	"regexp"
 	"strconv"
 	"strings"
+	"time"
 
 	"git.arvados.org/arvados.git/sdk/go/arvados"
 	"github.com/ghodss/yaml"
@@ -46,6 +47,10 @@ type Loader struct {
 	KeepBalancePath         string
 
 	configdata []byte
+	// UTC time for configdata: either the modtime of the file we
+	// read configdata from, or the time when we read configdata
+	// from a pipe.
+	sourceTimestamp time.Time
 }
 
 // NewLoader returns a new Loader with Stdin and Logger set to the
@@ -166,25 +171,32 @@ func (ldr *Loader) MungeLegacyConfigArgs(lgr logrus.FieldLogger, args []string,
 	return munged
 }
 
-func (ldr *Loader) loadBytes(path string) ([]byte, error) {
+func (ldr *Loader) loadBytes(path string) ([]byte, time.Time, error) {
 	if path == "-" {
-		return ioutil.ReadAll(ldr.Stdin)
+		buf, err := ioutil.ReadAll(ldr.Stdin)
+		return buf, time.Now().UTC(), err
 	}
 	f, err := os.Open(path)
 	if err != nil {
-		return nil, err
+		return nil, time.Time{}, err
 	}
 	defer f.Close()
-	return ioutil.ReadAll(f)
+	fi, err := f.Stat()
+	if err != nil {
+		return nil, time.Time{}, err
+	}
+	buf, err := ioutil.ReadAll(f)
+	return buf, fi.ModTime().UTC(), err
 }
 
 func (ldr *Loader) Load() (*arvados.Config, error) {
 	if ldr.configdata == nil {
-		buf, err := ldr.loadBytes(ldr.Path)
+		buf, t, err := ldr.loadBytes(ldr.Path)
 		if err != nil {
 			return nil, err
 		}
 		ldr.configdata = buf
+		ldr.sourceTimestamp = t
 	}
 
 	// FIXME: We should reject YAML if the same key is used twice
@@ -330,6 +342,8 @@ func (ldr *Loader) Load() (*arvados.Config, error) {
 			}
 		}
 	}
+	cfg.SourceTimestamp = ldr.sourceTimestamp
+	cfg.SourceSHA256 = fmt.Sprintf("%x", sha256.Sum256(ldr.configdata))
 	return &cfg, nil
 }
 
diff --git a/lib/config/load_test.go b/lib/config/load_test.go
index 5270dcccc..97830b4e1 100644
--- a/lib/config/load_test.go
+++ b/lib/config/load_test.go
@@ -12,13 +12,16 @@ import (
 	"os"
 	"os/exec"
 	"reflect"
+	"regexp"
 	"strings"
 	"testing"
+	"time"
 
 	"git.arvados.org/arvados.git/sdk/go/arvados"
 	"git.arvados.org/arvados.git/sdk/go/ctxlog"
 	"github.com/ghodss/yaml"
 	"github.com/sirupsen/logrus"
+	"golang.org/x/sys/unix"
 	check "gopkg.in/check.v1"
 )
 
@@ -315,8 +318,16 @@ Clusters:
 	c.Assert(err, check.IsNil)
 	yaml, err := yaml.Marshal(cfg)
 	c.Assert(err, check.IsNil)
+	// Well, *nearly* no warnings. SourceTimestamp and
+	// SourceSHA256 are included in a config-dump, but not
+	// expected in a real config file.
+	yaml = regexp.MustCompile(`(^|\n)(Source(Timestamp|SHA256): .*?\n)+`).ReplaceAll(yaml, []byte("$1"))
 	cfgDumped, err := testLoader(c, string(yaml), &logbuf).Load()
 	c.Assert(err, check.IsNil)
+	// SourceTimestamp and SourceSHA256 aren't expected to be
+	// preserved through dump+load
+	cfgDumped.SourceTimestamp = cfg.SourceTimestamp
+	cfgDumped.SourceSHA256 = cfg.SourceSHA256
 	c.Check(cfg, check.DeepEquals, cfgDumped)
 	c.Check(logbuf.String(), check.Equals, "")
 }
@@ -503,6 +514,12 @@ func checkEquivalentLoaders(c *check.C, gotldr, expectedldr *Loader) {
 	c.Assert(err, check.IsNil)
 	expected, err := expectedldr.Load()
 	c.Assert(err, check.IsNil)
+	// The inputs generally aren't even files, so SourceTimestamp
+	// can't be expected to match.
+	got.SourceTimestamp = expected.SourceTimestamp
+	// Obviously the content isn't identical -- otherwise we
+	// wouldn't need to check that it's equivalent.
+	got.SourceSHA256 = expected.SourceSHA256
 	checkEqualYAML(c, got, expected)
 }
 
@@ -762,3 +779,28 @@ Clusters:
 	c.Check(logbuf.String(), check.Not(check.Matches), `(?ms).*Type2\.preemptible.*`)
 	c.Check(logbuf.String(), check.Not(check.Matches), `(?ms).*(z1111|z2222)[^\n]*InstanceTypes.*`)
 }
+
+func (s *LoadSuite) TestSourceTimestamp(c *check.C) {
+	conftime, err := time.Parse(time.RFC3339, "2022-03-04T05:06:07-08:00")
+	c.Assert(err, check.IsNil)
+	confdata := `Clusters: {zzzzz: {}}`
+	conffile := c.MkDir() + "/config.yml"
+	ioutil.WriteFile(conffile, []byte(confdata), 0777)
+	tv := unix.NsecToTimeval(conftime.UnixNano())
+	unix.Lutimes(conffile, []unix.Timeval{tv, tv})
+	for _, trial := range []struct {
+		configarg  string
+		expectTime time.Time
+	}{
+		{"-", time.Now()},
+		{conffile, conftime},
+	} {
+		c.Logf("trial: %+v", trial)
+		ldr := NewLoader(strings.NewReader(confdata), ctxlog.TestLogger(c))
+		ldr.Path = trial.configarg
+		cfg, err := ldr.Load()
+		c.Assert(err, check.IsNil)
+		c.Check(cfg.SourceTimestamp, check.Equals, cfg.SourceTimestamp.UTC())
+		c.Check(int(cfg.SourceTimestamp.Sub(trial.expectTime).Seconds()), check.Equals, 0)
+	}
+}
diff --git a/sdk/go/arvados/config.go b/sdk/go/arvados/config.go
index 6c9324e47..b508a3f05 100644
--- a/sdk/go/arvados/config.go
+++ b/sdk/go/arvados/config.go
@@ -10,6 +10,7 @@ import (
 	"fmt"
 	"net/url"
 	"os"
+	"time"
 
 	"git.arvados.org/arvados.git/sdk/go/config"
 )
@@ -24,6 +25,8 @@ var DefaultConfigFile = func() string {
 type Config struct {
 	Clusters         map[string]Cluster
 	AutoReloadConfig bool
+	SourceTimestamp  time.Time
+	SourceSHA256     string
 }
 
 // GetConfig returns the current system config, loading it from
diff --git a/sdk/go/health/aggregator_test.go b/sdk/go/health/aggregator_test.go
index 05f0bdd31..62eca894b 100644
--- a/sdk/go/health/aggregator_test.go
+++ b/sdk/go/health/aggregator_test.go
@@ -10,6 +10,7 @@ import (
 	"io/ioutil"
 	"net/http"
 	"net/http/httptest"
+	"regexp"
 	"strings"
 	"time"
 
@@ -142,6 +143,7 @@ func (s *AggregatorSuite) TestCheckCommand(c *check.C) {
 	tmpdir := c.MkDir()
 	confdata, err := yaml.Marshal(arvados.Config{Clusters: map[string]arvados.Cluster{s.handler.Cluster.ClusterID: *s.handler.Cluster}})
 	c.Assert(err, check.IsNil)
+	confdata = regexp.MustCompile(`SourceTimestamp: [^\n]+\n`).ReplaceAll(confdata, []byte{})
 	err = ioutil.WriteFile(tmpdir+"/config.yml", confdata, 0777)
 	c.Assert(err, check.IsNil)
 	var stdout, stderr bytes.Buffer

commit 541c300ca6abb15449e917f648ae5ffd68087ff9
Author: Tom Clegg <tom at curii.com>
Date:   Thu Apr 7 16:11:55 2022 -0400

    18794: Add "arvados-server check" subcommand.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>

diff --git a/cmd/arvados-server/cmd.go b/cmd/arvados-server/cmd.go
index c8b945bea..4b48301eb 100644
--- a/cmd/arvados-server/cmd.go
+++ b/cmd/arvados-server/cmd.go
@@ -17,6 +17,7 @@ import (
 	"git.arvados.org/arvados.git/lib/install"
 	"git.arvados.org/arvados.git/lib/lsf"
 	"git.arvados.org/arvados.git/lib/recovercollection"
+	"git.arvados.org/arvados.git/sdk/go/health"
 	"git.arvados.org/arvados.git/services/keepstore"
 	"git.arvados.org/arvados.git/services/ws"
 )
@@ -28,6 +29,7 @@ var (
 		"--version": cmd.Version,
 
 		"boot":               boot.Command,
+		"check":              health.CheckCommand,
 		"cloudtest":          cloudtest.Command,
 		"config-check":       config.CheckCommand,
 		"config-defaults":    config.DumpDefaultsCommand,
diff --git a/sdk/go/health/aggregator.go b/sdk/go/health/aggregator.go
index a666ef8ec..4175bc544 100644
--- a/sdk/go/health/aggregator.go
+++ b/sdk/go/health/aggregator.go
@@ -8,19 +8,26 @@ import (
 	"context"
 	"encoding/json"
 	"errors"
+	"flag"
 	"fmt"
+	"io"
 	"net/http"
 	"net/url"
 	"sync"
 	"time"
 
+	"git.arvados.org/arvados.git/lib/cmd"
+	"git.arvados.org/arvados.git/lib/config"
 	"git.arvados.org/arvados.git/sdk/go/arvados"
 	"git.arvados.org/arvados.git/sdk/go/auth"
+	"git.arvados.org/arvados.git/sdk/go/ctxlog"
+	"github.com/ghodss/yaml"
+	"github.com/sirupsen/logrus"
 )
 
 const defaultTimeout = arvados.Duration(2 * time.Second)
 
-// Aggregator implements http.Handler. It handles "GET /_health/all"
+// Aggregator implements service.Handler. It handles "GET /_health/all"
 // by checking the health of all configured services on the cluster
 // and responding 200 if everything is healthy.
 type Aggregator struct {
@@ -229,3 +236,61 @@ func (agg *Aggregator) checkAuth(req *http.Request) bool {
 	}
 	return false
 }
+
+var errSilent = errors.New("")
+
+var CheckCommand cmd.Handler = checkCommand{}
+
+type checkCommand struct{}
+
+func (ccmd checkCommand) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
+	logger := ctxlog.New(stderr, "json", "info")
+	ctx := ctxlog.Context(context.Background(), logger)
+	err := ccmd.run(ctx, prog, args, stdin, stdout, stderr)
+	if err != nil {
+		if err != errSilent {
+			fmt.Fprintln(stdout, err.Error())
+		}
+		return 1
+	}
+	return 0
+}
+
+func (ccmd checkCommand) run(ctx context.Context, prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) error {
+	flags := flag.NewFlagSet("", flag.ContinueOnError)
+	flags.SetOutput(stderr)
+	loader := config.NewLoader(stdin, ctxlog.New(stderr, "text", "info"))
+	loader.SetupFlags(flags)
+	versionFlag := flags.Bool("version", false, "Write version information to stdout and exit 0")
+	timeout := flags.Duration("timeout", defaultTimeout.Duration(), "Maximum time to wait for health responses")
+	if ok, _ := cmd.ParseFlags(flags, prog, args, "", stderr); !ok {
+		// cmd.ParseFlags already reported the error
+		return errSilent
+	} else if *versionFlag {
+		cmd.Version.RunCommand(prog, args, stdin, stdout, stderr)
+		return nil
+	}
+	cfg, err := loader.Load()
+	if err != nil {
+		return err
+	}
+	cluster, err := cfg.GetCluster("")
+	if err != nil {
+		return err
+	}
+	logger := ctxlog.New(stderr, cluster.SystemLogs.Format, cluster.SystemLogs.LogLevel).WithFields(logrus.Fields{
+		"ClusterID": cluster.ClusterID,
+	})
+	ctx = ctxlog.Context(ctx, logger)
+	agg := Aggregator{Cluster: cluster, timeout: arvados.Duration(*timeout)}
+	resp := agg.ClusterHealth()
+	buf, err := yaml.Marshal(resp)
+	if err != nil {
+		return err
+	}
+	stdout.Write(buf)
+	if resp.Health != "OK" {
+		return fmt.Errorf("health check failed")
+	}
+	return nil
+}
diff --git a/sdk/go/health/aggregator_test.go b/sdk/go/health/aggregator_test.go
index 04106caa4..05f0bdd31 100644
--- a/sdk/go/health/aggregator_test.go
+++ b/sdk/go/health/aggregator_test.go
@@ -5,14 +5,19 @@
 package health
 
 import (
+	"bytes"
 	"encoding/json"
+	"io/ioutil"
 	"net/http"
 	"net/http/httptest"
 	"strings"
 	"time"
 
+	"git.arvados.org/arvados.git/lib/config"
 	"git.arvados.org/arvados.git/sdk/go/arvados"
 	"git.arvados.org/arvados.git/sdk/go/arvadostest"
+	"git.arvados.org/arvados.git/sdk/go/ctxlog"
+	"github.com/ghodss/yaml"
 	"gopkg.in/check.v1"
 )
 
@@ -30,9 +35,17 @@ func (s *AggregatorSuite) TestInterface(c *check.C) {
 }
 
 func (s *AggregatorSuite) SetUpTest(c *check.C) {
-	s.handler = &Aggregator{Cluster: &arvados.Cluster{
-		ManagementToken: arvadostest.ManagementToken,
-	}}
+	ldr := config.NewLoader(bytes.NewBufferString(`Clusters: {zzzzz: {}}`), ctxlog.TestLogger(c))
+	ldr.Path = "-"
+	cfg, err := ldr.Load()
+	c.Assert(err, check.IsNil)
+	cluster, err := cfg.GetCluster("")
+	c.Assert(err, check.IsNil)
+	cluster.ManagementToken = arvadostest.ManagementToken
+	cluster.SystemRootToken = arvadostest.SystemRootToken
+	cluster.Collections.BlobSigningKey = arvadostest.BlobSigningKey
+	cluster.Volumes["z"] = arvados.Volume{StorageClasses: map[string]bool{"default": true}}
+	s.handler = &Aggregator{Cluster: cluster}
 	s.req = httptest.NewRequest("GET", "/_health/all", nil)
 	s.req.Header.Set("Authorization", "Bearer "+arvadostest.ManagementToken)
 	s.resp = httptest.NewRecorder()
@@ -122,6 +135,22 @@ func (s *AggregatorSuite) TestPingTimeout(c *check.C) {
 	c.Check(rt > 0.005, check.Equals, true)
 }
 
+func (s *AggregatorSuite) TestCheckCommand(c *check.C) {
+	srv, listen := s.stubServer(&healthyHandler{})
+	defer srv.Close()
+	s.setAllServiceURLs(listen)
+	tmpdir := c.MkDir()
+	confdata, err := yaml.Marshal(arvados.Config{Clusters: map[string]arvados.Cluster{s.handler.Cluster.ClusterID: *s.handler.Cluster}})
+	c.Assert(err, check.IsNil)
+	err = ioutil.WriteFile(tmpdir+"/config.yml", confdata, 0777)
+	c.Assert(err, check.IsNil)
+	var stdout, stderr bytes.Buffer
+	exitcode := CheckCommand.RunCommand("check", []string{"-config=" + tmpdir + "/config.yml"}, &bytes.Buffer{}, &stdout, &stderr)
+	c.Check(exitcode, check.Equals, 0)
+	c.Check(stderr.String(), check.Equals, "")
+	c.Check(stdout.String(), check.Matches, `(?ms).*(\n|^)health: OK\n.*`)
+}
+
 func (s *AggregatorSuite) checkError(c *check.C) {
 	c.Check(s.resp.Code, check.Not(check.Equals), http.StatusOK)
 	var resp ClusterHealthResponse

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list