[ARVADOS] created: 2.1.0-2249-g2359fecc1
Git user
git at public.arvados.org
Wed Apr 13 19:15:36 UTC 2022
at 2359fecc1866212534b2148e22a0b8803e421455 (commit)
commit 2359fecc1866212534b2148e22a0b8803e421455
Author: Tom Clegg <tom at curii.com>
Date: Wed Apr 13 15:11:56 2022 -0400
18794: Avoid failing health check on incomplete-but-working config.
It is typical to have some InternalURLs that are only reachable by
from the same node (not necessarily where health-check runs) or are
missing entirely because only Nginx needs to know where they are.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>
diff --git a/sdk/go/health/aggregator.go b/sdk/go/health/aggregator.go
index 864ef592b..6e2c08e4c 100644
--- a/sdk/go/health/aggregator.go
+++ b/sdk/go/health/aggregator.go
@@ -6,12 +6,14 @@ package health
import (
"bufio"
+ "bytes"
"context"
"encoding/json"
"errors"
"flag"
"fmt"
"io"
+ "net"
"net/http"
"net/url"
"regexp"
@@ -124,7 +126,7 @@ type Metrics struct {
}
type ServiceHealth struct {
- Health string `json:"health"`
+ Health string `json:"health"` // "OK", "ERROR", or "SKIP"
N int `json:"n"`
}
@@ -142,7 +144,7 @@ func (agg *Aggregator) ClusterHealth() ClusterHealthResponse {
// Ensure svc is listed in resp.Services.
mtx.Lock()
if _, ok := resp.Services[svcName]; !ok {
- resp.Services[svcName] = ServiceHealth{Health: "ERROR"}
+ resp.Services[svcName] = ServiceHealth{Health: "NONE"}
}
mtx.Unlock()
@@ -159,11 +161,13 @@ func (agg *Aggregator) ClusterHealth() ClusterHealthResponse {
}
} else {
result = agg.ping(pingURL)
- m, err := agg.metrics(pingURL)
- if err != nil {
- result.Error = "metrics: " + err.Error()
+ if result.Health != "SKIP" {
+ m, err := agg.metrics(pingURL)
+ if err != nil && result.Error == "" {
+ result.Error = "metrics: " + err.Error()
+ }
+ result.Metrics = m
}
- result.Metrics = m
}
mtx.Lock()
@@ -174,7 +178,7 @@ func (agg *Aggregator) ClusterHealth() ClusterHealthResponse {
h.N++
h.Health = "OK"
resp.Services[svcName] = h
- } else {
+ } else if result.Health != "SKIP" {
resp.Health = "ERROR"
}
}(svcName, addr)
@@ -184,10 +188,20 @@ func (agg *Aggregator) ClusterHealth() ClusterHealthResponse {
// Report ERROR if a needed service didn't fail any checks
// merely because it isn't configured to run anywhere.
- for _, sh := range resp.Services {
- if sh.Health != "OK" {
- resp.Health = "ERROR"
- break
+ for svcName, sh := range resp.Services {
+ switch svcName {
+ case arvados.ServiceNameDispatchCloud,
+ arvados.ServiceNameDispatchLSF:
+ // ok to not run any given dispatcher
+ case arvados.ServiceNameHealth,
+ arvados.ServiceNameWorkbench1,
+ arvados.ServiceNameWorkbench2:
+ // typically doesn't have InternalURLs in config
+ default:
+ if sh.Health != "OK" && sh.Health != "SKIP" {
+ resp.Health = "ERROR"
+ continue
+ }
}
}
@@ -244,6 +258,16 @@ func (agg *Aggregator) ping(target *url.URL) (result CheckResult) {
req.Header.Set("X-Forwarded-Proto", "https")
resp, err := agg.httpClient.Do(req)
+ if urlerr, ok := err.(*url.Error); ok {
+ if neterr, ok := urlerr.Err.(*net.OpError); ok && isLocalHost(target.Hostname()) {
+ result = CheckResult{
+ Health: "SKIP",
+ Error: neterr.Error(),
+ }
+ err = nil
+ return
+ }
+ }
if err != nil {
return
}
@@ -307,6 +331,13 @@ func (agg *Aggregator) metrics(pingURL *url.URL) (result Metrics, err error) {
return
}
+// Test whether host is an easily recognizable loopback address:
+// 0.0.0.0, 127.x.x.x, ::1, or localhost.
+func isLocalHost(host string) bool {
+ ip := net.ParseIP(host)
+ return ip.IsLoopback() || bytes.Equal(ip.To4(), []byte{0, 0, 0, 0}) || strings.EqualFold(host, "localhost")
+}
+
func (agg *Aggregator) checkAuth(req *http.Request) bool {
creds := auth.CredentialsFromRequest(req)
for _, token := range creds.Tokens {
diff --git a/sdk/go/health/aggregator_test.go b/sdk/go/health/aggregator_test.go
index 5d76c19f2..050b2c0fc 100644
--- a/sdk/go/health/aggregator_test.go
+++ b/sdk/go/health/aggregator_test.go
@@ -123,6 +123,44 @@ func (s *AggregatorSuite) TestHealthyAndUnhealthy(c *check.C) {
c.Logf("%#v", ep)
}
+// If an InternalURL host is 0.0.0.0, localhost, 127/8, or ::1 and
+// nothing is listening there, don't fail the health check -- instead,
+// assume the relevant component just isn't installed/enabled on this
+// node, but does work when contacted through ExternalURL.
+func (s *AggregatorSuite) TestUnreachableLoopbackPort(c *check.C) {
+ srvH, listenH := s.stubServer(&healthyHandler{})
+ defer srvH.Close()
+ s.setAllServiceURLs(listenH)
+ arvadostest.SetServiceURL(&s.handler.Cluster.Services.Keepproxy, "http://localhost:9/")
+ arvadostest.SetServiceURL(&s.handler.Cluster.Services.Workbench1, "http://0.0.0.0:9/")
+ arvadostest.SetServiceURL(&s.handler.Cluster.Services.Keepbalance, "http://127.0.0.127:9/")
+ arvadostest.SetServiceURL(&s.handler.Cluster.Services.WebDAV, "http://[::1]:9/")
+ s.handler.ServeHTTP(s.resp, s.req)
+ s.checkOK(c)
+
+ // If a non-loopback address is unreachable, that's still a
+ // fail.
+ s.resp = httptest.NewRecorder()
+ arvadostest.SetServiceURL(&s.handler.Cluster.Services.WebDAV, "http://172.31.255.254:9/")
+ s.handler.ServeHTTP(s.resp, s.req)
+ s.checkUnhealthy(c)
+}
+
+func (s *AggregatorSuite) TestIsLocalHost(c *check.C) {
+ c.Check(isLocalHost("Localhost"), check.Equals, true)
+ c.Check(isLocalHost("localhost"), check.Equals, true)
+ c.Check(isLocalHost("127.0.0.1"), check.Equals, true)
+ c.Check(isLocalHost("127.0.0.127"), check.Equals, true)
+ c.Check(isLocalHost("127.1.2.7"), check.Equals, true)
+ c.Check(isLocalHost("0.0.0.0"), check.Equals, true)
+ c.Check(isLocalHost("::1"), check.Equals, true)
+ c.Check(isLocalHost("1.2.3.4"), check.Equals, false)
+ c.Check(isLocalHost("1::1"), check.Equals, false)
+ c.Check(isLocalHost("example.com"), check.Equals, false)
+ c.Check(isLocalHost("127.0.0"), check.Equals, false)
+ c.Check(isLocalHost(""), check.Equals, false)
+}
+
func (s *AggregatorSuite) TestConfigMismatch(c *check.C) {
// time1/hash1: current config
time1 := time.Now().Add(time.Second - time.Minute - time.Hour)
commit 6b4e10efa050158c16ff81cf3303f2faf8984337
Author: Tom Clegg <tom at curii.com>
Date: Mon Apr 11 14:35:03 2022 -0400
18794: Add /metrics endpoint to RailsAPI.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>
diff --git a/services/api/app/controllers/arvados/v1/healthcheck_controller.rb b/services/api/app/controllers/arvados/v1/management_controller.rb
similarity index 54%
rename from services/api/app/controllers/arvados/v1/healthcheck_controller.rb
rename to services/api/app/controllers/arvados/v1/management_controller.rb
index c56208207..55a00d346 100644
--- a/services/api/app/controllers/arvados/v1/healthcheck_controller.rb
+++ b/services/api/app/controllers/arvados/v1/management_controller.rb
@@ -2,7 +2,7 @@
#
# SPDX-License-Identifier: AGPL-3.0
-class Arvados::V1::HealthcheckController < ApplicationController
+class Arvados::V1::ManagementController < ApplicationController
skip_before_action :catch_redirect_hint
skip_before_action :find_objects_for_index
skip_before_action :find_object_by_uuid
@@ -29,8 +29,24 @@ class Arvados::V1::HealthcheckController < ApplicationController
end
end
- def ping
- resp = {"health" => "OK"}
- send_json resp
+ def metrics
+ render content_type: 'text/plain', plain: <<~EOF
+# HELP arvados_config_load_timestamp_seconds Time when config file was loaded.
+# TYPE arvados_config_load_timestamp_seconds gauge
+arvados_config_load_timestamp_seconds{sha256="#{Rails.configuration.SourceSHA256}"} #{Rails.configuration.LoadTimestamp.to_f}
+# HELP arvados_config_source_timestamp_seconds Timestamp of config file when it was loaded.
+# TYPE arvados_config_source_timestamp_seconds gauge
+arvados_config_source_timestamp_seconds{sha256="#{Rails.configuration.SourceSHA256}"} #{Rails.configuration.SourceTimestamp.to_f}
+EOF
+ end
+
+ def health
+ case params[:check]
+ when 'ping'
+ resp = {"health" => "OK"}
+ send_json resp
+ else
+ send_json ({"errors" => "not found"}), status: 404
+ end
end
end
diff --git a/services/api/config/arvados_config.rb b/services/api/config/arvados_config.rb
index 8a96c432a..c0f7ee174 100644
--- a/services/api/config/arvados_config.rb
+++ b/services/api/config/arvados_config.rb
@@ -30,6 +30,7 @@ end
# Load the defaults, used by config:migrate and fallback loading
# legacy application.yml
+load_time = Time.now.utc
defaultYAML, stderr, status = Open3.capture3("arvados-server", "config-dump", "-config=-", "-skip-legacy", stdin_data: "Clusters: {xxxxx: {}}")
if !status.success?
puts stderr
@@ -39,6 +40,8 @@ confs = YAML.load(defaultYAML, deserialize_symbols: false)
clusterID, clusterConfig = confs["Clusters"].first
$arvados_config_defaults = clusterConfig
$arvados_config_defaults["ClusterID"] = clusterID
+$arvados_config_defaults["SourceTimestamp"] = Time.rfc3339(confs["SourceTimestamp"])
+$arvados_config_defaults["SourceSHA256"] = confs["SourceSHA256"]
if ENV["ARVADOS_CONFIG"] == "none"
# Don't load config. This magic value is set by packaging scripts so
@@ -54,6 +57,8 @@ else
clusterID, clusterConfig = confs["Clusters"].first
$arvados_config_global = clusterConfig
$arvados_config_global["ClusterID"] = clusterID
+ $arvados_config_global["SourceTimestamp"] = Time.rfc3339(confs["SourceTimestamp"])
+ $arvados_config_global["SourceSHA256"] = confs["SourceSHA256"]
else
# config-dump failed, assume we will be loading from legacy
# application.yml, initialize with defaults.
@@ -64,6 +69,7 @@ end
# Now make a copy
$arvados_config = $arvados_config_global.deep_dup
+$arvados_config["LoadTimestamp"] = load_time
def arrayToHash cfg, k, v
val = {}
diff --git a/services/api/config/routes.rb b/services/api/config/routes.rb
index 98f5788d6..9c7bfc3a7 100644
--- a/services/api/config/routes.rb
+++ b/services/api/config/routes.rb
@@ -112,7 +112,8 @@ Rails.application.routes.draw do
match '/static/login_failure', to: 'static#login_failure', as: :login_failure, via: [:get, :post]
- match '/_health/ping', to: 'arvados/v1/healthcheck#ping', via: [:get]
+ match '/_health/:check', to: 'arvados/v1/management#health', via: [:get]
+ match '/metrics', to: 'arvados/v1/management#metrics', via: [:get]
# Send unroutable requests to an arbitrary controller
# (ends up at ApplicationController#render_not_found)
diff --git a/services/api/test/functional/arvados/v1/healthcheck_controller_test.rb b/services/api/test/functional/arvados/v1/healthcheck_controller_test.rb
deleted file mode 100644
index 76fdb0426..000000000
--- a/services/api/test/functional/arvados/v1/healthcheck_controller_test.rb
+++ /dev/null
@@ -1,34 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: AGPL-3.0
-
-require 'test_helper'
-
-class Arvados::V1::HealthcheckControllerTest < ActionController::TestCase
- [
- [false, nil, 404, 'disabled'],
- [true, nil, 401, 'authorization required'],
- [true, 'badformatwithnoBearer', 403, 'authorization error'],
- [true, 'Bearer wrongtoken', 403, 'authorization error'],
- [true, 'Bearer configuredmanagementtoken', 200, '{"health":"OK"}'],
- ].each do |enabled, header, error_code, error_msg|
- test "ping when #{if enabled then 'enabled' else 'disabled' end} with header '#{header}'" do
- if enabled
- Rails.configuration.ManagementToken = 'configuredmanagementtoken'
- else
- Rails.configuration.ManagementToken = ""
- end
-
- @request.headers['Authorization'] = header
- get :ping
- assert_response error_code
-
- resp = JSON.parse(@response.body)
- if error_code == 200
- assert_equal(JSON.load('{"health":"OK"}'), resp)
- else
- assert_equal(error_msg, resp['errors'])
- end
- end
- end
-end
diff --git a/services/api/test/functional/arvados/v1/management_controller_test.rb b/services/api/test/functional/arvados/v1/management_controller_test.rb
new file mode 100644
index 000000000..5b34f9fef
--- /dev/null
+++ b/services/api/test/functional/arvados/v1/management_controller_test.rb
@@ -0,0 +1,71 @@
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: AGPL-3.0
+
+require 'test_helper'
+
+class Arvados::V1::ManagementControllerTest < ActionController::TestCase
+ [
+ [false, nil, 404, 'disabled'],
+ [true, nil, 401, 'authorization required'],
+ [true, 'badformatwithnoBearer', 403, 'authorization error'],
+ [true, 'Bearer wrongtoken', 403, 'authorization error'],
+ [true, 'Bearer configuredmanagementtoken', 200, '{"health":"OK"}'],
+ ].each do |enabled, header, error_code, error_msg|
+ test "_health/ping when #{if enabled then 'enabled' else 'disabled' end} with header '#{header}'" do
+ if enabled
+ Rails.configuration.ManagementToken = 'configuredmanagementtoken'
+ else
+ Rails.configuration.ManagementToken = ""
+ end
+
+ @request.headers['Authorization'] = header
+ get :health, params: {check: 'ping'}
+ assert_response error_code
+
+ resp = JSON.parse(@response.body)
+ if error_code == 200
+ assert_equal(JSON.load('{"health":"OK"}'), resp)
+ else
+ assert_equal(error_msg, resp['errors'])
+ end
+ end
+ end
+
+ test "metrics" do
+ mtime = File.mtime(ENV["ARVADOS_CONFIG"])
+ hash = Digest::SHA256.hexdigest(File.read(ENV["ARVADOS_CONFIG"]))
+ Rails.configuration.ManagementToken = "configuredmanagementtoken"
+ @request.headers['Authorization'] = "Bearer configuredmanagementtoken"
+ get :metrics
+ assert_response :success
+ assert_equal 'text/plain', @response.content_type
+
+ assert_match /\narvados_config_source_timestamp_seconds{sha256="#{hash}"} #{Regexp.escape mtime.utc.to_f.to_s}\n/, @response.body
+
+ # Expect mtime < loadtime < now
+ m = @response.body.match(/\narvados_config_load_timestamp_seconds{sha256="#{hash}"} (.*?)\n/)
+ assert_operator m[1].to_f, :>, mtime.utc.to_f
+ assert_operator m[1].to_f, :<, Time.now.utc.to_f
+ end
+
+ test "metrics disabled" do
+ Rails.configuration.ManagementToken = ""
+ @request.headers['Authorization'] = "Bearer configuredmanagementtoken"
+ get :metrics
+ assert_response 404
+ end
+
+ test "metrics bad token" do
+ Rails.configuration.ManagementToken = "configuredmanagementtoken"
+ @request.headers['Authorization'] = "Bearer asdf"
+ get :metrics
+ assert_response 403
+ end
+
+ test "metrics unauthorized" do
+ Rails.configuration.ManagementToken = "configuredmanagementtoken"
+ get :metrics
+ assert_response 401
+ end
+end
diff --git a/services/api/test/integration/errors_test.rb b/services/api/test/integration/errors_test.rb
index a2a1545ce..a5359278e 100644
--- a/services/api/test/integration/errors_test.rb
+++ b/services/api/test/integration/errors_test.rb
@@ -24,7 +24,7 @@ class ErrorsTest < ActionDispatch::IntegrationTest
# Generally, new routes should appear under /arvados/v1/. If
# they appear elsewhere, that might have been caused by default
# rails generator behavior that we don't want.
- assert_match(/^\/(|\*a|arvados\/v1\/.*|auth\/.*|login|logout|database\/reset|discovery\/.*|static\/.*|sys\/trash_sweep|themes\/.*|assets|_health\/.*)(\(\.:format\))?$/,
+ assert_match(/^\/(|\*a|arvados\/v1\/.*|auth\/.*|login|logout|database\/reset|discovery\/.*|static\/.*|sys\/trash_sweep|themes\/.*|assets|_health|metrics\/.*)(\(\.:format\))?$/,
route.path.spec.to_s,
"Unexpected new route: #{route.path.spec}")
end
commit 72f222dfd8982bfd1b19804ba8ba42b80708134b
Author: Tom Clegg <tom at curii.com>
Date: Mon Apr 11 11:18:50 2022 -0400
18794: Fail health check on mismatched configs.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>
diff --git a/sdk/go/health/aggregator.go b/sdk/go/health/aggregator.go
index 4175bc544..864ef592b 100644
--- a/sdk/go/health/aggregator.go
+++ b/sdk/go/health/aggregator.go
@@ -5,6 +5,7 @@
package health
import (
+ "bufio"
"context"
"encoding/json"
"errors"
@@ -13,6 +14,9 @@ import (
"io"
"net/http"
"net/url"
+ "regexp"
+ "strconv"
+ "strings"
"sync"
"time"
@@ -100,6 +104,8 @@ type ClusterHealthResponse struct {
// "service S is needed, but isn't configured to run
// anywhere."
Services map[arvados.ServiceName]ServiceHealth `json:"services"`
+
+ Errors []string `json:"errors"`
}
type CheckResult struct {
@@ -109,6 +115,12 @@ type CheckResult struct {
HTTPStatusText string `json:",omitempty"`
Response map[string]interface{} `json:"response"`
ResponseTime json.Number `json:"responseTime"`
+ Metrics Metrics `json:"-"`
+}
+
+type Metrics struct {
+ ConfigSourceTimestamp time.Time
+ ConfigSourceSHA256 string
}
type ServiceHealth struct {
@@ -147,6 +159,11 @@ func (agg *Aggregator) ClusterHealth() ClusterHealthResponse {
}
} else {
result = agg.ping(pingURL)
+ m, err := agg.metrics(pingURL)
+ if err != nil {
+ result.Error = "metrics: " + err.Error()
+ }
+ result.Metrics = m
}
mtx.Lock()
@@ -173,6 +190,27 @@ func (agg *Aggregator) ClusterHealth() ClusterHealthResponse {
break
}
}
+
+ var newest Metrics
+ for _, result := range resp.Checks {
+ if result.Metrics.ConfigSourceTimestamp.After(newest.ConfigSourceTimestamp) {
+ newest = result.Metrics
+ }
+ }
+ var mismatches []string
+ for target, result := range resp.Checks {
+ if hash := result.Metrics.ConfigSourceSHA256; hash != "" && hash != newest.ConfigSourceSHA256 {
+ mismatches = append(mismatches, target)
+ }
+ }
+ for _, target := range mismatches {
+ msg := fmt.Sprintf("outdated config: %s: config file (sha256 %s) does not match latest version with timestamp %s",
+ strings.TrimSuffix(target, "/_health/ping"),
+ resp.Checks[target].Metrics.ConfigSourceSHA256,
+ newest.ConfigSourceTimestamp.Format(time.RFC3339))
+ resp.Errors = append(resp.Errors, msg)
+ resp.Health = "ERROR"
+ }
return resp
}
@@ -194,7 +232,9 @@ func (agg *Aggregator) ping(target *url.URL) (result CheckResult) {
}
}()
- req, err := http.NewRequest("GET", target.String(), nil)
+ ctx, cancel := context.WithTimeout(context.Background(), time.Duration(agg.timeout))
+ defer cancel()
+ req, err := http.NewRequestWithContext(ctx, "GET", target.String(), nil)
if err != nil {
return
}
@@ -203,9 +243,6 @@ func (agg *Aggregator) ping(target *url.URL) (result CheckResult) {
// Avoid workbench1's redirect-http-to-https feature
req.Header.Set("X-Forwarded-Proto", "https")
- ctx, cancel := context.WithTimeout(req.Context(), time.Duration(agg.timeout))
- defer cancel()
- req = req.WithContext(ctx)
resp, err := agg.httpClient.Do(req)
if err != nil {
return
@@ -227,6 +264,49 @@ func (agg *Aggregator) ping(target *url.URL) (result CheckResult) {
return
}
+var reMetric = regexp.MustCompile(`([a-z_]+){sha256="([0-9a-f]+)"} (\d[\d\.e\+]+)`)
+
+func (agg *Aggregator) metrics(pingURL *url.URL) (result Metrics, err error) {
+ metricsURL, err := pingURL.Parse("/metrics")
+ if err != nil {
+ return
+ }
+ ctx, cancel := context.WithTimeout(context.Background(), time.Duration(agg.timeout))
+ defer cancel()
+ req, err := http.NewRequestWithContext(ctx, "GET", metricsURL.String(), nil)
+ if err != nil {
+ return
+ }
+ req.Header.Set("Authorization", "Bearer "+agg.Cluster.ManagementToken)
+
+ // Avoid workbench1's redirect-http-to-https feature
+ req.Header.Set("X-Forwarded-Proto", "https")
+
+ resp, err := agg.httpClient.Do(req)
+ if err != nil {
+ return
+ } else if resp.StatusCode != http.StatusOK {
+ err = fmt.Errorf("%s: HTTP %d %s", metricsURL.String(), resp.StatusCode, resp.Status)
+ return
+ }
+
+ scanner := bufio.NewScanner(resp.Body)
+ for scanner.Scan() {
+ m := reMetric.FindSubmatch(scanner.Bytes())
+ if len(m) != 4 || string(m[1]) != "arvados_config_source_timestamp_seconds" {
+ continue
+ }
+ result.ConfigSourceSHA256 = string(m[2])
+ unixtime, _ := strconv.ParseFloat(string(m[3]), 64)
+ result.ConfigSourceTimestamp = time.UnixMicro(int64(unixtime * 1e6))
+ }
+ if err = scanner.Err(); err != nil {
+ err = fmt.Errorf("error parsing response from %s: %w", metricsURL.String(), err)
+ return
+ }
+ return
+}
+
func (agg *Aggregator) checkAuth(req *http.Request) bool {
creds := auth.CredentialsFromRequest(req)
for _, token := range creds.Tokens {
diff --git a/sdk/go/health/aggregator_test.go b/sdk/go/health/aggregator_test.go
index 62eca894b..5d76c19f2 100644
--- a/sdk/go/health/aggregator_test.go
+++ b/sdk/go/health/aggregator_test.go
@@ -6,7 +6,9 @@ package health
import (
"bytes"
+ "crypto/sha256"
"encoding/json"
+ "fmt"
"io/ioutil"
"net/http"
"net/http/httptest"
@@ -121,6 +123,65 @@ func (s *AggregatorSuite) TestHealthyAndUnhealthy(c *check.C) {
c.Logf("%#v", ep)
}
+func (s *AggregatorSuite) TestConfigMismatch(c *check.C) {
+ // time1/hash1: current config
+ time1 := time.Now().Add(time.Second - time.Minute - time.Hour)
+ hash1 := fmt.Sprintf("%x", sha256.Sum256([]byte(`Clusters: {zzzzz: {SystemRootToken: xyzzy}}`)))
+ // time2/hash2: old config
+ time2 := time1.Add(-time.Hour)
+ hash2 := fmt.Sprintf("%x", sha256.Sum256([]byte(`Clusters: {zzzzz: {SystemRootToken: old-token}}`)))
+
+ // srv1: current file
+ handler1 := healthyHandler{configHash: hash1, configTime: time1}
+ srv1, listen1 := s.stubServer(&handler1)
+ defer srv1.Close()
+ // srv2: old file, current content
+ handler2 := healthyHandler{configHash: hash1, configTime: time2}
+ srv2, listen2 := s.stubServer(&handler2)
+ defer srv2.Close()
+ // srv3: old file, old content
+ handler3 := healthyHandler{configHash: hash2, configTime: time2}
+ srv3, listen3 := s.stubServer(&handler3)
+ defer srv3.Close()
+ // srv4: no metrics handler
+ handler4 := healthyHandler{}
+ srv4, listen4 := s.stubServer(&handler4)
+ defer srv4.Close()
+
+ s.setAllServiceURLs(listen1)
+
+ // listen2 => old timestamp, same content => no problem
+ s.resp = httptest.NewRecorder()
+ arvadostest.SetServiceURL(&s.handler.Cluster.Services.DispatchCloud,
+ "http://localhost"+listen2+"/")
+ s.handler.ServeHTTP(s.resp, s.req)
+ resp := s.checkOK(c)
+
+ // listen4 => no metrics on some services => no problem
+ s.resp = httptest.NewRecorder()
+ arvadostest.SetServiceURL(&s.handler.Cluster.Services.WebDAV,
+ "http://localhost"+listen4+"/")
+ s.handler.ServeHTTP(s.resp, s.req)
+ resp = s.checkOK(c)
+
+ // listen3 => old timestamp, old content => report discrepancy
+ s.resp = httptest.NewRecorder()
+ arvadostest.SetServiceURL(&s.handler.Cluster.Services.Keepstore,
+ "http://localhost"+listen1+"/",
+ "http://localhost"+listen3+"/")
+ s.handler.ServeHTTP(s.resp, s.req)
+ resp = s.checkUnhealthy(c)
+ if c.Check(len(resp.Errors) > 0, check.Equals, true) {
+ c.Check(resp.Errors[0], check.Matches, `outdated config: \Qkeepstore+http://localhost`+listen3+`\E: config file \(sha256 .*\) does not match latest version with timestamp .*`)
+ }
+
+ // no services report config time (migrating to current version) => no problem
+ s.resp = httptest.NewRecorder()
+ s.setAllServiceURLs(listen4)
+ s.handler.ServeHTTP(s.resp, s.req)
+ s.checkOK(c)
+}
+
func (s *AggregatorSuite) TestPingTimeout(c *check.C) {
s.handler.timeout = arvados.Duration(100 * time.Millisecond)
srv, listen := s.stubServer(&slowHandler{})
@@ -143,7 +204,7 @@ func (s *AggregatorSuite) TestCheckCommand(c *check.C) {
tmpdir := c.MkDir()
confdata, err := yaml.Marshal(arvados.Config{Clusters: map[string]arvados.Cluster{s.handler.Cluster.ClusterID: *s.handler.Cluster}})
c.Assert(err, check.IsNil)
- confdata = regexp.MustCompile(`SourceTimestamp: [^\n]+\n`).ReplaceAll(confdata, []byte{})
+ confdata = regexp.MustCompile(`Source(Timestamp|SHA256): [^\n]+\n`).ReplaceAll(confdata, []byte{})
err = ioutil.WriteFile(tmpdir+"/config.yml", confdata, 0777)
c.Assert(err, check.IsNil)
var stdout, stderr bytes.Buffer
@@ -209,11 +270,37 @@ func (*unhealthyHandler) ServeHTTP(resp http.ResponseWriter, req *http.Request)
}
}
-type healthyHandler struct{}
+type healthyHandler struct {
+ configHash string
+ configTime time.Time
+}
-func (*healthyHandler) ServeHTTP(resp http.ResponseWriter, req *http.Request) {
+func (h *healthyHandler) ServeHTTP(resp http.ResponseWriter, req *http.Request) {
+ authOK := req.Header.Get("Authorization") == "Bearer "+arvadostest.ManagementToken
if req.URL.Path == "/_health/ping" {
+ if !authOK {
+ http.Error(resp, "unauthorized", http.StatusUnauthorized)
+ return
+ }
resp.Write([]byte(`{"health":"OK"}`))
+ } else if req.URL.Path == "/metrics" {
+ if !authOK {
+ http.Error(resp, "unauthorized", http.StatusUnauthorized)
+ return
+ }
+ t := h.configTime
+ if t.IsZero() {
+ t = time.Now()
+ }
+ fmt.Fprintf(resp, `# HELP arvados_config_load_timestamp_seconds Time when config file was loaded.
+# TYPE arvados_config_load_timestamp_seconds gauge
+arvados_config_load_timestamp_seconds{sha256="%s"} %g
+# HELP arvados_config_source_timestamp_seconds Timestamp of config file when it was loaded.
+# TYPE arvados_config_source_timestamp_seconds gauge
+arvados_config_source_timestamp_seconds{sha256="%s"} %g
+`,
+ h.configHash, float64(time.Now().UnixNano())/1e9,
+ h.configHash, float64(t.UnixNano())/1e9)
} else {
http.Error(resp, "not found", http.StatusNotFound)
}
commit a92e041a642687ca27d8bacf298faddcd442f3e1
Author: Tom Clegg <tom at curii.com>
Date: Fri Apr 8 17:48:21 2022 -0400
18794: Export config load time as prometheus metric.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>
diff --git a/lib/config/load.go b/lib/config/load.go
index d03ef5388..1ad31402c 100644
--- a/lib/config/load.go
+++ b/lib/config/load.go
@@ -53,6 +53,8 @@ type Loader struct {
// read configdata from, or the time when we read configdata
// from a pipe.
sourceTimestamp time.Time
+ // UTC time when configdata was read.
+ loadTimestamp time.Time
}
// NewLoader returns a new Loader with Stdin and Logger set to the
@@ -173,32 +175,36 @@ func (ldr *Loader) MungeLegacyConfigArgs(lgr logrus.FieldLogger, args []string,
return munged
}
-func (ldr *Loader) loadBytes(path string) ([]byte, time.Time, error) {
+func (ldr *Loader) loadBytes(path string) (buf []byte, sourceTime, loadTime time.Time, err error) {
+ loadTime = time.Now().UTC()
if path == "-" {
- buf, err := ioutil.ReadAll(ldr.Stdin)
- return buf, time.Now().UTC(), err
+ buf, err = ioutil.ReadAll(ldr.Stdin)
+ sourceTime = loadTime
+ return
}
f, err := os.Open(path)
if err != nil {
- return nil, time.Time{}, err
+ return
}
defer f.Close()
fi, err := f.Stat()
if err != nil {
- return nil, time.Time{}, err
+ return
}
- buf, err := ioutil.ReadAll(f)
- return buf, fi.ModTime().UTC(), err
+ sourceTime = fi.ModTime().UTC()
+ buf, err = ioutil.ReadAll(f)
+ return
}
func (ldr *Loader) Load() (*arvados.Config, error) {
if ldr.configdata == nil {
- buf, t, err := ldr.loadBytes(ldr.Path)
+ buf, sourceTime, loadTime, err := ldr.loadBytes(ldr.Path)
if err != nil {
return nil, err
}
ldr.configdata = buf
- ldr.sourceTimestamp = t
+ ldr.sourceTimestamp = sourceTime
+ ldr.loadTimestamp = loadTime
}
// FIXME: We should reject YAML if the same key is used twice
@@ -579,12 +585,22 @@ func (ldr *Loader) autofillPreemptible(label string, cc *arvados.Cluster) {
// called before Load(). Metrics are not updated by subsequent calls
// to Load().
func (ldr *Loader) RegisterMetrics(reg *prometheus.Registry) {
+ hash := fmt.Sprintf("%x", sha256.Sum256(ldr.configdata))
vec := prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "arvados",
Subsystem: "config",
Name: "source_timestamp_seconds",
Help: "Timestamp of config file when it was loaded.",
}, []string{"sha256"})
- vec.WithLabelValues(fmt.Sprintf("%x", sha256.Sum256(ldr.configdata))).Set(float64(ldr.sourceTimestamp.UnixNano()) / 1e9)
+ vec.WithLabelValues(hash).Set(float64(ldr.sourceTimestamp.UnixNano()) / 1e9)
+ reg.MustRegister(vec)
+
+ vec = prometheus.NewGaugeVec(prometheus.GaugeOpts{
+ Namespace: "arvados",
+ Subsystem: "config",
+ Name: "load_timestamp_seconds",
+ Help: "Time when config file was loaded.",
+ }, []string{"sha256"})
+ vec.WithLabelValues(hash).Set(float64(ldr.loadTimestamp.UnixNano()) / 1e9)
reg.MustRegister(vec)
}
diff --git a/lib/config/load_test.go b/lib/config/load_test.go
index 48e2d33a4..3daa66fc9 100644
--- a/lib/config/load_test.go
+++ b/lib/config/load_test.go
@@ -803,7 +803,9 @@ func (s *LoadSuite) TestSourceTimestamp(c *check.C) {
cfg, err := ldr.Load()
c.Assert(err, check.IsNil)
c.Check(cfg.SourceTimestamp, check.Equals, cfg.SourceTimestamp.UTC())
+ c.Check(cfg.SourceTimestamp, check.Equals, ldr.sourceTimestamp)
c.Check(int(cfg.SourceTimestamp.Sub(trial.expectTime).Seconds()), check.Equals, 0)
+ c.Check(int(ldr.loadTimestamp.Sub(time.Now()).Seconds()), check.Equals, 0)
var buf bytes.Buffer
reg := prometheus.NewRegistry()
@@ -813,6 +815,12 @@ func (s *LoadSuite) TestSourceTimestamp(c *check.C) {
for _, mf := range got {
enc.Encode(mf)
}
- c.Check(buf.String(), check.Matches, `# HELP .*\n# TYPE .*\narvados_config_source_timestamp_seconds{sha256="83aea5d82eb1d53372cd65c936c60acc1c6ef946e61977bbca7cfea709d201a8"} \Q`+fmt.Sprintf("%g", float64(cfg.SourceTimestamp.UnixNano())/1e9)+`\E\n`)
+ c.Check(buf.String(), check.Matches, `# HELP .*
+# TYPE .*
+arvados_config_load_timestamp_seconds{sha256="83aea5d82eb1d53372cd65c936c60acc1c6ef946e61977bbca7cfea709d201a8"} \Q`+fmt.Sprintf("%g", float64(ldr.loadTimestamp.UnixNano())/1e9)+`\E
+# HELP .*
+# TYPE .*
+arvados_config_source_timestamp_seconds{sha256="83aea5d82eb1d53372cd65c936c60acc1c6ef946e61977bbca7cfea709d201a8"} \Q`+fmt.Sprintf("%g", float64(cfg.SourceTimestamp.UnixNano())/1e9)+`\E
+`)
}
}
commit 4e10563689b0a874612d133f489ff8af69da6063
Author: Tom Clegg <tom at curii.com>
Date: Fri Apr 8 16:05:26 2022 -0400
18794: Export config timestamp/hash as prometheus metric.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>
diff --git a/lib/config/load.go b/lib/config/load.go
index 28f300e3b..d03ef5388 100644
--- a/lib/config/load.go
+++ b/lib/config/load.go
@@ -6,6 +6,7 @@ package config
import (
"bytes"
+ "crypto/sha256"
_ "embed"
"encoding/json"
"errors"
@@ -22,6 +23,7 @@ import (
"git.arvados.org/arvados.git/sdk/go/arvados"
"github.com/ghodss/yaml"
"github.com/imdario/mergo"
+ "github.com/prometheus/client_golang/prometheus"
"github.com/sirupsen/logrus"
)
@@ -569,3 +571,20 @@ func (ldr *Loader) autofillPreemptible(label string, cc *arvados.Cluster) {
}
}
+
+// RegisterMetrics registers metrics showing the timestamp and content
+// hash of the currently loaded config.
+//
+// Must not be called more than once for a given registry. Must not be
+// called before Load(). Metrics are not updated by subsequent calls
+// to Load().
+func (ldr *Loader) RegisterMetrics(reg *prometheus.Registry) {
+ vec := prometheus.NewGaugeVec(prometheus.GaugeOpts{
+ Namespace: "arvados",
+ Subsystem: "config",
+ Name: "source_timestamp_seconds",
+ Help: "Timestamp of config file when it was loaded.",
+ }, []string{"sha256"})
+ vec.WithLabelValues(fmt.Sprintf("%x", sha256.Sum256(ldr.configdata))).Set(float64(ldr.sourceTimestamp.UnixNano()) / 1e9)
+ reg.MustRegister(vec)
+}
diff --git a/lib/config/load_test.go b/lib/config/load_test.go
index 97830b4e1..48e2d33a4 100644
--- a/lib/config/load_test.go
+++ b/lib/config/load_test.go
@@ -20,6 +20,8 @@ import (
"git.arvados.org/arvados.git/sdk/go/arvados"
"git.arvados.org/arvados.git/sdk/go/ctxlog"
"github.com/ghodss/yaml"
+ "github.com/prometheus/client_golang/prometheus"
+ "github.com/prometheus/common/expfmt"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
check "gopkg.in/check.v1"
@@ -802,5 +804,15 @@ func (s *LoadSuite) TestSourceTimestamp(c *check.C) {
c.Assert(err, check.IsNil)
c.Check(cfg.SourceTimestamp, check.Equals, cfg.SourceTimestamp.UTC())
c.Check(int(cfg.SourceTimestamp.Sub(trial.expectTime).Seconds()), check.Equals, 0)
+
+ var buf bytes.Buffer
+ reg := prometheus.NewRegistry()
+ ldr.RegisterMetrics(reg)
+ enc := expfmt.NewEncoder(&buf, expfmt.FmtText)
+ got, _ := reg.Gather()
+ for _, mf := range got {
+ enc.Encode(mf)
+ }
+ c.Check(buf.String(), check.Matches, `# HELP .*\n# TYPE .*\narvados_config_source_timestamp_seconds{sha256="83aea5d82eb1d53372cd65c936c60acc1c6ef946e61977bbca7cfea709d201a8"} \Q`+fmt.Sprintf("%g", float64(cfg.SourceTimestamp.UnixNano())/1e9)+`\E\n`)
}
}
diff --git a/lib/service/cmd.go b/lib/service/cmd.go
index dbafc89fe..d11e4a74d 100644
--- a/lib/service/cmd.go
+++ b/lib/service/cmd.go
@@ -117,6 +117,8 @@ func (c *command) RunCommand(prog string, args []string, stdin io.Reader, stdout
ctx = context.WithValue(ctx, contextKeyURL{}, listenURL)
reg := prometheus.NewRegistry()
+ loader.RegisterMetrics(reg)
+
handler := c.newHandler(ctx, cluster, cluster.SystemRootToken, reg)
if err = handler.CheckHealth(); err != nil {
return 1
commit 03cd0561b3816fb2448ec355be6771cc49074ef9
Author: Tom Clegg <tom at curii.com>
Date: Fri Apr 8 11:08:14 2022 -0400
18794: Add SourceTimestamp and SourceSHA256 to config-dump output.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>
diff --git a/lib/config/load.go b/lib/config/load.go
index 5afb51c5a..28f300e3b 100644
--- a/lib/config/load.go
+++ b/lib/config/load.go
@@ -17,6 +17,7 @@ import (
"regexp"
"strconv"
"strings"
+ "time"
"git.arvados.org/arvados.git/sdk/go/arvados"
"github.com/ghodss/yaml"
@@ -46,6 +47,10 @@ type Loader struct {
KeepBalancePath string
configdata []byte
+ // UTC time for configdata: either the modtime of the file we
+ // read configdata from, or the time when we read configdata
+ // from a pipe.
+ sourceTimestamp time.Time
}
// NewLoader returns a new Loader with Stdin and Logger set to the
@@ -166,25 +171,32 @@ func (ldr *Loader) MungeLegacyConfigArgs(lgr logrus.FieldLogger, args []string,
return munged
}
-func (ldr *Loader) loadBytes(path string) ([]byte, error) {
+func (ldr *Loader) loadBytes(path string) ([]byte, time.Time, error) {
if path == "-" {
- return ioutil.ReadAll(ldr.Stdin)
+ buf, err := ioutil.ReadAll(ldr.Stdin)
+ return buf, time.Now().UTC(), err
}
f, err := os.Open(path)
if err != nil {
- return nil, err
+ return nil, time.Time{}, err
}
defer f.Close()
- return ioutil.ReadAll(f)
+ fi, err := f.Stat()
+ if err != nil {
+ return nil, time.Time{}, err
+ }
+ buf, err := ioutil.ReadAll(f)
+ return buf, fi.ModTime().UTC(), err
}
func (ldr *Loader) Load() (*arvados.Config, error) {
if ldr.configdata == nil {
- buf, err := ldr.loadBytes(ldr.Path)
+ buf, t, err := ldr.loadBytes(ldr.Path)
if err != nil {
return nil, err
}
ldr.configdata = buf
+ ldr.sourceTimestamp = t
}
// FIXME: We should reject YAML if the same key is used twice
@@ -330,6 +342,8 @@ func (ldr *Loader) Load() (*arvados.Config, error) {
}
}
}
+ cfg.SourceTimestamp = ldr.sourceTimestamp
+ cfg.SourceSHA256 = fmt.Sprintf("%x", sha256.Sum256(ldr.configdata))
return &cfg, nil
}
diff --git a/lib/config/load_test.go b/lib/config/load_test.go
index 5270dcccc..97830b4e1 100644
--- a/lib/config/load_test.go
+++ b/lib/config/load_test.go
@@ -12,13 +12,16 @@ import (
"os"
"os/exec"
"reflect"
+ "regexp"
"strings"
"testing"
+ "time"
"git.arvados.org/arvados.git/sdk/go/arvados"
"git.arvados.org/arvados.git/sdk/go/ctxlog"
"github.com/ghodss/yaml"
"github.com/sirupsen/logrus"
+ "golang.org/x/sys/unix"
check "gopkg.in/check.v1"
)
@@ -315,8 +318,16 @@ Clusters:
c.Assert(err, check.IsNil)
yaml, err := yaml.Marshal(cfg)
c.Assert(err, check.IsNil)
+ // Well, *nearly* no warnings. SourceTimestamp and
+ // SourceSHA256 are included in a config-dump, but not
+ // expected in a real config file.
+ yaml = regexp.MustCompile(`(^|\n)(Source(Timestamp|SHA256): .*?\n)+`).ReplaceAll(yaml, []byte("$1"))
cfgDumped, err := testLoader(c, string(yaml), &logbuf).Load()
c.Assert(err, check.IsNil)
+ // SourceTimestamp and SourceSHA256 aren't expected to be
+ // preserved through dump+load
+ cfgDumped.SourceTimestamp = cfg.SourceTimestamp
+ cfgDumped.SourceSHA256 = cfg.SourceSHA256
c.Check(cfg, check.DeepEquals, cfgDumped)
c.Check(logbuf.String(), check.Equals, "")
}
@@ -503,6 +514,12 @@ func checkEquivalentLoaders(c *check.C, gotldr, expectedldr *Loader) {
c.Assert(err, check.IsNil)
expected, err := expectedldr.Load()
c.Assert(err, check.IsNil)
+ // The inputs generally aren't even files, so SourceTimestamp
+ // can't be expected to match.
+ got.SourceTimestamp = expected.SourceTimestamp
+ // Obviously the content isn't identical -- otherwise we
+ // wouldn't need to check that it's equivalent.
+ got.SourceSHA256 = expected.SourceSHA256
checkEqualYAML(c, got, expected)
}
@@ -762,3 +779,28 @@ Clusters:
c.Check(logbuf.String(), check.Not(check.Matches), `(?ms).*Type2\.preemptible.*`)
c.Check(logbuf.String(), check.Not(check.Matches), `(?ms).*(z1111|z2222)[^\n]*InstanceTypes.*`)
}
+
+func (s *LoadSuite) TestSourceTimestamp(c *check.C) {
+ conftime, err := time.Parse(time.RFC3339, "2022-03-04T05:06:07-08:00")
+ c.Assert(err, check.IsNil)
+ confdata := `Clusters: {zzzzz: {}}`
+ conffile := c.MkDir() + "/config.yml"
+ ioutil.WriteFile(conffile, []byte(confdata), 0777)
+ tv := unix.NsecToTimeval(conftime.UnixNano())
+ unix.Lutimes(conffile, []unix.Timeval{tv, tv})
+ for _, trial := range []struct {
+ configarg string
+ expectTime time.Time
+ }{
+ {"-", time.Now()},
+ {conffile, conftime},
+ } {
+ c.Logf("trial: %+v", trial)
+ ldr := NewLoader(strings.NewReader(confdata), ctxlog.TestLogger(c))
+ ldr.Path = trial.configarg
+ cfg, err := ldr.Load()
+ c.Assert(err, check.IsNil)
+ c.Check(cfg.SourceTimestamp, check.Equals, cfg.SourceTimestamp.UTC())
+ c.Check(int(cfg.SourceTimestamp.Sub(trial.expectTime).Seconds()), check.Equals, 0)
+ }
+}
diff --git a/sdk/go/arvados/config.go b/sdk/go/arvados/config.go
index 6c9324e47..b508a3f05 100644
--- a/sdk/go/arvados/config.go
+++ b/sdk/go/arvados/config.go
@@ -10,6 +10,7 @@ import (
"fmt"
"net/url"
"os"
+ "time"
"git.arvados.org/arvados.git/sdk/go/config"
)
@@ -24,6 +25,8 @@ var DefaultConfigFile = func() string {
type Config struct {
Clusters map[string]Cluster
AutoReloadConfig bool
+ SourceTimestamp time.Time
+ SourceSHA256 string
}
// GetConfig returns the current system config, loading it from
diff --git a/sdk/go/health/aggregator_test.go b/sdk/go/health/aggregator_test.go
index 05f0bdd31..62eca894b 100644
--- a/sdk/go/health/aggregator_test.go
+++ b/sdk/go/health/aggregator_test.go
@@ -10,6 +10,7 @@ import (
"io/ioutil"
"net/http"
"net/http/httptest"
+ "regexp"
"strings"
"time"
@@ -142,6 +143,7 @@ func (s *AggregatorSuite) TestCheckCommand(c *check.C) {
tmpdir := c.MkDir()
confdata, err := yaml.Marshal(arvados.Config{Clusters: map[string]arvados.Cluster{s.handler.Cluster.ClusterID: *s.handler.Cluster}})
c.Assert(err, check.IsNil)
+ confdata = regexp.MustCompile(`SourceTimestamp: [^\n]+\n`).ReplaceAll(confdata, []byte{})
err = ioutil.WriteFile(tmpdir+"/config.yml", confdata, 0777)
c.Assert(err, check.IsNil)
var stdout, stderr bytes.Buffer
commit 541c300ca6abb15449e917f648ae5ffd68087ff9
Author: Tom Clegg <tom at curii.com>
Date: Thu Apr 7 16:11:55 2022 -0400
18794: Add "arvados-server check" subcommand.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>
diff --git a/cmd/arvados-server/cmd.go b/cmd/arvados-server/cmd.go
index c8b945bea..4b48301eb 100644
--- a/cmd/arvados-server/cmd.go
+++ b/cmd/arvados-server/cmd.go
@@ -17,6 +17,7 @@ import (
"git.arvados.org/arvados.git/lib/install"
"git.arvados.org/arvados.git/lib/lsf"
"git.arvados.org/arvados.git/lib/recovercollection"
+ "git.arvados.org/arvados.git/sdk/go/health"
"git.arvados.org/arvados.git/services/keepstore"
"git.arvados.org/arvados.git/services/ws"
)
@@ -28,6 +29,7 @@ var (
"--version": cmd.Version,
"boot": boot.Command,
+ "check": health.CheckCommand,
"cloudtest": cloudtest.Command,
"config-check": config.CheckCommand,
"config-defaults": config.DumpDefaultsCommand,
diff --git a/sdk/go/health/aggregator.go b/sdk/go/health/aggregator.go
index a666ef8ec..4175bc544 100644
--- a/sdk/go/health/aggregator.go
+++ b/sdk/go/health/aggregator.go
@@ -8,19 +8,26 @@ import (
"context"
"encoding/json"
"errors"
+ "flag"
"fmt"
+ "io"
"net/http"
"net/url"
"sync"
"time"
+ "git.arvados.org/arvados.git/lib/cmd"
+ "git.arvados.org/arvados.git/lib/config"
"git.arvados.org/arvados.git/sdk/go/arvados"
"git.arvados.org/arvados.git/sdk/go/auth"
+ "git.arvados.org/arvados.git/sdk/go/ctxlog"
+ "github.com/ghodss/yaml"
+ "github.com/sirupsen/logrus"
)
const defaultTimeout = arvados.Duration(2 * time.Second)
-// Aggregator implements http.Handler. It handles "GET /_health/all"
+// Aggregator implements service.Handler. It handles "GET /_health/all"
// by checking the health of all configured services on the cluster
// and responding 200 if everything is healthy.
type Aggregator struct {
@@ -229,3 +236,61 @@ func (agg *Aggregator) checkAuth(req *http.Request) bool {
}
return false
}
+
+var errSilent = errors.New("")
+
+var CheckCommand cmd.Handler = checkCommand{}
+
+type checkCommand struct{}
+
+func (ccmd checkCommand) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
+ logger := ctxlog.New(stderr, "json", "info")
+ ctx := ctxlog.Context(context.Background(), logger)
+ err := ccmd.run(ctx, prog, args, stdin, stdout, stderr)
+ if err != nil {
+ if err != errSilent {
+ fmt.Fprintln(stdout, err.Error())
+ }
+ return 1
+ }
+ return 0
+}
+
+func (ccmd checkCommand) run(ctx context.Context, prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) error {
+ flags := flag.NewFlagSet("", flag.ContinueOnError)
+ flags.SetOutput(stderr)
+ loader := config.NewLoader(stdin, ctxlog.New(stderr, "text", "info"))
+ loader.SetupFlags(flags)
+ versionFlag := flags.Bool("version", false, "Write version information to stdout and exit 0")
+ timeout := flags.Duration("timeout", defaultTimeout.Duration(), "Maximum time to wait for health responses")
+ if ok, _ := cmd.ParseFlags(flags, prog, args, "", stderr); !ok {
+ // cmd.ParseFlags already reported the error
+ return errSilent
+ } else if *versionFlag {
+ cmd.Version.RunCommand(prog, args, stdin, stdout, stderr)
+ return nil
+ }
+ cfg, err := loader.Load()
+ if err != nil {
+ return err
+ }
+ cluster, err := cfg.GetCluster("")
+ if err != nil {
+ return err
+ }
+ logger := ctxlog.New(stderr, cluster.SystemLogs.Format, cluster.SystemLogs.LogLevel).WithFields(logrus.Fields{
+ "ClusterID": cluster.ClusterID,
+ })
+ ctx = ctxlog.Context(ctx, logger)
+ agg := Aggregator{Cluster: cluster, timeout: arvados.Duration(*timeout)}
+ resp := agg.ClusterHealth()
+ buf, err := yaml.Marshal(resp)
+ if err != nil {
+ return err
+ }
+ stdout.Write(buf)
+ if resp.Health != "OK" {
+ return fmt.Errorf("health check failed")
+ }
+ return nil
+}
diff --git a/sdk/go/health/aggregator_test.go b/sdk/go/health/aggregator_test.go
index 04106caa4..05f0bdd31 100644
--- a/sdk/go/health/aggregator_test.go
+++ b/sdk/go/health/aggregator_test.go
@@ -5,14 +5,19 @@
package health
import (
+ "bytes"
"encoding/json"
+ "io/ioutil"
"net/http"
"net/http/httptest"
"strings"
"time"
+ "git.arvados.org/arvados.git/lib/config"
"git.arvados.org/arvados.git/sdk/go/arvados"
"git.arvados.org/arvados.git/sdk/go/arvadostest"
+ "git.arvados.org/arvados.git/sdk/go/ctxlog"
+ "github.com/ghodss/yaml"
"gopkg.in/check.v1"
)
@@ -30,9 +35,17 @@ func (s *AggregatorSuite) TestInterface(c *check.C) {
}
func (s *AggregatorSuite) SetUpTest(c *check.C) {
- s.handler = &Aggregator{Cluster: &arvados.Cluster{
- ManagementToken: arvadostest.ManagementToken,
- }}
+ ldr := config.NewLoader(bytes.NewBufferString(`Clusters: {zzzzz: {}}`), ctxlog.TestLogger(c))
+ ldr.Path = "-"
+ cfg, err := ldr.Load()
+ c.Assert(err, check.IsNil)
+ cluster, err := cfg.GetCluster("")
+ c.Assert(err, check.IsNil)
+ cluster.ManagementToken = arvadostest.ManagementToken
+ cluster.SystemRootToken = arvadostest.SystemRootToken
+ cluster.Collections.BlobSigningKey = arvadostest.BlobSigningKey
+ cluster.Volumes["z"] = arvados.Volume{StorageClasses: map[string]bool{"default": true}}
+ s.handler = &Aggregator{Cluster: cluster}
s.req = httptest.NewRequest("GET", "/_health/all", nil)
s.req.Header.Set("Authorization", "Bearer "+arvadostest.ManagementToken)
s.resp = httptest.NewRecorder()
@@ -122,6 +135,22 @@ func (s *AggregatorSuite) TestPingTimeout(c *check.C) {
c.Check(rt > 0.005, check.Equals, true)
}
+func (s *AggregatorSuite) TestCheckCommand(c *check.C) {
+ srv, listen := s.stubServer(&healthyHandler{})
+ defer srv.Close()
+ s.setAllServiceURLs(listen)
+ tmpdir := c.MkDir()
+ confdata, err := yaml.Marshal(arvados.Config{Clusters: map[string]arvados.Cluster{s.handler.Cluster.ClusterID: *s.handler.Cluster}})
+ c.Assert(err, check.IsNil)
+ err = ioutil.WriteFile(tmpdir+"/config.yml", confdata, 0777)
+ c.Assert(err, check.IsNil)
+ var stdout, stderr bytes.Buffer
+ exitcode := CheckCommand.RunCommand("check", []string{"-config=" + tmpdir + "/config.yml"}, &bytes.Buffer{}, &stdout, &stderr)
+ c.Check(exitcode, check.Equals, 0)
+ c.Check(stderr.String(), check.Equals, "")
+ c.Check(stdout.String(), check.Matches, `(?ms).*(\n|^)health: OK\n.*`)
+}
+
func (s *AggregatorSuite) checkError(c *check.C) {
c.Check(s.resp.Code, check.Not(check.Equals), http.StatusOK)
var resp ClusterHealthResponse
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list