[ARVADOS] created: 2.1.0-2505-g7f9de270a

Git user git at public.arvados.org
Thu May 19 19:19:40 UTC 2022


        at  7f9de270aa34467c1b1668be9333ec28d14b10a1 (commit)


commit 7f9de270aa34467c1b1668be9333ec28d14b10a1
Author: Tom Clegg <tom at curii.com>
Date:   Thu May 19 15:19:23 2022 -0400

    16345: Fail health check on server version mismatch.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>

diff --git a/sdk/go/health/aggregator.go b/sdk/go/health/aggregator.go
index f473eff35..b5301dffe 100644
--- a/sdk/go/health/aggregator.go
+++ b/sdk/go/health/aggregator.go
@@ -108,46 +108,46 @@ func (agg *Aggregator) ServeHTTP(resp http.ResponseWriter, req *http.Request) {
 
 type ClusterHealthResponse struct {
 	// "OK" if all needed services are OK, otherwise "ERROR".
-	Health string `json:"health"`
+	Health string
 
 	// An entry for each known health check of each known instance
 	// of each needed component: "instance of service S on node N
 	// reports health-check C is OK."
-	Checks map[string]CheckResult `json:"checks"`
+	Checks map[string]CheckResult
 
 	// An entry for each service type: "service S is OK." This
 	// exposes problems that can't be expressed in Checks, like
 	// "service S is needed, but isn't configured to run
 	// anywhere."
-	Services map[arvados.ServiceName]ServiceHealth `json:"services"`
+	Services map[arvados.ServiceName]ServiceHealth
 
 	// Difference between min/max timestamps in individual
 	// health-check responses.
 	ClockSkew arvados.Duration
 
-	Errors []string `json:"errors"`
+	Errors []string
 }
 
 type CheckResult struct {
-	Health         string                 `json:"health"`
-	Error          string                 `json:"error,omitempty"`
+	Health         string
+	Error          string                 `json:",omitempty"`
 	HTTPStatusCode int                    `json:",omitempty"`
-	HTTPStatusText string                 `json:",omitempty"`
-	Response       map[string]interface{} `json:"response"`
-	ResponseTime   json.Number            `json:"responseTime"`
-	ClockTime      time.Time              `json:"clockTime"`
-	Metrics        Metrics                `json:"-"`
-	respTime       time.Duration
+	Response       map[string]interface{} `json:",omitempty"`
+	ResponseTime   json.Number
+	ClockTime      time.Time
+	Metrics
+	respTime time.Duration
 }
 
 type Metrics struct {
 	ConfigSourceTimestamp time.Time
 	ConfigSourceSHA256    string
+	Version               string
 }
 
 type ServiceHealth struct {
-	Health string `json:"health"` // "OK", "ERROR", or "SKIP"
-	N      int    `json:"n"`
+	Health string // "OK", "ERROR", or "SKIP"
+	N      int
 }
 
 func (agg *Aggregator) ClusterHealth() ClusterHealthResponse {
@@ -238,6 +238,7 @@ func (agg *Aggregator) ClusterHealth() ClusterHealthResponse {
 		}
 	}
 
+	// Check for clock skew between hosts
 	var maxResponseTime time.Duration
 	var clockMin, clockMax time.Time
 	for _, result := range resp.Checks {
@@ -265,6 +266,7 @@ func (agg *Aggregator) ClusterHealth() ClusterHealthResponse {
 		agg.MetricClockSkew.Set(skew.Seconds())
 	}
 
+	// Check for mismatched config files
 	var newest Metrics
 	for _, result := range resp.Checks {
 		if result.Metrics.ConfigSourceTimestamp.After(newest.ConfigSourceTimestamp) {
@@ -285,6 +287,18 @@ func (agg *Aggregator) ClusterHealth() ClusterHealthResponse {
 		resp.Errors = append(resp.Errors, msg)
 		resp.Health = "ERROR"
 	}
+
+	// Check for services running a different version than we are.
+	for target, result := range resp.Checks {
+		if result.Metrics.Version != "" && !sameVersion(result.Metrics.Version, cmd.Version.String()) {
+			msg := fmt.Sprintf("version mismatch: %s is running %s -- expected %s",
+				strings.TrimSuffix(target, "/_health/ping"),
+				result.Metrics.Version,
+				cmd.Version.String())
+			resp.Errors = append(resp.Errors, msg)
+			resp.Health = "ERROR"
+		}
+	}
 	return resp
 }
 
@@ -329,7 +343,6 @@ func (agg *Aggregator) ping(target *url.URL) (result CheckResult) {
 		return
 	}
 	result.HTTPStatusCode = resp.StatusCode
-	result.HTTPStatusText = resp.Status
 	err = json.NewDecoder(resp.Body).Decode(&result.Response)
 	if err != nil {
 		result.Error = fmt.Sprintf("cannot decode response: %s", err)
@@ -349,7 +362,10 @@ func (agg *Aggregator) ping(target *url.URL) (result CheckResult) {
 	return
 }
 
-var reMetric = regexp.MustCompile(`([a-z_]+){sha256="([0-9a-f]+)"} (\d[\d\.e\+]+)`)
+var (
+	reConfigMetric  = regexp.MustCompile(`arvados_config_source_timestamp_seconds{sha256="([0-9a-f]+)"} (\d[\d\.e\+]+)`)
+	reVersionMetric = regexp.MustCompile(`arvados_version_running{version="([^"]+)"} 1`)
+)
 
 func (agg *Aggregator) metrics(pingURL *url.URL) (result Metrics, err error) {
 	metricsURL, err := pingURL.Parse("/metrics")
@@ -377,13 +393,13 @@ func (agg *Aggregator) metrics(pingURL *url.URL) (result Metrics, err error) {
 
 	scanner := bufio.NewScanner(resp.Body)
 	for scanner.Scan() {
-		m := reMetric.FindSubmatch(scanner.Bytes())
-		if len(m) != 4 || string(m[1]) != "arvados_config_source_timestamp_seconds" {
-			continue
+		if m := reConfigMetric.FindSubmatch(scanner.Bytes()); len(m) == 3 && len(m[1]) > 0 {
+			result.ConfigSourceSHA256 = string(m[1])
+			unixtime, _ := strconv.ParseFloat(string(m[2]), 64)
+			result.ConfigSourceTimestamp = time.UnixMicro(int64(unixtime * 1e6))
+		} else if m = reVersionMetric.FindSubmatch(scanner.Bytes()); len(m) == 2 && len(m[1]) > 0 {
+			result.Version = string(m[1])
 		}
-		result.ConfigSourceSHA256 = string(m[2])
-		unixtime, _ := strconv.ParseFloat(string(m[3]), 64)
-		result.ConfigSourceTimestamp = time.UnixMicro(int64(unixtime * 1e6))
 	}
 	if err = scanner.Err(); err != nil {
 		err = fmt.Errorf("error parsing response from %s: %w", metricsURL.String(), err)
@@ -477,3 +493,26 @@ func (ccmd checkCommand) run(ctx context.Context, prog string, args []string, st
 	}
 	return nil
 }
+
+var reGoVersion = regexp.MustCompile(` \(go\d+([\d.])*\)$`)
+
+// Return true if either a==b or the only difference is that one has a
+// " (go1.2.3)" suffix and the other does not.
+//
+// This allows us to recognize a non-Go (rails) service as the same
+// version as a Go service.
+func sameVersion(a, b string) bool {
+	if a == b {
+		return true
+	}
+	anogo := reGoVersion.ReplaceAllLiteralString(a, "")
+	bnogo := reGoVersion.ReplaceAllLiteralString(b, "")
+	if (anogo == a) != (bnogo == b) {
+		// only one of a/b has a (go1.2.3) suffix, so compare
+		// without that part
+		return anogo == bnogo
+	}
+	// both or neither has a (go1.2.3) suffix, and we already know
+	// a!=b
+	return false
+}
diff --git a/sdk/go/health/aggregator_test.go b/sdk/go/health/aggregator_test.go
index 481054c4d..daad208e0 100644
--- a/sdk/go/health/aggregator_test.go
+++ b/sdk/go/health/aggregator_test.go
@@ -13,9 +13,11 @@ import (
 	"net/http"
 	"net/http/httptest"
 	"regexp"
+	"runtime"
 	"strings"
 	"time"
 
+	"git.arvados.org/arvados.git/lib/cmd"
 	"git.arvados.org/arvados.git/lib/config"
 	"git.arvados.org/arvados.git/sdk/go/arvados"
 	"git.arvados.org/arvados.git/sdk/go/arvadostest"
@@ -254,6 +256,40 @@ func (s *AggregatorSuite) TestClockSkew(c *check.C) {
 	}
 }
 
+func (s *AggregatorSuite) TestVersionSkew(c *check.C) {
+	// srv1: report same version
+	handler1 := healthyHandler{version: cmd.Version.String()}
+	srv1, listen1 := s.stubServer(&handler1)
+	defer srv1.Close()
+	// srv2: report same version but without " (go1.2.3)" part
+	handler2 := healthyHandler{version: strings.Fields(cmd.Version.String())[0]}
+	srv2, listen2 := s.stubServer(&handler2)
+	defer srv2.Close()
+	// srv3: report different version
+	handler3 := healthyHandler{version: "1.2.3~4 (" + runtime.Version() + ")"}
+	srv3, listen3 := s.stubServer(&handler3)
+	defer srv3.Close()
+
+	s.setAllServiceURLs(listen1)
+
+	// same version but without go1.2.3 part => OK
+	s.resp = httptest.NewRecorder()
+	arvadostest.SetServiceURL(&s.handler.Cluster.Services.RailsAPI,
+		"http://localhost"+listen2+"/")
+	s.handler.ServeHTTP(s.resp, s.req)
+	s.checkOK(c)
+
+	// different version => error
+	s.resp = httptest.NewRecorder()
+	arvadostest.SetServiceURL(&s.handler.Cluster.Services.WebDAV,
+		"http://localhost"+listen3+"/")
+	s.handler.ServeHTTP(s.resp, s.req)
+	resp := s.checkUnhealthy(c)
+	if c.Check(len(resp.Errors) > 0, check.Equals, true) {
+		c.Check(resp.Errors[0], check.Matches, `version mismatch: \Qkeep-web+http://localhost`+listen3+`\E is running 1.2.3~4 (.*) -- expected \Q`+cmd.Version.String()+`\E`)
+	}
+}
+
 func (s *AggregatorSuite) TestPingTimeout(c *check.C) {
 	s.handler.timeout = arvados.Duration(100 * time.Millisecond)
 	srv, listen := s.stubServer(&slowHandler{})
@@ -292,7 +328,7 @@ func (s *AggregatorSuite) TestCheckCommand(c *check.C) {
 	exitcode = CheckCommand.RunCommand("check", []string{"-config=" + tmpdir + "/config.yml", "-yaml"}, &bytes.Buffer{}, &stdout, &stderr)
 	c.Check(exitcode, check.Equals, 0)
 	c.Check(stderr.String(), check.Equals, "")
-	c.Check(stdout.String(), check.Matches, `(?ms).*(\n|^)health: OK\n.*`)
+	c.Check(stdout.String(), check.Matches, `(?ms).*(\n|^)Health: OK\n.*`)
 }
 
 func (s *AggregatorSuite) checkError(c *check.C) {
@@ -354,6 +390,7 @@ func (*unhealthyHandler) ServeHTTP(resp http.ResponseWriter, req *http.Request)
 }
 
 type healthyHandler struct {
+	version    string
 	configHash string
 	configTime time.Time
 	headerDate time.Time
@@ -385,9 +422,13 @@ arvados_config_load_timestamp_seconds{sha256="%s"} %g
 # HELP arvados_config_source_timestamp_seconds Timestamp of config file when it was loaded.
 # TYPE arvados_config_source_timestamp_seconds gauge
 arvados_config_source_timestamp_seconds{sha256="%s"} %g
+# HELP arvados_version_running Indicated version is running.
+# TYPE arvados_version_running gauge
+arvados_version_running{version="%s"} 1
 `,
 			h.configHash, float64(time.Now().UnixNano())/1e9,
-			h.configHash, float64(t.UnixNano())/1e9)
+			h.configHash, float64(t.UnixNano())/1e9,
+			h.version)
 	} else {
 		http.Error(resp, "not found", http.StatusNotFound)
 	}

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list