[arvados] created: 2.7.0-5886-g14e5ee9b52

git repository hosting git at public.arvados.org
Thu Jan 18 20:24:24 UTC 2024


        at  14e5ee9b520680090f8e9acd46c5c7698054de69 (commit)


commit 14e5ee9b520680090f8e9acd46c5c7698054de69
Author: Tom Clegg <tom at curii.com>
Date:   Thu Jan 18 15:23:34 2024 -0500

    21023: Add exponential-backoff delay between keepclient retries.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>

diff --git a/sdk/go/keepclient/keepclient.go b/sdk/go/keepclient/keepclient.go
index 2bd7996b59..4b27f6eea6 100644
--- a/sdk/go/keepclient/keepclient.go
+++ b/sdk/go/keepclient/keepclient.go
@@ -269,6 +269,7 @@ func (kc *KeepClient) getOrHead(method string, locator string, header http.Heade
 
 	var errs []string
 
+	retryDelay := initialRetryDelay
 	triesRemaining := 1 + kc.Retries
 
 	serversToTry := kc.getSortedRoots(locator)
@@ -348,6 +349,10 @@ func (kc *KeepClient) getOrHead(method string, locator string, header http.Heade
 			return nil, expectLength, url, resp.Header, nil
 		}
 		serversToTry = retryList
+		if len(serversToTry) > 0 {
+			time.Sleep(retryDelay)
+			retryDelay = 2 * retryDelay
+		}
 	}
 	DebugPrintf("DEBUG: %s %s failed: %v", method, locator, errs)
 
diff --git a/sdk/go/keepclient/keepclient_test.go b/sdk/go/keepclient/keepclient_test.go
index fe133fe2cb..3af8027118 100644
--- a/sdk/go/keepclient/keepclient_test.go
+++ b/sdk/go/keepclient/keepclient_test.go
@@ -26,8 +26,8 @@ import (
 	. "gopkg.in/check.v1"
 )
 
-// Gocheck boilerplate
 func Test(t *testing.T) {
+	initialRetryDelay = 50 * time.Millisecond
 	TestingT(t)
 }
 
@@ -804,6 +804,9 @@ func (s *StandaloneSuite) TestGetFail(c *C) {
 }
 
 func (s *StandaloneSuite) TestGetFailRetry(c *C) {
+	defer func(orig time.Duration) { initialRetryDelay = orig }(initialRetryDelay)
+	initialRetryDelay = time.Second
+
 	hash := fmt.Sprintf("%x+3", md5.Sum([]byte("foo")))
 
 	st := &FailThenSucceedHandler{
@@ -824,9 +827,12 @@ func (s *StandaloneSuite) TestGetFailRetry(c *C) {
 	arv.ApiToken = "abc123"
 	kc.SetServiceRoots(map[string]string{"x": ks.url}, nil, nil)
 
+	t0 := time.Now()
 	r, n, _, err := kc.Get(hash)
 	c.Assert(err, IsNil)
 	c.Check(n, Equals, int64(3))
+	elapsed := time.Since(t0)
+	c.Check(elapsed > initialRetryDelay, Equals, true, Commentf("elapsed %v <= initialRetryDelay %v", elapsed, initialRetryDelay))
 
 	content, err := ioutil.ReadAll(r)
 	c.Check(err, IsNil)
@@ -1484,6 +1490,9 @@ func (s *StandaloneSuite) TestGetIndexWithNoSuchPrefix(c *C) {
 }
 
 func (s *StandaloneSuite) TestPutBRetry(c *C) {
+	defer func(orig time.Duration) { initialRetryDelay = orig }(initialRetryDelay)
+	initialRetryDelay = time.Second
+
 	st := &FailThenSucceedHandler{
 		handled: make(chan string, 1),
 		successhandler: &StubPutHandler{
@@ -1515,11 +1524,14 @@ func (s *StandaloneSuite) TestPutBRetry(c *C) {
 
 	kc.SetServiceRoots(localRoots, writableLocalRoots, nil)
 
+	t0 := time.Now()
 	hash, replicas, err := kc.PutB([]byte("foo"))
 
 	c.Check(err, IsNil)
 	c.Check(hash, Equals, "")
 	c.Check(replicas, Equals, 2)
+	elapsed := time.Since(t0)
+	c.Check(elapsed > initialRetryDelay, Equals, true, Commentf("elapsed %v <= initialRetryDelay %v", elapsed, initialRetryDelay))
 }
 
 func (s *ServerRequiredSuite) TestMakeKeepClientWithNonDiskTypeService(c *C) {
diff --git a/sdk/go/keepclient/support.go b/sdk/go/keepclient/support.go
index 6acaf64baa..924b61cdda 100644
--- a/sdk/go/keepclient/support.go
+++ b/sdk/go/keepclient/support.go
@@ -17,6 +17,7 @@ import (
 	"os"
 	"strconv"
 	"strings"
+	"time"
 
 	"git.arvados.org/arvados.git/sdk/go/arvados"
 	"git.arvados.org/arvados.git/sdk/go/arvadosclient"
@@ -28,6 +29,8 @@ import (
 // log.Printf to DebugPrintf.
 var DebugPrintf = func(string, ...interface{}) {}
 
+var initialRetryDelay = 5 * time.Second
+
 func init() {
 	if arvadosclient.StringBool(os.Getenv("ARVADOS_DEBUG")) {
 		DebugPrintf = log.Printf
@@ -218,6 +221,7 @@ func (kc *KeepClient) httpBlockWrite(ctx context.Context, req arvados.BlockWrite
 		replicasPerThread = req.Replicas
 	}
 
+	retryDelay := initialRetryDelay
 	retriesRemaining := req.Attempts
 	var retryServers []string
 
@@ -306,14 +310,18 @@ func (kc *KeepClient) httpBlockWrite(ctx context.Context, req arvados.BlockWrite
 			}
 
 			if status.statusCode == 0 || status.statusCode == 408 || status.statusCode == 429 ||
-				(status.statusCode >= 500 && status.statusCode != 503) {
+				(status.statusCode >= 500 && status.statusCode != http.StatusInsufficientStorage) {
 				// Timeout, too many requests, or other server side failure
-				// Do not retry when status code is 503, which means the keep server is full
+				// (do not auto-retry status 507 "full")
 				retryServers = append(retryServers, status.url[0:strings.LastIndex(status.url, "/")])
 			}
 		}
 
 		sv = retryServers
+		if len(sv) > 0 {
+			time.Sleep(retryDelay)
+			retryDelay = 2 * retryDelay
+		}
 	}
 
 	return resp, nil

commit b63cf2561b5d40fadb987f7e8491017e2be9054f
Author: Tom Clegg <tom at curii.com>
Date:   Thu Jan 18 14:04:26 2024 -0500

    21023: Avoid overloading 503 status.
    
    When all volumes are full, return 507 (cf WebDAV) indicating the
    client should not auto-retry without additional user interaction.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>

diff --git a/services/keepstore/handler_test.go b/services/keepstore/handler_test.go
index 5bdafb77c2..4352d9a54a 100644
--- a/services/keepstore/handler_test.go
+++ b/services/keepstore/handler_test.go
@@ -198,7 +198,7 @@ func (s *HandlerSuite) TestGetHandler(c *check.C) {
 	// should make the service return a 503 so that clients can retry.
 	ExpectStatusCode(c,
 		"Volume backend busy",
-		503, response)
+		http.StatusServiceUnavailable, response)
 }
 
 // Test PutBlockHandler on the following situations:
@@ -1270,7 +1270,7 @@ func (s *HandlerSuite) TestPutStorageClasses(c *check.C) {
 		c.Logf("failure case %#v", trial)
 		rt.storageClasses = trial.ask
 		resp := IssueRequest(s.handler, &rt)
-		c.Check(resp.Code, check.Equals, http.StatusServiceUnavailable)
+		c.Check(resp.Code, check.Equals, http.StatusInsufficientStorage)
 	}
 }
 
diff --git a/services/keepstore/handlers.go b/services/keepstore/handlers.go
index abeb20fe86..089ebb46da 100644
--- a/services/keepstore/handlers.go
+++ b/services/keepstore/handlers.go
@@ -853,7 +853,7 @@ func newPutProgress(classes []string) putProgress {
 //
 //	The MD5 hash of the BLOCK does not match the argument HASH.
 //
-// 503 Full
+// 507 Full
 //
 //	There was not enough space left in any Keep volume to store
 //	the object.
diff --git a/services/keepstore/keepstore.go b/services/keepstore/keepstore.go
index 953aa047cb..46b00339b5 100644
--- a/services/keepstore/keepstore.go
+++ b/services/keepstore/keepstore.go
@@ -5,6 +5,7 @@
 package keepstore
 
 import (
+	"net/http"
 	"time"
 )
 
@@ -23,22 +24,22 @@ type KeepError struct {
 }
 
 var (
-	BadRequestError     = &KeepError{400, "Bad Request"}
-	UnauthorizedError   = &KeepError{401, "Unauthorized"}
-	CollisionError      = &KeepError{500, "Collision"}
-	RequestHashError    = &KeepError{422, "Hash mismatch in request"}
-	PermissionError     = &KeepError{403, "Forbidden"}
-	DiskHashError       = &KeepError{500, "Hash mismatch in stored data"}
-	ExpiredError        = &KeepError{401, "Expired permission signature"}
-	NotFoundError       = &KeepError{404, "Not Found"}
-	VolumeBusyError     = &KeepError{503, "Volume backend busy"}
-	GenericError        = &KeepError{500, "Fail"}
-	FullError           = &KeepError{503, "Full"}
-	SizeRequiredError   = &KeepError{411, "Missing Content-Length"}
-	TooLongError        = &KeepError{413, "Block is too large"}
-	MethodDisabledError = &KeepError{405, "Method disabled"}
-	ErrNotImplemented   = &KeepError{500, "Unsupported configuration"}
-	ErrClientDisconnect = &KeepError{503, "Client disconnected"}
+	BadRequestError     = &KeepError{http.StatusBadRequest, "Bad Request"}
+	UnauthorizedError   = &KeepError{http.StatusUnauthorized, "Unauthorized"}
+	CollisionError      = &KeepError{http.StatusInternalServerError, "Collision"}
+	RequestHashError    = &KeepError{http.StatusUnprocessableEntity, "Hash mismatch in request"}
+	PermissionError     = &KeepError{http.StatusForbidden, "Forbidden"}
+	DiskHashError       = &KeepError{http.StatusInternalServerError, "Hash mismatch in stored data"}
+	ExpiredError        = &KeepError{http.StatusUnauthorized, "Expired permission signature"}
+	NotFoundError       = &KeepError{http.StatusNotFound, "Not Found"}
+	VolumeBusyError     = &KeepError{http.StatusServiceUnavailable, "Volume backend busy"}
+	GenericError        = &KeepError{http.StatusInternalServerError, "Fail"}
+	FullError           = &KeepError{http.StatusInsufficientStorage, "Full"}
+	SizeRequiredError   = &KeepError{http.StatusLengthRequired, "Missing Content-Length"}
+	TooLongError        = &KeepError{http.StatusRequestEntityTooLarge, "Block is too large"}
+	MethodDisabledError = &KeepError{http.StatusMethodNotAllowed, "Method disabled"}
+	ErrNotImplemented   = &KeepError{http.StatusInternalServerError, "Unsupported configuration"}
+	ErrClientDisconnect = &KeepError{499, "Client disconnected"} // non-RFC Nginx status code
 )
 
 func (e *KeepError) Error() string {

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list