[arvados] created: 2.7.0-5886-g14e5ee9b52
git repository hosting
git at public.arvados.org
Thu Jan 18 20:24:24 UTC 2024
at 14e5ee9b520680090f8e9acd46c5c7698054de69 (commit)
commit 14e5ee9b520680090f8e9acd46c5c7698054de69
Author: Tom Clegg <tom at curii.com>
Date: Thu Jan 18 15:23:34 2024 -0500
21023: Add exponential-backoff delay between keepclient retries.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>
diff --git a/sdk/go/keepclient/keepclient.go b/sdk/go/keepclient/keepclient.go
index 2bd7996b59..4b27f6eea6 100644
--- a/sdk/go/keepclient/keepclient.go
+++ b/sdk/go/keepclient/keepclient.go
@@ -269,6 +269,7 @@ func (kc *KeepClient) getOrHead(method string, locator string, header http.Heade
var errs []string
+ retryDelay := initialRetryDelay
triesRemaining := 1 + kc.Retries
serversToTry := kc.getSortedRoots(locator)
@@ -348,6 +349,10 @@ func (kc *KeepClient) getOrHead(method string, locator string, header http.Heade
return nil, expectLength, url, resp.Header, nil
}
serversToTry = retryList
+ if len(serversToTry) > 0 {
+ time.Sleep(retryDelay)
+ retryDelay = 2 * retryDelay
+ }
}
DebugPrintf("DEBUG: %s %s failed: %v", method, locator, errs)
diff --git a/sdk/go/keepclient/keepclient_test.go b/sdk/go/keepclient/keepclient_test.go
index fe133fe2cb..3af8027118 100644
--- a/sdk/go/keepclient/keepclient_test.go
+++ b/sdk/go/keepclient/keepclient_test.go
@@ -26,8 +26,8 @@ import (
. "gopkg.in/check.v1"
)
-// Gocheck boilerplate
func Test(t *testing.T) {
+ initialRetryDelay = 50 * time.Millisecond
TestingT(t)
}
@@ -804,6 +804,9 @@ func (s *StandaloneSuite) TestGetFail(c *C) {
}
func (s *StandaloneSuite) TestGetFailRetry(c *C) {
+ defer func(orig time.Duration) { initialRetryDelay = orig }(initialRetryDelay)
+ initialRetryDelay = time.Second
+
hash := fmt.Sprintf("%x+3", md5.Sum([]byte("foo")))
st := &FailThenSucceedHandler{
@@ -824,9 +827,12 @@ func (s *StandaloneSuite) TestGetFailRetry(c *C) {
arv.ApiToken = "abc123"
kc.SetServiceRoots(map[string]string{"x": ks.url}, nil, nil)
+ t0 := time.Now()
r, n, _, err := kc.Get(hash)
c.Assert(err, IsNil)
c.Check(n, Equals, int64(3))
+ elapsed := time.Since(t0)
+ c.Check(elapsed > initialRetryDelay, Equals, true, Commentf("elapsed %v <= initialRetryDelay %v", elapsed, initialRetryDelay))
content, err := ioutil.ReadAll(r)
c.Check(err, IsNil)
@@ -1484,6 +1490,9 @@ func (s *StandaloneSuite) TestGetIndexWithNoSuchPrefix(c *C) {
}
func (s *StandaloneSuite) TestPutBRetry(c *C) {
+ defer func(orig time.Duration) { initialRetryDelay = orig }(initialRetryDelay)
+ initialRetryDelay = time.Second
+
st := &FailThenSucceedHandler{
handled: make(chan string, 1),
successhandler: &StubPutHandler{
@@ -1515,11 +1524,14 @@ func (s *StandaloneSuite) TestPutBRetry(c *C) {
kc.SetServiceRoots(localRoots, writableLocalRoots, nil)
+ t0 := time.Now()
hash, replicas, err := kc.PutB([]byte("foo"))
c.Check(err, IsNil)
c.Check(hash, Equals, "")
c.Check(replicas, Equals, 2)
+ elapsed := time.Since(t0)
+ c.Check(elapsed > initialRetryDelay, Equals, true, Commentf("elapsed %v <= initialRetryDelay %v", elapsed, initialRetryDelay))
}
func (s *ServerRequiredSuite) TestMakeKeepClientWithNonDiskTypeService(c *C) {
diff --git a/sdk/go/keepclient/support.go b/sdk/go/keepclient/support.go
index 6acaf64baa..924b61cdda 100644
--- a/sdk/go/keepclient/support.go
+++ b/sdk/go/keepclient/support.go
@@ -17,6 +17,7 @@ import (
"os"
"strconv"
"strings"
+ "time"
"git.arvados.org/arvados.git/sdk/go/arvados"
"git.arvados.org/arvados.git/sdk/go/arvadosclient"
@@ -28,6 +29,8 @@ import (
// log.Printf to DebugPrintf.
var DebugPrintf = func(string, ...interface{}) {}
+var initialRetryDelay = 5 * time.Second
+
func init() {
if arvadosclient.StringBool(os.Getenv("ARVADOS_DEBUG")) {
DebugPrintf = log.Printf
@@ -218,6 +221,7 @@ func (kc *KeepClient) httpBlockWrite(ctx context.Context, req arvados.BlockWrite
replicasPerThread = req.Replicas
}
+ retryDelay := initialRetryDelay
retriesRemaining := req.Attempts
var retryServers []string
@@ -306,14 +310,18 @@ func (kc *KeepClient) httpBlockWrite(ctx context.Context, req arvados.BlockWrite
}
if status.statusCode == 0 || status.statusCode == 408 || status.statusCode == 429 ||
- (status.statusCode >= 500 && status.statusCode != 503) {
+ (status.statusCode >= 500 && status.statusCode != http.StatusInsufficientStorage) {
// Timeout, too many requests, or other server side failure
- // Do not retry when status code is 503, which means the keep server is full
+ // (do not auto-retry status 507 "full")
retryServers = append(retryServers, status.url[0:strings.LastIndex(status.url, "/")])
}
}
sv = retryServers
+ if len(sv) > 0 {
+ time.Sleep(retryDelay)
+ retryDelay = 2 * retryDelay
+ }
}
return resp, nil
commit b63cf2561b5d40fadb987f7e8491017e2be9054f
Author: Tom Clegg <tom at curii.com>
Date: Thu Jan 18 14:04:26 2024 -0500
21023: Avoid overloading 503 status.
When all volumes are full, return 507 (cf WebDAV) indicating the
client should not auto-retry without additional user interaction.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>
diff --git a/services/keepstore/handler_test.go b/services/keepstore/handler_test.go
index 5bdafb77c2..4352d9a54a 100644
--- a/services/keepstore/handler_test.go
+++ b/services/keepstore/handler_test.go
@@ -198,7 +198,7 @@ func (s *HandlerSuite) TestGetHandler(c *check.C) {
// should make the service return a 503 so that clients can retry.
ExpectStatusCode(c,
"Volume backend busy",
- 503, response)
+ http.StatusServiceUnavailable, response)
}
// Test PutBlockHandler on the following situations:
@@ -1270,7 +1270,7 @@ func (s *HandlerSuite) TestPutStorageClasses(c *check.C) {
c.Logf("failure case %#v", trial)
rt.storageClasses = trial.ask
resp := IssueRequest(s.handler, &rt)
- c.Check(resp.Code, check.Equals, http.StatusServiceUnavailable)
+ c.Check(resp.Code, check.Equals, http.StatusInsufficientStorage)
}
}
diff --git a/services/keepstore/handlers.go b/services/keepstore/handlers.go
index abeb20fe86..089ebb46da 100644
--- a/services/keepstore/handlers.go
+++ b/services/keepstore/handlers.go
@@ -853,7 +853,7 @@ func newPutProgress(classes []string) putProgress {
//
// The MD5 hash of the BLOCK does not match the argument HASH.
//
-// 503 Full
+// 507 Full
//
// There was not enough space left in any Keep volume to store
// the object.
diff --git a/services/keepstore/keepstore.go b/services/keepstore/keepstore.go
index 953aa047cb..46b00339b5 100644
--- a/services/keepstore/keepstore.go
+++ b/services/keepstore/keepstore.go
@@ -5,6 +5,7 @@
package keepstore
import (
+ "net/http"
"time"
)
@@ -23,22 +24,22 @@ type KeepError struct {
}
var (
- BadRequestError = &KeepError{400, "Bad Request"}
- UnauthorizedError = &KeepError{401, "Unauthorized"}
- CollisionError = &KeepError{500, "Collision"}
- RequestHashError = &KeepError{422, "Hash mismatch in request"}
- PermissionError = &KeepError{403, "Forbidden"}
- DiskHashError = &KeepError{500, "Hash mismatch in stored data"}
- ExpiredError = &KeepError{401, "Expired permission signature"}
- NotFoundError = &KeepError{404, "Not Found"}
- VolumeBusyError = &KeepError{503, "Volume backend busy"}
- GenericError = &KeepError{500, "Fail"}
- FullError = &KeepError{503, "Full"}
- SizeRequiredError = &KeepError{411, "Missing Content-Length"}
- TooLongError = &KeepError{413, "Block is too large"}
- MethodDisabledError = &KeepError{405, "Method disabled"}
- ErrNotImplemented = &KeepError{500, "Unsupported configuration"}
- ErrClientDisconnect = &KeepError{503, "Client disconnected"}
+ BadRequestError = &KeepError{http.StatusBadRequest, "Bad Request"}
+ UnauthorizedError = &KeepError{http.StatusUnauthorized, "Unauthorized"}
+ CollisionError = &KeepError{http.StatusInternalServerError, "Collision"}
+ RequestHashError = &KeepError{http.StatusUnprocessableEntity, "Hash mismatch in request"}
+ PermissionError = &KeepError{http.StatusForbidden, "Forbidden"}
+ DiskHashError = &KeepError{http.StatusInternalServerError, "Hash mismatch in stored data"}
+ ExpiredError = &KeepError{http.StatusUnauthorized, "Expired permission signature"}
+ NotFoundError = &KeepError{http.StatusNotFound, "Not Found"}
+ VolumeBusyError = &KeepError{http.StatusServiceUnavailable, "Volume backend busy"}
+ GenericError = &KeepError{http.StatusInternalServerError, "Fail"}
+ FullError = &KeepError{http.StatusInsufficientStorage, "Full"}
+ SizeRequiredError = &KeepError{http.StatusLengthRequired, "Missing Content-Length"}
+ TooLongError = &KeepError{http.StatusRequestEntityTooLarge, "Block is too large"}
+ MethodDisabledError = &KeepError{http.StatusMethodNotAllowed, "Method disabled"}
+ ErrNotImplemented = &KeepError{http.StatusInternalServerError, "Unsupported configuration"}
+ ErrClientDisconnect = &KeepError{499, "Client disconnected"} // non-RFC Nginx status code
)
func (e *KeepError) Error() string {
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list