[arvados] created: 2.7.0-6017-g570d9570f6
git repository hosting
git at public.arvados.org
Tue Feb 13 15:38:28 UTC 2024
at 570d9570f6655a6e9ca4365173406e67dd0e0e31 (commit)
commit 570d9570f6655a6e9ca4365173406e67dd0e0e31
Author: Tom Clegg <tom at curii.com>
Date: Tue Feb 13 10:36:58 2024 -0500
21023: Add exponential-backoff delay between keepclient retries.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>
diff --git a/sdk/go/keepclient/keepclient.go b/sdk/go/keepclient/keepclient.go
index 2bd7996b59..0f500a4fbc 100644
--- a/sdk/go/keepclient/keepclient.go
+++ b/sdk/go/keepclient/keepclient.go
@@ -44,6 +44,8 @@ var (
DefaultProxyTLSHandshakeTimeout = 10 * time.Second
DefaultProxyKeepAlive = 120 * time.Second
+ DefaultRetryDelay = 2 * time.Second
+
rootCacheDir = "/var/cache/arvados/keep"
userCacheDir = ".cache/arvados/keep" // relative to HOME
)
@@ -113,6 +115,7 @@ type KeepClient struct {
lock sync.RWMutex
HTTPClient HTTPClient
Retries int
+ RetryDelay time.Duration // Delay after first attempt (increases on each attempt); if 0, use DefaultRetryDelay
RequestID string
StorageClasses []string
DefaultStorageClasses []string // Set by cluster's exported config
@@ -141,6 +144,7 @@ func (kc *KeepClient) Clone() *KeepClient {
gatewayRoots: kc.gatewayRoots,
HTTPClient: kc.HTTPClient,
Retries: kc.Retries,
+ RetryDelay: kc.RetryDelay,
RequestID: kc.RequestID,
StorageClasses: kc.StorageClasses,
DefaultStorageClasses: kc.DefaultStorageClasses,
@@ -190,6 +194,7 @@ func New(arv *arvadosclient.ArvadosClient) *KeepClient {
Arvados: arv,
Want_replicas: defaultReplicationLevel,
Retries: 2,
+ RetryDelay: DefaultRetryDelay,
}
err = kc.loadDefaultClasses()
if err != nil {
@@ -269,6 +274,10 @@ func (kc *KeepClient) getOrHead(method string, locator string, header http.Heade
var errs []string
+ retryDelay := kc.RetryDelay
+ if retryDelay < 1 {
+ retryDelay = DefaultRetryDelay
+ }
triesRemaining := 1 + kc.Retries
serversToTry := kc.getSortedRoots(locator)
@@ -348,6 +357,10 @@ func (kc *KeepClient) getOrHead(method string, locator string, header http.Heade
return nil, expectLength, url, resp.Header, nil
}
serversToTry = retryList
+ if len(serversToTry) > 0 && triesRemaining > 0 {
+ time.Sleep(retryDelay)
+ retryDelay = 2 * retryDelay
+ }
}
DebugPrintf("DEBUG: %s %s failed: %v", method, locator, errs)
diff --git a/sdk/go/keepclient/keepclient_test.go b/sdk/go/keepclient/keepclient_test.go
index 94591bd064..8b5f8dc8db 100644
--- a/sdk/go/keepclient/keepclient_test.go
+++ b/sdk/go/keepclient/keepclient_test.go
@@ -26,8 +26,8 @@ import (
. "gopkg.in/check.v1"
)
-// Gocheck boilerplate
func Test(t *testing.T) {
+ DefaultRetryDelay = 50 * time.Millisecond
TestingT(t)
}
@@ -560,14 +560,7 @@ func (s *StandaloneSuite) TestPutHR(c *C) {
kc.SetServiceRoots(localRoots, writableLocalRoots, nil)
- reader, writer := io.Pipe()
-
- go func() {
- writer.Write([]byte("foo"))
- writer.Close()
- }()
-
- kc.PutHR(hash, reader, 3)
+ kc.PutHR(hash, bytes.NewBuffer([]byte("foo")), 3)
shuff := NewRootSorter(kc.LocalRoots(), hash).GetSortedRoots()
@@ -804,6 +797,9 @@ func (s *StandaloneSuite) TestGetFail(c *C) {
}
func (s *StandaloneSuite) TestGetFailRetry(c *C) {
+ defer func(orig time.Duration) { DefaultRetryDelay = orig }(DefaultRetryDelay)
+ DefaultRetryDelay = time.Second
+
hash := fmt.Sprintf("%x+3", md5.Sum([]byte("foo")))
st := &FailThenSucceedHandler{
@@ -824,9 +820,12 @@ func (s *StandaloneSuite) TestGetFailRetry(c *C) {
arv.ApiToken = "abc123"
kc.SetServiceRoots(map[string]string{"x": ks.url}, nil, nil)
+ t0 := time.Now()
r, n, _, err := kc.Get(hash)
c.Assert(err, IsNil)
c.Check(n, Equals, int64(3))
+ elapsed := time.Since(t0)
+ c.Check(elapsed > DefaultRetryDelay, Equals, true, Commentf("elapsed %v <= DefaultRetryDelay %v", elapsed, DefaultRetryDelay))
content, err := ioutil.ReadAll(r)
c.Check(err, IsNil)
@@ -1484,6 +1483,9 @@ func (s *StandaloneSuite) TestGetIndexWithNoSuchPrefix(c *C) {
}
func (s *StandaloneSuite) TestPutBRetry(c *C) {
+ defer func(orig time.Duration) { DefaultRetryDelay = orig }(DefaultRetryDelay)
+ DefaultRetryDelay = time.Second
+
st := &FailThenSucceedHandler{
handled: make(chan string, 1),
successhandler: &StubPutHandler{
@@ -1515,11 +1517,14 @@ func (s *StandaloneSuite) TestPutBRetry(c *C) {
kc.SetServiceRoots(localRoots, writableLocalRoots, nil)
+ t0 := time.Now()
hash, replicas, err := kc.PutB([]byte("foo"))
c.Check(err, IsNil)
c.Check(hash, Equals, "")
c.Check(replicas, Equals, 2)
+ elapsed := time.Since(t0)
+ c.Check(elapsed > DefaultRetryDelay, Equals, true, Commentf("elapsed %v <= DefaultRetryDelay %v", elapsed, DefaultRetryDelay))
}
func (s *ServerRequiredSuite) TestMakeKeepClientWithNonDiskTypeService(c *C) {
diff --git a/sdk/go/keepclient/support.go b/sdk/go/keepclient/support.go
index 6acaf64baa..9dc57267c0 100644
--- a/sdk/go/keepclient/support.go
+++ b/sdk/go/keepclient/support.go
@@ -17,6 +17,7 @@ import (
"os"
"strconv"
"strings"
+ "time"
"git.arvados.org/arvados.git/sdk/go/arvados"
"git.arvados.org/arvados.git/sdk/go/arvadosclient"
@@ -218,6 +219,10 @@ func (kc *KeepClient) httpBlockWrite(ctx context.Context, req arvados.BlockWrite
replicasPerThread = req.Replicas
}
+ retryDelay := kc.RetryDelay
+ if retryDelay < 1 {
+ retryDelay = DefaultRetryDelay
+ }
retriesRemaining := req.Attempts
var retryServers []string
@@ -306,14 +311,18 @@ func (kc *KeepClient) httpBlockWrite(ctx context.Context, req arvados.BlockWrite
}
if status.statusCode == 0 || status.statusCode == 408 || status.statusCode == 429 ||
- (status.statusCode >= 500 && status.statusCode != 503) {
+ (status.statusCode >= 500 && status.statusCode != http.StatusInsufficientStorage) {
// Timeout, too many requests, or other server side failure
- // Do not retry when status code is 503, which means the keep server is full
+ // (do not auto-retry status 507 "full")
retryServers = append(retryServers, status.url[0:strings.LastIndex(status.url, "/")])
}
}
sv = retryServers
+ if len(sv) > 0 {
+ time.Sleep(retryDelay)
+ retryDelay = 2 * retryDelay
+ }
}
return resp, nil
diff --git a/services/keepproxy/keepproxy.go b/services/keepproxy/keepproxy.go
index 39ffd45cbe..8afe20f911 100644
--- a/services/keepproxy/keepproxy.go
+++ b/services/keepproxy/keepproxy.go
@@ -320,6 +320,7 @@ func (h *proxyHandler) Get(resp http.ResponseWriter, req *http.Request) {
kc := h.makeKeepClient(req)
kc.DiskCacheSize = keepclient.DiskCacheDisabled
+ kc.RetryDelay = time.Second
var pass bool
var tok string
diff --git a/services/keepproxy/keepproxy_test.go b/services/keepproxy/keepproxy_test.go
index 7efba2348b..473be4d3a6 100644
--- a/services/keepproxy/keepproxy_test.go
+++ b/services/keepproxy/keepproxy_test.go
@@ -32,8 +32,8 @@ import (
. "gopkg.in/check.v1"
)
-// Gocheck boilerplate
func Test(t *testing.T) {
+ keepclient.DefaultRetryDelay = time.Millisecond
TestingT(t)
}
diff --git a/services/keepstore/command.go b/services/keepstore/command.go
index 48c8256a3c..7de9fa833e 100644
--- a/services/keepstore/command.go
+++ b/services/keepstore/command.go
@@ -206,6 +206,7 @@ func (h *handler) setup(ctx context.Context, cluster *arvados.Cluster, token str
Arvados: ac,
Want_replicas: 1,
DiskCacheSize: keepclient.DiskCacheDisabled,
+ Retries: 0,
}
h.keepClient.Arvados.ApiToken = fmt.Sprintf("%x", rand.Int63())
commit c300d42d1b07d74a054be8bb561b3152c537376c
Author: Tom Clegg <tom at curii.com>
Date: Thu Jan 18 14:04:26 2024 -0500
21023: Avoid overloading 503 status.
When all volumes are full, return 507 (cf WebDAV) indicating the
client should not auto-retry without additional user interaction.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>
diff --git a/services/keepstore/handler_test.go b/services/keepstore/handler_test.go
index 5bdafb77c2..4352d9a54a 100644
--- a/services/keepstore/handler_test.go
+++ b/services/keepstore/handler_test.go
@@ -198,7 +198,7 @@ func (s *HandlerSuite) TestGetHandler(c *check.C) {
// should make the service return a 503 so that clients can retry.
ExpectStatusCode(c,
"Volume backend busy",
- 503, response)
+ http.StatusServiceUnavailable, response)
}
// Test PutBlockHandler on the following situations:
@@ -1270,7 +1270,7 @@ func (s *HandlerSuite) TestPutStorageClasses(c *check.C) {
c.Logf("failure case %#v", trial)
rt.storageClasses = trial.ask
resp := IssueRequest(s.handler, &rt)
- c.Check(resp.Code, check.Equals, http.StatusServiceUnavailable)
+ c.Check(resp.Code, check.Equals, http.StatusInsufficientStorage)
}
}
diff --git a/services/keepstore/handlers.go b/services/keepstore/handlers.go
index abeb20fe86..089ebb46da 100644
--- a/services/keepstore/handlers.go
+++ b/services/keepstore/handlers.go
@@ -853,7 +853,7 @@ func newPutProgress(classes []string) putProgress {
//
// The MD5 hash of the BLOCK does not match the argument HASH.
//
-// 503 Full
+// 507 Full
//
// There was not enough space left in any Keep volume to store
// the object.
diff --git a/services/keepstore/keepstore.go b/services/keepstore/keepstore.go
index 953aa047cb..46b00339b5 100644
--- a/services/keepstore/keepstore.go
+++ b/services/keepstore/keepstore.go
@@ -5,6 +5,7 @@
package keepstore
import (
+ "net/http"
"time"
)
@@ -23,22 +24,22 @@ type KeepError struct {
}
var (
- BadRequestError = &KeepError{400, "Bad Request"}
- UnauthorizedError = &KeepError{401, "Unauthorized"}
- CollisionError = &KeepError{500, "Collision"}
- RequestHashError = &KeepError{422, "Hash mismatch in request"}
- PermissionError = &KeepError{403, "Forbidden"}
- DiskHashError = &KeepError{500, "Hash mismatch in stored data"}
- ExpiredError = &KeepError{401, "Expired permission signature"}
- NotFoundError = &KeepError{404, "Not Found"}
- VolumeBusyError = &KeepError{503, "Volume backend busy"}
- GenericError = &KeepError{500, "Fail"}
- FullError = &KeepError{503, "Full"}
- SizeRequiredError = &KeepError{411, "Missing Content-Length"}
- TooLongError = &KeepError{413, "Block is too large"}
- MethodDisabledError = &KeepError{405, "Method disabled"}
- ErrNotImplemented = &KeepError{500, "Unsupported configuration"}
- ErrClientDisconnect = &KeepError{503, "Client disconnected"}
+ BadRequestError = &KeepError{http.StatusBadRequest, "Bad Request"}
+ UnauthorizedError = &KeepError{http.StatusUnauthorized, "Unauthorized"}
+ CollisionError = &KeepError{http.StatusInternalServerError, "Collision"}
+ RequestHashError = &KeepError{http.StatusUnprocessableEntity, "Hash mismatch in request"}
+ PermissionError = &KeepError{http.StatusForbidden, "Forbidden"}
+ DiskHashError = &KeepError{http.StatusInternalServerError, "Hash mismatch in stored data"}
+ ExpiredError = &KeepError{http.StatusUnauthorized, "Expired permission signature"}
+ NotFoundError = &KeepError{http.StatusNotFound, "Not Found"}
+ VolumeBusyError = &KeepError{http.StatusServiceUnavailable, "Volume backend busy"}
+ GenericError = &KeepError{http.StatusInternalServerError, "Fail"}
+ FullError = &KeepError{http.StatusInsufficientStorage, "Full"}
+ SizeRequiredError = &KeepError{http.StatusLengthRequired, "Missing Content-Length"}
+ TooLongError = &KeepError{http.StatusRequestEntityTooLarge, "Block is too large"}
+ MethodDisabledError = &KeepError{http.StatusMethodNotAllowed, "Method disabled"}
+ ErrNotImplemented = &KeepError{http.StatusInternalServerError, "Unsupported configuration"}
+ ErrClientDisconnect = &KeepError{499, "Client disconnected"} // non-RFC Nginx status code
)
func (e *KeepError) Error() string {
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list