[ARVADOS] created: 9b4000637a9315fd15d8b506381a713243bd1699

git at public.curoverse.com git at public.curoverse.com
Thu Jan 21 14:52:54 EST 2016


        at  9b4000637a9315fd15d8b506381a713243bd1699 (commit)


commit 9b4000637a9315fd15d8b506381a713243bd1699
Author: Tom Clegg <tom at curoverse.com>
Date:   Thu Jan 21 14:35:34 2016 -0500

    8281: Fix KeepClient retry bugs.
    
    get() and put() were both handling all Curl exceptions -- including
    timeouts -- by marking the keep service as unusable. For example, if a
    single proxy is the only service available, a single timeout was
    fatal. This is fixed by setting the retry loop status to None instead
    of False after curl exceptions.
    
    put() was repeating its retry loop until it achieved the desired
    number of replicas _in a single iteration_. For example, when trying
    to store 2 replicas, 6 loop iterations with a single success in each
    iteration would result in 6 copies being stored but put() declaring
    failure. This is fixed by checking against a cumulative "done" counter
    instead of the "copies done in this loop iteration" counter.

diff --git a/sdk/python/arvados/keep.py b/sdk/python/arvados/keep.py
index 4fa8a4f..6264b19 100644
--- a/sdk/python/arvados/keep.py
+++ b/sdk/python/arvados/keep.py
@@ -379,6 +379,7 @@ class KeepClient(object):
             url = self.root + str(locator)
             _logger.debug("Request: GET %s", url)
             curl = self._get_user_agent()
+            ok = None
             try:
                 with timer.Timer() as t:
                     self._headers = {}
@@ -410,7 +411,6 @@ class KeepClient(object):
                 self._result = {
                     'error': e,
                 }
-                ok = False
             self._usable = ok != False
             if self._result.get('status_code', None):
                 # The client worked well enough to get an HTTP status
@@ -445,6 +445,7 @@ class KeepClient(object):
             url = self.root + hash_s
             _logger.debug("Request: PUT %s", url)
             curl = self._get_user_agent()
+            ok = None
             try:
                 with timer.Timer() as t:
                     self._headers = {}
@@ -486,7 +487,6 @@ class KeepClient(object):
                 self._result = {
                     'error': e,
                 }
-                ok = False
             self._usable = ok != False # still usable if ok is True or None
             if self._result.get('status_code', None):
                 # Client is functional. See comment in get().
@@ -1018,6 +1018,7 @@ class KeepClient(object):
         roots_map = {}
         loop = retry.RetryLoop(num_retries, self._check_loop_result,
                                backoff_start=2)
+        done = 0
         for tries_left in loop:
             try:
                 sorted_roots = self.map_new_services(
@@ -1046,7 +1047,8 @@ class KeepClient(object):
                 threads.append(t)
             for t in threads:
                 t.join()
-            loop.save_result((thread_limiter.done() >= copies, len(threads)))
+            done += thread_limiter.done()
+            loop.save_result(done >= copies, len(threads)))
 
         if loop.success():
             return thread_limiter.response()
diff --git a/sdk/python/tests/test_keep_client.py b/sdk/python/tests/test_keep_client.py
index 2b380f0..9a0fe80 100644
--- a/sdk/python/tests/test_keep_client.py
+++ b/sdk/python/tests/test_keep_client.py
@@ -852,6 +852,10 @@ class KeepClientRetryTestMixin(object):
         with self.TEST_PATCHER(self.DEFAULT_EXPECT, 500, 200):
             self.check_success(num_retries=3)
 
+    def test_exception_then_success(self):
+        with self.TEST_PATCHER(self.DEFAULT_EXPECT, Exception('mock err'), 200):
+            self.check_success(num_retries=3)
+
     def test_no_default_retry(self):
         with self.TEST_PATCHER(self.DEFAULT_EXPECT, 500, 200):
             self.check_exception()
@@ -928,3 +932,38 @@ class KeepClientRetryPutTestCase(KeepClientRetryTestMixin, unittest.TestCase):
     def test_do_not_send_multiple_copies_to_same_server(self):
         with tutil.mock_keep_responses(self.DEFAULT_EXPECT, 200):
             self.check_exception(copies=2, num_retries=3)
+
+
+ at tutil.skip_sleep
+class RetryNeedsMultipleServices(unittest.TestCase, tutil.ApiClientMock):
+    # Test put()s that need two distinct servers to succeed, possibly
+    # requiring multiple passes through the retry loop.
+
+    def setUp(self):
+        self.api_client = self.mock_keep_services(count=2)
+        self.keep_client = arvados.KeepClient(api_client=self.api_client)
+
+    def test_success_after_exception(self):
+        with tutil.mock_keep_responses(
+                'acbd18db4cc2f85cedef654fccc4a4d8+3',
+                Exception('mock err'), 200, 200) as req_mock:
+            self.keep_client.put('foo', num_retries=1, copies=2)
+        self.assertTrue(3, req_mock.call_count)
+
+    def test_success_after_retryable_error(self):
+        with tutil.mock_keep_responses(
+                'acbd18db4cc2f85cedef654fccc4a4d8+3',
+                500, 200, 200) as req_mock:
+            self.keep_client.put('foo', num_retries=1, copies=2)
+        self.assertTrue(3, req_mock.call_count)
+
+    def test_fail_after_final_error(self):
+        # First retry loop gets a 200 (can't achieve replication by
+        # storing again on that server) and a 400 (can't retry that
+        # server at all), so we shouldn't try a third request.
+        with tutil.mock_keep_responses(
+                'acbd18db4cc2f85cedef654fccc4a4d8+3',
+                200, 400, 200) as req_mock:
+            with self.assertRaises(arvados.errors.KeepWriteError):
+                self.keep_client.put('foo', num_retries=1, copies=2)
+        self.assertTrue(2, req_mock.call_count)
diff --git a/services/fuse/tests/test_retry.py b/services/fuse/tests/test_retry.py
index f6c0807..b46ba78 100644
--- a/services/fuse/tests/test_retry.py
+++ b/services/fuse/tests/test_retry.py
@@ -1,11 +1,17 @@
 import arvados
 import arvados_fuse.command
+import json
 import mock
 import os
+import pycurl
+import Queue
 import run_test_server
 import tempfile
 import unittest
 
+from .integration_test import IntegrationTest
+
+
 class KeepClientRetry(unittest.TestCase):
     origKeepClient = arvados.keep.KeepClient
 
@@ -33,3 +39,22 @@ class KeepClientRetry(unittest.TestCase):
 
     def test_no_retry(self):
         self._test_retry(0, ['--retries=0'])
+
+class RetryPUT(IntegrationTest):
+    @mock.patch('time.sleep')
+    @IntegrationTest.mount(argv=['--read-write', '--mount-tmp=zzz'])
+    def test_retry_write(self, sleep):
+        mockedCurl = mock.Mock(spec=pycurl.Curl(), wraps=pycurl.Curl())
+        mockedCurl.perform.side_effect = Exception('mock error (ok)')
+        q = Queue.Queue()
+        q.put(mockedCurl)
+        q.put(pycurl.Curl())
+        q.put(pycurl.Curl())
+        with mock.patch('arvados.keep.KeepClient.KeepService._get_user_agent', side_effect=lambda: q.get(block=None)):
+            self.pool_test(os.path.join(self.mnt, 'zzz'))
+            self.assertTrue(mockedCurl.perform.called)
+    @staticmethod
+    def _test_retry_write(self, tmp):
+        with open(os.path.join(tmp, 'foo'), 'w') as f:
+            f.write('foo')
+        json.load(open(os.path.join(tmp, '.arvados#collection')))

commit 4ae033c520ef9d305ac6f1c633a8d98c9852d873
Author: Tom Clegg <tom at curoverse.com>
Date:   Thu Jan 21 04:01:16 2016 -0500

    8281: Fix arv-mount ignoring --retries argument when writing file data.
    
    "num_retries" arguments get passed around extensively in arvfile.py
    and collection.py in the Python SDK, but ultimately the writing of
    file data is done by a _BlockManager which doesn't have any way to
    accept that argument or pass it along to a KeepClient, so PUT requests
    always use the CollectionWriter's KeepClient's default num_retries.
    
    In arv-mount's case, we have been telling CollectionWriter the
    num_retries we want. When CollectionWriter creates a KeepClient,
    num_retries gets passed along -- normally this works around the fact
    that num_retries gets lost by the _BlockManager layer. However, we
    provided our own KeepClient to use instead of letting CollectionWriter
    create one, and we forgot to set num_retries on our own KeepClient, so
    we weren't retrying PUT requests.

diff --git a/services/fuse/arvados_fuse/command.py b/services/fuse/arvados_fuse/command.py
index cd81a53..71623a5 100644
--- a/services/fuse/arvados_fuse/command.py
+++ b/services/fuse/arvados_fuse/command.py
@@ -151,7 +151,8 @@ class Mount(object):
         self.api = arvados.safeapi.ThreadSafeApiCache(
             apiconfig=arvados.config.settings(),
             keep_params={
-                "block_cache": arvados.keep.KeepBlockCache(self.args.file_cache)
+                'block_cache': arvados.keep.KeepBlockCache(self.args.file_cache),
+                'num_retries': self.args.retries,
             })
         # Do a sanity check that we have a working arvados host + token.
         self.api.users().current().execute()
diff --git a/services/fuse/arvados_fuse/fusedir.py b/services/fuse/arvados_fuse/fusedir.py
index a2135b3..7e627ef 100644
--- a/services/fuse/arvados_fuse/fusedir.py
+++ b/services/fuse/arvados_fuse/fusedir.py
@@ -493,7 +493,8 @@ class TmpCollectionDirectory(CollectionDirectoryBase):
     def __init__(self, parent_inode, inodes, api_client, num_retries):
         collection = self.UnsaveableCollection(
             api_client=api_client,
-            keep_client=api_client.keep)
+            keep_client=api_client.keep,
+            num_retries=num_retries)
         super(TmpCollectionDirectory, self).__init__(
             parent_inode, inodes, collection)
         self.collection_record_file = None
diff --git a/services/fuse/tests/test_retry.py b/services/fuse/tests/test_retry.py
new file mode 100644
index 0000000..f6c0807
--- /dev/null
+++ b/services/fuse/tests/test_retry.py
@@ -0,0 +1,35 @@
+import arvados
+import arvados_fuse.command
+import mock
+import os
+import run_test_server
+import tempfile
+import unittest
+
+class KeepClientRetry(unittest.TestCase):
+    origKeepClient = arvados.keep.KeepClient
+
+    def setUp(self):
+        self.mnt = tempfile.mkdtemp()
+        run_test_server.authorize_with('active')
+
+    def tearDown(self):
+        os.rmdir(self.mnt)
+
+    @mock.patch('arvados_fuse.arvados.keep.KeepClient')
+    def _test_retry(self, num_retries, argv, kc):
+        kc.side_effect = lambda *args, **kw: self.origKeepClient(*args, **kw)
+        with arvados_fuse.command.Mount(
+                arvados_fuse.command.ArgumentParser().parse_args(
+                    argv+[self.mnt])):
+            pass
+        self.assertEqual(num_retries, kc.call_args[1].get('num_retries'))
+
+    def test_default_retry_3(self):
+        self._test_retry(3, [])
+
+    def test_retry_2(self):
+        self._test_retry(2, ['--retries=2'])
+
+    def test_no_retry(self):
+        self._test_retry(0, ['--retries=0'])

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list