[ARVADOS] created: 783690708b6681eeae29dedb32fad193f54db5ea

Git user git at public.curoverse.com
Wed Apr 6 14:28:12 EDT 2016


        at  783690708b6681eeae29dedb32fad193f54db5ea (commit)


commit 783690708b6681eeae29dedb32fad193f54db5ea
Author: Brett Smith <brett at curoverse.com>
Date:   Wed Apr 6 14:23:11 2016 -0400

    8872: Bugfix Node Manager's node search after node create failure.
    
    search_for raises ValueError if the thing isn't found.  create_node
    seems to be expecting it to return None instead.  Bring create_node in
    line with search_for's documented API.
    
    In order to get the tests to pass, I had to separate out the raw
    search code from the caching, and use that in create_node.  Otherwise,
    the cloud node from the "node found" test would be cached and returned
    in the "node not found" test.

diff --git a/services/nodemanager/arvnodeman/computenode/driver/__init__.py b/services/nodemanager/arvnodeman/computenode/driver/__init__.py
index 0576999..0282122 100644
--- a/services/nodemanager/arvnodeman/computenode/driver/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/driver/__init__.py
@@ -79,6 +79,21 @@ class BaseComputeNodeDriver(RetryMixin):
             key = NodeAuthSSHKey(ssh_file.read())
         return 'auth', key
 
+    def search_for_now(self, term, list_method, key=attrgetter('id'), **kwargs):
+        """Call a cloud list API method and return one matching item.
+
+        See search_now() for details of arguments and exceptions.
+        This method does not cache results.
+        """
+        items = getattr(self.real, list_method)(**kwargs)
+        results = [item for item in items
+                   if key(item) == term]
+        count = len(results)
+        if count != 1:
+            raise ValueError("{} returned {} results for '{}'".format(
+                    list_method, count, term))
+        return results[0]
+
     def search_for(self, term, list_method, key=attrgetter('id'), **kwargs):
         """Return one matching item from a list of cloud objects.
 
@@ -94,14 +109,8 @@ class BaseComputeNodeDriver(RetryMixin):
         """
         cache_key = (list_method, term)
         if cache_key not in self.SEARCH_CACHE:
-            items = getattr(self.real, list_method)(**kwargs)
-            results = [item for item in items
-                       if key(item) == term]
-            count = len(results)
-            if count != 1:
-                raise ValueError("{} returned {} results for '{}'".format(
-                        list_method, count, term))
-            self.SEARCH_CACHE[cache_key] = results[0]
+            self.SEARCH_CACHE[cache_key] = self.search_for_now(
+                term, list_method, key, **kwargs)
         return self.SEARCH_CACHE[cache_key]
 
     def list_nodes(self, **kwargs):
@@ -143,19 +152,17 @@ class BaseComputeNodeDriver(RetryMixin):
             kwargs.update(self.arvados_create_kwargs(size, arvados_node))
             kwargs['size'] = size
             return self.real.create_node(**kwargs)
-        except self.CLOUD_ERRORS:
+        except self.CLOUD_ERRORS as create_error:
             # Workaround for bug #6702: sometimes the create node request
             # succeeds but times out and raises an exception instead of
             # returning a result.  If this happens, we get stuck in a retry
             # loop forever because subsequent create_node attempts will fail
             # due to node name collision.  So check if the node we intended to
             # create shows up in the cloud node list and return it if found.
-            node = self.search_for(kwargs['name'], 'list_nodes', self._name_key)
-            if node:
-                return node
-            else:
-                # something else went wrong, re-raise the exception
-                raise
+            try:
+                return self.search_for_now(kwargs['name'], 'list_nodes', self._name_key)
+            except ValueError:
+                raise create_error
 
     def post_create_node(self, cloud_node):
         # ComputeNodeSetupActor calls this method after the cloud node is
diff --git a/services/nodemanager/tests/test_computenode_driver_azure.py b/services/nodemanager/tests/test_computenode_driver_azure.py
index 8e701b9..0b99e9c 100644
--- a/services/nodemanager/tests/test_computenode_driver_azure.py
+++ b/services/nodemanager/tests/test_computenode_driver_azure.py
@@ -19,6 +19,10 @@ class AzureComputeNodeDriverTestCase(testutil.DriverTestMixin, unittest.TestCase
         list_kwargs.setdefault("ex_resource_group", "TestResourceGroup")
         return super(AzureComputeNodeDriverTestCase, self).new_driver(auth_kwargs, list_kwargs, create_kwargs)
 
+    def cloud_name_from_arv_node(self, arv_node):
+        uuid_parts = arv_node['uuid'].split('-', 2)
+        return 'compute-{parts[2]}-{parts[0]}'.format(parts=uuid_parts)
+
     def test_driver_instantiation(self):
         kwargs = {'key': 'testkey'}
         driver = self.new_driver(auth_kwargs=kwargs)
diff --git a/services/nodemanager/tests/test_computenode_driver_ec2.py b/services/nodemanager/tests/test_computenode_driver_ec2.py
index a778cd5..7f1b38e 100644
--- a/services/nodemanager/tests/test_computenode_driver_ec2.py
+++ b/services/nodemanager/tests/test_computenode_driver_ec2.py
@@ -15,6 +15,9 @@ from . import testutil
 class EC2ComputeNodeDriverTestCase(testutil.DriverTestMixin, unittest.TestCase):
     TEST_CLASS = ec2.ComputeNodeDriver
 
+    def cloud_name_from_arv_node(self, arv_node):
+        return '{n[hostname]}.{n[domain]}'.format(n=arv_node)
+
     def test_driver_instantiation(self):
         kwargs = {'key': 'testkey'}
         driver = self.new_driver(auth_kwargs=kwargs)
diff --git a/services/nodemanager/tests/test_computenode_driver_gce.py b/services/nodemanager/tests/test_computenode_driver_gce.py
index e8b2fa3..15d94a5 100644
--- a/services/nodemanager/tests/test_computenode_driver_gce.py
+++ b/services/nodemanager/tests/test_computenode_driver_gce.py
@@ -28,6 +28,10 @@ class GCEComputeNodeDriverTestCase(testutil.DriverTestMixin, unittest.TestCase):
         return super(GCEComputeNodeDriverTestCase, self).new_driver(
             auth_kwargs, list_kwargs, create_kwargs)
 
+    def cloud_name_from_arv_node(self, arv_node):
+        uuid_parts = arv_node['uuid'].split('-', 2)
+        return 'compute-{parts[2]}-{parts[0]}'.format(parts=uuid_parts)
+
     def test_driver_instantiation(self):
         kwargs = {'user_id': 'foo'}
         driver = self.new_driver(auth_kwargs=kwargs)
diff --git a/services/nodemanager/tests/testutil.py b/services/nodemanager/tests/testutil.py
index b9e7bea..3f9b87d 100644
--- a/services/nodemanager/tests/testutil.py
+++ b/services/nodemanager/tests/testutil.py
@@ -6,6 +6,7 @@ import datetime
 import threading
 import time
 
+import libcloud.common.types as cloud_types
 import mock
 import pykka
 
@@ -124,6 +125,9 @@ class DriverTestMixin(object):
         self.driver_mock = mock.MagicMock(name='driver_mock')
         super(DriverTestMixin, self).setUp()
 
+    def tearDown(self):
+        del self.driver_mock
+
     def new_driver(self, auth_kwargs={}, list_kwargs={}, create_kwargs={}):
         create_kwargs.setdefault('ping_host', '100::')
         return self.TEST_CLASS(
@@ -142,6 +146,30 @@ class DriverTestMixin(object):
             self.assertTrue(self.driver_mock.called)
             self.assertIs(driver.real, driver_mock2)
 
+    def test_create_can_find_node_after_timeout(self):
+        arv_node = arvados_node_mock()
+        cloud_node = cloud_node_mock()
+        cloud_node.name = self.cloud_name_from_arv_node(arv_node)
+        driver = self.new_driver()
+        create_method = self.driver_mock().create_node
+        create_method.side_effect = cloud_types.LibcloudError("fake timeout")
+        list_method = self.driver_mock().list_nodes
+        list_method.return_value = [cloud_node]
+        actual = driver.create_node(MockSize(1), arv_node)
+        self.assertIs(cloud_node, actual)
+
+    def test_create_can_raise_exception_after_timeout(self):
+        arv_node = arvados_node_mock()
+        driver = self.new_driver()
+        create_method = self.driver_mock().create_node
+        create_method.side_effect = cloud_types.LibcloudError("fake timeout")
+        list_method = self.driver_mock().list_nodes
+        list_method.return_value = []
+        with self.assertRaises(cloud_types.LibcloudError) as exc_test:
+            driver.create_node(MockSize(1), arv_node)
+        self.assertIs(create_method.side_effect, exc_test.exception)
+
+
 class RemotePollLoopActorTestMixin(ActorTestMixin):
     def build_monitor(self, *args, **kwargs):
         self.timer = mock.MagicMock(name='timer_mock')

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list