[ARVADOS] updated: 68d551b65a837664be72bf08aad55ab76d778d07

Git user git at public.curoverse.com
Thu Apr 7 17:31:05 EDT 2016


Summary of changes:
 .../arvnodeman/computenode/driver/__init__.py      | 51 +++++++++++++++-------
 .../arvnodeman/computenode/driver/azure.py         |  7 ++-
 .../arvnodeman/computenode/driver/ec2.py           |  4 +-
 .../arvnodeman/computenode/driver/gce.py           |  7 ++-
 services/nodemanager/tests/testutil.py             | 25 +++++++++++
 5 files changed, 73 insertions(+), 21 deletions(-)

       via  68d551b65a837664be72bf08aad55ab76d778d07 (commit)
       via  c244fbb0d880b45a44300e9ed650a32a954fd7d9 (commit)
      from  86996de865c2b4d395c2a3977b05f7278924f627 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.


commit 68d551b65a837664be72bf08aad55ab76d778d07
Merge: 86996de c244fbb
Author: Brett Smith <brett at curoverse.com>
Date:   Thu Apr 7 17:30:44 2016 -0400

    Merge branch '8872-node-manager-create-search-handling-wip'
    
    Closes #8872, #8900.


commit c244fbb0d880b45a44300e9ed650a32a954fd7d9
Author: Brett Smith <brett at curoverse.com>
Date:   Wed Apr 6 14:23:11 2016 -0400

    8872: Bugfix Node Manager's node search after node create failure.
    
    search_for raises ValueError if the thing isn't found.  create_node
    seems to be expecting it to return None instead.  Bring create_node in
    line with search_for's documented API.
    
    In order to get the tests to pass, I had to separate out the raw
    search code from the caching, and use that in create_node.  Otherwise,
    the cloud node from the "node found" test would be cached and returned
    in the "node not found" test.

diff --git a/services/nodemanager/arvnodeman/computenode/driver/__init__.py b/services/nodemanager/arvnodeman/computenode/driver/__init__.py
index 0576999..95b6fa8 100644
--- a/services/nodemanager/arvnodeman/computenode/driver/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/driver/__init__.py
@@ -79,7 +79,7 @@ class BaseComputeNodeDriver(RetryMixin):
             key = NodeAuthSSHKey(ssh_file.read())
         return 'auth', key
 
-    def search_for(self, term, list_method, key=attrgetter('id'), **kwargs):
+    def search_for_now(self, term, list_method, key=attrgetter('id'), **kwargs):
         """Return one matching item from a list of cloud objects.
 
         Raises ValueError if the number of matching objects is not exactly 1.
@@ -92,16 +92,25 @@ class BaseComputeNodeDriver(RetryMixin):
           value search for a `term` match on each item.  Returns the
           object's 'id' attribute by default.
         """
+        items = getattr(self.real, list_method)(**kwargs)
+        results = [item for item in items if key(item) == term]
+        count = len(results)
+        if count != 1:
+            raise ValueError("{} returned {} results for {!r}".format(
+                    list_method, count, term))
+        return results[0]
+
+    def search_for(self, term, list_method, key=attrgetter('id'), **kwargs):
+        """Return one cached matching item from a list of cloud objects.
+
+        See search_for_now() for details of arguments and exceptions.
+        This method caches results, so it's good to find static cloud objects
+        like node sizes, regions, etc.
+        """
         cache_key = (list_method, term)
         if cache_key not in self.SEARCH_CACHE:
-            items = getattr(self.real, list_method)(**kwargs)
-            results = [item for item in items
-                       if key(item) == term]
-            count = len(results)
-            if count != 1:
-                raise ValueError("{} returned {} results for '{}'".format(
-                        list_method, count, term))
-            self.SEARCH_CACHE[cache_key] = results[0]
+            self.SEARCH_CACHE[cache_key] = self.search_for_now(
+                term, list_method, key, **kwargs)
         return self.SEARCH_CACHE[cache_key]
 
     def list_nodes(self, **kwargs):
@@ -109,6 +118,18 @@ class BaseComputeNodeDriver(RetryMixin):
         l.update(kwargs)
         return self.real.list_nodes(**l)
 
+    def create_cloud_name(self, arvados_node):
+        """Return a cloud node name for the given Arvados node record.
+
+        Subclasses must override this method.  It should return a string
+        that can be used as the name for a newly-created cloud node,
+        based on identifying information in the Arvados node record.
+
+        Arguments:
+        * arvados_node: This Arvados node record to seed the new cloud node.
+        """
+        raise NotImplementedError("BaseComputeNodeDriver.create_cloud_name")
+
     def arvados_create_kwargs(self, size, arvados_node):
         """Return dynamic keyword arguments for create_node.
 
@@ -143,19 +164,17 @@ class BaseComputeNodeDriver(RetryMixin):
             kwargs.update(self.arvados_create_kwargs(size, arvados_node))
             kwargs['size'] = size
             return self.real.create_node(**kwargs)
-        except self.CLOUD_ERRORS:
+        except self.CLOUD_ERRORS as create_error:
             # Workaround for bug #6702: sometimes the create node request
             # succeeds but times out and raises an exception instead of
             # returning a result.  If this happens, we get stuck in a retry
             # loop forever because subsequent create_node attempts will fail
             # due to node name collision.  So check if the node we intended to
             # create shows up in the cloud node list and return it if found.
-            node = self.search_for(kwargs['name'], 'list_nodes', self._name_key)
-            if node:
-                return node
-            else:
-                # something else went wrong, re-raise the exception
-                raise
+            try:
+                return self.search_for_now(kwargs['name'], 'list_nodes', self._name_key)
+            except ValueError:
+                raise create_error
 
     def post_create_node(self, cloud_node):
         # ComputeNodeSetupActor calls this method after the cloud node is
diff --git a/services/nodemanager/arvnodeman/computenode/driver/azure.py b/services/nodemanager/arvnodeman/computenode/driver/azure.py
index 167d8b3..e293d1b 100644
--- a/services/nodemanager/arvnodeman/computenode/driver/azure.py
+++ b/services/nodemanager/arvnodeman/computenode/driver/azure.py
@@ -38,15 +38,18 @@ class ComputeNodeDriver(BaseComputeNodeDriver):
             auth_kwargs, list_kwargs, create_kwargs,
             driver_class)
 
+    def create_cloud_name(self, arvados_node):
+        uuid_parts = arvados_node['uuid'].split('-', 2)
+        return 'compute-{parts[2]}-{parts[0]}'.format(parts=uuid_parts)
+
     def arvados_create_kwargs(self, size, arvados_node):
-        cluster_id, _, node_id = arvados_node['uuid'].split('-')
-        name = 'compute-{}-{}'.format(node_id, cluster_id)
         tags = {
             'booted_at': time.strftime(ARVADOS_TIMEFMT, time.gmtime()),
             'arv-ping-url': self._make_ping_url(arvados_node)
         }
         tags.update(self.tags)
 
+        name = self.create_cloud_name(arvados_node)
         customdata = """#!/bin/sh
 mkdir -p    /var/tmp/arv-node-data/meta-data
 echo %s > /var/tmp/arv-node-data/arv-ping-url
diff --git a/services/nodemanager/arvnodeman/computenode/driver/ec2.py b/services/nodemanager/arvnodeman/computenode/driver/ec2.py
index d314d38..8deabbd 100644
--- a/services/nodemanager/arvnodeman/computenode/driver/ec2.py
+++ b/services/nodemanager/arvnodeman/computenode/driver/ec2.py
@@ -64,8 +64,10 @@ class ComputeNodeDriver(BaseComputeNodeDriver):
     def _init_subnet_id(self, subnet_id):
         return 'ex_subnet', self.search_for(subnet_id, 'ex_list_subnets')
 
+    create_cloud_name = staticmethod(arvados_node_fqdn)
+
     def arvados_create_kwargs(self, size, arvados_node):
-        return {'name': arvados_node_fqdn(arvados_node),
+        return {'name': self.create_cloud_name(arvados_node),
                 'ex_userdata': self._make_ping_url(arvados_node)}
 
     def post_create_node(self, cloud_node):
diff --git a/services/nodemanager/arvnodeman/computenode/driver/gce.py b/services/nodemanager/arvnodeman/computenode/driver/gce.py
index be99883..b853f00 100644
--- a/services/nodemanager/arvnodeman/computenode/driver/gce.py
+++ b/services/nodemanager/arvnodeman/computenode/driver/gce.py
@@ -60,9 +60,12 @@ class ComputeNodeDriver(BaseComputeNodeDriver):
             self.create_kwargs['ex_metadata']['sshKeys'] = (
                 'root:' + ssh_file.read().strip())
 
+    def create_cloud_name(self, arvados_node):
+        uuid_parts = arvados_node['uuid'].split('-', 2)
+        return 'compute-{parts[2]}-{parts[0]}'.format(parts=uuid_parts)
+
     def arvados_create_kwargs(self, size, arvados_node):
-        cluster_id, _, node_id = arvados_node['uuid'].split('-')
-        name = 'compute-{}-{}'.format(node_id, cluster_id)
+        name = self.create_cloud_name(arvados_node)
         disks = [
             {'autoDelete': True,
              'boot': True,
diff --git a/services/nodemanager/tests/testutil.py b/services/nodemanager/tests/testutil.py
index b9e7bea..b376ca7 100644
--- a/services/nodemanager/tests/testutil.py
+++ b/services/nodemanager/tests/testutil.py
@@ -6,6 +6,7 @@ import datetime
 import threading
 import time
 
+import libcloud.common.types as cloud_types
 import mock
 import pykka
 
@@ -142,6 +143,30 @@ class DriverTestMixin(object):
             self.assertTrue(self.driver_mock.called)
             self.assertIs(driver.real, driver_mock2)
 
+    def test_create_can_find_node_after_timeout(self):
+        driver = self.new_driver()
+        arv_node = arvados_node_mock()
+        cloud_node = cloud_node_mock()
+        cloud_node.name = driver.create_cloud_name(arv_node)
+        create_method = self.driver_mock().create_node
+        create_method.side_effect = cloud_types.LibcloudError("fake timeout")
+        list_method = self.driver_mock().list_nodes
+        list_method.return_value = [cloud_node]
+        actual = driver.create_node(MockSize(1), arv_node)
+        self.assertIs(cloud_node, actual)
+
+    def test_create_can_raise_exception_after_timeout(self):
+        driver = self.new_driver()
+        arv_node = arvados_node_mock()
+        create_method = self.driver_mock().create_node
+        create_method.side_effect = cloud_types.LibcloudError("fake timeout")
+        list_method = self.driver_mock().list_nodes
+        list_method.return_value = []
+        with self.assertRaises(cloud_types.LibcloudError) as exc_test:
+            driver.create_node(MockSize(1), arv_node)
+        self.assertIs(create_method.side_effect, exc_test.exception)
+
+
 class RemotePollLoopActorTestMixin(ActorTestMixin):
     def build_monitor(self, *args, **kwargs):
         self.timer = mock.MagicMock(name='timer_mock')

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list