[ARVADOS] updated: b38fea181afac08137243a294a502e4605d679d8

git at public.curoverse.com git at public.curoverse.com
Wed Mar 18 08:57:36 EDT 2015


Summary of changes:
 .../arvnodeman/computenode/dispatch/__init__.py    | 15 +++++++++---
 .../arvnodeman/computenode/driver/__init__.py      |  6 +++++
 .../arvnodeman/computenode/driver/dummy.py         |  4 ++++
 .../arvnodeman/computenode/driver/ec2.py           |  4 ++++
 .../arvnodeman/computenode/driver/gce.py           | 10 ++++++++
 services/nodemanager/arvnodeman/daemon.py          |  1 +
 .../nodemanager/tests/test_computenode_dispatch.py | 27 ++++++++++++++++++----
 .../tests/test_computenode_driver_ec2.py           |  6 +++++
 .../tests/test_computenode_driver_gce.py           | 13 +++++++++++
 services/nodemanager/tests/testutil.py             |  5 ++++
 10 files changed, 84 insertions(+), 7 deletions(-)

       via  b38fea181afac08137243a294a502e4605d679d8 (commit)
       via  fff9d822fdf8857a7d8e22c024bc4f04b3b94129 (commit)
      from  18148ec24bf63ec3353a2cd38f6f07c460037db2 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.


commit b38fea181afac08137243a294a502e4605d679d8
Merge: 18148ec fff9d82
Author: Brett Smith <brett at curoverse.com>
Date:   Wed Mar 18 08:56:54 2015 -0400

    Merge branch '5313-node-manager-node-naming-tag-wip'
    
    Refs #5313.  Closes #5434, #5467.


commit fff9d822fdf8857a7d8e22c024bc4f04b3b94129
Author: Brett Smith <brett at curoverse.com>
Date:   Fri Mar 13 11:11:35 2015 -0400

    5313: Node Manager has cloud-specific logic to get node FQDNs.
    
    On AWS, we put compute nodes' FQDN in the name field.  On GCE, we
    can't do that: it can't contain dots.  Add a node_fqdn classmethod to
    cloud drivers to get a node's FQDN from the right place, and use that
    method when deciding whether or not to sync a node.
    
    Syncing a node will occasionally raise an "Invalid fingerprint"
    exception on GCE.  This is nonfatal.  Add a comment explaining why.

diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
index 310e788..7081762 100644
--- a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
@@ -253,13 +253,14 @@ class ComputeNodeMonitorActor(config.actor_class):
     for shutdown.
     """
     def __init__(self, cloud_node, cloud_node_start_time, shutdown_timer,
-                 timer_actor, update_actor, arvados_node=None,
+                 cloud_fqdn_func, timer_actor, update_actor, arvados_node=None,
                  poll_stale_after=600, node_stale_after=3600):
         super(ComputeNodeMonitorActor, self).__init__()
         self._later = self.actor_ref.proxy()
         self._logger = logging.getLogger('arvnodeman.computenode')
         self._last_log = None
         self._shutdowns = shutdown_timer
+        self._cloud_node_fqdn = cloud_fqdn_func
         self._timer = timer_actor
         self._update = update_actor
         self.cloud_node = cloud_node
@@ -339,9 +340,17 @@ class ComputeNodeMonitorActor(config.actor_class):
             self._later.consider_shutdown()
 
     def update_arvados_node(self, arvados_node):
+        # If the cloud node's FQDN doesn't match what's in the Arvados node
+        # record, make them match.
+        # This method is a little unusual in the way it just fires off the
+        # request without checking the result or retrying errors.  That's
+        # because this update happens every time we reload the Arvados node
+        # list: if a previous sync attempt failed, we'll see that the names
+        # are out of sync and just try again.  ComputeNodeUpdateActor has
+        # the logic to throttle those effective retries when there's trouble.
         if arvados_node is not None:
             self.arvados_node = arvados_node
-            new_hostname = arvados_node_fqdn(self.arvados_node)
-            if new_hostname != self.cloud_node.name:
+            if (self._cloud_node_fqdn(self.cloud_node) !=
+                  arvados_node_fqdn(self.arvados_node)):
                 self._update.sync_node(self.cloud_node, self.arvados_node)
             self._later.consider_shutdown()
diff --git a/services/nodemanager/arvnodeman/computenode/driver/__init__.py b/services/nodemanager/arvnodeman/computenode/driver/__init__.py
index 369bb8f..b703e0d 100644
--- a/services/nodemanager/arvnodeman/computenode/driver/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/driver/__init__.py
@@ -79,6 +79,12 @@ class BaseComputeNodeDriver(object):
         raise NotImplementedError("BaseComputeNodeDriver.sync_node")
 
     @classmethod
+    def node_fqdn(cls, node):
+        # This method should return the FQDN of the node object argument.
+        # Different clouds store this in different places.
+        raise NotImplementedError("BaseComputeNodeDriver.node_fqdn")
+
+    @classmethod
     def node_start_time(cls, node):
         raise NotImplementedError("BaseComputeNodeDriver.node_start_time")
 
diff --git a/services/nodemanager/arvnodeman/computenode/driver/dummy.py b/services/nodemanager/arvnodeman/computenode/driver/dummy.py
index 3a286bb..c164a25 100644
--- a/services/nodemanager/arvnodeman/computenode/driver/dummy.py
+++ b/services/nodemanager/arvnodeman/computenode/driver/dummy.py
@@ -49,5 +49,9 @@ class ComputeNodeDriver(BaseComputeNodeDriver):
         cloud_node.name = arvados_node_fqdn(arvados_node)
 
     @classmethod
+    def node_fqdn(cls, node):
+        return node.name
+
+    @classmethod
     def node_start_time(cls, node):
         return cls.DUMMY_START_TIME
diff --git a/services/nodemanager/arvnodeman/computenode/driver/ec2.py b/services/nodemanager/arvnodeman/computenode/driver/ec2.py
index 9db3d89..588ca51 100644
--- a/services/nodemanager/arvnodeman/computenode/driver/ec2.py
+++ b/services/nodemanager/arvnodeman/computenode/driver/ec2.py
@@ -81,6 +81,10 @@ class ComputeNodeDriver(BaseComputeNodeDriver):
                                  {'Name': arvados_node_fqdn(arvados_node)})
 
     @classmethod
+    def node_fqdn(cls, node):
+        return node.name
+
+    @classmethod
     def node_start_time(cls, node):
         time_str = node.extra['launch_time'].split('.', 2)[0] + 'UTC'
         return time.mktime(time.strptime(
diff --git a/services/nodemanager/arvnodeman/computenode/driver/gce.py b/services/nodemanager/arvnodeman/computenode/driver/gce.py
index ccd1937..6380d0e 100644
--- a/services/nodemanager/arvnodeman/computenode/driver/gce.py
+++ b/services/nodemanager/arvnodeman/computenode/driver/gce.py
@@ -96,6 +96,10 @@ class ComputeNodeDriver(BaseComputeNodeDriver):
             raise
 
     def sync_node(self, cloud_node, arvados_node):
+        # We can't store the FQDN on the name attribute or anything like it,
+        # because (a) names are static throughout the node's life (so FQDN
+        # isn't available because we don't know it at node creation time) and
+        # (b) it can't contain dots.  Instead stash it in metadata.
         hostname = arvados_node_fqdn(arvados_node)
         metadata_req = cloud_node.extra['metadata'].copy()
         metadata_items = metadata_req.setdefault('items', [])
@@ -111,6 +115,12 @@ class ComputeNodeDriver(BaseComputeNodeDriver):
             raise Exception("setMetadata error: {}".format(response.error))
 
     @classmethod
+    def node_fqdn(cls, node):
+        # See sync_node comment.
+        return cls._get_metadata(node.extra['metadata'].get('items', []),
+                                 'hostname', '')
+
+    @classmethod
     def node_start_time(cls, node):
         try:
             return arvados_timestamp(cls._get_metadata(
diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py
index 0e48078..836b673 100644
--- a/services/nodemanager/arvnodeman/daemon.py
+++ b/services/nodemanager/arvnodeman/daemon.py
@@ -154,6 +154,7 @@ class NodeManagerDaemonActor(actor_class):
             cloud_node=cloud_node,
             cloud_node_start_time=start_time,
             shutdown_timer=shutdown_timer,
+            cloud_fqdn_func=self._cloud_driver.node_fqdn,
             update_actor=self._cloud_updater,
             timer_actor=self._timer,
             arvados_node=None,
diff --git a/services/nodemanager/tests/test_computenode_dispatch.py b/services/nodemanager/tests/test_computenode_dispatch.py
index 8d69ea9..b8cf0ee 100644
--- a/services/nodemanager/tests/test_computenode_dispatch.py
+++ b/services/nodemanager/tests/test_computenode_dispatch.py
@@ -129,8 +129,9 @@ class ComputeNodeShutdownActorMixin(testutil.ActorTestMixin):
         if not hasattr(self, 'timer'):
             self.make_mocks()
         monitor_actor = dispatch.ComputeNodeMonitorActor.start(
-            self.cloud_node, time.time(), self.shutdowns, self.timer,
-            self.updates, self.arvados_node)
+            self.cloud_node, time.time(), self.shutdowns,
+            testutil.cloud_node_fqdn, self.timer, self.updates,
+            self.arvados_node)
         self.shutdown_actor = self.ACTOR_CLASS.start(
             self.timer, self.cloud_client, monitor_actor, cancellable).proxy()
         self.monitor_actor = monitor_actor.proxy()
@@ -218,8 +219,9 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
         if start_time is None:
             start_time = time.time()
         self.node_actor = dispatch.ComputeNodeMonitorActor.start(
-            self.cloud_mock, start_time, self.shutdowns, self.timer,
-            self.updates, arv_node).proxy()
+            self.cloud_mock, start_time, self.shutdowns,
+            testutil.cloud_node_fqdn, self.timer, self.updates,
+            arv_node).proxy()
         self.node_actor.subscribe(self.subscriber).get(self.TIMEOUT)
 
     def node_state(self, *states):
@@ -356,3 +358,20 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
         current_arvados = self.node_actor.arvados_node.get(self.TIMEOUT)
         self.assertEqual(testutil.ip_address_mock(4),
                          current_arvados['ip_address'])
+
+    def test_update_arvados_node_syncs_when_fqdn_mismatch(self):
+        self.make_mocks(5)
+        self.cloud_mock.extra['testname'] = 'cloudfqdn.zzzzz.arvadosapi.com'
+        self.make_actor()
+        arv_node = testutil.arvados_node_mock(5)
+        self.node_actor.update_arvados_node(arv_node).get(self.TIMEOUT)
+        self.assertEqual(1, self.updates.sync_node.call_count)
+
+    def test_update_arvados_node_skips_sync_when_fqdn_match(self):
+        self.make_mocks(6)
+        arv_node = testutil.arvados_node_mock(6)
+        self.cloud_mock.extra['testname'] ='{n[hostname]}.{n[domain]}'.format(
+            n=arv_node)
+        self.make_actor()
+        self.node_actor.update_arvados_node(arv_node).get(self.TIMEOUT)
+        self.assertEqual(0, self.updates.sync_node.call_count)
diff --git a/services/nodemanager/tests/test_computenode_driver_ec2.py b/services/nodemanager/tests/test_computenode_driver_ec2.py
index 8e52824..595f1f4 100644
--- a/services/nodemanager/tests/test_computenode_driver_ec2.py
+++ b/services/nodemanager/tests/test_computenode_driver_ec2.py
@@ -91,6 +91,12 @@ class EC2ComputeNodeDriverTestCase(testutil.DriverTestMixin, unittest.TestCase):
                                                    reftuple)}
         self.assertEqual(refsecs, ec2.ComputeNodeDriver.node_start_time(node))
 
+    def test_node_fqdn(self):
+        name = 'fqdntest.zzzzz.arvadosapi.com'
+        node = testutil.cloud_node_mock()
+        node.name = name
+        self.assertEqual(name, ec2.ComputeNodeDriver.node_fqdn(node))
+
     def test_cloud_exceptions(self):
         for error in [Exception("test exception"),
                       IOError("test exception"),
diff --git a/services/nodemanager/tests/test_computenode_driver_gce.py b/services/nodemanager/tests/test_computenode_driver_gce.py
index 17114b6..465adc5 100644
--- a/services/nodemanager/tests/test_computenode_driver_gce.py
+++ b/services/nodemanager/tests/test_computenode_driver_gce.py
@@ -127,6 +127,19 @@ class GCEComputeNodeDriverTestCase(testutil.DriverTestMixin, unittest.TestCase):
         metadata = self.driver_mock().create_node.call_args[1]['ex_metadata']
         self.assertLessEqual(start_time, metadata.get('booted_at'))
 
+    def test_known_node_fqdn(self):
+        name = 'fqdntest.zzzzz.arvadosapi.com'
+        node = testutil.cloud_node_mock(metadata=self.build_gce_metadata(
+                {'hostname': name}))
+        self.assertEqual(name, gce.ComputeNodeDriver.node_fqdn(node))
+
+    def test_unknown_node_fqdn(self):
+        # Return an empty string.  This lets fqdn be safely compared
+        # against an expected value, and ComputeNodeMonitorActor
+        # should try to update it.
+        node = testutil.cloud_node_mock(metadata=self.build_gce_metadata({}))
+        self.assertEqual('', gce.ComputeNodeDriver.node_fqdn(node))
+
     def test_deliver_ssh_key_in_metadata(self):
         test_ssh_key = 'ssh-rsa-foo'
         arv_node = testutil.arvados_node_mock(1)
diff --git a/services/nodemanager/tests/testutil.py b/services/nodemanager/tests/testutil.py
index f0508e7..650a232 100644
--- a/services/nodemanager/tests/testutil.py
+++ b/services/nodemanager/tests/testutil.py
@@ -55,6 +55,11 @@ def cloud_node_mock(node_num=99, **extra):
     node.extra = extra
     return node
 
+def cloud_node_fqdn(node):
+    # We intentionally put the FQDN somewhere goofy to make sure tested code is
+    # using this function for lookups.
+    return node.extra.get('testname', 'NoTestName')
+
 def ip_address_mock(last_octet):
     return '10.20.30.{}'.format(last_octet)
 

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list