[ARVADOS] updated: 70fd9ea4dc6177c1774d90223d4f94edd5332c3a

git at public.curoverse.com git at public.curoverse.com
Tue Oct 7 15:25:19 EDT 2014


Summary of changes:
 services/nodemanager/arvnodeman/__init__.py        | 12 ----
 .../nodemanager/arvnodeman/computenode/__init__.py | 79 +++++++++++++++++++---
 .../nodemanager/arvnodeman/computenode/dummy.py    |  5 +-
 services/nodemanager/arvnodeman/computenode/ec2.py | 11 ++-
 services/nodemanager/arvnodeman/daemon.py          |  4 +-
 services/nodemanager/arvnodeman/launcher.py        | 13 +++-
 services/nodemanager/doc/local.example.cfg         |  2 +-
 services/nodemanager/tests/test_computenode.py     | 22 +++++-
 services/nodemanager/tests/test_computenode_ec2.py | 17 +++++
 services/nodemanager/tests/test_daemon.py          |  3 +-
 10 files changed, 137 insertions(+), 31 deletions(-)

       via  70fd9ea4dc6177c1774d90223d4f94edd5332c3a (commit)
       via  d42ec711c63c40dcef2b9e47d3017ea5dc31ba32 (commit)
       via  16e2f9f8127b345d047c77e589635f0e1d3aa487 (commit)
      from  02b41b35631329d9eff41f6f2e56456edb7f50a6 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.


commit 70fd9ea4dc6177c1774d90223d4f94edd5332c3a
Author: Brett Smith <brett at curoverse.com>
Date:   Tue Oct 7 15:19:52 2014 -0400

    2881: Node Manager tags cloud nodes with Arvados hostnames.

diff --git a/services/nodemanager/arvnodeman/__init__.py b/services/nodemanager/arvnodeman/__init__.py
index da3013c..a1ecac7 100644
--- a/services/nodemanager/arvnodeman/__init__.py
+++ b/services/nodemanager/arvnodeman/__init__.py
@@ -2,18 +2,6 @@
 
 from __future__ import absolute_import, print_function
 
-# First import and expose all the classes we want to export.
-from .computenode import \
-    ComputeNodeSetupActor, ComputeNodeShutdownActor, ComputeNodeActor, \
-    ShutdownTimer
-from .daemon import NodeManagerDaemonActor
-from .jobqueue import JobQueueMonitorActor, ServerCalculator
-from .nodelist import ArvadosNodeListMonitorActor, CloudNodeListMonitorActor
-from .timedcallback import TimedCallBackActor
-
-__all__ = [name for name in locals().keys() if name[0].isupper()]
-
-# We now return you to your regularly scheduled program.
 import _strptime  # See <http://bugs.python.org/issue7980#msg221094>.
 import logging
 
diff --git a/services/nodemanager/arvnodeman/computenode/__init__.py b/services/nodemanager/arvnodeman/computenode/__init__.py
index b67c5ef..e5efa3d 100644
--- a/services/nodemanager/arvnodeman/computenode/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/__init__.py
@@ -12,6 +12,10 @@ import pykka
 from ..clientactor import _notify_subscribers
 from .. import config
 
+def arvados_node_fqdn(arvados_node, default_hostname='dynamic.compute'):
+    hostname = arvados_node.get('hostname') or default_hostname
+    return '{}.{}'.format(hostname, arvados_node['domain'])
+
 def arvados_node_mtime(node):
     return time.mktime(time.strptime(node['modified_at'] + 'UTC',
                                      '%Y-%m-%dT%H:%M:%SZ%Z')) - time.timezone
@@ -55,9 +59,9 @@ class BaseComputeNodeDriver(object):
     are responsible for translating the node manager's cloud requests to a
     specific cloud's vocabulary.
 
-    Subclasses must implement arvados_create_kwargs (to update node creation
-    kwargs with information about the specific Arvados node record) and
-    node_start_time.
+    Subclasses must implement arvados_create_kwargs (to update node
+    creation kwargs with information about the specific Arvados node
+    record), sync_node, and node_start_time.
     """
     def __init__(self, auth_kwargs, list_kwargs, create_kwargs, driver_class):
         self.real = driver_class(**auth_kwargs)
@@ -96,6 +100,13 @@ class BaseComputeNodeDriver(object):
         kwargs['size'] = size
         return self.real.create_node(**kwargs)
 
+    def sync_node(self, cloud_node, arvados_node):
+        # When a compute node first pings the API server, the API server
+        # will automatically assign some attributes on the corresponding
+        # node record, like hostname.  This method should propagate that
+        # information back to the cloud node appropriately.
+        raise NotImplementedError("BaseComputeNodeDriver.sync_node")
+
     @classmethod
     def node_start_time(cls, node):
         raise NotImplementedError("BaseComputeNodeDriver.node_start_time")
@@ -200,6 +211,52 @@ class ComputeNodeShutdownActor(config.actor_class):
         self._logger.info("Cloud node %s shut down.", self.cloud_node.id)
 
 
+class ComputeNodeUpdateActor(config.actor_class):
+    """Actor to dispatch one-off cloud management requests.
+
+    This actor receives requests for small cloud updates, and dispatches them
+    to a real driver.  ComputeNodeActors use this to perform maintenance
+    tasks on themselves.  Having a dedicated actor for this gives us the
+    opportunity to control the flow of requests; e.g., by backing off when
+    errors occur.
+
+    This actor is most like a "traditional" Pykka actor: there's no
+    subscribing, but instead methods return real driver results.  If
+    you're interested in those results, you should get them from the
+    Future that the proxy method returns.  Be prepared to handle exceptions
+    from the cloud driver when you do.
+    """
+    def __init__(self, cloud_factory, max_retry_wait=180):
+        super(ComputeNodeUpdateActor, self).__init__()
+        self._cloud = cloud_factory()
+        self.max_retry_wait = max_retry_wait
+        self.error_streak = 0
+        self.next_request_time = time.time()
+
+    def _throttle_errors(orig_func):
+        @functools.wraps(orig_func)
+        def wrapper(self, *args, **kwargs):
+            throttle_time = self.next_request_time - time.time()
+            if throttle_time > 0:
+                time.sleep(throttle_time)
+            self.next_request_time = time.time()
+            try:
+                result = orig_func(self, *args, **kwargs)
+            except config.CLOUD_ERRORS:
+                self.error_streak += 1
+                self.next_request_time += min(2 ** self.error_streak,
+                                              self.max_retry_wait)
+                raise
+            else:
+                self.error_streak = 0
+                return result
+        return wrapper
+
+    @_throttle_errors
+    def sync_node(self, cloud_node, arvados_node):
+        return self._cloud.sync_node(cloud_node, arvados_node)
+
+
 class ShutdownTimer(object):
     """Keep track of a cloud node's shutdown windows.
 
@@ -250,7 +307,7 @@ class ComputeNodeActor(config.actor_class):
     for shutdown.
     """
     def __init__(self, cloud_node, cloud_node_start_time, shutdown_timer,
-                 timer_actor, arvados_node=None,
+                 timer_actor, update_actor, arvados_node=None,
                  poll_stale_after=600, node_stale_after=3600):
         super(ComputeNodeActor, self).__init__()
         self._later = self.actor_ref.proxy()
@@ -258,12 +315,14 @@ class ComputeNodeActor(config.actor_class):
         self._last_log = None
         self._shutdowns = shutdown_timer
         self._timer = timer_actor
+        self._update = update_actor
         self.cloud_node = cloud_node
         self.cloud_node_start_time = cloud_node_start_time
-        self.arvados_node = arvados_node
         self.poll_stale_after = poll_stale_after
         self.node_stale_after = node_stale_after
         self.subscribers = set()
+        self.arvados_node = None
+        self._later.update_arvados_node(arvados_node)
         self.last_shutdown_opening = None
         self._later.consider_shutdown()
 
@@ -305,7 +364,7 @@ class ComputeNodeActor(config.actor_class):
         if self.arvados_node is not None:
             return None
         elif arvados_node['ip_address'] in self.cloud_node.private_ips:
-            self.arvados_node = arvados_node
+            self._later.update_arvados_node(arvados_node)
             return self.cloud_node.id
         else:
             return None
@@ -318,4 +377,7 @@ class ComputeNodeActor(config.actor_class):
     def update_arvados_node(self, arvados_node):
         if arvados_node is not None:
             self.arvados_node = arvados_node
+            new_hostname = arvados_node_fqdn(self.arvados_node)
+            if new_hostname != self.cloud_node.name:
+                self._update.sync_node(self.cloud_node, self.arvados_node)
             self._later.consider_shutdown()
diff --git a/services/nodemanager/arvnodeman/computenode/dummy.py b/services/nodemanager/arvnodeman/computenode/dummy.py
index e1a2da0..6c39fea 100644
--- a/services/nodemanager/arvnodeman/computenode/dummy.py
+++ b/services/nodemanager/arvnodeman/computenode/dummy.py
@@ -7,7 +7,7 @@ import time
 import libcloud.compute.providers as cloud_provider
 import libcloud.compute.types as cloud_types
 
-from . import BaseComputeNodeDriver
+from . import BaseComputeNodeDriver, arvados_node_fqdn
 
 class ComputeNodeDriver(BaseComputeNodeDriver):
     """Compute node driver wrapper for libcloud's dummy driver.
@@ -44,6 +44,9 @@ class ComputeNodeDriver(BaseComputeNodeDriver):
         self._ensure_private_ip(node)
         return node
 
+    def sync_node(self, cloud_node, arvados_node):
+        cloud_node.name = arvados_node_fqdn(arvados_node)
+
     @classmethod
     def node_start_time(cls, node):
         return cls.DUMMY_START_TIME
diff --git a/services/nodemanager/arvnodeman/computenode/ec2.py b/services/nodemanager/arvnodeman/computenode/ec2.py
index 4269ed7..359bed4 100644
--- a/services/nodemanager/arvnodeman/computenode/ec2.py
+++ b/services/nodemanager/arvnodeman/computenode/ec2.py
@@ -9,7 +9,7 @@ import libcloud.compute.providers as cloud_provider
 import libcloud.compute.types as cloud_types
 from libcloud.compute.drivers import ec2 as cloud_ec2
 
-from . import BaseComputeNodeDriver
+from . import BaseComputeNodeDriver, arvados_node_fqdn
 
 ### Monkeypatch libcloud to support AWS' new SecurityGroup API.
 # These classes can be removed when libcloud support specifying
@@ -79,8 +79,7 @@ class ComputeNodeDriver(BaseComputeNodeDriver):
 
     def arvados_create_kwargs(self, arvados_node):
         result = {'ex_metadata': self.tags.copy(),
-                  'name': '{}.{}'.format(arvados_node['hostname'],
-                                         arvados_node['domain'])}
+                  'name': arvados_node_fqdn(arvados_node)}
         ping_secret = arvados_node['info'].get('ping_secret')
         if ping_secret is not None:
             ping_url = ('https://{}/arvados/v1/nodes/{}/ping?ping_secret={}'.
@@ -89,6 +88,12 @@ class ComputeNodeDriver(BaseComputeNodeDriver):
             result['ex_userdata'] = ping_url
         return result
 
+    def sync_node(self, cloud_node, arvados_node):
+        metadata = self.arvados_create_kwargs(arvados_node)
+        tags = metadata['ex_metadata']
+        tags['Name'] = metadata['name']
+        self.real.ex_create_tags(cloud_node, tags)
+
     @classmethod
     def node_start_time(cls, node):
         time_str = node.extra['launch_time'].split('.', 2)[0] + 'UTC'
diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py
index 8219af6..1aed1c5 100644
--- a/services/nodemanager/arvnodeman/daemon.py
+++ b/services/nodemanager/arvnodeman/daemon.py
@@ -55,7 +55,7 @@ class NodeManagerDaemonActor(actor_class):
 
 
     def __init__(self, server_wishlist_actor, arvados_nodes_actor,
-                 cloud_nodes_actor, timer_actor,
+                 cloud_nodes_actor, cloud_update_actor, timer_actor,
                  arvados_factory, cloud_factory,
                  shutdown_windows, max_nodes,
                  poll_stale_after=600, node_stale_after=7200,
@@ -66,6 +66,7 @@ class NodeManagerDaemonActor(actor_class):
         self._node_setup = node_setup_class
         self._node_shutdown = node_shutdown_class
         self._node_actor = node_actor_class
+        self._cloud_updater = cloud_update_actor
         self._timer = timer_actor
         self._new_arvados = arvados_factory
         self._new_cloud = cloud_factory
@@ -115,6 +116,7 @@ class NodeManagerDaemonActor(actor_class):
             cloud_node=cloud_node,
             cloud_node_start_time=start_time,
             shutdown_timer=shutdown_timer,
+            update_actor=self._cloud_updater,
             timer_actor=self._timer,
             arvados_node=arvados_node,
             poll_stale_after=self.poll_stale_after,
diff --git a/services/nodemanager/arvnodeman/launcher.py b/services/nodemanager/arvnodeman/launcher.py
index 477a2e4..84cb564 100644
--- a/services/nodemanager/arvnodeman/launcher.py
+++ b/services/nodemanager/arvnodeman/launcher.py
@@ -11,8 +11,14 @@ import time
 import daemon
 import pykka
 
-from . import *  # This imports all the Actor classes.
 from . import config as nmconfig
+from .computenode import \
+    ComputeNodeSetupActor, ComputeNodeShutdownActor, ComputeNodeUpdateActor, \
+    ComputeNodeActor, ShutdownTimer
+from .daemon import NodeManagerDaemonActor
+from .jobqueue import JobQueueMonitorActor, ServerCalculator
+from .nodelist import ArvadosNodeListMonitorActor, CloudNodeListMonitorActor
+from .timedcallback import TimedCallBackActor
 
 node_daemon = None
 
@@ -103,8 +109,11 @@ def main(args=None):
     setup_logging(config.get('Logging', 'file'), **config.log_levels())
     timer, cloud_node_poller, arvados_node_poller, job_queue_poller = \
         launch_pollers(config)
+    cloud_node_updater = ComputeNodeUpdateActor.start(
+        config.new_cloud_client).proxy()
     node_daemon = NodeManagerDaemonActor.start(
-        job_queue_poller, arvados_node_poller, cloud_node_poller, timer,
+        job_queue_poller, arvados_node_poller, cloud_node_poller,
+        cloud_node_updater, timer,
         config.new_arvados_client, config.new_cloud_client,
         config.shutdown_windows(), config.getint('Daemon', 'max_nodes'),
         config.getint('Daemon', 'poll_stale_after'),
diff --git a/services/nodemanager/tests/test_computenode.py b/services/nodemanager/tests/test_computenode.py
index f2454d3..59e40e7 100644
--- a/services/nodemanager/tests/test_computenode.py
+++ b/services/nodemanager/tests/test_computenode.py
@@ -108,6 +108,20 @@ class ComputeNodeShutdownActorTestCase(testutil.ActorTestMixin,
         self.wait_for_call(self.cloud_client.destroy_node)
 
 
+class ComputeNodeUpdateActorTestCase(testutil.ActorTestMixin,
+                                     unittest.TestCase):
+    def make_actor(self):
+        self.driver = mock.MagicMock(name='driver_mock')
+        self.updater = cnode.ComputeNodeUpdateActor.start(self.driver).proxy()
+
+    def test_node_sync(self):
+        self.make_actor()
+        cloud_node = testutil.cloud_node_mock()
+        arv_node = testutil.arvados_node_mock()
+        self.updater.sync_node(cloud_node, arv_node).get(self.TIMEOUT)
+        self.driver().sync_node.assert_called_with(cloud_node, arv_node)
+
+
 @mock.patch('time.time', return_value=1)
 class ShutdownTimerTestCase(unittest.TestCase):
     def test_two_length_window(self, time_mock):
@@ -144,6 +158,7 @@ class ComputeNodeActorTestCase(testutil.ActorTestMixin, unittest.TestCase):
         self.shutdowns = self.MockShutdownTimer()
         self.shutdowns._set_state(False, 300)
         self.timer = mock.MagicMock(name='timer_mock')
+        self.updates = mock.MagicMock(name='update_mock')
         self.cloud_mock = testutil.cloud_node_mock(node_num)
         self.subscriber = mock.Mock(name='subscriber_mock')
 
@@ -155,7 +170,7 @@ class ComputeNodeActorTestCase(testutil.ActorTestMixin, unittest.TestCase):
         start_time = time.time()
         self.node_actor = cnode.ComputeNodeActor.start(
             self.cloud_mock, start_time, self.shutdowns, self.timer,
-            arv_node).proxy()
+            self.updates, arv_node).proxy()
         self.node_actor.subscribe(self.subscriber)
 
     def test_init_shutdown_scheduling(self):
@@ -211,9 +226,12 @@ class ComputeNodeActorTestCase(testutil.ActorTestMixin, unittest.TestCase):
 
     def test_arvados_node_match(self):
         self.make_actor(2)
-        arv_node = testutil.arvados_node_mock(2)
+        arv_node = testutil.arvados_node_mock(
+            2, hostname='compute-two.zzzzz.arvadosapi.com')
         pair_future = self.node_actor.offer_arvados_pair(arv_node)
         self.assertEqual(self.cloud_mock.id, pair_future.get(self.TIMEOUT))
+        self.wait_for_call(self.updates.sync_node)
+        self.updates.sync_node.assert_called_with(self.cloud_mock, arv_node)
 
     def test_arvados_node_mismatch(self):
         self.make_actor(3)
diff --git a/services/nodemanager/tests/test_computenode_ec2.py b/services/nodemanager/tests/test_computenode_ec2.py
index ac9fda5..d1c9e43 100644
--- a/services/nodemanager/tests/test_computenode_ec2.py
+++ b/services/nodemanager/tests/test_computenode_ec2.py
@@ -63,6 +63,23 @@ class EC2ComputeNodeDriverTestCase(unittest.TestCase):
                           'name': 'compute8.zzzzz.arvadosapi.com'},
                          driver.arvados_create_kwargs(arv_node))
 
+    def test_tags_set_default_hostname_from_new_arvados_node(self):
+        arv_node = testutil.arvados_node_mock(hostname=None)
+        driver = self.new_driver()
+        actual = driver.arvados_create_kwargs(arv_node)
+        self.assertEqual('dynamic.compute.zzzzz.arvadosapi.com',
+                         actual['name'])
+
+    def test_sync_node(self):
+        arv_node = testutil.arvados_node_mock(1)
+        cloud_node = testutil.cloud_node_mock(2)
+        driver = self.new_driver()
+        driver.sync_node(cloud_node, arv_node)
+        tag_mock = self.driver_mock().ex_create_tags
+        self.assertTrue(tag_mock.called)
+        self.assertEqual('compute1.zzzzz.arvadosapi.com',
+                         tag_mock.call_args[0][1].get('Name', 'no name'))
+
     def test_node_create_time(self):
         refsecs = int(time.time())
         reftuple = time.gmtime(refsecs)
diff --git a/services/nodemanager/tests/test_daemon.py b/services/nodemanager/tests/test_daemon.py
index 3468ec0..8459fb4 100644
--- a/services/nodemanager/tests/test_daemon.py
+++ b/services/nodemanager/tests/test_daemon.py
@@ -18,13 +18,14 @@ class NodeManagerDaemonActorTestCase(testutil.ActorTestMixin,
         self.arv_factory = mock.MagicMock(name='arvados_mock')
         self.cloud_factory = mock.MagicMock(name='cloud_mock')
         self.cloud_factory().node_start_time.return_value = time.time()
+        self.cloud_updates = mock.MagicMock(name='updates_mock')
         self.timer = testutil.MockTimer()
         self.node_factory = mock.MagicMock(name='factory_mock')
         self.node_setup = mock.MagicMock(name='setup_mock')
         self.node_shutdown = mock.MagicMock(name='shutdown_mock')
         self.daemon = nmdaemon.NodeManagerDaemonActor.start(
             self.server_wishlist_poller, self.arvados_nodes_poller,
-            self.cloud_nodes_poller, self.timer,
+            self.cloud_nodes_poller, self.cloud_updates, self.timer,
             self.arv_factory, self.cloud_factory,
             [54, 5, 1], 8, 600, 3600,
             self.node_setup, self.node_shutdown, self.node_factory).proxy()

commit d42ec711c63c40dcef2b9e47d3017ea5dc31ba32
Author: Brett Smith <brett at curoverse.com>
Date:   Tue Oct 7 15:19:31 2014 -0400

    2881: Fix Node Manager dummy config to use Size id.

diff --git a/services/nodemanager/doc/local.example.cfg b/services/nodemanager/doc/local.example.cfg
index 33ca3d6..8a6e626 100644
--- a/services/nodemanager/doc/local.example.cfg
+++ b/services/nodemanager/doc/local.example.cfg
@@ -36,6 +36,6 @@ creds = dummycreds
 [Cloud List]
 [Cloud Create]
 
-[Size Medium]
+[Size 2]
 cores = 4
 scratch = 1234

commit 16e2f9f8127b345d047c77e589635f0e1d3aa487
Author: Brett Smith <brett at curoverse.com>
Date:   Tue Oct 7 13:41:18 2014 -0400

    2881: Clarify docstring about node setup procedure.

diff --git a/services/nodemanager/arvnodeman/computenode/__init__.py b/services/nodemanager/arvnodeman/computenode/__init__.py
index 8bedf88..b67c5ef 100644
--- a/services/nodemanager/arvnodeman/computenode/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/__init__.py
@@ -108,8 +108,9 @@ class ComputeNodeSetupActor(config.actor_class):
 
     This actor prepares an Arvados node record for a new compute node
     (either creating one or cleaning one passed in), then boots the
-    actual compute node.  It notifies subscribers when the node finishes
-    booting.
+    actual compute node.  It notifies subscribers when the cloud node
+    is successfully created (the last step in the process for Node
+    Manager to handle).
     """
     def __init__(self, timer_actor, arvados_client, cloud_client,
                  cloud_size, arvados_node=None,

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list