[ARVADOS] updated: 70fd9ea4dc6177c1774d90223d4f94edd5332c3a
git at public.curoverse.com
git at public.curoverse.com
Tue Oct 7 15:25:19 EDT 2014
Summary of changes:
services/nodemanager/arvnodeman/__init__.py | 12 ----
.../nodemanager/arvnodeman/computenode/__init__.py | 79 +++++++++++++++++++---
.../nodemanager/arvnodeman/computenode/dummy.py | 5 +-
services/nodemanager/arvnodeman/computenode/ec2.py | 11 ++-
services/nodemanager/arvnodeman/daemon.py | 4 +-
services/nodemanager/arvnodeman/launcher.py | 13 +++-
services/nodemanager/doc/local.example.cfg | 2 +-
services/nodemanager/tests/test_computenode.py | 22 +++++-
services/nodemanager/tests/test_computenode_ec2.py | 17 +++++
services/nodemanager/tests/test_daemon.py | 3 +-
10 files changed, 137 insertions(+), 31 deletions(-)
via 70fd9ea4dc6177c1774d90223d4f94edd5332c3a (commit)
via d42ec711c63c40dcef2b9e47d3017ea5dc31ba32 (commit)
via 16e2f9f8127b345d047c77e589635f0e1d3aa487 (commit)
from 02b41b35631329d9eff41f6f2e56456edb7f50a6 (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
commit 70fd9ea4dc6177c1774d90223d4f94edd5332c3a
Author: Brett Smith <brett at curoverse.com>
Date: Tue Oct 7 15:19:52 2014 -0400
2881: Node Manager tags cloud nodes with Arvados hostnames.
diff --git a/services/nodemanager/arvnodeman/__init__.py b/services/nodemanager/arvnodeman/__init__.py
index da3013c..a1ecac7 100644
--- a/services/nodemanager/arvnodeman/__init__.py
+++ b/services/nodemanager/arvnodeman/__init__.py
@@ -2,18 +2,6 @@
from __future__ import absolute_import, print_function
-# First import and expose all the classes we want to export.
-from .computenode import \
- ComputeNodeSetupActor, ComputeNodeShutdownActor, ComputeNodeActor, \
- ShutdownTimer
-from .daemon import NodeManagerDaemonActor
-from .jobqueue import JobQueueMonitorActor, ServerCalculator
-from .nodelist import ArvadosNodeListMonitorActor, CloudNodeListMonitorActor
-from .timedcallback import TimedCallBackActor
-
-__all__ = [name for name in locals().keys() if name[0].isupper()]
-
-# We now return you to your regularly scheduled program.
import _strptime # See <http://bugs.python.org/issue7980#msg221094>.
import logging
diff --git a/services/nodemanager/arvnodeman/computenode/__init__.py b/services/nodemanager/arvnodeman/computenode/__init__.py
index b67c5ef..e5efa3d 100644
--- a/services/nodemanager/arvnodeman/computenode/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/__init__.py
@@ -12,6 +12,10 @@ import pykka
from ..clientactor import _notify_subscribers
from .. import config
+def arvados_node_fqdn(arvados_node, default_hostname='dynamic.compute'):
+ hostname = arvados_node.get('hostname') or default_hostname
+ return '{}.{}'.format(hostname, arvados_node['domain'])
+
def arvados_node_mtime(node):
return time.mktime(time.strptime(node['modified_at'] + 'UTC',
'%Y-%m-%dT%H:%M:%SZ%Z')) - time.timezone
@@ -55,9 +59,9 @@ class BaseComputeNodeDriver(object):
are responsible for translating the node manager's cloud requests to a
specific cloud's vocabulary.
- Subclasses must implement arvados_create_kwargs (to update node creation
- kwargs with information about the specific Arvados node record) and
- node_start_time.
+ Subclasses must implement arvados_create_kwargs (to update node
+ creation kwargs with information about the specific Arvados node
+ record), sync_node, and node_start_time.
"""
def __init__(self, auth_kwargs, list_kwargs, create_kwargs, driver_class):
self.real = driver_class(**auth_kwargs)
@@ -96,6 +100,13 @@ class BaseComputeNodeDriver(object):
kwargs['size'] = size
return self.real.create_node(**kwargs)
+ def sync_node(self, cloud_node, arvados_node):
+ # When a compute node first pings the API server, the API server
+ # will automatically assign some attributes on the corresponding
+ # node record, like hostname. This method should propagate that
+ # information back to the cloud node appropriately.
+ raise NotImplementedError("BaseComputeNodeDriver.sync_node")
+
@classmethod
def node_start_time(cls, node):
raise NotImplementedError("BaseComputeNodeDriver.node_start_time")
@@ -200,6 +211,52 @@ class ComputeNodeShutdownActor(config.actor_class):
self._logger.info("Cloud node %s shut down.", self.cloud_node.id)
+class ComputeNodeUpdateActor(config.actor_class):
+ """Actor to dispatch one-off cloud management requests.
+
+ This actor receives requests for small cloud updates, and dispatches them
+ to a real driver. ComputeNodeActors use this to perform maintenance
+ tasks on themselves. Having a dedicated actor for this gives us the
+ opportunity to control the flow of requests; e.g., by backing off when
+ errors occur.
+
+ This actor is most like a "traditional" Pykka actor: there's no
+ subscribing, but instead methods return real driver results. If
+ you're interested in those results, you should get them from the
+ Future that the proxy method returns. Be prepared to handle exceptions
+ from the cloud driver when you do.
+ """
+ def __init__(self, cloud_factory, max_retry_wait=180):
+ super(ComputeNodeUpdateActor, self).__init__()
+ self._cloud = cloud_factory()
+ self.max_retry_wait = max_retry_wait
+ self.error_streak = 0
+ self.next_request_time = time.time()
+
+ def _throttle_errors(orig_func):
+ @functools.wraps(orig_func)
+ def wrapper(self, *args, **kwargs):
+ throttle_time = self.next_request_time - time.time()
+ if throttle_time > 0:
+ time.sleep(throttle_time)
+ self.next_request_time = time.time()
+ try:
+ result = orig_func(self, *args, **kwargs)
+ except config.CLOUD_ERRORS:
+ self.error_streak += 1
+ self.next_request_time += min(2 ** self.error_streak,
+ self.max_retry_wait)
+ raise
+ else:
+ self.error_streak = 0
+ return result
+ return wrapper
+
+ @_throttle_errors
+ def sync_node(self, cloud_node, arvados_node):
+ return self._cloud.sync_node(cloud_node, arvados_node)
+
+
class ShutdownTimer(object):
"""Keep track of a cloud node's shutdown windows.
@@ -250,7 +307,7 @@ class ComputeNodeActor(config.actor_class):
for shutdown.
"""
def __init__(self, cloud_node, cloud_node_start_time, shutdown_timer,
- timer_actor, arvados_node=None,
+ timer_actor, update_actor, arvados_node=None,
poll_stale_after=600, node_stale_after=3600):
super(ComputeNodeActor, self).__init__()
self._later = self.actor_ref.proxy()
@@ -258,12 +315,14 @@ class ComputeNodeActor(config.actor_class):
self._last_log = None
self._shutdowns = shutdown_timer
self._timer = timer_actor
+ self._update = update_actor
self.cloud_node = cloud_node
self.cloud_node_start_time = cloud_node_start_time
- self.arvados_node = arvados_node
self.poll_stale_after = poll_stale_after
self.node_stale_after = node_stale_after
self.subscribers = set()
+ self.arvados_node = None
+ self._later.update_arvados_node(arvados_node)
self.last_shutdown_opening = None
self._later.consider_shutdown()
@@ -305,7 +364,7 @@ class ComputeNodeActor(config.actor_class):
if self.arvados_node is not None:
return None
elif arvados_node['ip_address'] in self.cloud_node.private_ips:
- self.arvados_node = arvados_node
+ self._later.update_arvados_node(arvados_node)
return self.cloud_node.id
else:
return None
@@ -318,4 +377,7 @@ class ComputeNodeActor(config.actor_class):
def update_arvados_node(self, arvados_node):
if arvados_node is not None:
self.arvados_node = arvados_node
+ new_hostname = arvados_node_fqdn(self.arvados_node)
+ if new_hostname != self.cloud_node.name:
+ self._update.sync_node(self.cloud_node, self.arvados_node)
self._later.consider_shutdown()
diff --git a/services/nodemanager/arvnodeman/computenode/dummy.py b/services/nodemanager/arvnodeman/computenode/dummy.py
index e1a2da0..6c39fea 100644
--- a/services/nodemanager/arvnodeman/computenode/dummy.py
+++ b/services/nodemanager/arvnodeman/computenode/dummy.py
@@ -7,7 +7,7 @@ import time
import libcloud.compute.providers as cloud_provider
import libcloud.compute.types as cloud_types
-from . import BaseComputeNodeDriver
+from . import BaseComputeNodeDriver, arvados_node_fqdn
class ComputeNodeDriver(BaseComputeNodeDriver):
"""Compute node driver wrapper for libcloud's dummy driver.
@@ -44,6 +44,9 @@ class ComputeNodeDriver(BaseComputeNodeDriver):
self._ensure_private_ip(node)
return node
+ def sync_node(self, cloud_node, arvados_node):
+ cloud_node.name = arvados_node_fqdn(arvados_node)
+
@classmethod
def node_start_time(cls, node):
return cls.DUMMY_START_TIME
diff --git a/services/nodemanager/arvnodeman/computenode/ec2.py b/services/nodemanager/arvnodeman/computenode/ec2.py
index 4269ed7..359bed4 100644
--- a/services/nodemanager/arvnodeman/computenode/ec2.py
+++ b/services/nodemanager/arvnodeman/computenode/ec2.py
@@ -9,7 +9,7 @@ import libcloud.compute.providers as cloud_provider
import libcloud.compute.types as cloud_types
from libcloud.compute.drivers import ec2 as cloud_ec2
-from . import BaseComputeNodeDriver
+from . import BaseComputeNodeDriver, arvados_node_fqdn
### Monkeypatch libcloud to support AWS' new SecurityGroup API.
# These classes can be removed when libcloud support specifying
@@ -79,8 +79,7 @@ class ComputeNodeDriver(BaseComputeNodeDriver):
def arvados_create_kwargs(self, arvados_node):
result = {'ex_metadata': self.tags.copy(),
- 'name': '{}.{}'.format(arvados_node['hostname'],
- arvados_node['domain'])}
+ 'name': arvados_node_fqdn(arvados_node)}
ping_secret = arvados_node['info'].get('ping_secret')
if ping_secret is not None:
ping_url = ('https://{}/arvados/v1/nodes/{}/ping?ping_secret={}'.
@@ -89,6 +88,12 @@ class ComputeNodeDriver(BaseComputeNodeDriver):
result['ex_userdata'] = ping_url
return result
+ def sync_node(self, cloud_node, arvados_node):
+ metadata = self.arvados_create_kwargs(arvados_node)
+ tags = metadata['ex_metadata']
+ tags['Name'] = metadata['name']
+ self.real.ex_create_tags(cloud_node, tags)
+
@classmethod
def node_start_time(cls, node):
time_str = node.extra['launch_time'].split('.', 2)[0] + 'UTC'
diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py
index 8219af6..1aed1c5 100644
--- a/services/nodemanager/arvnodeman/daemon.py
+++ b/services/nodemanager/arvnodeman/daemon.py
@@ -55,7 +55,7 @@ class NodeManagerDaemonActor(actor_class):
def __init__(self, server_wishlist_actor, arvados_nodes_actor,
- cloud_nodes_actor, timer_actor,
+ cloud_nodes_actor, cloud_update_actor, timer_actor,
arvados_factory, cloud_factory,
shutdown_windows, max_nodes,
poll_stale_after=600, node_stale_after=7200,
@@ -66,6 +66,7 @@ class NodeManagerDaemonActor(actor_class):
self._node_setup = node_setup_class
self._node_shutdown = node_shutdown_class
self._node_actor = node_actor_class
+ self._cloud_updater = cloud_update_actor
self._timer = timer_actor
self._new_arvados = arvados_factory
self._new_cloud = cloud_factory
@@ -115,6 +116,7 @@ class NodeManagerDaemonActor(actor_class):
cloud_node=cloud_node,
cloud_node_start_time=start_time,
shutdown_timer=shutdown_timer,
+ update_actor=self._cloud_updater,
timer_actor=self._timer,
arvados_node=arvados_node,
poll_stale_after=self.poll_stale_after,
diff --git a/services/nodemanager/arvnodeman/launcher.py b/services/nodemanager/arvnodeman/launcher.py
index 477a2e4..84cb564 100644
--- a/services/nodemanager/arvnodeman/launcher.py
+++ b/services/nodemanager/arvnodeman/launcher.py
@@ -11,8 +11,14 @@ import time
import daemon
import pykka
-from . import * # This imports all the Actor classes.
from . import config as nmconfig
+from .computenode import \
+ ComputeNodeSetupActor, ComputeNodeShutdownActor, ComputeNodeUpdateActor, \
+ ComputeNodeActor, ShutdownTimer
+from .daemon import NodeManagerDaemonActor
+from .jobqueue import JobQueueMonitorActor, ServerCalculator
+from .nodelist import ArvadosNodeListMonitorActor, CloudNodeListMonitorActor
+from .timedcallback import TimedCallBackActor
node_daemon = None
@@ -103,8 +109,11 @@ def main(args=None):
setup_logging(config.get('Logging', 'file'), **config.log_levels())
timer, cloud_node_poller, arvados_node_poller, job_queue_poller = \
launch_pollers(config)
+ cloud_node_updater = ComputeNodeUpdateActor.start(
+ config.new_cloud_client).proxy()
node_daemon = NodeManagerDaemonActor.start(
- job_queue_poller, arvados_node_poller, cloud_node_poller, timer,
+ job_queue_poller, arvados_node_poller, cloud_node_poller,
+ cloud_node_updater, timer,
config.new_arvados_client, config.new_cloud_client,
config.shutdown_windows(), config.getint('Daemon', 'max_nodes'),
config.getint('Daemon', 'poll_stale_after'),
diff --git a/services/nodemanager/tests/test_computenode.py b/services/nodemanager/tests/test_computenode.py
index f2454d3..59e40e7 100644
--- a/services/nodemanager/tests/test_computenode.py
+++ b/services/nodemanager/tests/test_computenode.py
@@ -108,6 +108,20 @@ class ComputeNodeShutdownActorTestCase(testutil.ActorTestMixin,
self.wait_for_call(self.cloud_client.destroy_node)
+class ComputeNodeUpdateActorTestCase(testutil.ActorTestMixin,
+ unittest.TestCase):
+ def make_actor(self):
+ self.driver = mock.MagicMock(name='driver_mock')
+ self.updater = cnode.ComputeNodeUpdateActor.start(self.driver).proxy()
+
+ def test_node_sync(self):
+ self.make_actor()
+ cloud_node = testutil.cloud_node_mock()
+ arv_node = testutil.arvados_node_mock()
+ self.updater.sync_node(cloud_node, arv_node).get(self.TIMEOUT)
+ self.driver().sync_node.assert_called_with(cloud_node, arv_node)
+
+
@mock.patch('time.time', return_value=1)
class ShutdownTimerTestCase(unittest.TestCase):
def test_two_length_window(self, time_mock):
@@ -144,6 +158,7 @@ class ComputeNodeActorTestCase(testutil.ActorTestMixin, unittest.TestCase):
self.shutdowns = self.MockShutdownTimer()
self.shutdowns._set_state(False, 300)
self.timer = mock.MagicMock(name='timer_mock')
+ self.updates = mock.MagicMock(name='update_mock')
self.cloud_mock = testutil.cloud_node_mock(node_num)
self.subscriber = mock.Mock(name='subscriber_mock')
@@ -155,7 +170,7 @@ class ComputeNodeActorTestCase(testutil.ActorTestMixin, unittest.TestCase):
start_time = time.time()
self.node_actor = cnode.ComputeNodeActor.start(
self.cloud_mock, start_time, self.shutdowns, self.timer,
- arv_node).proxy()
+ self.updates, arv_node).proxy()
self.node_actor.subscribe(self.subscriber)
def test_init_shutdown_scheduling(self):
@@ -211,9 +226,12 @@ class ComputeNodeActorTestCase(testutil.ActorTestMixin, unittest.TestCase):
def test_arvados_node_match(self):
self.make_actor(2)
- arv_node = testutil.arvados_node_mock(2)
+ arv_node = testutil.arvados_node_mock(
+ 2, hostname='compute-two.zzzzz.arvadosapi.com')
pair_future = self.node_actor.offer_arvados_pair(arv_node)
self.assertEqual(self.cloud_mock.id, pair_future.get(self.TIMEOUT))
+ self.wait_for_call(self.updates.sync_node)
+ self.updates.sync_node.assert_called_with(self.cloud_mock, arv_node)
def test_arvados_node_mismatch(self):
self.make_actor(3)
diff --git a/services/nodemanager/tests/test_computenode_ec2.py b/services/nodemanager/tests/test_computenode_ec2.py
index ac9fda5..d1c9e43 100644
--- a/services/nodemanager/tests/test_computenode_ec2.py
+++ b/services/nodemanager/tests/test_computenode_ec2.py
@@ -63,6 +63,23 @@ class EC2ComputeNodeDriverTestCase(unittest.TestCase):
'name': 'compute8.zzzzz.arvadosapi.com'},
driver.arvados_create_kwargs(arv_node))
+ def test_tags_set_default_hostname_from_new_arvados_node(self):
+ arv_node = testutil.arvados_node_mock(hostname=None)
+ driver = self.new_driver()
+ actual = driver.arvados_create_kwargs(arv_node)
+ self.assertEqual('dynamic.compute.zzzzz.arvadosapi.com',
+ actual['name'])
+
+ def test_sync_node(self):
+ arv_node = testutil.arvados_node_mock(1)
+ cloud_node = testutil.cloud_node_mock(2)
+ driver = self.new_driver()
+ driver.sync_node(cloud_node, arv_node)
+ tag_mock = self.driver_mock().ex_create_tags
+ self.assertTrue(tag_mock.called)
+ self.assertEqual('compute1.zzzzz.arvadosapi.com',
+ tag_mock.call_args[0][1].get('Name', 'no name'))
+
def test_node_create_time(self):
refsecs = int(time.time())
reftuple = time.gmtime(refsecs)
diff --git a/services/nodemanager/tests/test_daemon.py b/services/nodemanager/tests/test_daemon.py
index 3468ec0..8459fb4 100644
--- a/services/nodemanager/tests/test_daemon.py
+++ b/services/nodemanager/tests/test_daemon.py
@@ -18,13 +18,14 @@ class NodeManagerDaemonActorTestCase(testutil.ActorTestMixin,
self.arv_factory = mock.MagicMock(name='arvados_mock')
self.cloud_factory = mock.MagicMock(name='cloud_mock')
self.cloud_factory().node_start_time.return_value = time.time()
+ self.cloud_updates = mock.MagicMock(name='updates_mock')
self.timer = testutil.MockTimer()
self.node_factory = mock.MagicMock(name='factory_mock')
self.node_setup = mock.MagicMock(name='setup_mock')
self.node_shutdown = mock.MagicMock(name='shutdown_mock')
self.daemon = nmdaemon.NodeManagerDaemonActor.start(
self.server_wishlist_poller, self.arvados_nodes_poller,
- self.cloud_nodes_poller, self.timer,
+ self.cloud_nodes_poller, self.cloud_updates, self.timer,
self.arv_factory, self.cloud_factory,
[54, 5, 1], 8, 600, 3600,
self.node_setup, self.node_shutdown, self.node_factory).proxy()
commit d42ec711c63c40dcef2b9e47d3017ea5dc31ba32
Author: Brett Smith <brett at curoverse.com>
Date: Tue Oct 7 15:19:31 2014 -0400
2881: Fix Node Manager dummy config to use Size id.
diff --git a/services/nodemanager/doc/local.example.cfg b/services/nodemanager/doc/local.example.cfg
index 33ca3d6..8a6e626 100644
--- a/services/nodemanager/doc/local.example.cfg
+++ b/services/nodemanager/doc/local.example.cfg
@@ -36,6 +36,6 @@ creds = dummycreds
[Cloud List]
[Cloud Create]
-[Size Medium]
+[Size 2]
cores = 4
scratch = 1234
commit 16e2f9f8127b345d047c77e589635f0e1d3aa487
Author: Brett Smith <brett at curoverse.com>
Date: Tue Oct 7 13:41:18 2014 -0400
2881: Clarify docstring about node setup procedure.
diff --git a/services/nodemanager/arvnodeman/computenode/__init__.py b/services/nodemanager/arvnodeman/computenode/__init__.py
index 8bedf88..b67c5ef 100644
--- a/services/nodemanager/arvnodeman/computenode/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/__init__.py
@@ -108,8 +108,9 @@ class ComputeNodeSetupActor(config.actor_class):
This actor prepares an Arvados node record for a new compute node
(either creating one or cleaning one passed in), then boots the
- actual compute node. It notifies subscribers when the node finishes
- booting.
+ actual compute node. It notifies subscribers when the cloud node
+ is successfully created (the last step in the process for Node
+ Manager to handle).
"""
def __init__(self, timer_actor, arvados_client, cloud_client,
cloud_size, arvados_node=None,
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list