[ARVADOS] updated: 2bf83ac52cf6af212044ec6f0d3c77b5713bb664
git at public.curoverse.com
git at public.curoverse.com
Thu Dec 18 16:20:35 EST 2014
Summary of changes:
services/nodemanager/arvnodeman/clientactor.py | 13 +++---
.../arvnodeman/computenode/dispatch/__init__.py | 52 +++++++++++++---------
.../arvnodeman/computenode/driver/__init__.py | 20 +++++++++
.../arvnodeman/computenode/driver/ec2.py | 12 ++---
services/nodemanager/arvnodeman/config.py | 2 -
services/nodemanager/arvnodeman/nodelist.py | 10 +++--
.../nodemanager/tests/test_computenode_dispatch.py | 23 +++++++++-
.../tests/test_computenode_driver_ec2.py | 52 ++++++++++++++++------
8 files changed, 131 insertions(+), 53 deletions(-)
via 2bf83ac52cf6af212044ec6f0d3c77b5713bb664 (commit)
via 5f401b4457cd085ce3ecc5b15c4dbaef5a3df749 (commit)
via 6ab7cf882cd9a268374b880b5e55b4c8946406b4 (commit)
from e86ad4d8172c24aae92ccd482ffb122ea01b55ab (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
commit 2bf83ac52cf6af212044ec6f0d3c77b5713bb664
Merge: e86ad4d 5f401b4
Author: Brett Smith <brett at curoverse.com>
Date: Thu Dec 18 16:20:15 2014 -0500
Merge branch '4670-node-manager-robust-tags-wip'
Closes #4670, #4812.
commit 5f401b4457cd085ce3ecc5b15c4dbaef5a3df749
Author: Brett Smith <brett at curoverse.com>
Date: Fri Dec 12 16:16:39 2014 -0500
4670: Add a post-create hook to Node Manager for EC2 tagging.
The previous code was relying on the post-create tagging in libcloud's
EC2 driver. Unfortunately, that's not working out too well for us: if
it fails, you get no indication of that, and it doesn't get retried.
This moves the work up into Node Manager, where failures can be logged
and retried appropriately.
The retry support may be sufficient to resolve #4670. If it's not,
then the additional logging will help us track down the root cause.
diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
index f50670e..48e8dcf 100644
--- a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
@@ -129,6 +129,12 @@ class ComputeNodeSetupActor(ComputeNodeStateChangeBase):
self.cloud_node = self._cloud.create_node(self.cloud_size,
self.arvados_node)
self._logger.info("Cloud node %s created.", self.cloud_node.id)
+ self._later.post_create()
+
+ @ComputeNodeStateChangeBase._retry()
+ def post_create(self):
+ self._cloud.post_create_node(self.cloud_node)
+ self._logger.info("%s post-create work done.", self.cloud_node.id)
self._finished()
def stop_if_no_cloud_node(self):
diff --git a/services/nodemanager/arvnodeman/computenode/driver/__init__.py b/services/nodemanager/arvnodeman/computenode/driver/__init__.py
index 99b419e..3a0c206 100644
--- a/services/nodemanager/arvnodeman/computenode/driver/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/driver/__init__.py
@@ -58,6 +58,12 @@ class BaseComputeNodeDriver(object):
kwargs['size'] = size
return self.real.create_node(**kwargs)
+ def post_create_node(self, cloud_node):
+ # ComputeNodeSetupActor calls this method after the cloud node is
+ # created. Any setup tasks that need to happen afterward (e.g.,
+ # tagging) should be done in this method.
+ pass
+
def sync_node(self, cloud_node, arvados_node):
# When a compute node first pings the API server, the API server
# will automatically assign some attributes on the corresponding
diff --git a/services/nodemanager/arvnodeman/computenode/driver/ec2.py b/services/nodemanager/arvnodeman/computenode/driver/ec2.py
index c0992f7..255a948 100644
--- a/services/nodemanager/arvnodeman/computenode/driver/ec2.py
+++ b/services/nodemanager/arvnodeman/computenode/driver/ec2.py
@@ -79,8 +79,7 @@ class ComputeNodeDriver(BaseComputeNodeDriver):
return 'auth', key
def arvados_create_kwargs(self, arvados_node):
- result = {'ex_metadata': self.tags.copy(),
- 'name': arvados_node_fqdn(arvados_node)}
+ result = {'name': arvados_node_fqdn(arvados_node)}
ping_secret = arvados_node['info'].get('ping_secret')
if ping_secret is not None:
ping_url = ('https://{}/arvados/v1/nodes/{}/ping?ping_secret={}'.
@@ -89,11 +88,12 @@ class ComputeNodeDriver(BaseComputeNodeDriver):
result['ex_userdata'] = ping_url
return result
+ def post_create_node(self, cloud_node):
+ self.real.ex_create_tags(cloud_node, self.tags)
+
def sync_node(self, cloud_node, arvados_node):
- metadata = self.arvados_create_kwargs(arvados_node)
- tags = metadata['ex_metadata']
- tags['Name'] = metadata['name']
- self.real.ex_create_tags(cloud_node, tags)
+ self.real.ex_create_tags(cloud_node,
+ {'Name': arvados_node_fqdn(arvados_node)})
@classmethod
def node_start_time(cls, node):
diff --git a/services/nodemanager/tests/test_computenode_dispatch.py b/services/nodemanager/tests/test_computenode_dispatch.py
index c86dcfd..a1dfde3 100644
--- a/services/nodemanager/tests/test_computenode_dispatch.py
+++ b/services/nodemanager/tests/test_computenode_dispatch.py
@@ -14,7 +14,7 @@ import arvnodeman.computenode.dispatch as dispatch
from . import testutil
class ComputeNodeSetupActorTestCase(testutil.ActorTestMixin, unittest.TestCase):
- def make_mocks(self, arvados_effect=None, cloud_effect=None):
+ def make_mocks(self, arvados_effect=None):
if arvados_effect is None:
arvados_effect = [testutil.arvados_node_mock()]
self.arvados_effect = arvados_effect
@@ -48,14 +48,33 @@ class ComputeNodeSetupActorTestCase(testutil.ActorTestMixin, unittest.TestCase):
self.assertEqual(self.cloud_client.create_node(),
self.setup_actor.cloud_node.get(self.TIMEOUT))
- def test_failed_calls_retried(self):
+ def test_failed_arvados_calls_retried(self):
self.make_mocks([
arverror.ApiError(httplib2.Response({'status': '500'}), ""),
testutil.arvados_node_mock(),
])
self.make_actor()
+ self.wait_for_assignment(self.setup_actor, 'arvados_node')
+
+ def test_failed_cloud_calls_retried(self):
+ self.make_mocks()
+ self.cloud_client.create_node.side_effect = [
+ Exception("test cloud creation error"),
+ self.cloud_client.create_node.return_value,
+ ]
+ self.make_actor()
self.wait_for_assignment(self.setup_actor, 'cloud_node')
+ def test_failed_post_create_retried(self):
+ self.make_mocks()
+ self.cloud_client.post_create_node.side_effect = [
+ Exception("test cloud post-create error"), None]
+ self.make_actor()
+ done = self.FUTURE_CLASS()
+ self.setup_actor.subscribe(done.set)
+ done.get(self.TIMEOUT)
+ self.assertEqual(2, self.cloud_client.post_create_node.call_count)
+
def test_stop_when_no_cloud_node(self):
self.make_mocks(
arverror.ApiError(httplib2.Response({'status': '500'}), ""))
diff --git a/services/nodemanager/tests/test_computenode_driver_ec2.py b/services/nodemanager/tests/test_computenode_driver_ec2.py
index c765587..fae63a5 100644
--- a/services/nodemanager/tests/test_computenode_driver_ec2.py
+++ b/services/nodemanager/tests/test_computenode_driver_ec2.py
@@ -57,30 +57,41 @@ class EC2ComputeNodeDriverTestCase(unittest.TestCase):
create_method.call_args[1].get('ex_userdata',
'arg missing'))
- def test_tags_created_from_arvados_node(self):
+ def test_hostname_from_arvados_node(self):
arv_node = testutil.arvados_node_mock(8)
- cloud_node = testutil.cloud_node_mock(8)
- driver = self.new_driver(list_kwargs={'tag:list': 'test'})
- self.assertEqual({'ex_metadata': {'list': 'test'},
- 'name': 'compute8.zzzzz.arvadosapi.com'},
- driver.arvados_create_kwargs(arv_node))
+ driver = self.new_driver()
+ self.assertEqual('compute8.zzzzz.arvadosapi.com',
+ driver.arvados_create_kwargs(arv_node)['name'])
- def test_tags_set_default_hostname_from_new_arvados_node(self):
+ def test_default_hostname_from_new_arvados_node(self):
arv_node = testutil.arvados_node_mock(hostname=None)
driver = self.new_driver()
- actual = driver.arvados_create_kwargs(arv_node)
self.assertEqual('dynamic.compute.zzzzz.arvadosapi.com',
- actual['name'])
+ driver.arvados_create_kwargs(arv_node)['name'])
+
+ def check_node_tagged(self, cloud_node, expected_tags):
+ tag_mock = self.driver_mock().ex_create_tags
+ self.assertTrue(tag_mock.called)
+ self.assertIs(cloud_node, tag_mock.call_args[0][0])
+ self.assertEqual(expected_tags, tag_mock.call_args[0][1])
+
+ def test_post_create_node_tags_from_list_kwargs(self):
+ expect_tags = {'key1': 'test value 1', 'key2': 'test value 2'}
+ list_kwargs = {('tag_' + key): value
+ for key, value in expect_tags.iteritems()}
+ list_kwargs['instance-state-name'] = 'running'
+ cloud_node = testutil.cloud_node_mock()
+ driver = self.new_driver(list_kwargs=list_kwargs)
+ driver.post_create_node(cloud_node)
+ self.check_node_tagged(cloud_node, expect_tags)
def test_sync_node(self):
arv_node = testutil.arvados_node_mock(1)
cloud_node = testutil.cloud_node_mock(2)
driver = self.new_driver()
driver.sync_node(cloud_node, arv_node)
- tag_mock = self.driver_mock().ex_create_tags
- self.assertTrue(tag_mock.called)
- self.assertEqual('compute1.zzzzz.arvadosapi.com',
- tag_mock.call_args[0][1].get('Name', 'no name'))
+ self.check_node_tagged(cloud_node,
+ {'Name': 'compute1.zzzzz.arvadosapi.com'})
def test_node_create_time(self):
refsecs = int(time.time())
commit 6ab7cf882cd9a268374b880b5e55b4c8946406b4
Author: Brett Smith <brett at curoverse.com>
Date: Fri Dec 12 13:18:51 2014 -0500
4670: Node Manager handles more libcloud exceptions.
libcloud compute drivers (at least EC2 and GCE) raise bare Exceptions
when there's some problem talking to the cloud service. The previous
code was expecting to see a LibcloudError, so it wouldn't handle these
errors as intended.
I didn't want to just catch errors with "except Exception" everywhere,
so I added an is_cloud_exception class method to our driver classes to
more accurately identify exceptions that represent trouble talking to
the cloud service. It recognizes exact Exceptions, plus the other
classes we were catching before.
While I was at this, I gave more specific names to the wrapper methods
in compute node actor decorators, as a debugging aid.
diff --git a/services/nodemanager/arvnodeman/clientactor.py b/services/nodemanager/arvnodeman/clientactor.py
index 46a103e..6319f4b 100644
--- a/services/nodemanager/arvnodeman/clientactor.py
+++ b/services/nodemanager/arvnodeman/clientactor.py
@@ -30,12 +30,10 @@ class RemotePollLoopActor(actor_class):
response to subscribers. It takes care of error handling, and retrying
requests with exponential backoff.
- To use this actor, define CLIENT_ERRORS and the _send_request method.
- If you also define an _item_key method, this class will support
- subscribing to a specific item by key in responses.
+ To use this actor, define the _send_request method. If you also
+ define an _item_key method, this class will support subscribing to
+ a specific item by key in responses.
"""
- CLIENT_ERRORS = ()
-
def __init__(self, client, timer_actor, poll_wait=60, max_poll_wait=180):
super(RemotePollLoopActor, self).__init__()
self._client = client
@@ -87,6 +85,9 @@ class RemotePollLoopActor(actor_class):
return "{} got error: {} - waiting {} seconds".format(
self.log_prefix, error, self.poll_wait)
+ def is_common_error(self, exception):
+ return False
+
def poll(self, scheduled_start=None):
self._logger.debug("%s sending poll", self.log_prefix)
start_time = time.time()
@@ -96,7 +97,7 @@ class RemotePollLoopActor(actor_class):
response = self._send_request()
except Exception as error:
errmsg = self._got_error(error)
- if isinstance(error, self.CLIENT_ERRORS):
+ if self.is_common_error(error):
self._logger.warning(errmsg)
else:
self._logger.exception(errmsg)
diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
index c79d8f9..f50670e 100644
--- a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
@@ -19,32 +19,38 @@ class ComputeNodeStateChangeBase(config.actor_class):
This base class takes care of retrying changes and notifying
subscribers when the change is finished.
"""
- def __init__(self, logger_name, timer_actor, retry_wait, max_retry_wait):
+ def __init__(self, logger_name, cloud_client, timer_actor,
+ retry_wait, max_retry_wait):
super(ComputeNodeStateChangeBase, self).__init__()
self._later = self.actor_ref.proxy()
- self._timer = timer_actor
self._logger = logging.getLogger(logger_name)
+ self._cloud = cloud_client
+ self._timer = timer_actor
self.min_retry_wait = retry_wait
self.max_retry_wait = max_retry_wait
self.retry_wait = retry_wait
self.subscribers = set()
@staticmethod
- def _retry(errors):
+ def _retry(errors=()):
"""Retry decorator for an actor method that makes remote requests.
Use this function to decorator an actor method, and pass in a
tuple of exceptions to catch. This decorator will schedule
retries of that method with exponential backoff if the
- original method raises any of the given errors.
+ original method raises a known cloud driver error, or any of the
+ given exception types.
"""
def decorator(orig_func):
@functools.wraps(orig_func)
- def wrapper(self, *args, **kwargs):
+ def retry_wrapper(self, *args, **kwargs):
start_time = time.time()
try:
orig_func(self, *args, **kwargs)
- except errors as error:
+ except Exception as error:
+ if not (isinstance(error, errors) or
+ self._cloud.is_cloud_exception(error)):
+ raise
self._logger.warning(
"Client error: %s - waiting %s seconds",
error, self.retry_wait)
@@ -56,7 +62,7 @@ class ComputeNodeStateChangeBase(config.actor_class):
self.max_retry_wait)
else:
self.retry_wait = self.min_retry_wait
- return wrapper
+ return retry_wrapper
return decorator
def _finished(self):
@@ -86,9 +92,9 @@ class ComputeNodeSetupActor(ComputeNodeStateChangeBase):
cloud_size, arvados_node=None,
retry_wait=1, max_retry_wait=180):
super(ComputeNodeSetupActor, self).__init__(
- 'arvnodeman.nodeup', timer_actor, retry_wait, max_retry_wait)
+ 'arvnodeman.nodeup', cloud_client, timer_actor,
+ retry_wait, max_retry_wait)
self._arvados = arvados_client
- self._cloud = cloud_client
self.cloud_size = cloud_size
self.arvados_node = None
self.cloud_node = None
@@ -97,12 +103,12 @@ class ComputeNodeSetupActor(ComputeNodeStateChangeBase):
else:
self._later.prepare_arvados_node(arvados_node)
- @ComputeNodeStateChangeBase._retry(config.ARVADOS_ERRORS)
+ @ComputeNodeStateChangeBase._retry()
def create_arvados_node(self):
self.arvados_node = self._arvados.nodes().create(body={}).execute()
self._later.create_cloud_node()
- @ComputeNodeStateChangeBase._retry(config.ARVADOS_ERRORS)
+ @ComputeNodeStateChangeBase._retry()
def prepare_arvados_node(self, node):
self.arvados_node = self._arvados.nodes().update(
uuid=node['uuid'],
@@ -116,7 +122,7 @@ class ComputeNodeSetupActor(ComputeNodeStateChangeBase):
).execute()
self._later.create_cloud_node()
- @ComputeNodeStateChangeBase._retry(config.CLOUD_ERRORS)
+ @ComputeNodeStateChangeBase._retry()
def create_cloud_node(self):
self._logger.info("Creating cloud node with size %s.",
self.cloud_size.name)
@@ -143,8 +149,8 @@ class ComputeNodeShutdownActor(ComputeNodeStateChangeBase):
# eligible. Normal shutdowns based on job demand should be
# cancellable; shutdowns based on node misbehavior should not.
super(ComputeNodeShutdownActor, self).__init__(
- 'arvnodeman.nodedown', timer_actor, retry_wait, max_retry_wait)
- self._cloud = cloud_client
+ 'arvnodeman.nodedown', cloud_client, timer_actor,
+ retry_wait, max_retry_wait)
self._monitor = node_monitor.proxy()
self.cloud_node = self._monitor.cloud_node.get()
self.cancellable = cancellable
@@ -159,7 +165,7 @@ class ComputeNodeShutdownActor(ComputeNodeStateChangeBase):
def _stop_if_window_closed(orig_func):
@functools.wraps(orig_func)
- def wrapper(self, *args, **kwargs):
+ def stop_wrapper(self, *args, **kwargs):
if (self.cancellable and
(not self._monitor.shutdown_eligible().get())):
self._logger.info(
@@ -169,10 +175,10 @@ class ComputeNodeShutdownActor(ComputeNodeStateChangeBase):
return None
else:
return orig_func(self, *args, **kwargs)
- return wrapper
+ return stop_wrapper
@_stop_if_window_closed
- @ComputeNodeStateChangeBase._retry(config.CLOUD_ERRORS)
+ @ComputeNodeStateChangeBase._retry()
def shutdown_node(self):
if self._cloud.destroy_node(self.cloud_node):
self._logger.info("Cloud node %s shut down.", self.cloud_node.id)
@@ -210,14 +216,14 @@ class ComputeNodeUpdateActor(config.actor_class):
def _throttle_errors(orig_func):
@functools.wraps(orig_func)
- def wrapper(self, *args, **kwargs):
+ def throttle_wrapper(self, *args, **kwargs):
throttle_time = self.next_request_time - time.time()
if throttle_time > 0:
time.sleep(throttle_time)
self.next_request_time = time.time()
try:
result = orig_func(self, *args, **kwargs)
- except config.CLOUD_ERRORS:
+ except Exception as error:
self.error_streak += 1
self.next_request_time += min(2 ** self.error_streak,
self.max_retry_wait)
@@ -225,7 +231,7 @@ class ComputeNodeUpdateActor(config.actor_class):
else:
self.error_streak = 0
return result
- return wrapper
+ return throttle_wrapper
@_throttle_errors
def sync_node(self, cloud_node, arvados_node):
diff --git a/services/nodemanager/arvnodeman/computenode/driver/__init__.py b/services/nodemanager/arvnodeman/computenode/driver/__init__.py
index a20cfde..99b419e 100644
--- a/services/nodemanager/arvnodeman/computenode/driver/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/driver/__init__.py
@@ -2,6 +2,10 @@
from __future__ import absolute_import, print_function
+import libcloud.common.types as cloud_types
+
+from ...config import NETWORK_ERRORS
+
class BaseComputeNodeDriver(object):
"""Abstract base class for compute node drivers.
@@ -15,6 +19,8 @@ class BaseComputeNodeDriver(object):
creation kwargs with information about the specific Arvados node
record), sync_node, and node_start_time.
"""
+ CLOUD_ERRORS = NETWORK_ERRORS + (cloud_types.LibcloudError,)
+
def __init__(self, auth_kwargs, list_kwargs, create_kwargs, driver_class):
self.real = driver_class(**auth_kwargs)
self.list_kwargs = list_kwargs
@@ -62,3 +68,11 @@ class BaseComputeNodeDriver(object):
@classmethod
def node_start_time(cls, node):
raise NotImplementedError("BaseComputeNodeDriver.node_start_time")
+
+ @classmethod
+ def is_cloud_exception(cls, exception):
+ # libcloud compute drivers typically raise bare Exceptions to
+ # represent API errors. Return True for any exception that is
+ # exactly an Exception, or a better-known higher-level exception.
+ return (isinstance(exception, cls.CLOUD_ERRORS) or
+ getattr(exception, '__class__', None) is Exception)
diff --git a/services/nodemanager/arvnodeman/config.py b/services/nodemanager/arvnodeman/config.py
index f018015..b7ec1fc 100644
--- a/services/nodemanager/arvnodeman/config.py
+++ b/services/nodemanager/arvnodeman/config.py
@@ -10,7 +10,6 @@ import sys
import arvados
import httplib2
-import libcloud.common.types as cloud_types
import pykka
from apiclient import errors as apierror
@@ -19,7 +18,6 @@ from apiclient import errors as apierror
# it's low-level, but unlikely to catch code bugs.
NETWORK_ERRORS = (IOError, ssl.SSLError)
ARVADOS_ERRORS = NETWORK_ERRORS + (apierror.Error,)
-CLOUD_ERRORS = NETWORK_ERRORS + (cloud_types.LibcloudError,)
actor_class = pykka.ThreadingActor
diff --git a/services/nodemanager/arvnodeman/nodelist.py b/services/nodemanager/arvnodeman/nodelist.py
index 7ddfb7c..83dd93f 100644
--- a/services/nodemanager/arvnodeman/nodelist.py
+++ b/services/nodemanager/arvnodeman/nodelist.py
@@ -11,10 +11,11 @@ class ArvadosNodeListMonitorActor(clientactor.RemotePollLoopActor):
This actor regularly polls the list of Arvados node records, and
sends it to subscribers.
"""
-
- CLIENT_ERRORS = config.ARVADOS_ERRORS
LOGGER_NAME = 'arvnodeman.arvados_nodes'
+ def is_common_error(self, exception):
+ return isinstance(exception, config.ARVADOS_ERRORS)
+
def _item_key(self, node):
return node['uuid']
@@ -28,10 +29,11 @@ class CloudNodeListMonitorActor(clientactor.RemotePollLoopActor):
This actor regularly polls the cloud to get a list of running compute
nodes, and sends it to subscribers.
"""
-
- CLIENT_ERRORS = config.CLOUD_ERRORS
LOGGER_NAME = 'arvnodeman.cloud_nodes'
+ def is_common_error(self, exception):
+ return self._client.is_cloud_exception(exception)
+
def _item_key(self, node):
return node.id
diff --git a/services/nodemanager/tests/test_computenode_driver_ec2.py b/services/nodemanager/tests/test_computenode_driver_ec2.py
index fde103e..c765587 100644
--- a/services/nodemanager/tests/test_computenode_driver_ec2.py
+++ b/services/nodemanager/tests/test_computenode_driver_ec2.py
@@ -2,9 +2,11 @@
from __future__ import absolute_import, print_function
+import ssl
import time
import unittest
+import libcloud.common.types as cloud_types
import mock
import arvnodeman.computenode.driver.ec2 as ec2
@@ -87,3 +89,16 @@ class EC2ComputeNodeDriverTestCase(unittest.TestCase):
node.extra = {'launch_time': time.strftime('%Y-%m-%dT%H:%M:%S.000Z',
reftuple)}
self.assertEqual(refsecs, ec2.ComputeNodeDriver.node_start_time(node))
+
+ def test_cloud_exceptions(self):
+ for error in [Exception("test exception"),
+ IOError("test exception"),
+ ssl.SSLError("test exception"),
+ cloud_types.LibcloudError("test exception")]:
+ self.assertTrue(ec2.ComputeNodeDriver.is_cloud_exception(error),
+ "{} not flagged as cloud exception".format(error))
+
+ def test_noncloud_exceptions(self):
+ self.assertFalse(
+ ec2.ComputeNodeDriver.is_cloud_exception(ValueError("test error")),
+ "ValueError flagged as cloud exception")
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list