[ARVADOS] updated: b38fea181afac08137243a294a502e4605d679d8
git at public.curoverse.com
git at public.curoverse.com
Wed Mar 18 08:57:36 EDT 2015
Summary of changes:
.../arvnodeman/computenode/dispatch/__init__.py | 15 +++++++++---
.../arvnodeman/computenode/driver/__init__.py | 6 +++++
.../arvnodeman/computenode/driver/dummy.py | 4 ++++
.../arvnodeman/computenode/driver/ec2.py | 4 ++++
.../arvnodeman/computenode/driver/gce.py | 10 ++++++++
services/nodemanager/arvnodeman/daemon.py | 1 +
.../nodemanager/tests/test_computenode_dispatch.py | 27 ++++++++++++++++++----
.../tests/test_computenode_driver_ec2.py | 6 +++++
.../tests/test_computenode_driver_gce.py | 13 +++++++++++
services/nodemanager/tests/testutil.py | 5 ++++
10 files changed, 84 insertions(+), 7 deletions(-)
via b38fea181afac08137243a294a502e4605d679d8 (commit)
via fff9d822fdf8857a7d8e22c024bc4f04b3b94129 (commit)
from 18148ec24bf63ec3353a2cd38f6f07c460037db2 (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
commit b38fea181afac08137243a294a502e4605d679d8
Merge: 18148ec fff9d82
Author: Brett Smith <brett at curoverse.com>
Date: Wed Mar 18 08:56:54 2015 -0400
Merge branch '5313-node-manager-node-naming-tag-wip'
Refs #5313. Closes #5434, #5467.
commit fff9d822fdf8857a7d8e22c024bc4f04b3b94129
Author: Brett Smith <brett at curoverse.com>
Date: Fri Mar 13 11:11:35 2015 -0400
5313: Node Manager has cloud-specific logic to get node FQDNs.
On AWS, we put compute nodes' FQDN in the name field. On GCE, we
can't do that: it can't contain dots. Add a node_fqdn classmethod to
cloud drivers to get a node's FQDN from the right place, and use that
method when deciding whether or not to sync a node.
Syncing a node will occasionally raise an "Invalid fingerprint"
exception on GCE. This is nonfatal. Add a comment explaining why.
diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
index 310e788..7081762 100644
--- a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
@@ -253,13 +253,14 @@ class ComputeNodeMonitorActor(config.actor_class):
for shutdown.
"""
def __init__(self, cloud_node, cloud_node_start_time, shutdown_timer,
- timer_actor, update_actor, arvados_node=None,
+ cloud_fqdn_func, timer_actor, update_actor, arvados_node=None,
poll_stale_after=600, node_stale_after=3600):
super(ComputeNodeMonitorActor, self).__init__()
self._later = self.actor_ref.proxy()
self._logger = logging.getLogger('arvnodeman.computenode')
self._last_log = None
self._shutdowns = shutdown_timer
+ self._cloud_node_fqdn = cloud_fqdn_func
self._timer = timer_actor
self._update = update_actor
self.cloud_node = cloud_node
@@ -339,9 +340,17 @@ class ComputeNodeMonitorActor(config.actor_class):
self._later.consider_shutdown()
def update_arvados_node(self, arvados_node):
+ # If the cloud node's FQDN doesn't match what's in the Arvados node
+ # record, make them match.
+ # This method is a little unusual in the way it just fires off the
+ # request without checking the result or retrying errors. That's
+ # because this update happens every time we reload the Arvados node
+ # list: if a previous sync attempt failed, we'll see that the names
+ # are out of sync and just try again. ComputeNodeUpdateActor has
+ # the logic to throttle those effective retries when there's trouble.
if arvados_node is not None:
self.arvados_node = arvados_node
- new_hostname = arvados_node_fqdn(self.arvados_node)
- if new_hostname != self.cloud_node.name:
+ if (self._cloud_node_fqdn(self.cloud_node) !=
+ arvados_node_fqdn(self.arvados_node)):
self._update.sync_node(self.cloud_node, self.arvados_node)
self._later.consider_shutdown()
diff --git a/services/nodemanager/arvnodeman/computenode/driver/__init__.py b/services/nodemanager/arvnodeman/computenode/driver/__init__.py
index 369bb8f..b703e0d 100644
--- a/services/nodemanager/arvnodeman/computenode/driver/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/driver/__init__.py
@@ -79,6 +79,12 @@ class BaseComputeNodeDriver(object):
raise NotImplementedError("BaseComputeNodeDriver.sync_node")
@classmethod
+ def node_fqdn(cls, node):
+ # This method should return the FQDN of the node object argument.
+ # Different clouds store this in different places.
+ raise NotImplementedError("BaseComputeNodeDriver.node_fqdn")
+
+ @classmethod
def node_start_time(cls, node):
raise NotImplementedError("BaseComputeNodeDriver.node_start_time")
diff --git a/services/nodemanager/arvnodeman/computenode/driver/dummy.py b/services/nodemanager/arvnodeman/computenode/driver/dummy.py
index 3a286bb..c164a25 100644
--- a/services/nodemanager/arvnodeman/computenode/driver/dummy.py
+++ b/services/nodemanager/arvnodeman/computenode/driver/dummy.py
@@ -49,5 +49,9 @@ class ComputeNodeDriver(BaseComputeNodeDriver):
cloud_node.name = arvados_node_fqdn(arvados_node)
@classmethod
+ def node_fqdn(cls, node):
+ return node.name
+
+ @classmethod
def node_start_time(cls, node):
return cls.DUMMY_START_TIME
diff --git a/services/nodemanager/arvnodeman/computenode/driver/ec2.py b/services/nodemanager/arvnodeman/computenode/driver/ec2.py
index 9db3d89..588ca51 100644
--- a/services/nodemanager/arvnodeman/computenode/driver/ec2.py
+++ b/services/nodemanager/arvnodeman/computenode/driver/ec2.py
@@ -81,6 +81,10 @@ class ComputeNodeDriver(BaseComputeNodeDriver):
{'Name': arvados_node_fqdn(arvados_node)})
@classmethod
+ def node_fqdn(cls, node):
+ return node.name
+
+ @classmethod
def node_start_time(cls, node):
time_str = node.extra['launch_time'].split('.', 2)[0] + 'UTC'
return time.mktime(time.strptime(
diff --git a/services/nodemanager/arvnodeman/computenode/driver/gce.py b/services/nodemanager/arvnodeman/computenode/driver/gce.py
index ccd1937..6380d0e 100644
--- a/services/nodemanager/arvnodeman/computenode/driver/gce.py
+++ b/services/nodemanager/arvnodeman/computenode/driver/gce.py
@@ -96,6 +96,10 @@ class ComputeNodeDriver(BaseComputeNodeDriver):
raise
def sync_node(self, cloud_node, arvados_node):
+ # We can't store the FQDN on the name attribute or anything like it,
+ # because (a) names are static throughout the node's life (so FQDN
+ # isn't available because we don't know it at node creation time) and
+ # (b) it can't contain dots. Instead stash it in metadata.
hostname = arvados_node_fqdn(arvados_node)
metadata_req = cloud_node.extra['metadata'].copy()
metadata_items = metadata_req.setdefault('items', [])
@@ -111,6 +115,12 @@ class ComputeNodeDriver(BaseComputeNodeDriver):
raise Exception("setMetadata error: {}".format(response.error))
@classmethod
+ def node_fqdn(cls, node):
+ # See sync_node comment.
+ return cls._get_metadata(node.extra['metadata'].get('items', []),
+ 'hostname', '')
+
+ @classmethod
def node_start_time(cls, node):
try:
return arvados_timestamp(cls._get_metadata(
diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py
index 0e48078..836b673 100644
--- a/services/nodemanager/arvnodeman/daemon.py
+++ b/services/nodemanager/arvnodeman/daemon.py
@@ -154,6 +154,7 @@ class NodeManagerDaemonActor(actor_class):
cloud_node=cloud_node,
cloud_node_start_time=start_time,
shutdown_timer=shutdown_timer,
+ cloud_fqdn_func=self._cloud_driver.node_fqdn,
update_actor=self._cloud_updater,
timer_actor=self._timer,
arvados_node=None,
diff --git a/services/nodemanager/tests/test_computenode_dispatch.py b/services/nodemanager/tests/test_computenode_dispatch.py
index 8d69ea9..b8cf0ee 100644
--- a/services/nodemanager/tests/test_computenode_dispatch.py
+++ b/services/nodemanager/tests/test_computenode_dispatch.py
@@ -129,8 +129,9 @@ class ComputeNodeShutdownActorMixin(testutil.ActorTestMixin):
if not hasattr(self, 'timer'):
self.make_mocks()
monitor_actor = dispatch.ComputeNodeMonitorActor.start(
- self.cloud_node, time.time(), self.shutdowns, self.timer,
- self.updates, self.arvados_node)
+ self.cloud_node, time.time(), self.shutdowns,
+ testutil.cloud_node_fqdn, self.timer, self.updates,
+ self.arvados_node)
self.shutdown_actor = self.ACTOR_CLASS.start(
self.timer, self.cloud_client, monitor_actor, cancellable).proxy()
self.monitor_actor = monitor_actor.proxy()
@@ -218,8 +219,9 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
if start_time is None:
start_time = time.time()
self.node_actor = dispatch.ComputeNodeMonitorActor.start(
- self.cloud_mock, start_time, self.shutdowns, self.timer,
- self.updates, arv_node).proxy()
+ self.cloud_mock, start_time, self.shutdowns,
+ testutil.cloud_node_fqdn, self.timer, self.updates,
+ arv_node).proxy()
self.node_actor.subscribe(self.subscriber).get(self.TIMEOUT)
def node_state(self, *states):
@@ -356,3 +358,20 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
current_arvados = self.node_actor.arvados_node.get(self.TIMEOUT)
self.assertEqual(testutil.ip_address_mock(4),
current_arvados['ip_address'])
+
+ def test_update_arvados_node_syncs_when_fqdn_mismatch(self):
+ self.make_mocks(5)
+ self.cloud_mock.extra['testname'] = 'cloudfqdn.zzzzz.arvadosapi.com'
+ self.make_actor()
+ arv_node = testutil.arvados_node_mock(5)
+ self.node_actor.update_arvados_node(arv_node).get(self.TIMEOUT)
+ self.assertEqual(1, self.updates.sync_node.call_count)
+
+ def test_update_arvados_node_skips_sync_when_fqdn_match(self):
+ self.make_mocks(6)
+ arv_node = testutil.arvados_node_mock(6)
+ self.cloud_mock.extra['testname'] ='{n[hostname]}.{n[domain]}'.format(
+ n=arv_node)
+ self.make_actor()
+ self.node_actor.update_arvados_node(arv_node).get(self.TIMEOUT)
+ self.assertEqual(0, self.updates.sync_node.call_count)
diff --git a/services/nodemanager/tests/test_computenode_driver_ec2.py b/services/nodemanager/tests/test_computenode_driver_ec2.py
index 8e52824..595f1f4 100644
--- a/services/nodemanager/tests/test_computenode_driver_ec2.py
+++ b/services/nodemanager/tests/test_computenode_driver_ec2.py
@@ -91,6 +91,12 @@ class EC2ComputeNodeDriverTestCase(testutil.DriverTestMixin, unittest.TestCase):
reftuple)}
self.assertEqual(refsecs, ec2.ComputeNodeDriver.node_start_time(node))
+ def test_node_fqdn(self):
+ name = 'fqdntest.zzzzz.arvadosapi.com'
+ node = testutil.cloud_node_mock()
+ node.name = name
+ self.assertEqual(name, ec2.ComputeNodeDriver.node_fqdn(node))
+
def test_cloud_exceptions(self):
for error in [Exception("test exception"),
IOError("test exception"),
diff --git a/services/nodemanager/tests/test_computenode_driver_gce.py b/services/nodemanager/tests/test_computenode_driver_gce.py
index 17114b6..465adc5 100644
--- a/services/nodemanager/tests/test_computenode_driver_gce.py
+++ b/services/nodemanager/tests/test_computenode_driver_gce.py
@@ -127,6 +127,19 @@ class GCEComputeNodeDriverTestCase(testutil.DriverTestMixin, unittest.TestCase):
metadata = self.driver_mock().create_node.call_args[1]['ex_metadata']
self.assertLessEqual(start_time, metadata.get('booted_at'))
+ def test_known_node_fqdn(self):
+ name = 'fqdntest.zzzzz.arvadosapi.com'
+ node = testutil.cloud_node_mock(metadata=self.build_gce_metadata(
+ {'hostname': name}))
+ self.assertEqual(name, gce.ComputeNodeDriver.node_fqdn(node))
+
+ def test_unknown_node_fqdn(self):
+ # Return an empty string. This lets fqdn be safely compared
+ # against an expected value, and ComputeNodeMonitorActor
+ # should try to update it.
+ node = testutil.cloud_node_mock(metadata=self.build_gce_metadata({}))
+ self.assertEqual('', gce.ComputeNodeDriver.node_fqdn(node))
+
def test_deliver_ssh_key_in_metadata(self):
test_ssh_key = 'ssh-rsa-foo'
arv_node = testutil.arvados_node_mock(1)
diff --git a/services/nodemanager/tests/testutil.py b/services/nodemanager/tests/testutil.py
index f0508e7..650a232 100644
--- a/services/nodemanager/tests/testutil.py
+++ b/services/nodemanager/tests/testutil.py
@@ -55,6 +55,11 @@ def cloud_node_mock(node_num=99, **extra):
node.extra = extra
return node
+def cloud_node_fqdn(node):
+ # We intentionally put the FQDN somewhere goofy to make sure tested code is
+ # using this function for lookups.
+ return node.extra.get('testname', 'NoTestName')
+
def ip_address_mock(last_octet):
return '10.20.30.{}'.format(last_octet)
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list