[ARVADOS] created: 2475b1d9407ba6fa21a2619d6ad5dbc9a50da3df
git at public.curoverse.com
git at public.curoverse.com
Mon Mar 2 11:28:03 EST 2015
at 2475b1d9407ba6fa21a2619d6ad5dbc9a50da3df (commit)
commit 2475b1d9407ba6fa21a2619d6ad5dbc9a50da3df
Author: Brett Smith <brett at curoverse.com>
Date: Mon Mar 2 11:27:59 2015 -0500
4751: Node Manager considers ping times for stricter node pairing.
Because the pairing decision is currently based on IP address alone,
Node Manager will occasionally pair a cloud node with the wrong
Arvados node after an IP address is reused. Fix that by bringing the
node's first_ping_at into consideration: if it's older than the cloud
node, refuse to pair.
diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
index 1608b52..310e788 100644
--- a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
@@ -9,7 +9,8 @@ import time
import libcloud.common.types as cloud_types
import pykka
-from .. import arvados_node_fqdn, arvados_node_mtime, timestamp_fresh
+from .. import \
+ arvados_node_fqdn, arvados_node_mtime, arvados_timestamp, timestamp_fresh
from ...clientactor import _notify_subscribers
from ... import config
@@ -322,9 +323,11 @@ class ComputeNodeMonitorActor(config.actor_class):
self.last_shutdown_opening = next_opening
def offer_arvados_pair(self, arvados_node):
- if self.arvados_node is not None:
+ first_ping_s = arvados_node.get('first_ping_at')
+ if (self.arvados_node is not None) or (not first_ping_s):
return None
- elif arvados_node['ip_address'] in self.cloud_node.private_ips:
+ elif ((arvados_node['ip_address'] in self.cloud_node.private_ips) and
+ (arvados_timestamp(first_ping_s) >= self.cloud_node_start_time)):
self._later.update_arvados_node(arvados_node)
return self.cloud_node.id
else:
diff --git a/services/nodemanager/tests/test_computenode_dispatch.py b/services/nodemanager/tests/test_computenode_dispatch.py
index 4a72f47..8d69ea9 100644
--- a/services/nodemanager/tests/test_computenode_dispatch.py
+++ b/services/nodemanager/tests/test_computenode_dispatch.py
@@ -319,6 +319,13 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
self.assertIsNone(
self.node_actor.offer_arvados_pair(arv_node).get(self.TIMEOUT))
+ def test_arvados_node_mismatch_first_ping_too_early(self):
+ self.make_actor(4)
+ arv_node = testutil.arvados_node_mock(
+ 4, first_ping_at='1971-03-02T14:15:16.1717282Z')
+ self.assertIsNone(
+ self.node_actor.offer_arvados_pair(arv_node).get(self.TIMEOUT))
+
def test_update_cloud_node(self):
self.make_actor(1)
self.make_mocks(2)
diff --git a/services/nodemanager/tests/test_daemon.py b/services/nodemanager/tests/test_daemon.py
index bdba83a..dc8fdc3 100644
--- a/services/nodemanager/tests/test_daemon.py
+++ b/services/nodemanager/tests/test_daemon.py
@@ -94,8 +94,11 @@ class NodeManagerDaemonActorTestCase(testutil.ActorTestMixin,
self.check_monitors_arvados_nodes(arv_node)
def test_arvados_node_un_and_re_paired(self):
+ # We need to create the Arvados node mock after spinning up the daemon
+ # to make sure it's new enough to pair with the cloud node.
+ self.make_daemon([testutil.cloud_node_mock(3)], arvados_nodes=None)
arv_node = testutil.arvados_node_mock(3)
- self.make_daemon([testutil.cloud_node_mock(3)], [arv_node])
+ self.daemon.update_arvados_nodes([arv_node]).get(self.TIMEOUT)
self.check_monitors_arvados_nodes(arv_node)
self.daemon.update_cloud_nodes([]).get(self.TIMEOUT)
self.assertEqual(0, self.alive_monitor_count())
diff --git a/services/nodemanager/tests/testutil.py b/services/nodemanager/tests/testutil.py
index 1c53c68..f0508e7 100644
--- a/services/nodemanager/tests/testutil.py
+++ b/services/nodemanager/tests/testutil.py
@@ -13,14 +13,17 @@ from . import pykka_timeout
no_sleep = mock.patch('time.sleep', lambda n: None)
-def arvados_node_mock(node_num=99, job_uuid=None, age=0, **kwargs):
+def arvados_node_mock(node_num=99, job_uuid=None, age=-1, **kwargs):
mod_time = datetime.datetime.utcnow() - datetime.timedelta(seconds=age)
+ mod_time_s = mod_time.strftime('%Y-%m-%dT%H:%M:%S.%fZ')
if job_uuid is True:
job_uuid = 'zzzzz-jjjjj-jobjobjobjobjob'
crunch_worker_state = 'idle' if (job_uuid is None) else 'busy'
node = {'uuid': 'zzzzz-yyyyy-{:015x}'.format(node_num),
'created_at': '2014-01-01T01:02:03.04050607Z',
- 'modified_at': mod_time.strftime('%Y-%m-%dT%H:%M:%S.%fZ'),
+ 'modified_at': mod_time_s,
+ 'first_ping_at': kwargs.pop('first_ping_at', mod_time_s),
+ 'last_ping_at': mod_time_s,
'slot_number': node_num,
'hostname': 'compute{}'.format(node_num),
'domain': 'zzzzz.arvadosapi.com',
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list