[ARVADOS] created: 2475b1d9407ba6fa21a2619d6ad5dbc9a50da3df

git at public.curoverse.com git at public.curoverse.com
Mon Mar 2 11:28:03 EST 2015


        at  2475b1d9407ba6fa21a2619d6ad5dbc9a50da3df (commit)


commit 2475b1d9407ba6fa21a2619d6ad5dbc9a50da3df
Author: Brett Smith <brett at curoverse.com>
Date:   Mon Mar 2 11:27:59 2015 -0500

    4751: Node Manager considers ping times for stricter node pairing.
    
    Because the pairing decision is currently based on IP address alone,
    Node Manager will occasionally pair a cloud node with the wrong
    Arvados node after an IP address is reused.  Fix that by bringing the
    node's first_ping_at into consideration: if it's older than the cloud
    node, refuse to pair.

diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
index 1608b52..310e788 100644
--- a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
@@ -9,7 +9,8 @@ import time
 import libcloud.common.types as cloud_types
 import pykka
 
-from .. import arvados_node_fqdn, arvados_node_mtime, timestamp_fresh
+from .. import \
+    arvados_node_fqdn, arvados_node_mtime, arvados_timestamp, timestamp_fresh
 from ...clientactor import _notify_subscribers
 from ... import config
 
@@ -322,9 +323,11 @@ class ComputeNodeMonitorActor(config.actor_class):
             self.last_shutdown_opening = next_opening
 
     def offer_arvados_pair(self, arvados_node):
-        if self.arvados_node is not None:
+        first_ping_s = arvados_node.get('first_ping_at')
+        if (self.arvados_node is not None) or (not first_ping_s):
             return None
-        elif arvados_node['ip_address'] in self.cloud_node.private_ips:
+        elif ((arvados_node['ip_address'] in self.cloud_node.private_ips) and
+              (arvados_timestamp(first_ping_s) >= self.cloud_node_start_time)):
             self._later.update_arvados_node(arvados_node)
             return self.cloud_node.id
         else:
diff --git a/services/nodemanager/tests/test_computenode_dispatch.py b/services/nodemanager/tests/test_computenode_dispatch.py
index 4a72f47..8d69ea9 100644
--- a/services/nodemanager/tests/test_computenode_dispatch.py
+++ b/services/nodemanager/tests/test_computenode_dispatch.py
@@ -319,6 +319,13 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
         self.assertIsNone(
             self.node_actor.offer_arvados_pair(arv_node).get(self.TIMEOUT))
 
+    def test_arvados_node_mismatch_first_ping_too_early(self):
+        self.make_actor(4)
+        arv_node = testutil.arvados_node_mock(
+            4, first_ping_at='1971-03-02T14:15:16.1717282Z')
+        self.assertIsNone(
+            self.node_actor.offer_arvados_pair(arv_node).get(self.TIMEOUT))
+
     def test_update_cloud_node(self):
         self.make_actor(1)
         self.make_mocks(2)
diff --git a/services/nodemanager/tests/test_daemon.py b/services/nodemanager/tests/test_daemon.py
index bdba83a..dc8fdc3 100644
--- a/services/nodemanager/tests/test_daemon.py
+++ b/services/nodemanager/tests/test_daemon.py
@@ -94,8 +94,11 @@ class NodeManagerDaemonActorTestCase(testutil.ActorTestMixin,
         self.check_monitors_arvados_nodes(arv_node)
 
     def test_arvados_node_un_and_re_paired(self):
+        # We need to create the Arvados node mock after spinning up the daemon
+        # to make sure it's new enough to pair with the cloud node.
+        self.make_daemon([testutil.cloud_node_mock(3)], arvados_nodes=None)
         arv_node = testutil.arvados_node_mock(3)
-        self.make_daemon([testutil.cloud_node_mock(3)], [arv_node])
+        self.daemon.update_arvados_nodes([arv_node]).get(self.TIMEOUT)
         self.check_monitors_arvados_nodes(arv_node)
         self.daemon.update_cloud_nodes([]).get(self.TIMEOUT)
         self.assertEqual(0, self.alive_monitor_count())
diff --git a/services/nodemanager/tests/testutil.py b/services/nodemanager/tests/testutil.py
index 1c53c68..f0508e7 100644
--- a/services/nodemanager/tests/testutil.py
+++ b/services/nodemanager/tests/testutil.py
@@ -13,14 +13,17 @@ from . import pykka_timeout
 
 no_sleep = mock.patch('time.sleep', lambda n: None)
 
-def arvados_node_mock(node_num=99, job_uuid=None, age=0, **kwargs):
+def arvados_node_mock(node_num=99, job_uuid=None, age=-1, **kwargs):
     mod_time = datetime.datetime.utcnow() - datetime.timedelta(seconds=age)
+    mod_time_s = mod_time.strftime('%Y-%m-%dT%H:%M:%S.%fZ')
     if job_uuid is True:
         job_uuid = 'zzzzz-jjjjj-jobjobjobjobjob'
     crunch_worker_state = 'idle' if (job_uuid is None) else 'busy'
     node = {'uuid': 'zzzzz-yyyyy-{:015x}'.format(node_num),
             'created_at': '2014-01-01T01:02:03.04050607Z',
-            'modified_at': mod_time.strftime('%Y-%m-%dT%H:%M:%S.%fZ'),
+            'modified_at': mod_time_s,
+            'first_ping_at': kwargs.pop('first_ping_at', mod_time_s),
+            'last_ping_at': mod_time_s,
             'slot_number': node_num,
             'hostname': 'compute{}'.format(node_num),
             'domain': 'zzzzz.arvadosapi.com',

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list