[ARVADOS] created: 0f1bac0cc06ffa93ddc67ff7d281a5218eaf93f5

Git user git at public.curoverse.com
Tue Sep 19 22:55:24 EDT 2017


        at  0f1bac0cc06ffa93ddc67ff7d281a5218eaf93f5 (commit)


commit 0f1bac0cc06ffa93ddc67ff7d281a5218eaf93f5
Author: Lucas Di Pentima <ldipentima at veritasgenetics.com>
Date:   Tue Sep 19 23:54:48 2017 -0300

    12073: Added tests
    
    Arvados-DCO-1.1-Signed-off-by: Lucas Di Pentima <ldipentima at veritasgenetics.com>

diff --git a/services/nodemanager/tests/test_daemon.py b/services/nodemanager/tests/test_daemon.py
index ec285d8..fcb4f2f 100644
--- a/services/nodemanager/tests/test_daemon.py
+++ b/services/nodemanager/tests/test_daemon.py
@@ -149,6 +149,36 @@ class NodeManagerDaemonActorTestCase(testutil.ActorTestMixin,
         self.busywait(lambda: len(arv_nodes) == len(self.monitored_arvados_nodes()))
         self.assertItemsEqual(arv_nodes, self.monitored_arvados_nodes())
 
+    def test_stale_arvados_node_cleanup(self):
+        """
+        When a node record is down, with a slot assigned, and also
+        stale, it should be cleaned.
+        """
+        cloud_node = testutil.cloud_node_mock(1)
+        arv_node = testutil.arvados_node_mock(
+            node_num=1,
+            age=3601, # > node_stale_after
+            crunch_worker_state='down',
+            slot_number=1
+        )
+        self.make_daemon([cloud_node], [arv_node])
+        self.assertTrue(self.node_record_cleaner.clean_node.called)
+
+    def test_fresh_arvados_node_cleanup(self):
+        """
+        When a node record is down, with a slot assigned, but not yet
+        stale, it shouldn't be cleaned
+        """
+        cloud_node = testutil.cloud_node_mock(1)
+        arv_node = testutil.arvados_node_mock(
+            node_num=1,
+            age=31, # < node_stale_after
+            crunch_worker_state='down',
+            slot_number=1
+        )
+        self.make_daemon([cloud_node], [arv_node])
+        self.assertFalse(self.node_record_cleaner.clean_node.called)
+
     def test_node_pairing(self):
         cloud_node = testutil.cloud_node_mock(1)
         arv_node = testutil.arvados_node_mock(1)
diff --git a/services/nodemanager/tests/test_nodelist.py b/services/nodemanager/tests/test_nodelist.py
index 11f41b8..aae4f93 100644
--- a/services/nodemanager/tests/test_nodelist.py
+++ b/services/nodemanager/tests/test_nodelist.py
@@ -89,5 +89,38 @@ class CloudNodeListMonitorActorTestCase(testutil.RemotePollLoopActorTestMixin,
         self.subscriber.assert_called_with(node)
         self.assertEqual(testutil.MockSize(2), node.size)
 
+
+class ArvadosNodeCleanupActorTestCase(testutil.ActorTestMixin,
+                                      unittest.TestCase):
+    ACTOR_CLASS = nodelist.ArvadosNodeCleanupActor
+
+    def make_actor(self):
+        self.client = mock.MagicMock(name='client_mock')
+        self.cleaner = self.ACTOR_CLASS.start(self.client).proxy()
+
+    def test_api_update_request(self):
+        """
+        When clean_node() is called, an api record update should be
+        done to reset the requested node record.
+        """
+        node_uuid = 'node-record-uuid'
+        self.make_actor()
+        self.cleaner.clean_node(node_uuid).get(self.TIMEOUT)
+        self.client.nodes().update.assert_called_with(
+            uuid=node_uuid,
+            body={
+                'hostname': None,
+                'ip_address': None,
+                'slot_number': None,
+                'first_ping_at': None,
+                'last_ping_at': None,
+                'properties': {},
+                'info': {
+                    'ec2_instance_id': None,
+                    'last_action': ''
+                }
+            }
+        )
+
 if __name__ == '__main__':
     unittest.main()

commit 499884be3880a3643bc4557682a75e0a54f71cf2
Author: Lucas Di Pentima <ldipentima at veritasgenetics.com>
Date:   Mon Sep 18 22:30:02 2017 -0300

    12073: Added actor for dispatching node record updates to the api server.
    The Nodemanager daemon periodically checks for stale node records and
    issues update commands to clean them up so they don't occupy a slot.
    
    Arvados-DCO-1.1-Signed-off-by: Lucas Di Pentima <ldipentima at veritasgenetics.com>

diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py
index e476e5e..65f55a5 100644
--- a/services/nodemanager/arvnodeman/daemon.py
+++ b/services/nodemanager/arvnodeman/daemon.py
@@ -13,7 +13,7 @@ import pykka
 
 from . import computenode as cnode
 from . import status
-from .computenode import dispatch
+from .computenode import dispatch, arvados_node_missing
 from .config import actor_class
 
 class _ComputeNodeRecord(object):
@@ -100,6 +100,7 @@ class NodeManagerDaemonActor(actor_class):
     """
     def __init__(self, server_wishlist_actor, arvados_nodes_actor,
                  cloud_nodes_actor, cloud_update_actor, timer_actor,
+                 arvados_node_cleaner_actor,
                  arvados_factory, cloud_factory,
                  shutdown_windows, server_calculator,
                  min_nodes, max_nodes,
@@ -116,6 +117,7 @@ class NodeManagerDaemonActor(actor_class):
         self._node_actor = node_actor_class
         self._cloud_updater = cloud_update_actor
         self._timer = timer_actor
+        self._arvados_node_cleaner = arvados_node_cleaner_actor
         self._new_arvados = arvados_factory
         self._new_cloud = cloud_factory
         self._cloud_driver = self._new_cloud()
@@ -230,6 +232,18 @@ class NodeManagerDaemonActor(actor_class):
 
     def update_arvados_nodes(self, nodelist):
         self._update_poll_time('arvados_nodes')
+        # Check for stale node records
+        for node in nodelist:
+            if node['crunch_worker_state'] == 'down' and \
+               node['slot_number'] and \
+               arvados_node_missing(node, self.node_stale_after):
+                self._logger.info(
+                    "Requesting cleanup for stale Arvados node record %s, "
+                    "slot number %s",
+                    node['uuid'], node['slot_number'])
+                self._arvados_node_cleaner.clean_node(
+                    node['uuid'],
+                    cleanup_reason='Stale record cleaned up by Node Manager')
         for key, node in self.arvados_nodes.update_from(nodelist):
             self._register_arvados_node(key, node)
         self.try_pairing()
@@ -526,6 +540,9 @@ class NodeManagerDaemonActor(actor_class):
         self._arvados_nodes_actor.stop()
         self._cloud_nodes_actor.stop()
 
+        # Shut down stale node record cleaner
+        self._arvados_node_cleaner.stop()
+
         # Clear cloud node list
         self.update_cloud_nodes([])
 
diff --git a/services/nodemanager/arvnodeman/launcher.py b/services/nodemanager/arvnodeman/launcher.py
index d85ef55..ffefeb0 100644
--- a/services/nodemanager/arvnodeman/launcher.py
+++ b/services/nodemanager/arvnodeman/launcher.py
@@ -20,7 +20,8 @@ from . import status
 from .baseactor import WatchdogActor
 from .daemon import NodeManagerDaemonActor
 from .jobqueue import JobQueueMonitorActor, ServerCalculator
-from .nodelist import ArvadosNodeListMonitorActor, CloudNodeListMonitorActor
+from .nodelist import ArvadosNodeListMonitorActor, CloudNodeListMonitorActor, \
+                      ArvadosNodeCleanupActor
 from .timedcallback import TimedCallBackActor
 from ._version import __version__
 
@@ -132,9 +133,11 @@ def main(args=None):
         timer, cloud_node_poller, arvados_node_poller, job_queue_poller = \
             launch_pollers(config, server_calculator)
         cloud_node_updater = node_update.start(config.new_cloud_client, timer).tell_proxy()
+        node_record_cleaner = ArvadosNodeCleanupActor.start(
+            config.new_arvados_client()).tell_proxy()
         node_daemon = NodeManagerDaemonActor.start(
             job_queue_poller, arvados_node_poller, cloud_node_poller,
-            cloud_node_updater, timer,
+            cloud_node_updater, timer, node_record_cleaner,
             config.new_arvados_client, config.new_cloud_client,
             config.shutdown_windows(),
             server_calculator,
diff --git a/services/nodemanager/arvnodeman/nodelist.py b/services/nodemanager/arvnodeman/nodelist.py
index e06ec83..044b64d 100644
--- a/services/nodemanager/arvnodeman/nodelist.py
+++ b/services/nodemanager/arvnodeman/nodelist.py
@@ -12,6 +12,41 @@ from . import config
 
 import arvados.util
 
+
+class ArvadosNodeCleanupActor(config.actor_class):
+    """Actor to dispatch Arvados nodes records cleanup requests.
+
+    This actor receives requests from the NodeManagerDaemonActor,
+    and dispatches API Server updates to reset stale node records without
+    a cloud node assigned but occupying a slot.
+    """
+    def __init__(self, api_client):
+        super(ArvadosNodeCleanupActor, self).__init__()
+        self._client = api_client
+
+    def clean_node(self, node_uuid, cleanup_reason=''):
+        try:
+            self._client.nodes().update(
+                uuid=node_uuid,
+                body={
+                    'hostname': None,
+                    'ip_address': None,
+                    'slot_number': None,
+                    'first_ping_at': None,
+                    'last_ping_at': None,
+                    'properties': {},
+                    'info': {
+                        'ec2_instance_id': None,
+                        'last_action': cleanup_reason
+                    }
+                },
+            ).execute()
+        except Exception as error:
+            self._logger.error("Trying to clean node record '%s': %s",
+                               node_uuid,
+                               error)
+
+
 class ArvadosNodeListMonitorActor(clientactor.RemotePollLoopActor):
     """Actor to poll the Arvados node list.
 
diff --git a/services/nodemanager/tests/test_daemon.py b/services/nodemanager/tests/test_daemon.py
index 1efa1ff..ec285d8 100644
--- a/services/nodemanager/tests/test_daemon.py
+++ b/services/nodemanager/tests/test_daemon.py
@@ -77,14 +77,16 @@ class NodeManagerDaemonActorTestCase(testutil.ActorTestMixin,
 
         self.arv_factory = mock.MagicMock(name='arvados_mock')
         api_client = mock.MagicMock(name='api_client')
-        api_client.nodes().create().execute.side_effect = [testutil.arvados_node_mock(1),
-                                                           testutil.arvados_node_mock(2)]
+        api_client.nodes().create().execute.side_effect = \
+            [testutil.arvados_node_mock(1),
+             testutil.arvados_node_mock(2)]
         self.arv_factory.return_value = api_client
 
         self.cloud_factory = mock.MagicMock(name='cloud_mock')
         self.cloud_factory().node_start_time.return_value = time.time()
         self.cloud_updates = mock.MagicMock(name='updates_mock')
         self.timer = testutil.MockTimer(deliver_immediately=False)
+        self.node_record_cleaner = mock.MagicMock(name='node_cleaner_mock')
         self.cloud_factory().node_id.side_effect = lambda node: node.id
         self.cloud_factory().broken.return_value = False
 
@@ -98,6 +100,7 @@ class NodeManagerDaemonActorTestCase(testutil.ActorTestMixin,
         self.daemon = nmdaemon.NodeManagerDaemonActor.start(
             self.server_wishlist_poller, self.arvados_nodes_poller,
             self.cloud_nodes_poller, self.cloud_updates, self.timer,
+            self.node_record_cleaner,
             self.arv_factory, self.cloud_factory,
             shutdown_windows, ServerCalculator(avail_sizes),
             min_nodes, max_nodes, 600, 1800, 3600,

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list