[ARVADOS] created: 0f1bac0cc06ffa93ddc67ff7d281a5218eaf93f5
Git user
git at public.curoverse.com
Tue Sep 19 22:55:24 EDT 2017
at 0f1bac0cc06ffa93ddc67ff7d281a5218eaf93f5 (commit)
commit 0f1bac0cc06ffa93ddc67ff7d281a5218eaf93f5
Author: Lucas Di Pentima <ldipentima at veritasgenetics.com>
Date: Tue Sep 19 23:54:48 2017 -0300
12073: Added tests
Arvados-DCO-1.1-Signed-off-by: Lucas Di Pentima <ldipentima at veritasgenetics.com>
diff --git a/services/nodemanager/tests/test_daemon.py b/services/nodemanager/tests/test_daemon.py
index ec285d8..fcb4f2f 100644
--- a/services/nodemanager/tests/test_daemon.py
+++ b/services/nodemanager/tests/test_daemon.py
@@ -149,6 +149,36 @@ class NodeManagerDaemonActorTestCase(testutil.ActorTestMixin,
self.busywait(lambda: len(arv_nodes) == len(self.monitored_arvados_nodes()))
self.assertItemsEqual(arv_nodes, self.monitored_arvados_nodes())
+ def test_stale_arvados_node_cleanup(self):
+ """
+ When a node record is down, with a slot assigned, and also
+ stale, it should be cleaned.
+ """
+ cloud_node = testutil.cloud_node_mock(1)
+ arv_node = testutil.arvados_node_mock(
+ node_num=1,
+ age=3601, # > node_stale_after
+ crunch_worker_state='down',
+ slot_number=1
+ )
+ self.make_daemon([cloud_node], [arv_node])
+ self.assertTrue(self.node_record_cleaner.clean_node.called)
+
+ def test_fresh_arvados_node_cleanup(self):
+ """
+ When a node record is down, with a slot assigned, but not yet
+ stale, it shouldn't be cleaned
+ """
+ cloud_node = testutil.cloud_node_mock(1)
+ arv_node = testutil.arvados_node_mock(
+ node_num=1,
+ age=31, # < node_stale_after
+ crunch_worker_state='down',
+ slot_number=1
+ )
+ self.make_daemon([cloud_node], [arv_node])
+ self.assertFalse(self.node_record_cleaner.clean_node.called)
+
def test_node_pairing(self):
cloud_node = testutil.cloud_node_mock(1)
arv_node = testutil.arvados_node_mock(1)
diff --git a/services/nodemanager/tests/test_nodelist.py b/services/nodemanager/tests/test_nodelist.py
index 11f41b8..aae4f93 100644
--- a/services/nodemanager/tests/test_nodelist.py
+++ b/services/nodemanager/tests/test_nodelist.py
@@ -89,5 +89,38 @@ class CloudNodeListMonitorActorTestCase(testutil.RemotePollLoopActorTestMixin,
self.subscriber.assert_called_with(node)
self.assertEqual(testutil.MockSize(2), node.size)
+
+class ArvadosNodeCleanupActorTestCase(testutil.ActorTestMixin,
+ unittest.TestCase):
+ ACTOR_CLASS = nodelist.ArvadosNodeCleanupActor
+
+ def make_actor(self):
+ self.client = mock.MagicMock(name='client_mock')
+ self.cleaner = self.ACTOR_CLASS.start(self.client).proxy()
+
+ def test_api_update_request(self):
+ """
+ When clean_node() is called, an api record update should be
+ done to reset the requested node record.
+ """
+ node_uuid = 'node-record-uuid'
+ self.make_actor()
+ self.cleaner.clean_node(node_uuid).get(self.TIMEOUT)
+ self.client.nodes().update.assert_called_with(
+ uuid=node_uuid,
+ body={
+ 'hostname': None,
+ 'ip_address': None,
+ 'slot_number': None,
+ 'first_ping_at': None,
+ 'last_ping_at': None,
+ 'properties': {},
+ 'info': {
+ 'ec2_instance_id': None,
+ 'last_action': ''
+ }
+ }
+ )
+
if __name__ == '__main__':
unittest.main()
commit 499884be3880a3643bc4557682a75e0a54f71cf2
Author: Lucas Di Pentima <ldipentima at veritasgenetics.com>
Date: Mon Sep 18 22:30:02 2017 -0300
12073: Added actor for dispatching node record updates to the api server.
The Nodemanager daemon periodically checks for stale node records and
issues update commands to clean them up so they don't occupy a slot.
Arvados-DCO-1.1-Signed-off-by: Lucas Di Pentima <ldipentima at veritasgenetics.com>
diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py
index e476e5e..65f55a5 100644
--- a/services/nodemanager/arvnodeman/daemon.py
+++ b/services/nodemanager/arvnodeman/daemon.py
@@ -13,7 +13,7 @@ import pykka
from . import computenode as cnode
from . import status
-from .computenode import dispatch
+from .computenode import dispatch, arvados_node_missing
from .config import actor_class
class _ComputeNodeRecord(object):
@@ -100,6 +100,7 @@ class NodeManagerDaemonActor(actor_class):
"""
def __init__(self, server_wishlist_actor, arvados_nodes_actor,
cloud_nodes_actor, cloud_update_actor, timer_actor,
+ arvados_node_cleaner_actor,
arvados_factory, cloud_factory,
shutdown_windows, server_calculator,
min_nodes, max_nodes,
@@ -116,6 +117,7 @@ class NodeManagerDaemonActor(actor_class):
self._node_actor = node_actor_class
self._cloud_updater = cloud_update_actor
self._timer = timer_actor
+ self._arvados_node_cleaner = arvados_node_cleaner_actor
self._new_arvados = arvados_factory
self._new_cloud = cloud_factory
self._cloud_driver = self._new_cloud()
@@ -230,6 +232,18 @@ class NodeManagerDaemonActor(actor_class):
def update_arvados_nodes(self, nodelist):
self._update_poll_time('arvados_nodes')
+ # Check for stale node records
+ for node in nodelist:
+ if node['crunch_worker_state'] == 'down' and \
+ node['slot_number'] and \
+ arvados_node_missing(node, self.node_stale_after):
+ self._logger.info(
+ "Requesting cleanup for stale Arvados node record %s, "
+ "slot number %s",
+ node['uuid'], node['slot_number'])
+ self._arvados_node_cleaner.clean_node(
+ node['uuid'],
+ cleanup_reason='Stale record cleaned up by Node Manager')
for key, node in self.arvados_nodes.update_from(nodelist):
self._register_arvados_node(key, node)
self.try_pairing()
@@ -526,6 +540,9 @@ class NodeManagerDaemonActor(actor_class):
self._arvados_nodes_actor.stop()
self._cloud_nodes_actor.stop()
+ # Shut down stale node record cleaner
+ self._arvados_node_cleaner.stop()
+
# Clear cloud node list
self.update_cloud_nodes([])
diff --git a/services/nodemanager/arvnodeman/launcher.py b/services/nodemanager/arvnodeman/launcher.py
index d85ef55..ffefeb0 100644
--- a/services/nodemanager/arvnodeman/launcher.py
+++ b/services/nodemanager/arvnodeman/launcher.py
@@ -20,7 +20,8 @@ from . import status
from .baseactor import WatchdogActor
from .daemon import NodeManagerDaemonActor
from .jobqueue import JobQueueMonitorActor, ServerCalculator
-from .nodelist import ArvadosNodeListMonitorActor, CloudNodeListMonitorActor
+from .nodelist import ArvadosNodeListMonitorActor, CloudNodeListMonitorActor, \
+ ArvadosNodeCleanupActor
from .timedcallback import TimedCallBackActor
from ._version import __version__
@@ -132,9 +133,11 @@ def main(args=None):
timer, cloud_node_poller, arvados_node_poller, job_queue_poller = \
launch_pollers(config, server_calculator)
cloud_node_updater = node_update.start(config.new_cloud_client, timer).tell_proxy()
+ node_record_cleaner = ArvadosNodeCleanupActor.start(
+ config.new_arvados_client()).tell_proxy()
node_daemon = NodeManagerDaemonActor.start(
job_queue_poller, arvados_node_poller, cloud_node_poller,
- cloud_node_updater, timer,
+ cloud_node_updater, timer, node_record_cleaner,
config.new_arvados_client, config.new_cloud_client,
config.shutdown_windows(),
server_calculator,
diff --git a/services/nodemanager/arvnodeman/nodelist.py b/services/nodemanager/arvnodeman/nodelist.py
index e06ec83..044b64d 100644
--- a/services/nodemanager/arvnodeman/nodelist.py
+++ b/services/nodemanager/arvnodeman/nodelist.py
@@ -12,6 +12,41 @@ from . import config
import arvados.util
+
+class ArvadosNodeCleanupActor(config.actor_class):
+ """Actor to dispatch Arvados nodes records cleanup requests.
+
+ This actor receives requests from the NodeManagerDaemonActor,
+ and dispatches API Server updates to reset stale node records without
+ a cloud node assigned but occupying a slot.
+ """
+ def __init__(self, api_client):
+ super(ArvadosNodeCleanupActor, self).__init__()
+ self._client = api_client
+
+ def clean_node(self, node_uuid, cleanup_reason=''):
+ try:
+ self._client.nodes().update(
+ uuid=node_uuid,
+ body={
+ 'hostname': None,
+ 'ip_address': None,
+ 'slot_number': None,
+ 'first_ping_at': None,
+ 'last_ping_at': None,
+ 'properties': {},
+ 'info': {
+ 'ec2_instance_id': None,
+ 'last_action': cleanup_reason
+ }
+ },
+ ).execute()
+ except Exception as error:
+ self._logger.error("Trying to clean node record '%s': %s",
+ node_uuid,
+ error)
+
+
class ArvadosNodeListMonitorActor(clientactor.RemotePollLoopActor):
"""Actor to poll the Arvados node list.
diff --git a/services/nodemanager/tests/test_daemon.py b/services/nodemanager/tests/test_daemon.py
index 1efa1ff..ec285d8 100644
--- a/services/nodemanager/tests/test_daemon.py
+++ b/services/nodemanager/tests/test_daemon.py
@@ -77,14 +77,16 @@ class NodeManagerDaemonActorTestCase(testutil.ActorTestMixin,
self.arv_factory = mock.MagicMock(name='arvados_mock')
api_client = mock.MagicMock(name='api_client')
- api_client.nodes().create().execute.side_effect = [testutil.arvados_node_mock(1),
- testutil.arvados_node_mock(2)]
+ api_client.nodes().create().execute.side_effect = \
+ [testutil.arvados_node_mock(1),
+ testutil.arvados_node_mock(2)]
self.arv_factory.return_value = api_client
self.cloud_factory = mock.MagicMock(name='cloud_mock')
self.cloud_factory().node_start_time.return_value = time.time()
self.cloud_updates = mock.MagicMock(name='updates_mock')
self.timer = testutil.MockTimer(deliver_immediately=False)
+ self.node_record_cleaner = mock.MagicMock(name='node_cleaner_mock')
self.cloud_factory().node_id.side_effect = lambda node: node.id
self.cloud_factory().broken.return_value = False
@@ -98,6 +100,7 @@ class NodeManagerDaemonActorTestCase(testutil.ActorTestMixin,
self.daemon = nmdaemon.NodeManagerDaemonActor.start(
self.server_wishlist_poller, self.arvados_nodes_poller,
self.cloud_nodes_poller, self.cloud_updates, self.timer,
+ self.node_record_cleaner,
self.arv_factory, self.cloud_factory,
shutdown_windows, ServerCalculator(avail_sizes),
min_nodes, max_nodes, 600, 1800, 3600,
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list