[ARVADOS] updated: f116df2ef0439328044ccc46a90d7fc86541ace8
git at public.curoverse.com
git at public.curoverse.com
Mon May 11 17:50:53 EDT 2015
Summary of changes:
.../arvnodeman/computenode/dispatch/__init__.py | 62 ++++++++++++++--------
.../arvnodeman/computenode/dispatch/slurm.py | 2 +-
services/nodemanager/arvnodeman/daemon.py | 1 +
services/nodemanager/tests/__init__.py | 2 +-
.../nodemanager/tests/test_computenode_dispatch.py | 29 +++++++++-
.../tests/test_computenode_dispatch_slurm.py | 5 ++
6 files changed, 76 insertions(+), 25 deletions(-)
via f116df2ef0439328044ccc46a90d7fc86541ace8 (commit)
via b5d3c559df7250ab7106adc2ded7f38dc666cf9c (commit)
via 44684242aba028470311a4991dcd1456df5c84fd (commit)
from 7e80f468042fd4caa78f69fb66efce1b89035d45 (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
commit f116df2ef0439328044ccc46a90d7fc86541ace8
Merge: 7e80f46 b5d3c55
Author: Brett Smith <brett at curoverse.com>
Date: Mon May 11 17:50:40 2015 -0400
Merge branch '5736-node-manager-easy-slot-cleanup-wip'
Refs #5736. Closes #5995.
commit b5d3c559df7250ab7106adc2ded7f38dc666cf9c
Author: Brett Smith <brett at curoverse.com>
Date: Mon May 11 16:51:29 2015 -0400
5736: Node Manager cleans node records after shutting down a paired node.
This is an easy case to handle, and helps avoid exhausting SLURM
slot numbers.
diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
index 0fab1b0..6d5c223 100644
--- a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
@@ -20,12 +20,13 @@ class ComputeNodeStateChangeBase(config.actor_class):
This base class takes care of retrying changes and notifying
subscribers when the change is finished.
"""
- def __init__(self, logger_name, cloud_client, timer_actor,
+ def __init__(self, logger_name, cloud_client, arvados_client, timer_actor,
retry_wait, max_retry_wait):
super(ComputeNodeStateChangeBase, self).__init__()
self._later = self.actor_ref.proxy()
self._logger = logging.getLogger(logger_name)
self._cloud = cloud_client
+ self._arvados = arvados_client
self._timer = timer_actor
self.min_retry_wait = retry_wait
self.max_retry_wait = max_retry_wait
@@ -79,6 +80,18 @@ class ComputeNodeStateChangeBase(config.actor_class):
else:
self.subscribers.add(subscriber)
+ def _clean_arvados_node(self, arvados_node, explanation):
+ return self._arvados.nodes().update(
+ uuid=arvados_node['uuid'],
+ body={'hostname': None,
+ 'ip_address': None,
+ 'slot_number': None,
+ 'first_ping_at': None,
+ 'last_ping_at': None,
+ 'info': {'ec2_instance_id': None,
+ 'last_action': explanation}},
+ ).execute()
+
class ComputeNodeSetupActor(ComputeNodeStateChangeBase):
"""Actor to create and set up a cloud compute node.
@@ -93,9 +106,8 @@ class ComputeNodeSetupActor(ComputeNodeStateChangeBase):
cloud_size, arvados_node=None,
retry_wait=1, max_retry_wait=180):
super(ComputeNodeSetupActor, self).__init__(
- 'arvnodeman.nodeup', cloud_client, timer_actor,
+ 'arvnodeman.nodeup', cloud_client, arvados_client, timer_actor,
retry_wait, max_retry_wait)
- self._arvados = arvados_client
self.cloud_size = cloud_size
self.arvados_node = None
self.cloud_node = None
@@ -111,16 +123,8 @@ class ComputeNodeSetupActor(ComputeNodeStateChangeBase):
@ComputeNodeStateChangeBase._retry(config.ARVADOS_ERRORS)
def prepare_arvados_node(self, node):
- self.arvados_node = self._arvados.nodes().update(
- uuid=node['uuid'],
- body={'hostname': None,
- 'ip_address': None,
- 'slot_number': None,
- 'first_ping_at': None,
- 'last_ping_at': None,
- 'info': {'ec2_instance_id': None,
- 'last_action': "Prepared by Node Manager"}}
- ).execute()
+ self.arvados_node = self._clean_arvados_node(
+ node, "Prepared by Node Manager")
self._later.create_cloud_node()
@ComputeNodeStateChangeBase._retry()
@@ -150,7 +154,7 @@ class ComputeNodeShutdownActor(ComputeNodeStateChangeBase):
This actor simply destroys a cloud node, retrying as needed.
"""
- def __init__(self, timer_actor, cloud_client, node_monitor,
+ def __init__(self, timer_actor, cloud_client, arvados_client, node_monitor,
cancellable=True, retry_wait=1, max_retry_wait=180):
# If a ShutdownActor is cancellable, it will ask the
# ComputeNodeMonitorActor if it's still eligible before taking each
@@ -158,7 +162,7 @@ class ComputeNodeShutdownActor(ComputeNodeStateChangeBase):
# eligible. Normal shutdowns based on job demand should be
# cancellable; shutdowns based on node misbehavior should not.
super(ComputeNodeShutdownActor, self).__init__(
- 'arvnodeman.nodedown', cloud_client, timer_actor,
+ 'arvnodeman.nodedown', cloud_client, arvados_client, timer_actor,
retry_wait, max_retry_wait)
self._monitor = node_monitor.proxy()
self.cloud_node = self._monitor.cloud_node.get()
@@ -168,9 +172,16 @@ class ComputeNodeShutdownActor(ComputeNodeStateChangeBase):
def on_start(self):
self._later.shutdown_node()
+ def _arvados_node(self):
+ return self._monitor.arvados_node.get()
+
+ def _finished(self, success_flag=None):
+ if success_flag is not None:
+ self.success = success_flag
+ return super(ComputeNodeShutdownActor, self)._finished()
+
def cancel_shutdown(self):
- self.success = False
- self._finished()
+ self._finished(success_flag=False)
def _stop_if_window_closed(orig_func):
@functools.wraps(orig_func)
@@ -189,13 +200,20 @@ class ComputeNodeShutdownActor(ComputeNodeStateChangeBase):
@_stop_if_window_closed
@ComputeNodeStateChangeBase._retry()
def shutdown_node(self):
- if self._cloud.destroy_node(self.cloud_node):
- self._logger.info("Cloud node %s shut down.", self.cloud_node.id)
- self.success = True
- self._finished()
- else:
+ if not self._cloud.destroy_node(self.cloud_node):
# Force a retry.
raise cloud_types.LibcloudError("destroy_node failed")
+ self._logger.info("Cloud node %s shut down.", self.cloud_node.id)
+ arv_node = self._arvados_node()
+ if arv_node is None:
+ self._finished(success_flag=True)
+ else:
+ self._later.clean_arvados_node(arv_node)
+
+ @ComputeNodeStateChangeBase._retry(config.ARVADOS_ERRORS)
+ def clean_arvados_node(self, arvados_node):
+ self._clean_arvados_node(arvados_node, "Shut down by Node Manager")
+ self._finished(success_flag=True)
# Make the decorator available to subclasses.
_stop_if_window_closed = staticmethod(_stop_if_window_closed)
diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py b/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
index 6eaa8b9..71e73f1 100644
--- a/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
+++ b/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
@@ -13,7 +13,7 @@ class ComputeNodeShutdownActor(ShutdownActorBase):
SLURM_END_STATES = frozenset(['down\n', 'down*\n', 'drain\n', 'fail\n'])
def on_start(self):
- arv_node = self._monitor.arvados_node.get()
+ arv_node = self._arvados_node()
if arv_node is None:
return super(ComputeNodeShutdownActor, self).on_start()
else:
diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py
index af8e608..44f1513 100644
--- a/services/nodemanager/arvnodeman/daemon.py
+++ b/services/nodemanager/arvnodeman/daemon.py
@@ -311,6 +311,7 @@ class NodeManagerDaemonActor(actor_class):
return None
shutdown = self._node_shutdown.start(
timer_actor=self._timer, cloud_client=self._new_cloud(),
+ arvados_client=self._new_arvados(),
node_monitor=node_actor.actor_ref, cancellable=cancellable).proxy()
self.shutdowns[cloud_node_id] = shutdown
shutdown.subscribe(self._later.node_finished_shutdown)
diff --git a/services/nodemanager/tests/test_computenode_dispatch.py b/services/nodemanager/tests/test_computenode_dispatch.py
index 96a70c6..c22e7a0 100644
--- a/services/nodemanager/tests/test_computenode_dispatch.py
+++ b/services/nodemanager/tests/test_computenode_dispatch.py
@@ -121,6 +121,7 @@ class ComputeNodeShutdownActorMixin(testutil.ActorTestMixin):
self.shutdowns = testutil.MockShutdownTimer()
self.shutdowns._set_state(shutdown_open, 300)
self.cloud_client = mock.MagicMock(name='cloud_client')
+ self.arvados_client = mock.MagicMock(name='arvados_client')
self.updates = mock.MagicMock(name='update_mock')
if cloud_node is None:
cloud_node = testutil.cloud_node_mock()
@@ -135,7 +136,8 @@ class ComputeNodeShutdownActorMixin(testutil.ActorTestMixin):
testutil.cloud_node_fqdn, self.timer, self.updates,
self.arvados_node)
self.shutdown_actor = self.ACTOR_CLASS.start(
- self.timer, self.cloud_client, monitor_actor, cancellable).proxy()
+ self.timer, self.cloud_client, self.arvados_client, monitor_actor,
+ cancellable).proxy()
self.monitor_actor = monitor_actor.proxy()
def check_success_flag(self, expected, allow_msg_count=1):
@@ -157,6 +159,31 @@ class ComputeNodeShutdownActorMixin(testutil.ActorTestMixin):
self.cloud_client.destroy_node.return_value = True
self.check_success_flag(True)
+ def test_arvados_node_cleaned_after_shutdown(self, *mocks):
+ cloud_node = testutil.cloud_node_mock(62)
+ arv_node = testutil.arvados_node_mock(62)
+ self.make_mocks(cloud_node, arv_node)
+ self.make_actor()
+ self.check_success_flag(True, 3)
+ update_mock = self.arvados_client.nodes().update
+ self.assertTrue(update_mock.called)
+ update_kwargs = update_mock.call_args_list[0][1]
+ self.assertEqual(arv_node['uuid'], update_kwargs.get('uuid'))
+ self.assertIn('body', update_kwargs)
+ for clear_key in ['slot_number', 'hostname', 'ip_address',
+ 'first_ping_at', 'last_ping_at']:
+ self.assertIn(clear_key, update_kwargs['body'])
+ self.assertIsNone(update_kwargs['body'][clear_key])
+ self.assertTrue(update_mock().execute.called)
+
+ def test_arvados_node_not_cleaned_after_shutdown_cancelled(self, *mocks):
+ cloud_node = testutil.cloud_node_mock(61)
+ arv_node = testutil.arvados_node_mock(61)
+ self.make_mocks(cloud_node, arv_node, shutdown_open=False)
+ self.make_actor(cancellable=True)
+ self.check_success_flag(False, 2)
+ self.assertFalse(self.arvados_client.nodes().update.called)
+
class ComputeNodeShutdownActorTestCase(ComputeNodeShutdownActorMixin,
unittest.TestCase):
diff --git a/services/nodemanager/tests/test_computenode_dispatch_slurm.py b/services/nodemanager/tests/test_computenode_dispatch_slurm.py
index 93cc60d..ac3ebf0 100644
--- a/services/nodemanager/tests/test_computenode_dispatch_slurm.py
+++ b/services/nodemanager/tests/test_computenode_dispatch_slurm.py
@@ -66,3 +66,8 @@ class SLURMComputeNodeShutdownActorTestCase(ComputeNodeShutdownActorMixin,
self.check_success_flag(False, 2)
self.check_slurm_got_args(proc_mock, 'NodeName=compute99',
'State=RESUME')
+
+ def test_arvados_node_cleaned_after_shutdown(self, proc_mock):
+ proc_mock.return_value = 'drain\n'
+ super(SLURMComputeNodeShutdownActorTestCase,
+ self).test_arvados_node_cleaned_after_shutdown()
commit 44684242aba028470311a4991dcd1456df5c84fd
Author: Brett Smith <brett at curoverse.com>
Date: Mon May 11 14:57:49 2015 -0400
5736: Fix typo in Node Manager test comments.
diff --git a/services/nodemanager/tests/__init__.py b/services/nodemanager/tests/__init__.py
index c5eaf76..e5e9ee3 100644
--- a/services/nodemanager/tests/__init__.py
+++ b/services/nodemanager/tests/__init__.py
@@ -7,7 +7,7 @@ import os
loglevel = os.environ.get('ANMTEST_LOGLEVEL', 'CRITICAL')
logging.basicConfig(level=getattr(logging, loglevel.upper()))
-# Set the ANM_TIMEOUT environment variable to the maximum amount of time to
+# Set the ANMTEST_TIMEOUT environment variable to the maximum amount of time to
# wait for tested actors to respond to important messages. The default value
# is very conservative, because a small value may produce false negatives on
# slower systems. If you're debugging a known timeout issue, however, you may
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list