[ARVADOS] created: 912c6ffed63537757268fcc5f762e20bc5b8b860
Git user
git at public.curoverse.com
Thu Mar 23 14:09:46 EDT 2017
at 912c6ffed63537757268fcc5f762e20bc5b8b860 (commit)
commit 912c6ffed63537757268fcc5f762e20bc5b8b860
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date: Thu Mar 23 13:30:35 2017 -0400
11323: Don't try to offer_arvados_pair on unpaired nodes which are being shut down.
diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py
index d99917b..e21bf3d 100644
--- a/services/nodemanager/arvnodeman/daemon.py
+++ b/services/nodemanager/arvnodeman/daemon.py
@@ -232,7 +232,7 @@ class NodeManagerDaemonActor(actor_class):
def try_pairing(self):
for record in self.cloud_nodes.unpaired():
for arv_rec in self.arvados_nodes.unpaired():
- if record.actor.offer_arvados_pair(arv_rec.arvados_node).get():
+ if record.actor is not None and record.actor.offer_arvados_pair(arv_rec.arvados_node).get():
self._pair_nodes(record, arv_rec.arvados_node)
break
commit 4bdebc393b74aab0511ec2e34ee7cc52f1804927
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date: Thu Mar 23 14:05:51 2017 -0400
11324: Fix crash in NodeManagerDaemonActor when receiving a node_can_shutdown
message for a node that has already been shut down.
diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py
index b4f1784..d99917b 100644
--- a/services/nodemanager/arvnodeman/daemon.py
+++ b/services/nodemanager/arvnodeman/daemon.py
@@ -426,16 +426,25 @@ class NodeManagerDaemonActor(actor_class):
@_check_poll_freshness
def node_can_shutdown(self, node_actor):
- if self._nodes_excess(node_actor.cloud_node.get().size) > 0:
- self._begin_node_shutdown(node_actor, cancellable=True)
- elif self.cloud_nodes.nodes.get(node_actor.cloud_node.get().id).arvados_node is None:
- # Node is unpaired, which means it probably exceeded its booting
- # grace period without a ping, so shut it down so we can boot a new
- # node in its place.
- self._begin_node_shutdown(node_actor, cancellable=False)
- elif node_actor.in_state('down').get():
- # Node is down and unlikely to come back.
- self._begin_node_shutdown(node_actor, cancellable=False)
+ try:
+ if self._nodes_excess(node_actor.cloud_node.get().size) > 0:
+ self._begin_node_shutdown(node_actor, cancellable=True)
+ elif self.cloud_nodes.nodes.get(node_actor.cloud_node.get().id).arvados_node is None:
+ # Node is unpaired, which means it probably exceeded its booting
+ # grace period without a ping, so shut it down so we can boot a new
+ # node in its place.
+ self._begin_node_shutdown(node_actor, cancellable=False)
+ elif node_actor.in_state('down').get():
+ # Node is down and unlikely to come back.
+ self._begin_node_shutdown(node_actor, cancellable=False)
+ except pykka.ActorDeadError:
+ # The monitor actor sends shutdown suggestions every time the
+ # node's state is updated, and these go into the daemon actor's
+ # message queue. It's possible that the node has already been shut
+ # down (which shuts down the node monitor actor). In that case,
+ # this message is stale and we'll get ActorDeadError when we try to
+ # access node_actor. Suppress the error.
+ return
def node_finished_shutdown(self, shutdown_actor):
try:
commit 2aef6ca08d80c0fd25d74ddb9ab52cf535a33d3e
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date: Thu Mar 23 13:23:54 2017 -0400
11325: Remove "broken node" check. Assume if the node really isn't
functioning, it should be "down" in SLURM anyway. Remove test_broken_node_not_counted because broken node check is removed.
diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
index fc3ff05..71f9083 100644
--- a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
@@ -348,10 +348,6 @@ class ComputeNodeMonitorActor(config.actor_class):
if self.arvados_node is None:
return 'unpaired'
- # This node is indicated as non-functioning by the cloud
- if self._cloud.broken(self.cloud_node):
- return 'down'
-
state = self.arvados_node['crunch_worker_state']
# If state information is not available because it is missing or the
diff --git a/services/nodemanager/tests/test_daemon.py b/services/nodemanager/tests/test_daemon.py
index f1d168c..e49fc39 100644
--- a/services/nodemanager/tests/test_daemon.py
+++ b/services/nodemanager/tests/test_daemon.py
@@ -529,27 +529,6 @@ class NodeManagerDaemonActorTestCase(testutil.ActorTestMixin,
self.daemon.node_finished_shutdown(self.last_shutdown).get(self.TIMEOUT)
self.assertEqual(0, self.alive_monitor_count())
- def test_broken_node_not_counted(self):
- size = testutil.MockSize(8)
- cloud_node = testutil.cloud_node_mock(8, size=size)
- wishlist = [size]
- self.make_daemon([cloud_node], [testutil.arvados_node_mock(8)],
- wishlist, avail_sizes=[(size, {"cores":1})])
- self.assertEqual(1, self.alive_monitor_count())
- self.assertFalse(self.node_setup.start.called)
- monitor = self.monitor_list()[0].proxy()
- shutdown_proxy = self.node_shutdown.start().proxy
- shutdown_proxy().cloud_node.get.return_value = cloud_node
- shutdown_proxy().success.get.return_value = False
- self.cloud_factory().broken.return_value = True
- self.daemon.update_server_wishlist([]).get(self.TIMEOUT)
- self.daemon.node_can_shutdown(monitor).get(self.TIMEOUT)
- self.daemon.node_finished_shutdown(shutdown_proxy()).get(self.TIMEOUT)
- self.daemon.update_cloud_nodes([cloud_node]).get(self.TIMEOUT)
- self.daemon.update_server_wishlist(wishlist).get(self.TIMEOUT)
- self.stop_proxy(self.daemon)
- self.assertEqual(1, self.node_setup.start.call_count)
-
def test_nodes_shutting_down_replaced_below_max_nodes(self):
size = testutil.MockSize(6)
cloud_node = testutil.cloud_node_mock(6, size=size)
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list