[ARVADOS] updated: ae8aaa4c55762222c837fcce8e9ad6800ff8b128
Git user
git at public.curoverse.com
Thu Mar 23 16:16:41 EDT 2017
Summary of changes:
.../arvnodeman/computenode/dispatch/__init__.py | 4 ---
services/nodemanager/arvnodeman/daemon.py | 31 ++++++++++++++--------
services/nodemanager/tests/test_daemon.py | 21 ---------------
3 files changed, 20 insertions(+), 36 deletions(-)
via ae8aaa4c55762222c837fcce8e9ad6800ff8b128 (commit)
via b8000c3cb38b77c5c429e0fd591a43f5eeee64d1 (commit)
via 2e32ef1657b439c0398e66930c3d17437032fb1a (commit)
via 2aef6ca08d80c0fd25d74ddb9ab52cf535a33d3e (commit)
from 8ada36c931712304c4b2c70bdcbc316b1ad2c4e2 (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
commit ae8aaa4c55762222c837fcce8e9ad6800ff8b128
Merge: 8ada36c b8000c3
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date: Thu Mar 23 16:16:33 2017 -0400
Merge branch '11325-no-broken-nodes' refs #11323, refs #11324, refs #11325
commit b8000c3cb38b77c5c429e0fd591a43f5eeee64d1
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date: Thu Mar 23 13:30:35 2017 -0400
11323: Don't try to offer_arvados_pair on unpaired nodes which are being shut down.
diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py
index 8890e83..f23b261 100644
--- a/services/nodemanager/arvnodeman/daemon.py
+++ b/services/nodemanager/arvnodeman/daemon.py
@@ -232,7 +232,7 @@ class NodeManagerDaemonActor(actor_class):
def try_pairing(self):
for record in self.cloud_nodes.unpaired():
for arv_rec in self.arvados_nodes.unpaired():
- if record.actor.offer_arvados_pair(arv_rec.arvados_node).get():
+ if record.actor is not None and record.actor.offer_arvados_pair(arv_rec.arvados_node).get():
self._pair_nodes(record, arv_rec.arvados_node)
break
commit 2e32ef1657b439c0398e66930c3d17437032fb1a
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date: Thu Mar 23 14:05:51 2017 -0400
11324: Fix crash in NodeManagerDaemonActor when receiving a node_can_shutdown
message for a node that has already been shut down.
diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py
index b4f1784..8890e83 100644
--- a/services/nodemanager/arvnodeman/daemon.py
+++ b/services/nodemanager/arvnodeman/daemon.py
@@ -426,16 +426,25 @@ class NodeManagerDaemonActor(actor_class):
@_check_poll_freshness
def node_can_shutdown(self, node_actor):
- if self._nodes_excess(node_actor.cloud_node.get().size) > 0:
- self._begin_node_shutdown(node_actor, cancellable=True)
- elif self.cloud_nodes.nodes.get(node_actor.cloud_node.get().id).arvados_node is None:
- # Node is unpaired, which means it probably exceeded its booting
- # grace period without a ping, so shut it down so we can boot a new
- # node in its place.
- self._begin_node_shutdown(node_actor, cancellable=False)
- elif node_actor.in_state('down').get():
- # Node is down and unlikely to come back.
- self._begin_node_shutdown(node_actor, cancellable=False)
+ try:
+ if self._nodes_excess(node_actor.cloud_node.get().size) > 0:
+ self._begin_node_shutdown(node_actor, cancellable=True)
+ elif self.cloud_nodes.nodes.get(node_actor.cloud_node.get().id).arvados_node is None:
+ # Node is unpaired, which means it probably exceeded its booting
+ # grace period without a ping, so shut it down so we can boot a new
+ # node in its place.
+ self._begin_node_shutdown(node_actor, cancellable=False)
+ elif node_actor.in_state('down').get():
+ # Node is down and unlikely to come back.
+ self._begin_node_shutdown(node_actor, cancellable=False)
+ except pykka.ActorDeadError as e:
+ # The monitor actor sends shutdown suggestions every time the
+ # node's state is updated, and these go into the daemon actor's
+ # message queue. It's possible that the node has already been shut
+ # down (which shuts down the node monitor actor). In that case,
+ # this message is stale and we'll get ActorDeadError when we try to
+ # access node_actor. Log the error.
+ self._logger.debug("ActorDeadError in node_can_shutdown: %s", e)
def node_finished_shutdown(self, shutdown_actor):
try:
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list