[ARVADOS] updated: ae8aaa4c55762222c837fcce8e9ad6800ff8b128

Git user git at public.curoverse.com
Thu Mar 23 16:16:41 EDT 2017


Summary of changes:
 .../arvnodeman/computenode/dispatch/__init__.py    |  4 ---
 services/nodemanager/arvnodeman/daemon.py          | 31 ++++++++++++++--------
 services/nodemanager/tests/test_daemon.py          | 21 ---------------
 3 files changed, 20 insertions(+), 36 deletions(-)

       via  ae8aaa4c55762222c837fcce8e9ad6800ff8b128 (commit)
       via  b8000c3cb38b77c5c429e0fd591a43f5eeee64d1 (commit)
       via  2e32ef1657b439c0398e66930c3d17437032fb1a (commit)
       via  2aef6ca08d80c0fd25d74ddb9ab52cf535a33d3e (commit)
      from  8ada36c931712304c4b2c70bdcbc316b1ad2c4e2 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.


commit ae8aaa4c55762222c837fcce8e9ad6800ff8b128
Merge: 8ada36c b8000c3
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Thu Mar 23 16:16:33 2017 -0400

    Merge branch '11325-no-broken-nodes' refs #11323, refs #11324, refs #11325


commit b8000c3cb38b77c5c429e0fd591a43f5eeee64d1
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Thu Mar 23 13:30:35 2017 -0400

    11323: Don't try to offer_arvados_pair on unpaired nodes which are being shut down.

diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py
index 8890e83..f23b261 100644
--- a/services/nodemanager/arvnodeman/daemon.py
+++ b/services/nodemanager/arvnodeman/daemon.py
@@ -232,7 +232,7 @@ class NodeManagerDaemonActor(actor_class):
     def try_pairing(self):
         for record in self.cloud_nodes.unpaired():
             for arv_rec in self.arvados_nodes.unpaired():
-                if record.actor.offer_arvados_pair(arv_rec.arvados_node).get():
+                if record.actor is not None and record.actor.offer_arvados_pair(arv_rec.arvados_node).get():
                     self._pair_nodes(record, arv_rec.arvados_node)
                     break
 

commit 2e32ef1657b439c0398e66930c3d17437032fb1a
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Thu Mar 23 14:05:51 2017 -0400

    11324: Fix crash in NodeManagerDaemonActor when receiving a node_can_shutdown
    message for a node that has already been shut down.

diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py
index b4f1784..8890e83 100644
--- a/services/nodemanager/arvnodeman/daemon.py
+++ b/services/nodemanager/arvnodeman/daemon.py
@@ -426,16 +426,25 @@ class NodeManagerDaemonActor(actor_class):
 
     @_check_poll_freshness
     def node_can_shutdown(self, node_actor):
-        if self._nodes_excess(node_actor.cloud_node.get().size) > 0:
-            self._begin_node_shutdown(node_actor, cancellable=True)
-        elif self.cloud_nodes.nodes.get(node_actor.cloud_node.get().id).arvados_node is None:
-            # Node is unpaired, which means it probably exceeded its booting
-            # grace period without a ping, so shut it down so we can boot a new
-            # node in its place.
-            self._begin_node_shutdown(node_actor, cancellable=False)
-        elif node_actor.in_state('down').get():
-            # Node is down and unlikely to come back.
-            self._begin_node_shutdown(node_actor, cancellable=False)
+        try:
+            if self._nodes_excess(node_actor.cloud_node.get().size) > 0:
+                self._begin_node_shutdown(node_actor, cancellable=True)
+            elif self.cloud_nodes.nodes.get(node_actor.cloud_node.get().id).arvados_node is None:
+                # Node is unpaired, which means it probably exceeded its booting
+                # grace period without a ping, so shut it down so we can boot a new
+                # node in its place.
+                self._begin_node_shutdown(node_actor, cancellable=False)
+            elif node_actor.in_state('down').get():
+                # Node is down and unlikely to come back.
+                self._begin_node_shutdown(node_actor, cancellable=False)
+        except pykka.ActorDeadError as e:
+            # The monitor actor sends shutdown suggestions every time the
+            # node's state is updated, and these go into the daemon actor's
+            # message queue.  It's possible that the node has already been shut
+            # down (which shuts down the node monitor actor).  In that case,
+            # this message is stale and we'll get ActorDeadError when we try to
+            # access node_actor.  Log the error.
+            self._logger.debug("ActorDeadError in node_can_shutdown: %s", e)
 
     def node_finished_shutdown(self, shutdown_actor):
         try:

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list