[ARVADOS] created: d5f5f869d46f9096c7c680d608c1cc654d1d7fa0

git at public.curoverse.com git at public.curoverse.com
Mon Oct 5 14:15:27 EDT 2015


        at  d5f5f869d46f9096c7c680d608c1cc654d1d7fa0 (commit)


commit d5f5f869d46f9096c7c680d608c1cc654d1d7fa0
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Thu Oct 1 13:00:00 2015 -0400

    7286: Add BaseHTTPError to list of "cloud errors"

diff --git a/services/nodemanager/arvnodeman/computenode/driver/azure.py b/services/nodemanager/arvnodeman/computenode/driver/azure.py
index ba3c9b0..ca6ed7c 100644
--- a/services/nodemanager/arvnodeman/computenode/driver/azure.py
+++ b/services/nodemanager/arvnodeman/computenode/driver/azure.py
@@ -16,6 +16,7 @@ class ComputeNodeDriver(BaseComputeNodeDriver):
 
     DEFAULT_DRIVER = cloud_provider.get_driver(cloud_types.Provider.AZURE_ARM)
     SEARCH_CACHE = {}
+    CLOUD_ERRORS = BaseComputeNodeDriver.CLOUD_ERRORS + (BaseHTTPError,)
 
     def __init__(self, auth_kwargs, list_kwargs, create_kwargs,
                  driver_class=DEFAULT_DRIVER):

commit 11df73b96ae395fca11b4006253475046e3b74cc
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Thu Oct 1 09:32:33 2015 -0400

    7286: Add drain* and fail* to SLURM_END_STATES, because the '*' means the node
    is out of contact with slurm.

diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py b/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
index 3c26629..225d856 100644
--- a/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
+++ b/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
@@ -10,7 +10,9 @@ from . import \
 from . import ComputeNodeShutdownActor as ShutdownActorBase
 
 class ComputeNodeShutdownActor(ShutdownActorBase):
-    SLURM_END_STATES = frozenset(['down\n', 'down*\n', 'drain\n', 'fail\n'])
+    SLURM_END_STATES = frozenset(['down\n', 'down*\n',
+                                  'drain\n', 'drain*\n',
+                                  'fail\n', 'fail*\n'])
 
     def on_start(self):
         arv_node = self._arvados_node()

commit 2e919859109fe27d552b81b13d47aed61e80eca6
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Wed Sep 30 17:16:09 2015 -0400

    7286: Fix double count of missing nodes in shutdown

diff --git a/services/nodemanager/arvnodeman/computenode/__init__.py b/services/nodemanager/arvnodeman/computenode/__init__.py
index b47866d..e7bd7bf 100644
--- a/services/nodemanager/arvnodeman/computenode/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/__init__.py
@@ -31,7 +31,10 @@ def timestamp_fresh(timestamp, fresh_time):
     return (time.time() - timestamp) < fresh_time
 
 def arvados_node_missing(arvados_node, fresh_time):
-    return not timestamp_fresh(arvados_timestamp(arvados_node["last_ping_at"]), fresh_time)
+    if arvados_node["last_ping_at"] is None:
+        return None
+    else:
+        return not timestamp_fresh(arvados_timestamp(arvados_node["last_ping_at"]), fresh_time)
 
 class ShutdownTimer(object):
     """Keep track of a cloud node's shutdown windows.
diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py
index 6f33a3b..ddddd41 100644
--- a/services/nodemanager/arvnodeman/daemon.py
+++ b/services/nodemanager/arvnodeman/daemon.py
@@ -212,7 +212,8 @@ class NodeManagerDaemonActor(actor_class):
     def _nodes_missing(self):
         return sum(1 for arv_node in
                    pykka.get_all(rec.actor.arvados_node for rec in
-                                 self.cloud_nodes.nodes.itervalues())
+                                 self.cloud_nodes.nodes.itervalues()
+                                 if rec.actor.cloud_node.get().id not in self.shutdowns)
                    if arv_node and cnode.arvados_node_missing(arv_node, self.node_stale_after))
 
     def _nodes_wanted(self):
diff --git a/services/nodemanager/tests/test_daemon.py b/services/nodemanager/tests/test_daemon.py
index 57ea46c..0206f4c 100644
--- a/services/nodemanager/tests/test_daemon.py
+++ b/services/nodemanager/tests/test_daemon.py
@@ -148,8 +148,8 @@ class NodeManagerDaemonActorTestCase(testutil.ActorTestMixin,
 
     def test_excess_counts_missing(self):
         size = testutil.MockSize(1)
-        self.make_daemon(cloud_nodes=[testutil.cloud_node_mock(1),
-                                      testutil.cloud_node_mock(2)],
+        cloud_nodes = [testutil.cloud_node_mock(1), testutil.cloud_node_mock(2)]
+        self.make_daemon(cloud_nodes=cloud_nodes,
                          arvados_nodes=[testutil.arvados_node_mock(1),
                                         testutil.arvados_node_mock(2, last_ping_at='1970-01-01T01:02:03.04050607Z')],
                          want_sizes=[size])
@@ -158,6 +158,19 @@ class NodeManagerDaemonActorTestCase(testutil.ActorTestMixin,
             self.daemon.node_can_shutdown(mon_ref.proxy()).get(self.TIMEOUT)
         self.assertEqual(1, self.node_shutdown.start.call_count)
 
+    def test_missing_shutdown_not_excess(self):
+        size = testutil.MockSize(1)
+        cloud_nodes = [testutil.cloud_node_mock(1), testutil.cloud_node_mock(2)]
+        self.make_daemon(cloud_nodes=cloud_nodes,
+                         arvados_nodes=[testutil.arvados_node_mock(1),
+                                        testutil.arvados_node_mock(2, last_ping_at='1970-01-01T01:02:03.04050607Z')],
+                         want_sizes=[size])
+        self.daemon.shutdowns.get()[cloud_nodes[1].id] = True
+        self.assertEqual(2, self.alive_monitor_count())
+        for mon_ref in self.monitor_list():
+            self.daemon.node_can_shutdown(mon_ref.proxy()).get(self.TIMEOUT)
+        self.assertEqual(0, self.node_shutdown.start.call_count)
+
     def test_booting_nodes_counted(self):
         cloud_node = testutil.cloud_node_mock(1)
         arv_node = testutil.arvados_node_mock(1)

commit be81c03a3c26f365eba35b91e4f0827244a02ef7
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Wed Sep 30 16:26:46 2015 -0400

    7286: Missing nodes are considered in "excess" count (reverts previous change).  Added test.  Also remove debug log statement.

diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
index 4ebd437..9ea6c32 100644
--- a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
@@ -332,7 +332,6 @@ class ComputeNodeMonitorActor(config.actor_class):
         elif arvados_node_missing(self.arvados_node, self.node_stale_after) and self._cloud.broken(self.cloud_node):
             # Node is paired, but Arvados says it is missing and the cloud says the node
             # is in an error state, so shut it down.
-            self._logger.warn("blah %s %s", arvados_node_missing(self.arvados_node, self.node_stale_after), self._cloud.broken(self.cloud_node))
             return True
         else:
             return self.in_state('idle')
diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py
index 30592ab..6f33a3b 100644
--- a/services/nodemanager/arvnodeman/daemon.py
+++ b/services/nodemanager/arvnodeman/daemon.py
@@ -228,7 +228,7 @@ class NodeManagerDaemonActor(actor_class):
             return len(self.last_wishlist) - up_count
 
     def _nodes_excess(self):
-        up_count = self._nodes_up() - len(self.shutdowns) - self._nodes_missing()
+        up_count = self._nodes_up() - len(self.shutdowns)
         over_min = up_count - self.min_nodes
         if over_min <= 0:
             return over_min
diff --git a/services/nodemanager/tests/test_daemon.py b/services/nodemanager/tests/test_daemon.py
index 02dec42..57ea46c 100644
--- a/services/nodemanager/tests/test_daemon.py
+++ b/services/nodemanager/tests/test_daemon.py
@@ -146,6 +146,18 @@ class NodeManagerDaemonActorTestCase(testutil.ActorTestMixin,
         self.stop_proxy(self.daemon)
         self.assertFalse(self.node_setup.start.called)
 
+    def test_excess_counts_missing(self):
+        size = testutil.MockSize(1)
+        self.make_daemon(cloud_nodes=[testutil.cloud_node_mock(1),
+                                      testutil.cloud_node_mock(2)],
+                         arvados_nodes=[testutil.arvados_node_mock(1),
+                                        testutil.arvados_node_mock(2, last_ping_at='1970-01-01T01:02:03.04050607Z')],
+                         want_sizes=[size])
+        self.assertEqual(2, self.alive_monitor_count())
+        for mon_ref in self.monitor_list():
+            self.daemon.node_can_shutdown(mon_ref.proxy()).get(self.TIMEOUT)
+        self.assertEqual(1, self.node_shutdown.start.call_count)
+
     def test_booting_nodes_counted(self):
         cloud_node = testutil.cloud_node_mock(1)
         arv_node = testutil.arvados_node_mock(1)

commit c0f33379c7fd062fc097ecef92808334e821cb6b
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Wed Sep 30 14:23:25 2015 -0400

    7286: Compute "missing" based on "last_ping_at" instead of using API server's
    buggy "status" field.

diff --git a/services/nodemanager/arvnodeman/computenode/__init__.py b/services/nodemanager/arvnodeman/computenode/__init__.py
index f518607..b47866d 100644
--- a/services/nodemanager/arvnodeman/computenode/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/__init__.py
@@ -30,6 +30,9 @@ def arvados_timestamp(timestr):
 def timestamp_fresh(timestamp, fresh_time):
     return (time.time() - timestamp) < fresh_time
 
+def arvados_node_missing(arvados_node, fresh_time):
+    return not timestamp_fresh(arvados_timestamp(arvados_node["last_ping_at"]), fresh_time)
+
 class ShutdownTimer(object):
     """Keep track of a cloud node's shutdown windows.
 
diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
index 4557198..4ebd437 100644
--- a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
@@ -10,7 +10,7 @@ import libcloud.common.types as cloud_types
 import pykka
 
 from .. import \
-    arvados_node_fqdn, arvados_node_mtime, arvados_timestamp, timestamp_fresh
+    arvados_node_fqdn, arvados_node_mtime, arvados_timestamp, timestamp_fresh, arvados_node_missing
 from ...clientactor import _notify_subscribers
 from ... import config
 
@@ -329,9 +329,10 @@ class ComputeNodeMonitorActor(config.actor_class):
             # Node is unpaired.
             # If it hasn't pinged Arvados after boot_fail seconds, shut it down
             return not timestamp_fresh(self.cloud_node_start_time, self.boot_fail_after)
-        elif self.arvados_node.get('status') == "missing" and self._cloud.broken(self.cloud_node):
+        elif arvados_node_missing(self.arvados_node, self.node_stale_after) and self._cloud.broken(self.cloud_node):
             # Node is paired, but Arvados says it is missing and the cloud says the node
             # is in an error state, so shut it down.
+            self._logger.warn("blah %s %s", arvados_node_missing(self.arvados_node, self.node_stale_after), self._cloud.broken(self.cloud_node))
             return True
         else:
             return self.in_state('idle')
diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py
index ed8c7d5..30592ab 100644
--- a/services/nodemanager/arvnodeman/daemon.py
+++ b/services/nodemanager/arvnodeman/daemon.py
@@ -213,7 +213,7 @@ class NodeManagerDaemonActor(actor_class):
         return sum(1 for arv_node in
                    pykka.get_all(rec.actor.arvados_node for rec in
                                  self.cloud_nodes.nodes.itervalues())
-                   if arv_node and arv_node.get("status") == "missing")
+                   if arv_node and cnode.arvados_node_missing(arv_node, self.node_stale_after))
 
     def _nodes_wanted(self):
         up_count = self._nodes_up()
diff --git a/services/nodemanager/tests/test_computenode_dispatch.py b/services/nodemanager/tests/test_computenode_dispatch.py
index 707cdc6..e718fc1 100644
--- a/services/nodemanager/tests/test_computenode_dispatch.py
+++ b/services/nodemanager/tests/test_computenode_dispatch.py
@@ -244,6 +244,7 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
         self.cloud_mock = testutil.cloud_node_mock(node_num)
         self.subscriber = mock.Mock(name='subscriber_mock')
         self.cloud_client = mock.MagicMock(name='cloud_client')
+        self.cloud_client.broken.return_value = False
 
     def make_actor(self, node_num=1, arv_node=None, start_time=None):
         if not hasattr(self, 'cloud_mock'):
@@ -321,16 +322,14 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
     def test_no_shutdown_missing(self):
         arv_node = testutil.arvados_node_mock(10, job_uuid=None,
                                               crunch_worker_state="down",
-                                              status="missing")
+                                              last_ping_at='1970-01-01T01:02:03.04050607Z')
         self.make_actor(10, arv_node)
         self.shutdowns._set_state(True, 600)
-        self.cloud_client.broken.return_value = False
         self.assertFalse(self.node_actor.shutdown_eligible().get(self.TIMEOUT))
 
     def test_no_shutdown_running_broken(self):
         arv_node = testutil.arvados_node_mock(12, job_uuid=None,
-                                              crunch_worker_state="down",
-                                              status="running")
+                                              crunch_worker_state="down")
         self.make_actor(12, arv_node)
         self.shutdowns._set_state(True, 600)
         self.cloud_client.broken.return_value = True
@@ -339,7 +338,7 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
     def test_shutdown_missing_broken(self):
         arv_node = testutil.arvados_node_mock(11, job_uuid=None,
                                               crunch_worker_state="down",
-                                              status="missing")
+                                              last_ping_at='1970-01-01T01:02:03.04050607Z')
         self.make_actor(11, arv_node)
         self.shutdowns._set_state(True, 600)
         self.cloud_client.broken.return_value = True
diff --git a/services/nodemanager/tests/test_daemon.py b/services/nodemanager/tests/test_daemon.py
index 8c622ec..02dec42 100644
--- a/services/nodemanager/tests/test_daemon.py
+++ b/services/nodemanager/tests/test_daemon.py
@@ -130,7 +130,7 @@ class NodeManagerDaemonActorTestCase(testutil.ActorTestMixin,
         self.make_daemon(cloud_nodes=[testutil.cloud_node_mock(1),
                                       testutil.cloud_node_mock(2)],
                          arvados_nodes=[testutil.arvados_node_mock(1),
-                                      testutil.arvados_node_mock(2, status="missing")],
+                                      testutil.arvados_node_mock(2, last_ping_at='1970-01-01T01:02:03.04050607Z')],
                          want_sizes=[size, size])
         self.stop_proxy(self.daemon)
         self.assertTrue(self.node_setup.start.called)
@@ -140,7 +140,7 @@ class NodeManagerDaemonActorTestCase(testutil.ActorTestMixin,
         self.make_daemon(cloud_nodes=[testutil.cloud_node_mock(1),
                                       testutil.cloud_node_mock(2)],
                          arvados_nodes=[testutil.arvados_node_mock(1),
-                                        testutil.arvados_node_mock(2, status="missing")],
+                                        testutil.arvados_node_mock(2, last_ping_at='1970-01-01T01:02:03.04050607Z')],
                          want_sizes=[size, size],
                          max_nodes=2)
         self.stop_proxy(self.daemon)

commit f6aa7c0c8c84b85b550d73117c6fdbd663a38c4c
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Wed Sep 30 10:35:04 2015 -0400

    7286: Add test that "missing" nodes are not counted towards "busy" (but are
    counted towards node max).

diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
index 70e6e8e..4557198 100644
--- a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
@@ -323,9 +323,6 @@ class ComputeNodeMonitorActor(config.actor_class):
         return result
 
     def shutdown_eligible(self):
-        import logging
-        logging.warn("XXX %s %s", self.arvados_node, self._cloud.broken(self.cloud_node))
-
         if not self._shutdowns.window_open():
             return False
         elif self.arvados_node is None:
diff --git a/services/nodemanager/tests/test_daemon.py b/services/nodemanager/tests/test_daemon.py
index b406f13..8c622ec 100644
--- a/services/nodemanager/tests/test_daemon.py
+++ b/services/nodemanager/tests/test_daemon.py
@@ -123,7 +123,28 @@ class NodeManagerDaemonActorTestCase(testutil.ActorTestMixin,
         self.make_daemon([testutil.cloud_node_mock()],
                          want_sizes=[testutil.MockSize(1)])
         self.stop_proxy(self.daemon)
-        self.assertFalse(self.node_setup.called)
+        self.assertFalse(self.node_setup.start.called)
+
+    def test_dont_count_missing_as_busy(self):
+        size = testutil.MockSize(1)
+        self.make_daemon(cloud_nodes=[testutil.cloud_node_mock(1),
+                                      testutil.cloud_node_mock(2)],
+                         arvados_nodes=[testutil.arvados_node_mock(1),
+                                      testutil.arvados_node_mock(2, status="missing")],
+                         want_sizes=[size, size])
+        self.stop_proxy(self.daemon)
+        self.assertTrue(self.node_setup.start.called)
+
+    def test_missing_counts_towards_max(self):
+        size = testutil.MockSize(1)
+        self.make_daemon(cloud_nodes=[testutil.cloud_node_mock(1),
+                                      testutil.cloud_node_mock(2)],
+                         arvados_nodes=[testutil.arvados_node_mock(1),
+                                        testutil.arvados_node_mock(2, status="missing")],
+                         want_sizes=[size, size],
+                         max_nodes=2)
+        self.stop_proxy(self.daemon)
+        self.assertFalse(self.node_setup.start.called)
 
     def test_booting_nodes_counted(self):
         cloud_node = testutil.cloud_node_mock(1)

commit 560c318fdc49835b03f96af35774fbbfa7984fe7
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Tue Sep 29 16:43:41 2015 -0400

    7286: Tests for new "missing and broken" shutdown policy.

diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
index 4557198..70e6e8e 100644
--- a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
@@ -323,6 +323,9 @@ class ComputeNodeMonitorActor(config.actor_class):
         return result
 
     def shutdown_eligible(self):
+        import logging
+        logging.warn("XXX %s %s", self.arvados_node, self._cloud.broken(self.cloud_node))
+
         if not self._shutdowns.window_open():
             return False
         elif self.arvados_node is None:
diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py b/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
index 71e73f1..3c26629 100644
--- a/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
+++ b/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
@@ -15,6 +15,7 @@ class ComputeNodeShutdownActor(ShutdownActorBase):
     def on_start(self):
         arv_node = self._arvados_node()
         if arv_node is None:
+            self._nodename = None
             return super(ComputeNodeShutdownActor, self).on_start()
         else:
             self._nodename = arv_node['hostname']
@@ -29,7 +30,8 @@ class ComputeNodeShutdownActor(ShutdownActorBase):
 
     @ShutdownActorBase._retry((subprocess.CalledProcessError,))
     def cancel_shutdown(self):
-        self._set_node_state('RESUME')
+        if self._nodename:
+            self._set_node_state('RESUME')
         return super(ComputeNodeShutdownActor, self).cancel_shutdown()
 
     @ShutdownActorBase._stop_if_window_closed
diff --git a/services/nodemanager/tests/test_computenode_dispatch.py b/services/nodemanager/tests/test_computenode_dispatch.py
index c22e7a0..707cdc6 100644
--- a/services/nodemanager/tests/test_computenode_dispatch.py
+++ b/services/nodemanager/tests/test_computenode_dispatch.py
@@ -128,12 +128,14 @@ class ComputeNodeShutdownActorMixin(testutil.ActorTestMixin):
         self.cloud_node = cloud_node
         self.arvados_node = arvados_node
 
-    def make_actor(self, cancellable=True):
+    def make_actor(self, cancellable=True, start_time=None):
         if not hasattr(self, 'timer'):
             self.make_mocks()
+        if start_time is None:
+            start_time = time.time()
         monitor_actor = dispatch.ComputeNodeMonitorActor.start(
-            self.cloud_node, time.time(), self.shutdowns,
-            testutil.cloud_node_fqdn, self.timer, self.updates,
+            self.cloud_node, start_time, self.shutdowns,
+            testutil.cloud_node_fqdn, self.timer, self.updates, self.cloud_client,
             self.arvados_node)
         self.shutdown_actor = self.ACTOR_CLASS.start(
             self.timer, self.cloud_client, self.arvados_client, monitor_actor,
@@ -190,7 +192,7 @@ class ComputeNodeShutdownActorTestCase(ComputeNodeShutdownActorMixin,
     ACTOR_CLASS = dispatch.ComputeNodeShutdownActor
 
     def test_easy_shutdown(self):
-        self.make_actor()
+        self.make_actor(start_time=0)
         self.check_success_flag(True)
         self.assertTrue(self.cloud_client.destroy_node.called)
 
@@ -203,7 +205,7 @@ class ComputeNodeShutdownActorTestCase(ComputeNodeShutdownActorMixin,
     def test_shutdown_retries_when_cloud_fails(self):
         self.make_mocks()
         self.cloud_client.destroy_node.return_value = False
-        self.make_actor()
+        self.make_actor(start_time=0)
         self.assertIsNone(self.shutdown_actor.success.get(self.TIMEOUT))
         self.cloud_client.destroy_node.return_value = True
         self.check_success_flag(True)
@@ -241,6 +243,7 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
         self.updates = mock.MagicMock(name='update_mock')
         self.cloud_mock = testutil.cloud_node_mock(node_num)
         self.subscriber = mock.Mock(name='subscriber_mock')
+        self.cloud_client = mock.MagicMock(name='cloud_client')
 
     def make_actor(self, node_num=1, arv_node=None, start_time=None):
         if not hasattr(self, 'cloud_mock'):
@@ -249,8 +252,8 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
             start_time = time.time()
         self.node_actor = dispatch.ComputeNodeMonitorActor.start(
             self.cloud_mock, start_time, self.shutdowns,
-            testutil.cloud_node_fqdn, self.timer, self.updates,
-            arv_node).proxy()
+            testutil.cloud_node_fqdn, self.timer, self.updates, self.cloud_client,
+            arv_node, boot_fail_after=300).proxy()
         self.node_actor.subscribe(self.subscriber).get(self.TIMEOUT)
 
     def node_state(self, *states):
@@ -298,23 +301,50 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
         self.assertFalse(self.subscriber.called)
 
     def test_shutdown_subscription(self):
-        self.make_actor()
+        self.make_actor(start_time=0)
         self.shutdowns._set_state(True, 600)
         self.node_actor.consider_shutdown().get(self.TIMEOUT)
         self.assertTrue(self.subscriber.called)
         self.assertEqual(self.node_actor.actor_ref.actor_urn,
                          self.subscriber.call_args[0][0].actor_ref.actor_urn)
 
-    def test_shutdown_without_arvados_node(self):
+    def test_no_shutdown_booting(self):
         self.make_actor()
         self.shutdowns._set_state(True, 600)
-        self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT))
+        self.assertFalse(self.node_actor.shutdown_eligible().get(self.TIMEOUT))
 
-    def test_no_shutdown_without_arvados_node_and_old_cloud_node(self):
+    def test_shutdown_without_arvados_node(self):
         self.make_actor(start_time=0)
         self.shutdowns._set_state(True, 600)
+        self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT))
+
+    def test_no_shutdown_missing(self):
+        arv_node = testutil.arvados_node_mock(10, job_uuid=None,
+                                              crunch_worker_state="down",
+                                              status="missing")
+        self.make_actor(10, arv_node)
+        self.shutdowns._set_state(True, 600)
+        self.cloud_client.broken.return_value = False
         self.assertFalse(self.node_actor.shutdown_eligible().get(self.TIMEOUT))
 
+    def test_no_shutdown_running_broken(self):
+        arv_node = testutil.arvados_node_mock(12, job_uuid=None,
+                                              crunch_worker_state="down",
+                                              status="running")
+        self.make_actor(12, arv_node)
+        self.shutdowns._set_state(True, 600)
+        self.cloud_client.broken.return_value = True
+        self.assertFalse(self.node_actor.shutdown_eligible().get(self.TIMEOUT))
+
+    def test_shutdown_missing_broken(self):
+        arv_node = testutil.arvados_node_mock(11, job_uuid=None,
+                                              crunch_worker_state="down",
+                                              status="missing")
+        self.make_actor(11, arv_node)
+        self.shutdowns._set_state(True, 600)
+        self.cloud_client.broken.return_value = True
+        self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT))
+
     def test_no_shutdown_when_window_closed(self):
         self.make_actor(3, testutil.arvados_node_mock(3, job_uuid=None))
         self.assertFalse(self.node_actor.shutdown_eligible().get(self.TIMEOUT))
diff --git a/services/nodemanager/tests/test_computenode_dispatch_slurm.py b/services/nodemanager/tests/test_computenode_dispatch_slurm.py
index ac3ebf0..c5097a7 100644
--- a/services/nodemanager/tests/test_computenode_dispatch_slurm.py
+++ b/services/nodemanager/tests/test_computenode_dispatch_slurm.py
@@ -55,7 +55,7 @@ class SLURMComputeNodeShutdownActorTestCase(ComputeNodeShutdownActorMixin,
     def test_slurm_bypassed_when_no_arvados_node(self, proc_mock):
         # Test we correctly handle a node that failed to bootstrap.
         proc_mock.return_value = 'idle\n'
-        self.make_actor()
+        self.make_actor(start_time=0)
         self.check_success_flag(True)
         self.assertFalse(proc_mock.called)
 

commit 72e3566f2cdacd44f095183ebf88f7aab8b0d8dc
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Tue Sep 29 10:27:10 2015 -0400

    7286: Move logic to shut down newly booted nodes nodes that haven't pinged to
    ComputeNodeMonitorActor.  Shut down nodes if they have "missing" status and are
    "broken" according to the cloud client.  Don't count "missing" nodes as "up"
    when deciding whether to boot more nodes.

diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
index 6d5c223..4557198 100644
--- a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
@@ -273,8 +273,10 @@ class ComputeNodeMonitorActor(config.actor_class):
     for shutdown.
     """
     def __init__(self, cloud_node, cloud_node_start_time, shutdown_timer,
-                 cloud_fqdn_func, timer_actor, update_actor, arvados_node=None,
-                 poll_stale_after=600, node_stale_after=3600):
+                 cloud_fqdn_func, timer_actor, update_actor, cloud_client,
+                 arvados_node=None, poll_stale_after=600, node_stale_after=3600,
+                 boot_fail_after=1800
+    ):
         super(ComputeNodeMonitorActor, self).__init__()
         self._later = self.actor_ref.proxy()
         self._logger = logging.getLogger('arvnodeman.computenode')
@@ -283,10 +285,12 @@ class ComputeNodeMonitorActor(config.actor_class):
         self._cloud_node_fqdn = cloud_fqdn_func
         self._timer = timer_actor
         self._update = update_actor
+        self._cloud = cloud_client
         self.cloud_node = cloud_node
         self.cloud_node_start_time = cloud_node_start_time
         self.poll_stale_after = poll_stale_after
         self.node_stale_after = node_stale_after
+        self.boot_fail_after = boot_fail_after
         self.subscribers = set()
         self.arvados_node = None
         self._later.update_arvados_node(arvados_node)
@@ -322,10 +326,13 @@ class ComputeNodeMonitorActor(config.actor_class):
         if not self._shutdowns.window_open():
             return False
         elif self.arvados_node is None:
-            # If this is a new, unpaired node, it's eligible for
-            # shutdown--we figure there was an error during bootstrap.
-            return timestamp_fresh(self.cloud_node_start_time,
-                                   self.node_stale_after)
+            # Node is unpaired.
+            # If it hasn't pinged Arvados after boot_fail seconds, shut it down
+            return not timestamp_fresh(self.cloud_node_start_time, self.boot_fail_after)
+        elif self.arvados_node.get('status') == "missing" and self._cloud.broken(self.cloud_node):
+            # Node is paired, but Arvados says it is missing and the cloud says the node
+            # is in an error state, so shut it down.
+            return True
         else:
             return self.in_state('idle')
 
diff --git a/services/nodemanager/arvnodeman/computenode/driver/__init__.py b/services/nodemanager/arvnodeman/computenode/driver/__init__.py
index 724c772..14e804f 100644
--- a/services/nodemanager/arvnodeman/computenode/driver/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/driver/__init__.py
@@ -103,6 +103,10 @@ class BaseComputeNodeDriver(object):
         """
         raise NotImplementedError("BaseComputeNodeDriver.arvados_create_kwargs")
 
+    def broken(self, cloud_node):
+        """Return true if libcloud has indicated the node is in a "broken" state."""
+        return False
+
     def _make_ping_url(self, arvados_node):
         return 'https://{}/arvados/v1/nodes/{}/ping?ping_secret={}'.format(
             self.ping_host, arvados_node['uuid'],
diff --git a/services/nodemanager/arvnodeman/computenode/driver/azure.py b/services/nodemanager/arvnodeman/computenode/driver/azure.py
index b1494d0..ba3c9b0 100644
--- a/services/nodemanager/arvnodeman/computenode/driver/azure.py
+++ b/services/nodemanager/arvnodeman/computenode/driver/azure.py
@@ -81,6 +81,10 @@ class ComputeNodeDriver(BaseComputeNodeDriver):
                 super(ComputeNodeDriver, self).list_nodes()
                 if node.extra["tags"].get("arvados-class") == self.tags["arvados-class"]]
 
+    def broken(self, cloud_node):
+        """Return true if libcloud has indicated the node is in a "broken" state."""
+        return (cloud_node.state in (cloud_types.NodeState.ERROR, cloud_types.NodeState.UNKNOWN))
+
     @classmethod
     def node_fqdn(cls, node):
         return node.extra["tags"].get("hostname")
diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py
index 44f1513..ed8c7d5 100644
--- a/services/nodemanager/arvnodeman/daemon.py
+++ b/services/nodemanager/arvnodeman/daemon.py
@@ -159,7 +159,9 @@ class NodeManagerDaemonActor(actor_class):
             timer_actor=self._timer,
             arvados_node=None,
             poll_stale_after=self.poll_stale_after,
-            node_stale_after=self.node_stale_after).proxy()
+            node_stale_after=self.node_stale_after,
+            cloud_client=self._cloud_driver,
+            boot_fail_after=self.boot_fail_after).proxy()
         actor.subscribe(self._later.node_can_shutdown)
         self._cloud_nodes_actor.subscribe_to(cloud_node.id,
                                              actor.update_cloud_node)
@@ -207,6 +209,12 @@ class NodeManagerDaemonActor(actor_class):
                                  self.cloud_nodes.nodes.itervalues())
                    if busy)
 
+    def _nodes_missing(self):
+        return sum(1 for arv_node in
+                   pykka.get_all(rec.actor.arvados_node for rec in
+                                 self.cloud_nodes.nodes.itervalues())
+                   if arv_node and arv_node.get("status") == "missing")
+
     def _nodes_wanted(self):
         up_count = self._nodes_up()
         under_min = self.min_nodes - up_count
@@ -216,11 +224,11 @@ class NodeManagerDaemonActor(actor_class):
         elif under_min > 0:
             return under_min
         else:
-            up_count -= len(self.shutdowns) + self._nodes_busy()
+            up_count -= len(self.shutdowns) + self._nodes_busy() + self._nodes_missing()
             return len(self.last_wishlist) - up_count
 
     def _nodes_excess(self):
-        up_count = self._nodes_up() - len(self.shutdowns)
+        up_count = self._nodes_up() - len(self.shutdowns) - self._nodes_missing()
         over_min = up_count - self.min_nodes
         if over_min <= 0:
             return over_min

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list