[ARVADOS] updated: 0d6070d3b62fc367dbb02e8ead5fe25dce012156

git at public.curoverse.com git at public.curoverse.com
Fri Feb 5 11:10:47 EST 2016


Summary of changes:
 services/nodemanager/arvnodeman/clientactor.py     |  5 +-
 .../arvnodeman/computenode/dispatch/__init__.py    |  6 +--
 services/nodemanager/arvnodeman/daemon.py          | 60 ++++++++++++----------
 services/nodemanager/tests/test_clientactor.py     |  4 +-
 .../nodemanager/tests/test_computenode_dispatch.py | 14 ++---
 5 files changed, 49 insertions(+), 40 deletions(-)

       via  0d6070d3b62fc367dbb02e8ead5fe25dce012156 (commit)
      from  8709ef45c968e79b5e5978759484b994d4789aea (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.


commit 0d6070d3b62fc367dbb02e8ead5fe25dce012156
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Fri Feb 5 11:10:43 2016 -0500

    7667: Combine polling logs into fewer lines for less noise.  Adjust message
    when last_ping_at is unexpectedly none to be less severe (can happen in
    innocent circumstances).  Report nodes in "booted" list as "booting" since they
    are unpaired.  Fix tests.

diff --git a/services/nodemanager/arvnodeman/clientactor.py b/services/nodemanager/arvnodeman/clientactor.py
index e5534c5..9a9ce58 100644
--- a/services/nodemanager/arvnodeman/clientactor.py
+++ b/services/nodemanager/arvnodeman/clientactor.py
@@ -72,7 +72,6 @@ class RemotePollLoopActor(actor_class):
         raise NotImplementedError("subclasses must implement request method")
 
     def _got_response(self, response):
-        self._logger.debug("got response with %d items", len(response))
         self.poll_wait = self.min_poll_wait
         _notify_subscribers(response, self.all_subscribers)
         if hasattr(self, '_item_key'):
@@ -105,7 +104,9 @@ class RemotePollLoopActor(actor_class):
         else:
             self._got_response(response)
             next_poll = scheduled_start + self.poll_wait
-        self._logger.info("request took %s seconds", (time.time() - scheduled_start))
+            self._logger.info("got response with %d items in %s seconds, next poll at %s",
+                              len(response), (time.time() - scheduled_start),
+                              time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(next_poll)))
         end_time = time.time()
         if next_poll < end_time:  # We've drifted too much; start fresh.
             next_poll = end_time + self.poll_wait
diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
index 17d0463..b6c0597 100644
--- a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
@@ -374,9 +374,9 @@ class ComputeNodeMonitorActor(config.actor_class):
             # is in an error state, so shut it down.
             return True
         if missing is None and self._cloud.broken(self.cloud_node):
-            self._logger.warning(
-                "cloud reports broken node, but paired node %s never pinged "
-                "(bug?) -- skipped check for node_stale_after",
+            self._logger.info(
+                "Cloud node considered 'broken' but paired node %s last_ping_at is None, " +
+                "cannot check node_stale_after (node may be shut down and we just haven't gotten the message yet).",
                 self.arvados_node['uuid'])
         if self.in_state('idle'):
             return True
diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py
index 5616376..47e77d1 100644
--- a/services/nodemanager/arvnodeman/daemon.py
+++ b/services/nodemanager/arvnodeman/daemon.py
@@ -151,8 +151,8 @@ class NodeManagerDaemonActor(actor_class):
         self.last_polls[poll_key] = time.time()
 
     def _pair_nodes(self, node_record, arvados_node):
-        self._logger.info("Cloud node %s has associated with Arvados node %s",
-                          node_record.cloud_node.id, arvados_node['uuid'])
+        self._logger.info("Cloud node %s is now paired with Arvados node %s",
+                          node_record.cloud_node.name, arvados_node['uuid'])
         self._arvados_nodes_actor.subscribe_to(
             arvados_node['uuid'], node_record.actor.update_arvados_node)
         node_record.arvados_node = arvados_node
@@ -217,14 +217,22 @@ class NodeManagerDaemonActor(actor_class):
                     break
 
     def _nodes_booting(self, size):
+        s = sum(1
+                for c in self.booting.itervalues()
+                if size is None or c.cloud_size.get().id == size.id)
+        s += sum(1
+                 for c in self.booted.itervalues()
+                 if size is None or c.cloud_node.size.id == size.id)
+        return s
+
+    def _nodes_unpaired(self, size):
         return sum(1
-                   for c in self.booting.itervalues()
-                   if size is None or c.cloud_size.get().id == size.id)
+                   for c in self.cloud_nodes.unpaired()
+                   if size is None or c.cloud_node.size.id == size.id)
 
     def _nodes_booted(self, size):
         return sum(1
-                  for i in (self.booted, self.cloud_nodes.nodes)
-                  for c in i.itervalues()
+                  for c in self.cloud_nodes.nodes.itervalues()
                   if size is None or c.cloud_node.size.id == size.id)
 
     def _nodes_up(self, size):
@@ -278,21 +286,18 @@ class NodeManagerDaemonActor(actor_class):
         elif under_min > 0 and size.id == self.min_cloud_size.id:
             return under_min
 
-        try:
-            booting_count = self._nodes_booting(size) + sum(1 for _ in self.cloud_nodes.unpaired())
-            shutdown_count = self._size_shutdowns(size)
-            busy_count = self._nodes_busy(size)
-            up_count = self._nodes_up(size) - (shutdown_count + busy_count + self._nodes_missing(size))
-
-            self._logger.info("%s: wishlist %i, up %i (booting %i, idle %i, busy %i), shutting down %i", size.name,
-                              self._size_wishlist(size),
-                              up_count + busy_count,
-                              booting_count,
-                              up_count - booting_count,
-                              busy_count,
-                              shutdown_count)
-        except Exception as e:
-            self._logger.exception("whoops")
+        booting_count = self._nodes_booting(size) + self._nodes_unpaired(size)
+        shutdown_count = self._size_shutdowns(size)
+        busy_count = self._nodes_busy(size)
+        up_count = self._nodes_up(size) - (shutdown_count + busy_count + self._nodes_missing(size))
+
+        self._logger.info("%s: wishlist %i, up %i (booting %i, idle %i, busy %i), shutting down %i", size.name,
+                          self._size_wishlist(size),
+                          up_count + busy_count,
+                          booting_count,
+                          up_count - booting_count,
+                          busy_count,
+                          shutdown_count)
 
         wanted = self._size_wishlist(size) - up_count
         if wanted > 0 and self.max_total_price and ((total_price + (size.price*wanted)) > self.max_total_price):
@@ -314,11 +319,14 @@ class NodeManagerDaemonActor(actor_class):
         self._update_poll_time('server_wishlist')
         self.last_wishlist = wishlist
         for size in reversed(self.server_calculator.cloud_sizes):
-            nodes_wanted = self._nodes_wanted(size)
-            if nodes_wanted > 0:
-                self._later.start_node(size)
-            elif (nodes_wanted < 0) and self.booting:
-                self._later.stop_booting_node(size)
+            try:
+                nodes_wanted = self._nodes_wanted(size)
+                if nodes_wanted > 0:
+                    self._later.start_node(size)
+                elif (nodes_wanted < 0) and self.booting:
+                    self._later.stop_booting_node(size)
+            except Exception as e:
+                self._logger.exception("while calculating nodes wanted for size %s", size)
 
     def _check_poll_freshness(orig_func):
         """Decorator to inhibit a method when poll information is stale.
diff --git a/services/nodemanager/tests/test_clientactor.py b/services/nodemanager/tests/test_clientactor.py
index 57a0d32..cee9c85 100644
--- a/services/nodemanager/tests/test_clientactor.py
+++ b/services/nodemanager/tests/test_clientactor.py
@@ -47,7 +47,8 @@ class RemotePollLoopActorTestCase(testutil.RemotePollLoopActorTestMixin,
 
     def test_late_subscribers_get_responses(self):
         self.build_monitor(['pre_late_test', 'late_test'])
-        self.monitor.subscribe(lambda response: None).get(self.TIMEOUT)
+        mock_subscriber = mock.Mock(name='mock_subscriber')
+        self.monitor.subscribe(mock_subscriber).get(self.TIMEOUT)
         self.monitor.subscribe(self.subscriber)
         self.monitor.poll().get(self.TIMEOUT)
         self.stop_proxy(self.monitor)
@@ -146,4 +147,3 @@ class RemotePollLoopActorWithKeysTestCase(testutil.RemotePollLoopActorTestMixin,
 
 if __name__ == '__main__':
     unittest.main()
-
diff --git a/services/nodemanager/tests/test_computenode_dispatch.py b/services/nodemanager/tests/test_computenode_dispatch.py
index ecf83c6..040d8d5 100644
--- a/services/nodemanager/tests/test_computenode_dispatch.py
+++ b/services/nodemanager/tests/test_computenode_dispatch.py
@@ -339,7 +339,7 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
     def test_no_shutdown_booting(self):
         self.make_actor()
         self.shutdowns._set_state(True, 600)
-        self.assertFalse(self.node_actor.shutdown_eligible().get(self.TIMEOUT))
+        self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT).startswith("node is still booting"))
 
     def test_shutdown_without_arvados_node(self):
         self.make_actor(start_time=0)
@@ -352,7 +352,7 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
                                               last_ping_at='1970-01-01T01:02:03.04050607Z')
         self.make_actor(10, arv_node)
         self.shutdowns._set_state(True, 600)
-        self.assertFalse(self.node_actor.shutdown_eligible().get(self.TIMEOUT))
+        self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT).startswith("node is not idle."))
 
     def test_no_shutdown_running_broken(self):
         arv_node = testutil.arvados_node_mock(12, job_uuid=None,
@@ -360,7 +360,7 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
         self.make_actor(12, arv_node)
         self.shutdowns._set_state(True, 600)
         self.cloud_client.broken.return_value = True
-        self.assertFalse(self.node_actor.shutdown_eligible().get(self.TIMEOUT))
+        self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT).startswith("node is not idle."))
 
     def test_shutdown_missing_broken(self):
         arv_node = testutil.arvados_node_mock(11, job_uuid=None,
@@ -373,23 +373,23 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
 
     def test_no_shutdown_when_window_closed(self):
         self.make_actor(3, testutil.arvados_node_mock(3, job_uuid=None))
-        self.assertFalse(self.node_actor.shutdown_eligible().get(self.TIMEOUT))
+        self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT).startswith("shutdown window is not open."))
 
     def test_no_shutdown_when_node_running_job(self):
         self.make_actor(4, testutil.arvados_node_mock(4, job_uuid=True))
         self.shutdowns._set_state(True, 600)
-        self.assertFalse(self.node_actor.shutdown_eligible().get(self.TIMEOUT))
+        self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT).startswith("node is not idle."))
 
     def test_no_shutdown_when_node_state_unknown(self):
         self.make_actor(5, testutil.arvados_node_mock(
             5, crunch_worker_state=None))
         self.shutdowns._set_state(True, 600)
-        self.assertFalse(self.node_actor.shutdown_eligible().get(self.TIMEOUT))
+        self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT).startswith("node is not idle."))
 
     def test_no_shutdown_when_node_state_stale(self):
         self.make_actor(6, testutil.arvados_node_mock(6, age=90000))
         self.shutdowns._set_state(True, 600)
-        self.assertFalse(self.node_actor.shutdown_eligible().get(self.TIMEOUT))
+        self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT).startswith("node is not idle."))
 
     def test_arvados_node_match(self):
         self.make_actor(2)

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list