[ARVADOS] updated: 0d6070d3b62fc367dbb02e8ead5fe25dce012156
git at public.curoverse.com
git at public.curoverse.com
Fri Feb 5 11:10:47 EST 2016
Summary of changes:
services/nodemanager/arvnodeman/clientactor.py | 5 +-
.../arvnodeman/computenode/dispatch/__init__.py | 6 +--
services/nodemanager/arvnodeman/daemon.py | 60 ++++++++++++----------
services/nodemanager/tests/test_clientactor.py | 4 +-
.../nodemanager/tests/test_computenode_dispatch.py | 14 ++---
5 files changed, 49 insertions(+), 40 deletions(-)
via 0d6070d3b62fc367dbb02e8ead5fe25dce012156 (commit)
from 8709ef45c968e79b5e5978759484b994d4789aea (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
commit 0d6070d3b62fc367dbb02e8ead5fe25dce012156
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date: Fri Feb 5 11:10:43 2016 -0500
7667: Combine polling logs into fewer lines for less noise. Adjust message
when last_ping_at is unexpectedly none to be less severe (can happen in
innocent circumstances). Report nodes in "booted" list as "booting" since they
are unpaired. Fix tests.
diff --git a/services/nodemanager/arvnodeman/clientactor.py b/services/nodemanager/arvnodeman/clientactor.py
index e5534c5..9a9ce58 100644
--- a/services/nodemanager/arvnodeman/clientactor.py
+++ b/services/nodemanager/arvnodeman/clientactor.py
@@ -72,7 +72,6 @@ class RemotePollLoopActor(actor_class):
raise NotImplementedError("subclasses must implement request method")
def _got_response(self, response):
- self._logger.debug("got response with %d items", len(response))
self.poll_wait = self.min_poll_wait
_notify_subscribers(response, self.all_subscribers)
if hasattr(self, '_item_key'):
@@ -105,7 +104,9 @@ class RemotePollLoopActor(actor_class):
else:
self._got_response(response)
next_poll = scheduled_start + self.poll_wait
- self._logger.info("request took %s seconds", (time.time() - scheduled_start))
+ self._logger.info("got response with %d items in %s seconds, next poll at %s",
+ len(response), (time.time() - scheduled_start),
+ time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(next_poll)))
end_time = time.time()
if next_poll < end_time: # We've drifted too much; start fresh.
next_poll = end_time + self.poll_wait
diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
index 17d0463..b6c0597 100644
--- a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
@@ -374,9 +374,9 @@ class ComputeNodeMonitorActor(config.actor_class):
# is in an error state, so shut it down.
return True
if missing is None and self._cloud.broken(self.cloud_node):
- self._logger.warning(
- "cloud reports broken node, but paired node %s never pinged "
- "(bug?) -- skipped check for node_stale_after",
+ self._logger.info(
+ "Cloud node considered 'broken' but paired node %s last_ping_at is None, " +
+ "cannot check node_stale_after (node may be shut down and we just haven't gotten the message yet).",
self.arvados_node['uuid'])
if self.in_state('idle'):
return True
diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py
index 5616376..47e77d1 100644
--- a/services/nodemanager/arvnodeman/daemon.py
+++ b/services/nodemanager/arvnodeman/daemon.py
@@ -151,8 +151,8 @@ class NodeManagerDaemonActor(actor_class):
self.last_polls[poll_key] = time.time()
def _pair_nodes(self, node_record, arvados_node):
- self._logger.info("Cloud node %s has associated with Arvados node %s",
- node_record.cloud_node.id, arvados_node['uuid'])
+ self._logger.info("Cloud node %s is now paired with Arvados node %s",
+ node_record.cloud_node.name, arvados_node['uuid'])
self._arvados_nodes_actor.subscribe_to(
arvados_node['uuid'], node_record.actor.update_arvados_node)
node_record.arvados_node = arvados_node
@@ -217,14 +217,22 @@ class NodeManagerDaemonActor(actor_class):
break
def _nodes_booting(self, size):
+ s = sum(1
+ for c in self.booting.itervalues()
+ if size is None or c.cloud_size.get().id == size.id)
+ s += sum(1
+ for c in self.booted.itervalues()
+ if size is None or c.cloud_node.size.id == size.id)
+ return s
+
+ def _nodes_unpaired(self, size):
return sum(1
- for c in self.booting.itervalues()
- if size is None or c.cloud_size.get().id == size.id)
+ for c in self.cloud_nodes.unpaired()
+ if size is None or c.cloud_node.size.id == size.id)
def _nodes_booted(self, size):
return sum(1
- for i in (self.booted, self.cloud_nodes.nodes)
- for c in i.itervalues()
+ for c in self.cloud_nodes.nodes.itervalues()
if size is None or c.cloud_node.size.id == size.id)
def _nodes_up(self, size):
@@ -278,21 +286,18 @@ class NodeManagerDaemonActor(actor_class):
elif under_min > 0 and size.id == self.min_cloud_size.id:
return under_min
- try:
- booting_count = self._nodes_booting(size) + sum(1 for _ in self.cloud_nodes.unpaired())
- shutdown_count = self._size_shutdowns(size)
- busy_count = self._nodes_busy(size)
- up_count = self._nodes_up(size) - (shutdown_count + busy_count + self._nodes_missing(size))
-
- self._logger.info("%s: wishlist %i, up %i (booting %i, idle %i, busy %i), shutting down %i", size.name,
- self._size_wishlist(size),
- up_count + busy_count,
- booting_count,
- up_count - booting_count,
- busy_count,
- shutdown_count)
- except Exception as e:
- self._logger.exception("whoops")
+ booting_count = self._nodes_booting(size) + self._nodes_unpaired(size)
+ shutdown_count = self._size_shutdowns(size)
+ busy_count = self._nodes_busy(size)
+ up_count = self._nodes_up(size) - (shutdown_count + busy_count + self._nodes_missing(size))
+
+ self._logger.info("%s: wishlist %i, up %i (booting %i, idle %i, busy %i), shutting down %i", size.name,
+ self._size_wishlist(size),
+ up_count + busy_count,
+ booting_count,
+ up_count - booting_count,
+ busy_count,
+ shutdown_count)
wanted = self._size_wishlist(size) - up_count
if wanted > 0 and self.max_total_price and ((total_price + (size.price*wanted)) > self.max_total_price):
@@ -314,11 +319,14 @@ class NodeManagerDaemonActor(actor_class):
self._update_poll_time('server_wishlist')
self.last_wishlist = wishlist
for size in reversed(self.server_calculator.cloud_sizes):
- nodes_wanted = self._nodes_wanted(size)
- if nodes_wanted > 0:
- self._later.start_node(size)
- elif (nodes_wanted < 0) and self.booting:
- self._later.stop_booting_node(size)
+ try:
+ nodes_wanted = self._nodes_wanted(size)
+ if nodes_wanted > 0:
+ self._later.start_node(size)
+ elif (nodes_wanted < 0) and self.booting:
+ self._later.stop_booting_node(size)
+ except Exception as e:
+ self._logger.exception("while calculating nodes wanted for size %s", size)
def _check_poll_freshness(orig_func):
"""Decorator to inhibit a method when poll information is stale.
diff --git a/services/nodemanager/tests/test_clientactor.py b/services/nodemanager/tests/test_clientactor.py
index 57a0d32..cee9c85 100644
--- a/services/nodemanager/tests/test_clientactor.py
+++ b/services/nodemanager/tests/test_clientactor.py
@@ -47,7 +47,8 @@ class RemotePollLoopActorTestCase(testutil.RemotePollLoopActorTestMixin,
def test_late_subscribers_get_responses(self):
self.build_monitor(['pre_late_test', 'late_test'])
- self.monitor.subscribe(lambda response: None).get(self.TIMEOUT)
+ mock_subscriber = mock.Mock(name='mock_subscriber')
+ self.monitor.subscribe(mock_subscriber).get(self.TIMEOUT)
self.monitor.subscribe(self.subscriber)
self.monitor.poll().get(self.TIMEOUT)
self.stop_proxy(self.monitor)
@@ -146,4 +147,3 @@ class RemotePollLoopActorWithKeysTestCase(testutil.RemotePollLoopActorTestMixin,
if __name__ == '__main__':
unittest.main()
-
diff --git a/services/nodemanager/tests/test_computenode_dispatch.py b/services/nodemanager/tests/test_computenode_dispatch.py
index ecf83c6..040d8d5 100644
--- a/services/nodemanager/tests/test_computenode_dispatch.py
+++ b/services/nodemanager/tests/test_computenode_dispatch.py
@@ -339,7 +339,7 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
def test_no_shutdown_booting(self):
self.make_actor()
self.shutdowns._set_state(True, 600)
- self.assertFalse(self.node_actor.shutdown_eligible().get(self.TIMEOUT))
+ self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT).startswith("node is still booting"))
def test_shutdown_without_arvados_node(self):
self.make_actor(start_time=0)
@@ -352,7 +352,7 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
last_ping_at='1970-01-01T01:02:03.04050607Z')
self.make_actor(10, arv_node)
self.shutdowns._set_state(True, 600)
- self.assertFalse(self.node_actor.shutdown_eligible().get(self.TIMEOUT))
+ self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT).startswith("node is not idle."))
def test_no_shutdown_running_broken(self):
arv_node = testutil.arvados_node_mock(12, job_uuid=None,
@@ -360,7 +360,7 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
self.make_actor(12, arv_node)
self.shutdowns._set_state(True, 600)
self.cloud_client.broken.return_value = True
- self.assertFalse(self.node_actor.shutdown_eligible().get(self.TIMEOUT))
+ self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT).startswith("node is not idle."))
def test_shutdown_missing_broken(self):
arv_node = testutil.arvados_node_mock(11, job_uuid=None,
@@ -373,23 +373,23 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
def test_no_shutdown_when_window_closed(self):
self.make_actor(3, testutil.arvados_node_mock(3, job_uuid=None))
- self.assertFalse(self.node_actor.shutdown_eligible().get(self.TIMEOUT))
+ self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT).startswith("shutdown window is not open."))
def test_no_shutdown_when_node_running_job(self):
self.make_actor(4, testutil.arvados_node_mock(4, job_uuid=True))
self.shutdowns._set_state(True, 600)
- self.assertFalse(self.node_actor.shutdown_eligible().get(self.TIMEOUT))
+ self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT).startswith("node is not idle."))
def test_no_shutdown_when_node_state_unknown(self):
self.make_actor(5, testutil.arvados_node_mock(
5, crunch_worker_state=None))
self.shutdowns._set_state(True, 600)
- self.assertFalse(self.node_actor.shutdown_eligible().get(self.TIMEOUT))
+ self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT).startswith("node is not idle."))
def test_no_shutdown_when_node_state_stale(self):
self.make_actor(6, testutil.arvados_node_mock(6, age=90000))
self.shutdowns._set_state(True, 600)
- self.assertFalse(self.node_actor.shutdown_eligible().get(self.TIMEOUT))
+ self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT).startswith("node is not idle."))
def test_arvados_node_match(self):
self.make_actor(2)
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list