[ARVADOS] updated: c0f33379c7fd062fc097ecef92808334e821cb6b
git at public.curoverse.com
git at public.curoverse.com
Wed Sep 30 14:21:59 EDT 2015
Summary of changes:
services/nodemanager/arvnodeman/computenode/__init__.py | 3 +++
services/nodemanager/arvnodeman/computenode/dispatch/__init__.py | 5 +++--
services/nodemanager/arvnodeman/daemon.py | 2 +-
services/nodemanager/tests/test_computenode_dispatch.py | 9 ++++-----
services/nodemanager/tests/test_daemon.py | 4 ++--
5 files changed, 13 insertions(+), 10 deletions(-)
via c0f33379c7fd062fc097ecef92808334e821cb6b (commit)
from f6aa7c0c8c84b85b550d73117c6fdbd663a38c4c (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
commit c0f33379c7fd062fc097ecef92808334e821cb6b
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date: Wed Sep 30 14:23:25 2015 -0400
7286: Compute "missing" based on "last_ping_at" instead of using API server's
buggy "status" field.
diff --git a/services/nodemanager/arvnodeman/computenode/__init__.py b/services/nodemanager/arvnodeman/computenode/__init__.py
index f518607..b47866d 100644
--- a/services/nodemanager/arvnodeman/computenode/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/__init__.py
@@ -30,6 +30,9 @@ def arvados_timestamp(timestr):
def timestamp_fresh(timestamp, fresh_time):
return (time.time() - timestamp) < fresh_time
+def arvados_node_missing(arvados_node, fresh_time):
+ return not timestamp_fresh(arvados_timestamp(arvados_node["last_ping_at"]), fresh_time)
+
class ShutdownTimer(object):
"""Keep track of a cloud node's shutdown windows.
diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
index 4557198..4ebd437 100644
--- a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
@@ -10,7 +10,7 @@ import libcloud.common.types as cloud_types
import pykka
from .. import \
- arvados_node_fqdn, arvados_node_mtime, arvados_timestamp, timestamp_fresh
+ arvados_node_fqdn, arvados_node_mtime, arvados_timestamp, timestamp_fresh, arvados_node_missing
from ...clientactor import _notify_subscribers
from ... import config
@@ -329,9 +329,10 @@ class ComputeNodeMonitorActor(config.actor_class):
# Node is unpaired.
# If it hasn't pinged Arvados after boot_fail seconds, shut it down
return not timestamp_fresh(self.cloud_node_start_time, self.boot_fail_after)
- elif self.arvados_node.get('status') == "missing" and self._cloud.broken(self.cloud_node):
+ elif arvados_node_missing(self.arvados_node, self.node_stale_after) and self._cloud.broken(self.cloud_node):
# Node is paired, but Arvados says it is missing and the cloud says the node
# is in an error state, so shut it down.
+ self._logger.warn("blah %s %s", arvados_node_missing(self.arvados_node, self.node_stale_after), self._cloud.broken(self.cloud_node))
return True
else:
return self.in_state('idle')
diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py
index ed8c7d5..30592ab 100644
--- a/services/nodemanager/arvnodeman/daemon.py
+++ b/services/nodemanager/arvnodeman/daemon.py
@@ -213,7 +213,7 @@ class NodeManagerDaemonActor(actor_class):
return sum(1 for arv_node in
pykka.get_all(rec.actor.arvados_node for rec in
self.cloud_nodes.nodes.itervalues())
- if arv_node and arv_node.get("status") == "missing")
+ if arv_node and cnode.arvados_node_missing(arv_node, self.node_stale_after))
def _nodes_wanted(self):
up_count = self._nodes_up()
diff --git a/services/nodemanager/tests/test_computenode_dispatch.py b/services/nodemanager/tests/test_computenode_dispatch.py
index 707cdc6..e718fc1 100644
--- a/services/nodemanager/tests/test_computenode_dispatch.py
+++ b/services/nodemanager/tests/test_computenode_dispatch.py
@@ -244,6 +244,7 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
self.cloud_mock = testutil.cloud_node_mock(node_num)
self.subscriber = mock.Mock(name='subscriber_mock')
self.cloud_client = mock.MagicMock(name='cloud_client')
+ self.cloud_client.broken.return_value = False
def make_actor(self, node_num=1, arv_node=None, start_time=None):
if not hasattr(self, 'cloud_mock'):
@@ -321,16 +322,14 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
def test_no_shutdown_missing(self):
arv_node = testutil.arvados_node_mock(10, job_uuid=None,
crunch_worker_state="down",
- status="missing")
+ last_ping_at='1970-01-01T01:02:03.04050607Z')
self.make_actor(10, arv_node)
self.shutdowns._set_state(True, 600)
- self.cloud_client.broken.return_value = False
self.assertFalse(self.node_actor.shutdown_eligible().get(self.TIMEOUT))
def test_no_shutdown_running_broken(self):
arv_node = testutil.arvados_node_mock(12, job_uuid=None,
- crunch_worker_state="down",
- status="running")
+ crunch_worker_state="down")
self.make_actor(12, arv_node)
self.shutdowns._set_state(True, 600)
self.cloud_client.broken.return_value = True
@@ -339,7 +338,7 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
def test_shutdown_missing_broken(self):
arv_node = testutil.arvados_node_mock(11, job_uuid=None,
crunch_worker_state="down",
- status="missing")
+ last_ping_at='1970-01-01T01:02:03.04050607Z')
self.make_actor(11, arv_node)
self.shutdowns._set_state(True, 600)
self.cloud_client.broken.return_value = True
diff --git a/services/nodemanager/tests/test_daemon.py b/services/nodemanager/tests/test_daemon.py
index 8c622ec..02dec42 100644
--- a/services/nodemanager/tests/test_daemon.py
+++ b/services/nodemanager/tests/test_daemon.py
@@ -130,7 +130,7 @@ class NodeManagerDaemonActorTestCase(testutil.ActorTestMixin,
self.make_daemon(cloud_nodes=[testutil.cloud_node_mock(1),
testutil.cloud_node_mock(2)],
arvados_nodes=[testutil.arvados_node_mock(1),
- testutil.arvados_node_mock(2, status="missing")],
+ testutil.arvados_node_mock(2, last_ping_at='1970-01-01T01:02:03.04050607Z')],
want_sizes=[size, size])
self.stop_proxy(self.daemon)
self.assertTrue(self.node_setup.start.called)
@@ -140,7 +140,7 @@ class NodeManagerDaemonActorTestCase(testutil.ActorTestMixin,
self.make_daemon(cloud_nodes=[testutil.cloud_node_mock(1),
testutil.cloud_node_mock(2)],
arvados_nodes=[testutil.arvados_node_mock(1),
- testutil.arvados_node_mock(2, status="missing")],
+ testutil.arvados_node_mock(2, last_ping_at='1970-01-01T01:02:03.04050607Z')],
want_sizes=[size, size],
max_nodes=2)
self.stop_proxy(self.daemon)
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list