[ARVADOS] updated: c0f33379c7fd062fc097ecef92808334e821cb6b

git at public.curoverse.com git at public.curoverse.com
Wed Sep 30 14:21:59 EDT 2015


Summary of changes:
 services/nodemanager/arvnodeman/computenode/__init__.py          | 3 +++
 services/nodemanager/arvnodeman/computenode/dispatch/__init__.py | 5 +++--
 services/nodemanager/arvnodeman/daemon.py                        | 2 +-
 services/nodemanager/tests/test_computenode_dispatch.py          | 9 ++++-----
 services/nodemanager/tests/test_daemon.py                        | 4 ++--
 5 files changed, 13 insertions(+), 10 deletions(-)

       via  c0f33379c7fd062fc097ecef92808334e821cb6b (commit)
      from  f6aa7c0c8c84b85b550d73117c6fdbd663a38c4c (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.


commit c0f33379c7fd062fc097ecef92808334e821cb6b
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Wed Sep 30 14:23:25 2015 -0400

    7286: Compute "missing" based on "last_ping_at" instead of using API server's
    buggy "status" field.

diff --git a/services/nodemanager/arvnodeman/computenode/__init__.py b/services/nodemanager/arvnodeman/computenode/__init__.py
index f518607..b47866d 100644
--- a/services/nodemanager/arvnodeman/computenode/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/__init__.py
@@ -30,6 +30,9 @@ def arvados_timestamp(timestr):
 def timestamp_fresh(timestamp, fresh_time):
     return (time.time() - timestamp) < fresh_time
 
+def arvados_node_missing(arvados_node, fresh_time):
+    return not timestamp_fresh(arvados_timestamp(arvados_node["last_ping_at"]), fresh_time)
+
 class ShutdownTimer(object):
     """Keep track of a cloud node's shutdown windows.
 
diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
index 4557198..4ebd437 100644
--- a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
@@ -10,7 +10,7 @@ import libcloud.common.types as cloud_types
 import pykka
 
 from .. import \
-    arvados_node_fqdn, arvados_node_mtime, arvados_timestamp, timestamp_fresh
+    arvados_node_fqdn, arvados_node_mtime, arvados_timestamp, timestamp_fresh, arvados_node_missing
 from ...clientactor import _notify_subscribers
 from ... import config
 
@@ -329,9 +329,10 @@ class ComputeNodeMonitorActor(config.actor_class):
             # Node is unpaired.
             # If it hasn't pinged Arvados after boot_fail seconds, shut it down
             return not timestamp_fresh(self.cloud_node_start_time, self.boot_fail_after)
-        elif self.arvados_node.get('status') == "missing" and self._cloud.broken(self.cloud_node):
+        elif arvados_node_missing(self.arvados_node, self.node_stale_after) and self._cloud.broken(self.cloud_node):
             # Node is paired, but Arvados says it is missing and the cloud says the node
             # is in an error state, so shut it down.
+            self._logger.warn("blah %s %s", arvados_node_missing(self.arvados_node, self.node_stale_after), self._cloud.broken(self.cloud_node))
             return True
         else:
             return self.in_state('idle')
diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py
index ed8c7d5..30592ab 100644
--- a/services/nodemanager/arvnodeman/daemon.py
+++ b/services/nodemanager/arvnodeman/daemon.py
@@ -213,7 +213,7 @@ class NodeManagerDaemonActor(actor_class):
         return sum(1 for arv_node in
                    pykka.get_all(rec.actor.arvados_node for rec in
                                  self.cloud_nodes.nodes.itervalues())
-                   if arv_node and arv_node.get("status") == "missing")
+                   if arv_node and cnode.arvados_node_missing(arv_node, self.node_stale_after))
 
     def _nodes_wanted(self):
         up_count = self._nodes_up()
diff --git a/services/nodemanager/tests/test_computenode_dispatch.py b/services/nodemanager/tests/test_computenode_dispatch.py
index 707cdc6..e718fc1 100644
--- a/services/nodemanager/tests/test_computenode_dispatch.py
+++ b/services/nodemanager/tests/test_computenode_dispatch.py
@@ -244,6 +244,7 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
         self.cloud_mock = testutil.cloud_node_mock(node_num)
         self.subscriber = mock.Mock(name='subscriber_mock')
         self.cloud_client = mock.MagicMock(name='cloud_client')
+        self.cloud_client.broken.return_value = False
 
     def make_actor(self, node_num=1, arv_node=None, start_time=None):
         if not hasattr(self, 'cloud_mock'):
@@ -321,16 +322,14 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
     def test_no_shutdown_missing(self):
         arv_node = testutil.arvados_node_mock(10, job_uuid=None,
                                               crunch_worker_state="down",
-                                              status="missing")
+                                              last_ping_at='1970-01-01T01:02:03.04050607Z')
         self.make_actor(10, arv_node)
         self.shutdowns._set_state(True, 600)
-        self.cloud_client.broken.return_value = False
         self.assertFalse(self.node_actor.shutdown_eligible().get(self.TIMEOUT))
 
     def test_no_shutdown_running_broken(self):
         arv_node = testutil.arvados_node_mock(12, job_uuid=None,
-                                              crunch_worker_state="down",
-                                              status="running")
+                                              crunch_worker_state="down")
         self.make_actor(12, arv_node)
         self.shutdowns._set_state(True, 600)
         self.cloud_client.broken.return_value = True
@@ -339,7 +338,7 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
     def test_shutdown_missing_broken(self):
         arv_node = testutil.arvados_node_mock(11, job_uuid=None,
                                               crunch_worker_state="down",
-                                              status="missing")
+                                              last_ping_at='1970-01-01T01:02:03.04050607Z')
         self.make_actor(11, arv_node)
         self.shutdowns._set_state(True, 600)
         self.cloud_client.broken.return_value = True
diff --git a/services/nodemanager/tests/test_daemon.py b/services/nodemanager/tests/test_daemon.py
index 8c622ec..02dec42 100644
--- a/services/nodemanager/tests/test_daemon.py
+++ b/services/nodemanager/tests/test_daemon.py
@@ -130,7 +130,7 @@ class NodeManagerDaemonActorTestCase(testutil.ActorTestMixin,
         self.make_daemon(cloud_nodes=[testutil.cloud_node_mock(1),
                                       testutil.cloud_node_mock(2)],
                          arvados_nodes=[testutil.arvados_node_mock(1),
-                                      testutil.arvados_node_mock(2, status="missing")],
+                                      testutil.arvados_node_mock(2, last_ping_at='1970-01-01T01:02:03.04050607Z')],
                          want_sizes=[size, size])
         self.stop_proxy(self.daemon)
         self.assertTrue(self.node_setup.start.called)
@@ -140,7 +140,7 @@ class NodeManagerDaemonActorTestCase(testutil.ActorTestMixin,
         self.make_daemon(cloud_nodes=[testutil.cloud_node_mock(1),
                                       testutil.cloud_node_mock(2)],
                          arvados_nodes=[testutil.arvados_node_mock(1),
-                                        testutil.arvados_node_mock(2, status="missing")],
+                                        testutil.arvados_node_mock(2, last_ping_at='1970-01-01T01:02:03.04050607Z')],
                          want_sizes=[size, size],
                          max_nodes=2)
         self.stop_proxy(self.daemon)

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list