[ARVADOS] updated: f178db3f1a81cb5f4f155baf385ef889e0911a79

Git user git at public.curoverse.com
Tue Jun 13 15:04:58 EDT 2017


Summary of changes:
 services/nodemanager/arvnodeman/__init__.py        |  1 +
 services/nodemanager/arvnodeman/baseactor.py       |  3 ++
 services/nodemanager/arvnodeman/clientactor.py     |  3 ++
 .../nodemanager/arvnodeman/computenode/__init__.py |  2 +-
 .../arvnodeman/computenode/dispatch/__init__.py    |  2 ++
 services/nodemanager/arvnodeman/daemon.py          | 41 +++++++++++++---------
 services/nodemanager/arvnodeman/launcher.py        |  3 +-
 services/nodemanager/arvnodeman/status.py          | 30 +++++++++++++++-
 8 files changed, 65 insertions(+), 20 deletions(-)

       via  f178db3f1a81cb5f4f155baf385ef889e0911a79 (commit)
      from  1227ea2b5795e34a75c62cb9eae91d46ef7cfb6a (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.


commit f178db3f1a81cb5f4f155baf385ef889e0911a79
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Tue Jun 13 12:49:25 2017 -0400

    11836: Report "warning" and "ok" status on exceptions or on completion of successful process loop.
    
    Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz at curoverse.com>

diff --git a/services/nodemanager/arvnodeman/__init__.py b/services/nodemanager/arvnodeman/__init__.py
index a1ecac7..083b62a 100644
--- a/services/nodemanager/arvnodeman/__init__.py
+++ b/services/nodemanager/arvnodeman/__init__.py
@@ -7,3 +7,4 @@ import logging
 
 logger = logging.getLogger('arvnodeman')
 logger.addHandler(logging.NullHandler())
+ARVADOS_TIMEFMT = '%Y-%m-%dT%H:%M:%SZ'
diff --git a/services/nodemanager/arvnodeman/baseactor.py b/services/nodemanager/arvnodeman/baseactor.py
index 68ea97a..db8093d 100644
--- a/services/nodemanager/arvnodeman/baseactor.py
+++ b/services/nodemanager/arvnodeman/baseactor.py
@@ -7,6 +7,7 @@ import signal
 import time
 import threading
 import traceback
+from . import ARVADOS_TIMEFMT
 
 import pykka
 
@@ -85,6 +86,8 @@ class BaseNodeManagerActor(pykka.ThreadingActor):
             exception_type is OSError and exception_value.errno == errno.ENOMEM):
             lg.critical("Unhandled exception is a fatal error, killing Node Manager")
             os.kill(os.getpid(), signal.SIGKILL)
+        status.tracker.report_error(getattr(lg, "name", "unknown"),
+                                    exception_type, exception_value, tb)
 
     def ping(self):
         return True
diff --git a/services/nodemanager/arvnodeman/clientactor.py b/services/nodemanager/arvnodeman/clientactor.py
index e130749..e920b19 100644
--- a/services/nodemanager/arvnodeman/clientactor.py
+++ b/services/nodemanager/arvnodeman/clientactor.py
@@ -4,6 +4,7 @@ from __future__ import absolute_import, print_function
 
 import logging
 import time
+from . import status
 
 import pykka
 
@@ -94,12 +95,14 @@ class RemotePollLoopActor(actor_class):
             scheduled_start = start_time
         try:
             response = self._send_request()
+            status.tracker.report_ok(self._logger.name)
         except Exception as error:
             errmsg = self._got_error(error)
             if self.is_common_error(error):
                 self._logger.warning(errmsg)
             else:
                 self._logger.exception(errmsg)
+            status.tracker.report_error(self._logger.name)
             next_poll = start_time + self.poll_wait
         else:
             self._got_response(response)
diff --git a/services/nodemanager/arvnodeman/computenode/__init__.py b/services/nodemanager/arvnodeman/computenode/__init__.py
index d2c3d0c..79314b1 100644
--- a/services/nodemanager/arvnodeman/computenode/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/__init__.py
@@ -9,9 +9,9 @@ import re
 import time
 
 from ..config import CLOUD_ERRORS
+from .. import ARVADOS_TIMEFMT
 from libcloud.common.exceptions import BaseHTTPError
 
-ARVADOS_TIMEFMT = '%Y-%m-%dT%H:%M:%SZ'
 ARVADOS_TIMESUBSEC_RE = re.compile(r'(\.\d+)Z$')
 
 def arvados_node_fqdn(arvados_node, default_hostname='dynamic.compute'):
diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
index 633dc9f..60cd76e 100644
--- a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
@@ -79,6 +79,7 @@ class ComputeNodeStateChangeBase(config.actor_class, RetryMixin):
                 return orig_func(self, *args, **kwargs)
             except Exception as error:
                 self._logger.error("Actor error %s", error)
+                status.tracker_full.report_error(self._logger.name)
                 self._finished()
         return finish_wrapper
 
@@ -470,6 +471,7 @@ class ComputeNodeMonitorActor(config.actor_class):
                     self.last_shutdown_opening = next_opening
         except Exception:
             self._logger.exception("Unexpected exception")
+            status.tracker_full.report_error(self._logger.name)
 
     def offer_arvados_pair(self, arvados_node):
         first_ping_s = arvados_node.get('first_ping_at')
diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py
index 20cfb37..9397eb8 100644
--- a/services/nodemanager/arvnodeman/daemon.py
+++ b/services/nodemanager/arvnodeman/daemon.py
@@ -257,21 +257,30 @@ class NodeManagerDaemonActor(actor_class):
         return states + pykka.get_all(proxy_states)
 
     def _update_tracker(self):
-        updates = {"nodes_"+k: v for k,v in self._state_counts(None).items()}
-        updates['timestamp'] = time.strftime(cnode.ARVADOS_TIMEFMT, time.gmtime())
-        updates['nodes_wish'] = len(self.last_wishlist)
-        updates['nodes_max'] = self.max_nodes
-        updates['nodes_quota'] = self.node_quota
-        updates['nodes_wish'] = len(self.last_wishlist)
-        updates['status'] = "OK"
+        updates = {}
+        try:
+            updates['nodes_wish'] = len(self.last_wishlist)
+            updates['nodes_max'] = self.max_nodes
+            updates['nodes_quota'] = self.node_quota
+            updates['nodes_wish'] = len(self.last_wishlist)
+            updates.update({"nodes_"+k: v for k,v in self._state_counts(None).items()})
+            status.tracker.report_ok(self._logger.name)
+        except:
+            self._logger.exception("while updating tracker")
+            status.tracker.report_error(self._logger.name)
+
         status.tracker.update(updates)
-        status.tracker_full.update(updates)
 
-        updates = {}
-        for size in self.server_calculator.cloud_sizes:
-            updates["size_"+size.name] = {"nodes_"+k: v for k,v in self._state_counts(size).items()}
-            for attr in ['id', 'name', 'ram', 'disk', 'bandwidth', 'price']:
-                updates["size_"+size.name][attr] = getattr(size, attr)
+        try:
+            for size in self.server_calculator.cloud_sizes:
+                updates["size_"+size.name] = {"nodes_"+k: v for k,v in self._state_counts(size).items()}
+                for attr in ['id', 'name', 'ram', 'disk', 'bandwidth', 'price']:
+                    updates["size_"+size.name][attr] = getattr(size, attr)
+                status.tracker_full.report_ok(self._logger.name)
+        except:
+            self._logger.exception("while updating tracker")
+            status.tracker_full.report_error(self._logger.name)
+
         status.tracker_full.update(updates)
 
     def _state_counts(self, size):
@@ -359,10 +368,8 @@ class NodeManagerDaemonActor(actor_class):
                     self._later.stop_booting_node(size)
             except Exception as e:
                 self._logger.exception("while calculating nodes wanted for size %s", getattr(size, "id", "(id not available)"))
-        try:
-            self._update_tracker()
-        except:
-            self._logger.exception("while updating tracker")
+
+        self._update_tracker()
 
     def _check_poll_freshness(orig_func):
         """Decorator to inhibit a method when poll information is stale.
diff --git a/services/nodemanager/arvnodeman/launcher.py b/services/nodemanager/arvnodeman/launcher.py
index 2529a9e..f9a681a 100644
--- a/services/nodemanager/arvnodeman/launcher.py
+++ b/services/nodemanager/arvnodeman/launcher.py
@@ -13,7 +13,7 @@ import pykka
 import libcloud
 
 from . import config as nmconfig
-from . import status
+from . import status, ARVADOS_TIMEFMT
 from .baseactor import WatchdogActor
 from .daemon import NodeManagerDaemonActor
 from .jobqueue import JobQueueMonitorActor, ServerCalculator
@@ -124,6 +124,7 @@ def main(args=None):
     updates['hostname'] = socket.getfqdn()
     updates['servicetype'] = "arvados_nodemanager"
     updates['version'] = __version__
+    updates['started_at'] = time.strftime(ARVADOS_TIMEFMT, time.gmtime())
     status.tracker.update(updates)
     status.tracker_full.update(updates)
 
diff --git a/services/nodemanager/arvnodeman/status.py b/services/nodemanager/arvnodeman/status.py
index 23dda47..e5949ae 100644
--- a/services/nodemanager/arvnodeman/status.py
+++ b/services/nodemanager/arvnodeman/status.py
@@ -6,6 +6,9 @@ import json
 import logging
 import socketserver
 import threading
+import time
+
+from . import ARVADOS_TIMEFMT
 
 _logger = logging.getLogger('status.Handler')
 
@@ -52,7 +55,7 @@ class Handler(http.server.BaseHTTPRequestHandler, object):
 class Tracker(object):
     def __init__(self):
         self._mtx = threading.Lock()
-        self._latest = {}
+        self._latest = {"error_count": 0}
 
     def get_json(self):
         with self._mtx:
@@ -64,7 +67,32 @@ class Tracker(object):
 
     def update(self, updates):
         with self._mtx:
+            self._latest['timestamp'] = time.strftime(ARVADOS_TIMEFMT, time.gmtime())
             self._latest.update(updates)
+            st = "OK"
+            for k,v in self._latest.iteritems():
+                if k.startswith("status_") and v != "OK":
+                    st = v
+                    break
+            self._latest["status"] = st
+
+    def report_ok(self, name):
+        self.update({
+            "status_"+name: "OK",
+            "exception_"+name: None
+        })
+
+    def report_error(self, name, type=None, value=None, tb=None):
+        if type is not None:
+            msg = "\n".join(traceback.format_exception(type, value, tb))
+        else:
+            msg = traceback.format_exc()
+        with self._mtx:
+            self._latest["error_count"] += 1
+        self.update({
+            "status_"+name: "WARNING",
+            "exception_"+name: msg
+        })
 
 
 tracker = Tracker()

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list