[ARVADOS] updated: f178db3f1a81cb5f4f155baf385ef889e0911a79
Git user
git at public.curoverse.com
Tue Jun 13 15:04:58 EDT 2017
Summary of changes:
services/nodemanager/arvnodeman/__init__.py | 1 +
services/nodemanager/arvnodeman/baseactor.py | 3 ++
services/nodemanager/arvnodeman/clientactor.py | 3 ++
.../nodemanager/arvnodeman/computenode/__init__.py | 2 +-
.../arvnodeman/computenode/dispatch/__init__.py | 2 ++
services/nodemanager/arvnodeman/daemon.py | 41 +++++++++++++---------
services/nodemanager/arvnodeman/launcher.py | 3 +-
services/nodemanager/arvnodeman/status.py | 30 +++++++++++++++-
8 files changed, 65 insertions(+), 20 deletions(-)
via f178db3f1a81cb5f4f155baf385ef889e0911a79 (commit)
from 1227ea2b5795e34a75c62cb9eae91d46ef7cfb6a (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
commit f178db3f1a81cb5f4f155baf385ef889e0911a79
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date: Tue Jun 13 12:49:25 2017 -0400
11836: Report "warning" and "ok" status on exceptions or on completion of successful process loop.
Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz at curoverse.com>
diff --git a/services/nodemanager/arvnodeman/__init__.py b/services/nodemanager/arvnodeman/__init__.py
index a1ecac7..083b62a 100644
--- a/services/nodemanager/arvnodeman/__init__.py
+++ b/services/nodemanager/arvnodeman/__init__.py
@@ -7,3 +7,4 @@ import logging
logger = logging.getLogger('arvnodeman')
logger.addHandler(logging.NullHandler())
+ARVADOS_TIMEFMT = '%Y-%m-%dT%H:%M:%SZ'
diff --git a/services/nodemanager/arvnodeman/baseactor.py b/services/nodemanager/arvnodeman/baseactor.py
index 68ea97a..db8093d 100644
--- a/services/nodemanager/arvnodeman/baseactor.py
+++ b/services/nodemanager/arvnodeman/baseactor.py
@@ -7,6 +7,7 @@ import signal
import time
import threading
import traceback
+from . import ARVADOS_TIMEFMT
import pykka
@@ -85,6 +86,8 @@ class BaseNodeManagerActor(pykka.ThreadingActor):
exception_type is OSError and exception_value.errno == errno.ENOMEM):
lg.critical("Unhandled exception is a fatal error, killing Node Manager")
os.kill(os.getpid(), signal.SIGKILL)
+ status.tracker.report_error(getattr(lg, "name", "unknown"),
+ exception_type, exception_value, tb)
def ping(self):
return True
diff --git a/services/nodemanager/arvnodeman/clientactor.py b/services/nodemanager/arvnodeman/clientactor.py
index e130749..e920b19 100644
--- a/services/nodemanager/arvnodeman/clientactor.py
+++ b/services/nodemanager/arvnodeman/clientactor.py
@@ -4,6 +4,7 @@ from __future__ import absolute_import, print_function
import logging
import time
+from . import status
import pykka
@@ -94,12 +95,14 @@ class RemotePollLoopActor(actor_class):
scheduled_start = start_time
try:
response = self._send_request()
+ status.tracker.report_ok(self._logger.name)
except Exception as error:
errmsg = self._got_error(error)
if self.is_common_error(error):
self._logger.warning(errmsg)
else:
self._logger.exception(errmsg)
+ status.tracker.report_error(self._logger.name)
next_poll = start_time + self.poll_wait
else:
self._got_response(response)
diff --git a/services/nodemanager/arvnodeman/computenode/__init__.py b/services/nodemanager/arvnodeman/computenode/__init__.py
index d2c3d0c..79314b1 100644
--- a/services/nodemanager/arvnodeman/computenode/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/__init__.py
@@ -9,9 +9,9 @@ import re
import time
from ..config import CLOUD_ERRORS
+from .. import ARVADOS_TIMEFMT
from libcloud.common.exceptions import BaseHTTPError
-ARVADOS_TIMEFMT = '%Y-%m-%dT%H:%M:%SZ'
ARVADOS_TIMESUBSEC_RE = re.compile(r'(\.\d+)Z$')
def arvados_node_fqdn(arvados_node, default_hostname='dynamic.compute'):
diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
index 633dc9f..60cd76e 100644
--- a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
@@ -79,6 +79,7 @@ class ComputeNodeStateChangeBase(config.actor_class, RetryMixin):
return orig_func(self, *args, **kwargs)
except Exception as error:
self._logger.error("Actor error %s", error)
+ status.tracker_full.report_error(self._logger.name)
self._finished()
return finish_wrapper
@@ -470,6 +471,7 @@ class ComputeNodeMonitorActor(config.actor_class):
self.last_shutdown_opening = next_opening
except Exception:
self._logger.exception("Unexpected exception")
+ status.tracker_full.report_error(self._logger.name)
def offer_arvados_pair(self, arvados_node):
first_ping_s = arvados_node.get('first_ping_at')
diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py
index 20cfb37..9397eb8 100644
--- a/services/nodemanager/arvnodeman/daemon.py
+++ b/services/nodemanager/arvnodeman/daemon.py
@@ -257,21 +257,30 @@ class NodeManagerDaemonActor(actor_class):
return states + pykka.get_all(proxy_states)
def _update_tracker(self):
- updates = {"nodes_"+k: v for k,v in self._state_counts(None).items()}
- updates['timestamp'] = time.strftime(cnode.ARVADOS_TIMEFMT, time.gmtime())
- updates['nodes_wish'] = len(self.last_wishlist)
- updates['nodes_max'] = self.max_nodes
- updates['nodes_quota'] = self.node_quota
- updates['nodes_wish'] = len(self.last_wishlist)
- updates['status'] = "OK"
+ updates = {}
+ try:
+ updates['nodes_wish'] = len(self.last_wishlist)
+ updates['nodes_max'] = self.max_nodes
+ updates['nodes_quota'] = self.node_quota
+ updates['nodes_wish'] = len(self.last_wishlist)
+ updates.update({"nodes_"+k: v for k,v in self._state_counts(None).items()})
+ status.tracker.report_ok(self._logger.name)
+ except:
+ self._logger.exception("while updating tracker")
+ status.tracker.report_error(self._logger.name)
+
status.tracker.update(updates)
- status.tracker_full.update(updates)
- updates = {}
- for size in self.server_calculator.cloud_sizes:
- updates["size_"+size.name] = {"nodes_"+k: v for k,v in self._state_counts(size).items()}
- for attr in ['id', 'name', 'ram', 'disk', 'bandwidth', 'price']:
- updates["size_"+size.name][attr] = getattr(size, attr)
+ try:
+ for size in self.server_calculator.cloud_sizes:
+ updates["size_"+size.name] = {"nodes_"+k: v for k,v in self._state_counts(size).items()}
+ for attr in ['id', 'name', 'ram', 'disk', 'bandwidth', 'price']:
+ updates["size_"+size.name][attr] = getattr(size, attr)
+ status.tracker_full.report_ok(self._logger.name)
+ except:
+ self._logger.exception("while updating tracker")
+ status.tracker_full.report_error(self._logger.name)
+
status.tracker_full.update(updates)
def _state_counts(self, size):
@@ -359,10 +368,8 @@ class NodeManagerDaemonActor(actor_class):
self._later.stop_booting_node(size)
except Exception as e:
self._logger.exception("while calculating nodes wanted for size %s", getattr(size, "id", "(id not available)"))
- try:
- self._update_tracker()
- except:
- self._logger.exception("while updating tracker")
+
+ self._update_tracker()
def _check_poll_freshness(orig_func):
"""Decorator to inhibit a method when poll information is stale.
diff --git a/services/nodemanager/arvnodeman/launcher.py b/services/nodemanager/arvnodeman/launcher.py
index 2529a9e..f9a681a 100644
--- a/services/nodemanager/arvnodeman/launcher.py
+++ b/services/nodemanager/arvnodeman/launcher.py
@@ -13,7 +13,7 @@ import pykka
import libcloud
from . import config as nmconfig
-from . import status
+from . import status, ARVADOS_TIMEFMT
from .baseactor import WatchdogActor
from .daemon import NodeManagerDaemonActor
from .jobqueue import JobQueueMonitorActor, ServerCalculator
@@ -124,6 +124,7 @@ def main(args=None):
updates['hostname'] = socket.getfqdn()
updates['servicetype'] = "arvados_nodemanager"
updates['version'] = __version__
+ updates['started_at'] = time.strftime(ARVADOS_TIMEFMT, time.gmtime())
status.tracker.update(updates)
status.tracker_full.update(updates)
diff --git a/services/nodemanager/arvnodeman/status.py b/services/nodemanager/arvnodeman/status.py
index 23dda47..e5949ae 100644
--- a/services/nodemanager/arvnodeman/status.py
+++ b/services/nodemanager/arvnodeman/status.py
@@ -6,6 +6,9 @@ import json
import logging
import socketserver
import threading
+import time
+
+from . import ARVADOS_TIMEFMT
_logger = logging.getLogger('status.Handler')
@@ -52,7 +55,7 @@ class Handler(http.server.BaseHTTPRequestHandler, object):
class Tracker(object):
def __init__(self):
self._mtx = threading.Lock()
- self._latest = {}
+ self._latest = {"error_count": 0}
def get_json(self):
with self._mtx:
@@ -64,7 +67,32 @@ class Tracker(object):
def update(self, updates):
with self._mtx:
+ self._latest['timestamp'] = time.strftime(ARVADOS_TIMEFMT, time.gmtime())
self._latest.update(updates)
+ st = "OK"
+ for k,v in self._latest.iteritems():
+ if k.startswith("status_") and v != "OK":
+ st = v
+ break
+ self._latest["status"] = st
+
+ def report_ok(self, name):
+ self.update({
+ "status_"+name: "OK",
+ "exception_"+name: None
+ })
+
+ def report_error(self, name, type=None, value=None, tb=None):
+ if type is not None:
+ msg = "\n".join(traceback.format_exception(type, value, tb))
+ else:
+ msg = traceback.format_exc()
+ with self._mtx:
+ self._latest["error_count"] += 1
+ self.update({
+ "status_"+name: "WARNING",
+ "exception_"+name: msg
+ })
tracker = Tracker()
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list