[ARVADOS] created: 8709ef45c968e79b5e5978759484b994d4789aea
git at public.curoverse.com
git at public.curoverse.com
Thu Feb 4 22:06:47 EST 2016
at 8709ef45c968e79b5e5978759484b994d4789aea (commit)
commit 8709ef45c968e79b5e5978759484b994d4789aea
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date: Thu Feb 4 18:46:31 2016 -0500
7667: Node manager bug fixes and logging improvements.
* ComputeNodeSetupActor will now finish if there is an unhandled exception.
* ComputeNodeMonitorActor now explains why a node that is in the shutdown window
is not eligible for shutdown.
* Logging in nodes_wanted now distinguishes idle/busy/booting/shutting down.
* Logging by actors is now class name and a portion of the actor urn, so actions
of a specific actor can be consistently identified.
diff --git a/services/nodemanager/arvnodeman/clientactor.py b/services/nodemanager/arvnodeman/clientactor.py
index 5e5354a..e5534c5 100644
--- a/services/nodemanager/arvnodeman/clientactor.py
+++ b/services/nodemanager/arvnodeman/clientactor.py
@@ -58,14 +58,14 @@ class RemotePollLoopActor(actor_class):
def subscribe(self, subscriber):
self.all_subscribers.add(subscriber)
- self._logger.debug("%r subscribed to all events", subscriber)
+ self._logger.debug("%s subscribed to all events", subscriber.actor_ref.actor_urn)
self._start_polling()
# __init__ exposes this method to the proxy if the subclass defines
# _item_key.
def _subscribe_to(self, key, subscriber):
self.key_subscribers.setdefault(key, set()).add(subscriber)
- self._logger.debug("%r subscribed to events for '%s'", subscriber, key)
+ self._logger.debug("%s subscribed to events for '%s'", subscriber.actor_ref.actor_urn, key)
self._start_polling()
def _send_request(self):
@@ -82,8 +82,8 @@ class RemotePollLoopActor(actor_class):
def _got_error(self, error):
self.poll_wait = min(self.poll_wait * 2, self.max_poll_wait)
- return "{} got error: {} - waiting {} seconds".format(
- self.log_prefix, error, self.poll_wait)
+ return "got error: {} - will try again in {} seconds".format(
+ error, self.poll_wait)
def is_common_error(self, exception):
return False
@@ -105,6 +105,7 @@ class RemotePollLoopActor(actor_class):
else:
self._got_response(response)
next_poll = scheduled_start + self.poll_wait
+ self._logger.info("request took %s seconds", (time.time() - scheduled_start))
end_time = time.time()
if next_poll < end_time: # We've drifted too much; start fresh.
next_poll = end_time + self.poll_wait
diff --git a/services/nodemanager/arvnodeman/computenode/__init__.py b/services/nodemanager/arvnodeman/computenode/__init__.py
index 3921e09..54d6a82 100644
--- a/services/nodemanager/arvnodeman/computenode/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/__init__.py
@@ -76,6 +76,9 @@ class RetryMixin(object):
if not (isinstance(error, errors) or
self._cloud.is_cloud_exception(error)):
self.retry_wait = self.min_retry_wait
+ self._logger.warning(
+ "Re-raising unknown error (no retry): %s",
+ error, exc_info=error)
raise
self._logger.warning(
diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
index 45468c3..17d0463 100644
--- a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
@@ -24,25 +24,22 @@ class ComputeNodeStateChangeBase(config.actor_class, RetryMixin):
def __init__(self, cloud_client, arvados_client, timer_actor,
retry_wait, max_retry_wait):
super(ComputeNodeStateChangeBase, self).__init__()
+ RetryMixin.__init__(self, retry_wait, max_retry_wait,
+ None, cloud_client, timer_actor)
self._later = self.actor_ref.proxy()
self._arvados = arvados_client
self.subscribers = set()
def _set_logger(self):
- self._logger = logging.getLogger("%s.%s" % (self.__class__.__name__, self.actor_urn[9:]))
+ self._logger = logging.getLogger("%s.%s" % (self.__class__.__name__, self.actor_urn[33:]))
def on_start(self):
self._set_logger()
- RetryMixin.__init__(self,
- retry_wait,
- max_retry_wait,
- self._logger,
- cloud_client,
- timer_actor)
def _finished(self):
_notify_subscribers(self._later, self.subscribers)
self.subscribers = None
+ self._logger.info("finished")
def subscribe(self, subscriber):
if self.subscribers is None:
@@ -66,6 +63,17 @@ class ComputeNodeStateChangeBase(config.actor_class, RetryMixin):
'last_action': explanation}},
).execute()
+ @staticmethod
+ def _finish_on_exception(orig_func):
+ @functools.wraps(orig_func)
+ def finish_wrapper(self, *args, **kwargs):
+ try:
+ return orig_func(self, *args, **kwargs)
+ except Exception as error:
+ self._logger.error("Actor error %s", error)
+ self._finished()
+ return finish_wrapper
+
class ComputeNodeSetupActor(ComputeNodeStateChangeBase):
"""Actor to create and set up a cloud compute node.
@@ -90,20 +98,23 @@ class ComputeNodeSetupActor(ComputeNodeStateChangeBase):
else:
self._later.prepare_arvados_node(arvados_node)
+ @ComputeNodeStateChangeBase._finish_on_exception
@RetryMixin._retry(config.ARVADOS_ERRORS)
def create_arvados_node(self):
self.arvados_node = self._arvados.nodes().create(body={}).execute()
self._later.create_cloud_node()
+ @ComputeNodeStateChangeBase._finish_on_exception
@RetryMixin._retry(config.ARVADOS_ERRORS)
def prepare_arvados_node(self, node):
self.arvados_node = self._clean_arvados_node(
node, "Prepared by Node Manager")
self._later.create_cloud_node()
+ @ComputeNodeStateChangeBase._finish_on_exception
@RetryMixin._retry()
def create_cloud_node(self):
- self._logger.info("Creating cloud node of size %s.",
+ self._logger.info("Sending create_node request for node size %s.",
self.cloud_size.name)
self.cloud_node = self._cloud.create_node(self.cloud_size,
self.arvados_node)
@@ -112,6 +123,7 @@ class ComputeNodeSetupActor(ComputeNodeStateChangeBase):
self._logger.info("Cloud node %s created.", self.cloud_node.id)
self._later.update_arvados_node_properties()
+ @ComputeNodeStateChangeBase._finish_on_exception
@RetryMixin._retry(config.ARVADOS_ERRORS)
def update_arvados_node_properties(self):
"""Tell Arvados some details about the cloud node.
@@ -175,7 +187,7 @@ class ComputeNodeShutdownActor(ComputeNodeStateChangeBase):
self.success = None
def _set_logger(self):
- self._logger = logging.getLogger("%s.%s.%s" % (self.__class__.__name__, self.actor_urn[9:], self.cloud_node.id))
+ self._logger = logging.getLogger("%s.%s.%s" % (self.__class__.__name__, self.actor_urn[33:], self.cloud_node.name))
def on_start(self):
super(ComputeNodeShutdownActor, self).on_start()
@@ -199,30 +211,32 @@ class ComputeNodeShutdownActor(ComputeNodeStateChangeBase):
@functools.wraps(orig_func)
def stop_wrapper(self, *args, **kwargs):
if (self.cancellable and
- (not self._monitor.shutdown_eligible().get())):
+ (self._monitor.shutdown_eligible().get() is not True)):
self._later.cancel_shutdown(self.WINDOW_CLOSED)
return None
else:
return orig_func(self, *args, **kwargs)
return stop_wrapper
+ @ComputeNodeStateChangeBase._finish_on_exception
@_stop_if_window_closed
@RetryMixin._retry()
def shutdown_node(self):
- self._logger.info("Starting shutdown", self.cloud_node.id)
+ self._logger.info("Starting shutdown")
if not self._cloud.destroy_node(self.cloud_node):
if self._cloud.broken(self.cloud_node):
self._later.cancel_shutdown(self.NODE_BROKEN)
else:
# Force a retry.
raise cloud_types.LibcloudError("destroy_node failed")
- self._logger.info("Shutdown success", self.cloud_node.id)
+ self._logger.info("Shutdown success")
arv_node = self._arvados_node()
if arv_node is None:
self._finished(success_flag=True)
else:
self._later.clean_arvados_node(arv_node)
+ @ComputeNodeStateChangeBase._finish_on_exception
@RetryMixin._retry(config.ARVADOS_ERRORS)
def clean_arvados_node(self, arvados_node):
self._clean_arvados_node(arvados_node, "Shut down by Node Manager")
@@ -310,10 +324,11 @@ class ComputeNodeMonitorActor(config.actor_class):
self._later.consider_shutdown()
def _set_logger(self):
- self._logger = logging.getLogger("%s.%s.%s" % (self.__class__.__name__, self.actor_urn[9:], self.cloud_node.name))
+ self._logger = logging.getLogger("%s.%s.%s" % (self.__class__.__name__, self.actor_urn[33:], self.cloud_node.name))
def on_start(self):
self._set_logger()
+ self._timer.schedule(self.cloud_node_start_time + self.boot_fail_after, self._later.consider_shutdown)
def subscribe(self, subscriber):
self.subscribers.add(subscriber)
@@ -341,12 +356,18 @@ class ComputeNodeMonitorActor(config.actor_class):
return result
def shutdown_eligible(self):
+ """Return True if eligible for shutdown, or a string explaining why the node
+ is not eligible for shutdown."""
+
if not self._shutdowns.window_open():
- return False
+ return "shutdown window is not open."
if self.arvados_node is None:
# Node is unpaired.
# If it hasn't pinged Arvados after boot_fail seconds, shut it down
- return not timestamp_fresh(self.cloud_node_start_time, self.boot_fail_after)
+ if timestamp_fresh(self.cloud_node_start_time, self.boot_fail_after):
+ return "node is still booting, will be considered a failed boot at %s" % time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(self.cloud_node_start_time + self.boot_fail_after))
+ else:
+ return True
missing = arvados_node_missing(self.arvados_node, self.node_stale_after)
if missing and self._cloud.broken(self.cloud_node):
# Node is paired, but Arvados says it is missing and the cloud says the node
@@ -357,21 +378,27 @@ class ComputeNodeMonitorActor(config.actor_class):
"cloud reports broken node, but paired node %s never pinged "
"(bug?) -- skipped check for node_stale_after",
self.arvados_node['uuid'])
- return self.in_state('idle')
+ if self.in_state('idle'):
+ return True
+ else:
+ return "node is not idle."
def consider_shutdown(self):
- next_opening = self._shutdowns.next_opening()
- if self.shutdown_eligible():
- self._debug("Suggesting shutdown.", self.cloud_node.id)
- _notify_subscribers(self._later, self.subscribers)
- elif self._shutdowns.window_open():
- self._debug("Shutdown window open but node busy.",
- self.cloud_node.id)
- elif self.last_shutdown_opening != next_opening:
- self._debug("Shutdown window closed. Next at %s.",
- self.cloud_node.id, time.ctime(next_opening))
- self._timer.schedule(next_opening, self._later.consider_shutdown)
- self.last_shutdown_opening = next_opening
+ try:
+ next_opening = self._shutdowns.next_opening()
+ eligible = self.shutdown_eligible()
+ if eligible is True:
+ self._debug("Suggesting shutdown.")
+ _notify_subscribers(self._later, self.subscribers)
+ elif self._shutdowns.window_open():
+ self._debug("Cannot shut down because %s", eligible)
+ elif self.last_shutdown_opening != next_opening:
+ self._debug("Shutdown window closed. Next at %s.",
+ time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(next_opening)))
+ self._timer.schedule(next_opening, self._later.consider_shutdown)
+ self.last_shutdown_opening = next_opening
+ except Exception:
+ self._logger.exception("Unexpected exception")
def offer_arvados_pair(self, arvados_node):
first_ping_s = arvados_node.get('first_ping_at')
diff --git a/services/nodemanager/arvnodeman/computenode/driver/__init__.py b/services/nodemanager/arvnodeman/computenode/driver/__init__.py
index 11cbc99..c98c95a 100644
--- a/services/nodemanager/arvnodeman/computenode/driver/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/driver/__init__.py
@@ -30,6 +30,10 @@ class BaseComputeNodeDriver(RetryMixin):
def _create_driver(self, driver_class, **auth_kwargs):
return driver_class(**auth_kwargs)
+ @RetryMixin._retry()
+ def _set_sizes(self):
+ self.sizes = {sz.id: sz for sz in self.real.list_sizes()}
+
def __init__(self, auth_kwargs, list_kwargs, create_kwargs,
driver_class, retry_wait=1, max_retry_wait=180):
"""Base initializer for compute node drivers.
@@ -46,7 +50,7 @@ class BaseComputeNodeDriver(RetryMixin):
"""
super(BaseComputeNodeDriver, self).__init__(retry_wait, max_retry_wait,
- logging.getLogger(str(type(self))),
+ logging.getLogger(self.__class__.__name__),
type(self),
None)
self.real = self._create_driver(driver_class, **auth_kwargs)
@@ -65,7 +69,7 @@ class BaseComputeNodeDriver(RetryMixin):
if new_pair is not None:
self.create_kwargs[new_pair[0]] = new_pair[1]
- self.sizes = {sz.id: sz for sz in self.real.list_sizes()}
+ self._set_sizes()
def _init_ping_host(self, ping_host):
self.ping_host = ping_host
diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py
index 42ee4db..5616376 100644
--- a/services/nodemanager/arvnodeman/daemon.py
+++ b/services/nodemanager/arvnodeman/daemon.py
@@ -142,10 +142,10 @@ class NodeManagerDaemonActor(actor_class):
self.booting = {} # Actor IDs to ComputeNodeSetupActors
self.booted = {} # Cloud node IDs to _ComputeNodeRecords
self.shutdowns = {} # Cloud node IDs to ComputeNodeShutdownActors
- self._logger.debug("Daemon initialized")
def on_start(self):
- self._logger = logging.getLogger("%s.%s" % (self.__class__.__name__, id(self.actor_urn[9:])))
+ self._logger = logging.getLogger("%s.%s" % (self.__class__.__name__, self.actor_urn[33:]))
+ self._logger.debug("Daemon started")
def _update_poll_time(self, poll_key):
self.last_polls[poll_key] = time.time()
@@ -216,15 +216,19 @@ class NodeManagerDaemonActor(actor_class):
self._pair_nodes(cloud_rec, arv_node)
break
- def _nodes_up(self, size):
- up = 0
- up += sum(1
- for c in self.booting.itervalues()
- if size is None or c.cloud_size.get().id == size.id)
- up += sum(1
+ def _nodes_booting(self, size):
+ return sum(1
+ for c in self.booting.itervalues()
+ if size is None or c.cloud_size.get().id == size.id)
+
+ def _nodes_booted(self, size):
+ return sum(1
for i in (self.booted, self.cloud_nodes.nodes)
for c in i.itervalues()
if size is None or c.cloud_node.size.id == size.id)
+
+ def _nodes_up(self, size):
+ up = self._nodes_booting(size) + self._nodes_booted(size)
return up
def _total_price(self):
@@ -274,11 +278,21 @@ class NodeManagerDaemonActor(actor_class):
elif under_min > 0 and size.id == self.min_cloud_size.id:
return under_min
- up_count = self._nodes_up(size) - (self._size_shutdowns(size) +
- self._nodes_busy(size) +
- self._nodes_missing(size))
-
- self._logger.debug("%s: idle nodes %i, wishlist size %i", size.name, up_count, self._size_wishlist(size))
+ try:
+ booting_count = self._nodes_booting(size) + sum(1 for _ in self.cloud_nodes.unpaired())
+ shutdown_count = self._size_shutdowns(size)
+ busy_count = self._nodes_busy(size)
+ up_count = self._nodes_up(size) - (shutdown_count + busy_count + self._nodes_missing(size))
+
+ self._logger.info("%s: wishlist %i, up %i (booting %i, idle %i, busy %i), shutting down %i", size.name,
+ self._size_wishlist(size),
+ up_count + busy_count,
+ booting_count,
+ up_count - booting_count,
+ busy_count,
+ shutdown_count)
+ except Exception as e:
+ self._logger.exception("whoops")
wanted = self._size_wishlist(size) - up_count
if wanted > 0 and self.max_total_price and ((total_price + (size.price*wanted)) > self.max_total_price):
@@ -329,7 +343,7 @@ class NodeManagerDaemonActor(actor_class):
if nodes_wanted < 1:
return None
arvados_node = self.arvados_nodes.find_stale_node(self.node_stale_after)
- self._logger.info("Want %s more nodes. Booting a %s node.",
+ self._logger.info("Want %i more %s nodes. Booting a node.",
nodes_wanted, cloud_size.name)
new_setup = self._node_setup.start(
timer_actor=self._timer,
@@ -352,12 +366,13 @@ class NodeManagerDaemonActor(actor_class):
cloud_node = setup_proxy.cloud_node.get()
del self.booting[setup_proxy.actor_ref.actor_urn]
setup_proxy.stop()
- record = self.cloud_nodes.get(cloud_node.id)
- if record is None:
- record = self._new_node(cloud_node)
- self.booted[cloud_node.id] = record
- self._timer.schedule(time.time() + self.boot_fail_after,
- self._later.shutdown_unpaired_node, cloud_node.id)
+ if cloud_node is not None:
+ record = self.cloud_nodes.get(cloud_node.id)
+ if record is None:
+ record = self._new_node(cloud_node)
+ self.booted[cloud_node.id] = record
+ self._timer.schedule(time.time() + self.boot_fail_after,
+ self._later.shutdown_unpaired_node, cloud_node.id)
@_check_poll_freshness
def stop_booting_node(self, size):
diff --git a/services/nodemanager/arvnodeman/launcher.py b/services/nodemanager/arvnodeman/launcher.py
index 1ee7928..c8b3d19 100644
--- a/services/nodemanager/arvnodeman/launcher.py
+++ b/services/nodemanager/arvnodeman/launcher.py
@@ -71,7 +71,7 @@ def launch_pollers(config, server_calculator):
timer = TimedCallBackActor.start(poll_time / 10.0).proxy()
cloud_node_poller = CloudNodeListMonitorActor.start(
- config.new_cloud_client(timer), timer, poll_time, max_poll_time).proxy()
+ config.new_cloud_client(), timer, poll_time, max_poll_time).proxy()
arvados_node_poller = ArvadosNodeListMonitorActor.start(
config.new_arvados_client(), timer, poll_time, max_poll_time).proxy()
job_queue_poller = JobQueueMonitorActor.start(
@@ -103,32 +103,36 @@ def main(args=None):
for sigcode in [signal.SIGINT, signal.SIGQUIT, signal.SIGTERM]:
signal.signal(sigcode, shutdown_signal)
- setup_logging(config.get('Logging', 'file'), **config.log_levels())
- node_setup, node_shutdown, node_update, node_monitor = \
- config.dispatch_classes()
- server_calculator = build_server_calculator(config)
- timer, cloud_node_poller, arvados_node_poller, job_queue_poller = \
- launch_pollers(config, server_calculator)
- cloud_node_updater = node_update.start(config.new_cloud_client).proxy()
- node_daemon = NodeManagerDaemonActor.start(
- job_queue_poller, arvados_node_poller, cloud_node_poller,
- cloud_node_updater, timer,
- config.new_arvados_client, config.new_cloud_client,
- config.shutdown_windows(),
- server_calculator,
- config.getint('Daemon', 'min_nodes'),
- config.getint('Daemon', 'max_nodes'),
- config.getint('Daemon', 'poll_stale_after'),
- config.getint('Daemon', 'boot_fail_after'),
- config.getint('Daemon', 'node_stale_after'),
- node_setup, node_shutdown, node_monitor,
- max_total_price=config.getfloat('Daemon', 'max_total_price')).proxy()
-
- signal.pause()
- daemon_stopped = node_daemon.actor_ref.actor_stopped.is_set
- while not daemon_stopped():
- time.sleep(1)
- pykka.ActorRegistry.stop_all()
+ try:
+ setup_logging(config.get('Logging', 'file'), **config.log_levels())
+ node_setup, node_shutdown, node_update, node_monitor = \
+ config.dispatch_classes()
+ server_calculator = build_server_calculator(config)
+ timer, cloud_node_poller, arvados_node_poller, job_queue_poller = \
+ launch_pollers(config, server_calculator)
+ cloud_node_updater = node_update.start(config.new_cloud_client).proxy()
+ node_daemon = NodeManagerDaemonActor.start(
+ job_queue_poller, arvados_node_poller, cloud_node_poller,
+ cloud_node_updater, timer,
+ config.new_arvados_client, config.new_cloud_client,
+ config.shutdown_windows(),
+ server_calculator,
+ config.getint('Daemon', 'min_nodes'),
+ config.getint('Daemon', 'max_nodes'),
+ config.getint('Daemon', 'poll_stale_after'),
+ config.getint('Daemon', 'boot_fail_after'),
+ config.getint('Daemon', 'node_stale_after'),
+ node_setup, node_shutdown, node_monitor,
+ max_total_price=config.getfloat('Daemon', 'max_total_price')).proxy()
+
+ signal.pause()
+ daemon_stopped = node_daemon.actor_ref.actor_stopped.is_set
+ while not daemon_stopped():
+ time.sleep(1)
+ except Exception:
+ logging.exception("Uncaught exception during setup")
+ finally:
+ pykka.ActorRegistry.stop_all()
if __name__ == '__main__':
diff --git a/services/nodemanager/arvnodeman/nodelist.py b/services/nodemanager/arvnodeman/nodelist.py
index 26f49a3..f1a661e 100644
--- a/services/nodemanager/arvnodeman/nodelist.py
+++ b/services/nodemanager/arvnodeman/nodelist.py
@@ -36,4 +36,5 @@ class CloudNodeListMonitorActor(clientactor.RemotePollLoopActor):
return node.id
def _send_request(self):
- return self._client.list_nodes()
+ n = self._client.list_nodes()
+ return n
commit e9901a3f94eab93f9482b7cb3d7e2c1b50216f08
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date: Wed Feb 3 17:51:46 2016 -0500
Try to make logging identify the actor consistently
diff --git a/services/nodemanager/arvnodeman/clientactor.py b/services/nodemanager/arvnodeman/clientactor.py
index 6319f4b..5e5354a 100644
--- a/services/nodemanager/arvnodeman/clientactor.py
+++ b/services/nodemanager/arvnodeman/clientactor.py
@@ -38,10 +38,8 @@ class RemotePollLoopActor(actor_class):
super(RemotePollLoopActor, self).__init__()
self._client = client
self._timer = timer_actor
- self._logger = logging.getLogger(self.LOGGER_NAME)
self._later = self.actor_ref.proxy()
self._polling_started = False
- self.log_prefix = "{} (at {})".format(self.__class__.__name__, id(self))
self.min_poll_wait = poll_wait
self.max_poll_wait = max_poll_wait
self.poll_wait = self.min_poll_wait
@@ -50,6 +48,9 @@ class RemotePollLoopActor(actor_class):
if hasattr(self, '_item_key'):
self.subscribe_to = self._subscribe_to
+ def on_start(self):
+ self._logger = logging.getLogger("%s.%s" % (self.__class__.__name__, id(self.actor_urn[9:])))
+
def _start_polling(self):
if not self._polling_started:
self._polling_started = True
@@ -71,8 +72,7 @@ class RemotePollLoopActor(actor_class):
raise NotImplementedError("subclasses must implement request method")
def _got_response(self, response):
- self._logger.debug("%s got response with %d items",
- self.log_prefix, len(response))
+ self._logger.debug("got response with %d items", len(response))
self.poll_wait = self.min_poll_wait
_notify_subscribers(response, self.all_subscribers)
if hasattr(self, '_item_key'):
@@ -89,7 +89,7 @@ class RemotePollLoopActor(actor_class):
return False
def poll(self, scheduled_start=None):
- self._logger.debug("%s sending poll", self.log_prefix)
+ self._logger.debug("sending request")
start_time = time.time()
if scheduled_start is None:
scheduled_start = start_time
diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
index 8c983c1..45468c3 100644
--- a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
@@ -21,18 +21,24 @@ class ComputeNodeStateChangeBase(config.actor_class, RetryMixin):
This base class takes care of retrying changes and notifying
subscribers when the change is finished.
"""
- def __init__(self, logger_name, cloud_client, arvados_client, timer_actor,
+ def __init__(self, cloud_client, arvados_client, timer_actor,
retry_wait, max_retry_wait):
super(ComputeNodeStateChangeBase, self).__init__()
+ self._later = self.actor_ref.proxy()
+ self._arvados = arvados_client
+ self.subscribers = set()
+
+ def _set_logger(self):
+ self._logger = logging.getLogger("%s.%s" % (self.__class__.__name__, self.actor_urn[9:]))
+
+ def on_start(self):
+ self._set_logger()
RetryMixin.__init__(self,
retry_wait,
max_retry_wait,
- logging.getLogger(logger_name),
+ self._logger,
cloud_client,
timer_actor)
- self._later = self.actor_ref.proxy()
- self._arvados = arvados_client
- self.subscribers = set()
def _finished(self):
_notify_subscribers(self._later, self.subscribers)
@@ -74,7 +80,7 @@ class ComputeNodeSetupActor(ComputeNodeStateChangeBase):
cloud_size, arvados_node=None,
retry_wait=1, max_retry_wait=180):
super(ComputeNodeSetupActor, self).__init__(
- 'arvnodeman.nodeup', cloud_client, arvados_client, timer_actor,
+ cloud_client, arvados_client, timer_actor,
retry_wait, max_retry_wait)
self.cloud_size = cloud_size
self.arvados_node = None
@@ -97,7 +103,7 @@ class ComputeNodeSetupActor(ComputeNodeStateChangeBase):
@RetryMixin._retry()
def create_cloud_node(self):
- self._logger.info("Creating cloud node with size %s.",
+ self._logger.info("Creating cloud node of size %s.",
self.cloud_size.name)
self.cloud_node = self._cloud.create_node(self.cloud_size,
self.arvados_node)
@@ -160,7 +166,7 @@ class ComputeNodeShutdownActor(ComputeNodeStateChangeBase):
# eligible. Normal shutdowns based on job demand should be
# cancellable; shutdowns based on node misbehavior should not.
super(ComputeNodeShutdownActor, self).__init__(
- 'arvnodeman.nodedown', cloud_client, arvados_client, timer_actor,
+ cloud_client, arvados_client, timer_actor,
retry_wait, max_retry_wait)
self._monitor = node_monitor.proxy()
self.cloud_node = self._monitor.cloud_node.get()
@@ -168,7 +174,11 @@ class ComputeNodeShutdownActor(ComputeNodeStateChangeBase):
self.cancel_reason = None
self.success = None
+ def _set_logger(self):
+ self._logger = logging.getLogger("%s.%s.%s" % (self.__class__.__name__, self.actor_urn[9:], self.cloud_node.id))
+
def on_start(self):
+ super(ComputeNodeShutdownActor, self).on_start()
self._later.shutdown_node()
def _arvados_node(self):
@@ -181,7 +191,7 @@ class ComputeNodeShutdownActor(ComputeNodeStateChangeBase):
def cancel_shutdown(self, reason):
self.cancel_reason = reason
- self._logger.info("Cloud node %s shutdown cancelled: %s.",
+ self._logger.info("Shutdown cancelled: %s.",
self.cloud_node.id, reason)
self._finished(success_flag=False)
@@ -199,13 +209,14 @@ class ComputeNodeShutdownActor(ComputeNodeStateChangeBase):
@_stop_if_window_closed
@RetryMixin._retry()
def shutdown_node(self):
+ self._logger.info("Starting shutdown", self.cloud_node.id)
if not self._cloud.destroy_node(self.cloud_node):
if self._cloud.broken(self.cloud_node):
self._later.cancel_shutdown(self.NODE_BROKEN)
else:
# Force a retry.
raise cloud_types.LibcloudError("destroy_node failed")
- self._logger.info("Cloud node %s shut down.", self.cloud_node.id)
+ self._logger.info("Shutdown success", self.cloud_node.id)
arv_node = self._arvados_node()
if arv_node is None:
self._finished(success_flag=True)
@@ -281,7 +292,6 @@ class ComputeNodeMonitorActor(config.actor_class):
):
super(ComputeNodeMonitorActor, self).__init__()
self._later = self.actor_ref.proxy()
- self._logger = logging.getLogger('arvnodeman.computenode')
self._last_log = None
self._shutdowns = shutdown_timer
self._cloud_node_fqdn = cloud_fqdn_func
@@ -299,6 +309,12 @@ class ComputeNodeMonitorActor(config.actor_class):
self.last_shutdown_opening = None
self._later.consider_shutdown()
+ def _set_logger(self):
+ self._logger = logging.getLogger("%s.%s.%s" % (self.__class__.__name__, self.actor_urn[9:], self.cloud_node.name))
+
+ def on_start(self):
+ self._set_logger()
+
def subscribe(self, subscriber):
self.subscribers.add(subscriber)
@@ -346,13 +362,13 @@ class ComputeNodeMonitorActor(config.actor_class):
def consider_shutdown(self):
next_opening = self._shutdowns.next_opening()
if self.shutdown_eligible():
- self._debug("Node %s suggesting shutdown.", self.cloud_node.id)
+ self._debug("Suggesting shutdown.", self.cloud_node.id)
_notify_subscribers(self._later, self.subscribers)
elif self._shutdowns.window_open():
- self._debug("Node %s shutdown window open but node busy.",
+ self._debug("Shutdown window open but node busy.",
self.cloud_node.id)
elif self.last_shutdown_opening != next_opening:
- self._debug("Node %s shutdown window closed. Next at %s.",
+ self._debug("Shutdown window closed. Next at %s.",
self.cloud_node.id, time.ctime(next_opening))
self._timer.schedule(next_opening, self._later.consider_shutdown)
self.last_shutdown_opening = next_opening
diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py b/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
index 43f61c7..4d70436 100644
--- a/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
+++ b/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
@@ -22,6 +22,7 @@ class ComputeNodeShutdownActor(ShutdownActorBase):
self._nodename = None
return super(ComputeNodeShutdownActor, self).on_start()
else:
+ self._set_logger()
self._nodename = arv_node['hostname']
self._logger.info("Draining SLURM node %s", self._nodename)
self._later.issue_slurm_drain()
diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py
index 243d3bf..42ee4db 100644
--- a/services/nodemanager/arvnodeman/daemon.py
+++ b/services/nodemanager/arvnodeman/daemon.py
@@ -121,7 +121,6 @@ class NodeManagerDaemonActor(actor_class):
self._new_arvados = arvados_factory
self._new_cloud = cloud_factory
self._cloud_driver = self._new_cloud()
- self._logger = logging.getLogger('arvnodeman.daemon')
self._later = self.actor_ref.proxy()
self.shutdown_windows = shutdown_windows
self.server_calculator = server_calculator
@@ -145,6 +144,9 @@ class NodeManagerDaemonActor(actor_class):
self.shutdowns = {} # Cloud node IDs to ComputeNodeShutdownActors
self._logger.debug("Daemon initialized")
+ def on_start(self):
+ self._logger = logging.getLogger("%s.%s" % (self.__class__.__name__, id(self.actor_urn[9:])))
+
def _update_poll_time(self, poll_key):
self.last_polls[poll_key] = time.time()
diff --git a/services/nodemanager/arvnodeman/jobqueue.py b/services/nodemanager/arvnodeman/jobqueue.py
index e0f0a5b..87cf738 100644
--- a/services/nodemanager/arvnodeman/jobqueue.py
+++ b/services/nodemanager/arvnodeman/jobqueue.py
@@ -102,7 +102,6 @@ class JobQueueMonitorActor(clientactor.RemotePollLoopActor):
"""
CLIENT_ERRORS = ARVADOS_ERRORS
- LOGGER_NAME = 'arvnodeman.jobqueue'
def __init__(self, client, timer_actor, server_calc, *args, **kwargs):
super(JobQueueMonitorActor, self).__init__(
@@ -114,6 +113,6 @@ class JobQueueMonitorActor(clientactor.RemotePollLoopActor):
def _got_response(self, queue):
server_list = self._calculator.servers_for_queue(queue)
- self._logger.debug("Sending server wishlist: %s",
+ self._logger.debug("Calculated wishlist: %s",
', '.join(s.name for s in server_list) or "(empty)")
return super(JobQueueMonitorActor, self)._got_response(server_list)
diff --git a/services/nodemanager/arvnodeman/nodelist.py b/services/nodemanager/arvnodeman/nodelist.py
index 83dd93f..26f49a3 100644
--- a/services/nodemanager/arvnodeman/nodelist.py
+++ b/services/nodemanager/arvnodeman/nodelist.py
@@ -11,7 +11,6 @@ class ArvadosNodeListMonitorActor(clientactor.RemotePollLoopActor):
This actor regularly polls the list of Arvados node records, and
sends it to subscribers.
"""
- LOGGER_NAME = 'arvnodeman.arvados_nodes'
def is_common_error(self, exception):
return isinstance(exception, config.ARVADOS_ERRORS)
@@ -29,7 +28,6 @@ class CloudNodeListMonitorActor(clientactor.RemotePollLoopActor):
This actor regularly polls the cloud to get a list of running compute
nodes, and sends it to subscribers.
"""
- LOGGER_NAME = 'arvnodeman.cloud_nodes'
def is_common_error(self, exception):
return self._client.is_cloud_exception(exception)
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list