[ARVADOS] created: 8709ef45c968e79b5e5978759484b994d4789aea

git at public.curoverse.com git at public.curoverse.com
Thu Feb 4 22:06:47 EST 2016


        at  8709ef45c968e79b5e5978759484b994d4789aea (commit)


commit 8709ef45c968e79b5e5978759484b994d4789aea
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Thu Feb 4 18:46:31 2016 -0500

    7667: Node manager bug fixes and logging improvements.
    
     * ComputeNodeSetupActor will now finish if there is an unhandled exception.
    
     * ComputeNodeMonitorActor now explains why a node that is in the shutdown window
    is not eligible for shutdown.
    
     * Logging in nodes_wanted now distinguishes idle/busy/booting/shutting down.
    
     * Logging by actors is now class name and a portion of the actor urn, so actions
    of a specific actor can be consistently identified.

diff --git a/services/nodemanager/arvnodeman/clientactor.py b/services/nodemanager/arvnodeman/clientactor.py
index 5e5354a..e5534c5 100644
--- a/services/nodemanager/arvnodeman/clientactor.py
+++ b/services/nodemanager/arvnodeman/clientactor.py
@@ -58,14 +58,14 @@ class RemotePollLoopActor(actor_class):
 
     def subscribe(self, subscriber):
         self.all_subscribers.add(subscriber)
-        self._logger.debug("%r subscribed to all events", subscriber)
+        self._logger.debug("%s subscribed to all events", subscriber.actor_ref.actor_urn)
         self._start_polling()
 
     # __init__ exposes this method to the proxy if the subclass defines
     # _item_key.
     def _subscribe_to(self, key, subscriber):
         self.key_subscribers.setdefault(key, set()).add(subscriber)
-        self._logger.debug("%r subscribed to events for '%s'", subscriber, key)
+        self._logger.debug("%s subscribed to events for '%s'", subscriber.actor_ref.actor_urn, key)
         self._start_polling()
 
     def _send_request(self):
@@ -82,8 +82,8 @@ class RemotePollLoopActor(actor_class):
 
     def _got_error(self, error):
         self.poll_wait = min(self.poll_wait * 2, self.max_poll_wait)
-        return "{} got error: {} - waiting {} seconds".format(
-            self.log_prefix, error, self.poll_wait)
+        return "got error: {} - will try again in {} seconds".format(
+            error, self.poll_wait)
 
     def is_common_error(self, exception):
         return False
@@ -105,6 +105,7 @@ class RemotePollLoopActor(actor_class):
         else:
             self._got_response(response)
             next_poll = scheduled_start + self.poll_wait
+        self._logger.info("request took %s seconds", (time.time() - scheduled_start))
         end_time = time.time()
         if next_poll < end_time:  # We've drifted too much; start fresh.
             next_poll = end_time + self.poll_wait
diff --git a/services/nodemanager/arvnodeman/computenode/__init__.py b/services/nodemanager/arvnodeman/computenode/__init__.py
index 3921e09..54d6a82 100644
--- a/services/nodemanager/arvnodeman/computenode/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/__init__.py
@@ -76,6 +76,9 @@ class RetryMixin(object):
                         if not (isinstance(error, errors) or
                                 self._cloud.is_cloud_exception(error)):
                             self.retry_wait = self.min_retry_wait
+                            self._logger.warning(
+                                "Re-raising unknown error (no retry): %s",
+                                error, exc_info=error)
                             raise
 
                         self._logger.warning(
diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
index 45468c3..17d0463 100644
--- a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
@@ -24,25 +24,22 @@ class ComputeNodeStateChangeBase(config.actor_class, RetryMixin):
     def __init__(self, cloud_client, arvados_client, timer_actor,
                  retry_wait, max_retry_wait):
         super(ComputeNodeStateChangeBase, self).__init__()
+        RetryMixin.__init__(self, retry_wait, max_retry_wait,
+                            None, cloud_client, timer_actor)
         self._later = self.actor_ref.proxy()
         self._arvados = arvados_client
         self.subscribers = set()
 
     def _set_logger(self):
-        self._logger = logging.getLogger("%s.%s" % (self.__class__.__name__, self.actor_urn[9:]))
+        self._logger = logging.getLogger("%s.%s" % (self.__class__.__name__, self.actor_urn[33:]))
 
     def on_start(self):
         self._set_logger()
-        RetryMixin.__init__(self,
-                            retry_wait,
-                            max_retry_wait,
-                            self._logger,
-                            cloud_client,
-                            timer_actor)
 
     def _finished(self):
         _notify_subscribers(self._later, self.subscribers)
         self.subscribers = None
+        self._logger.info("finished")
 
     def subscribe(self, subscriber):
         if self.subscribers is None:
@@ -66,6 +63,17 @@ class ComputeNodeStateChangeBase(config.actor_class, RetryMixin):
                            'last_action': explanation}},
             ).execute()
 
+    @staticmethod
+    def _finish_on_exception(orig_func):
+        @functools.wraps(orig_func)
+        def finish_wrapper(self, *args, **kwargs):
+            try:
+                return orig_func(self, *args, **kwargs)
+            except Exception as error:
+                self._logger.error("Actor error %s", error)
+                self._finished()
+        return finish_wrapper
+
 
 class ComputeNodeSetupActor(ComputeNodeStateChangeBase):
     """Actor to create and set up a cloud compute node.
@@ -90,20 +98,23 @@ class ComputeNodeSetupActor(ComputeNodeStateChangeBase):
         else:
             self._later.prepare_arvados_node(arvados_node)
 
+    @ComputeNodeStateChangeBase._finish_on_exception
     @RetryMixin._retry(config.ARVADOS_ERRORS)
     def create_arvados_node(self):
         self.arvados_node = self._arvados.nodes().create(body={}).execute()
         self._later.create_cloud_node()
 
+    @ComputeNodeStateChangeBase._finish_on_exception
     @RetryMixin._retry(config.ARVADOS_ERRORS)
     def prepare_arvados_node(self, node):
         self.arvados_node = self._clean_arvados_node(
             node, "Prepared by Node Manager")
         self._later.create_cloud_node()
 
+    @ComputeNodeStateChangeBase._finish_on_exception
     @RetryMixin._retry()
     def create_cloud_node(self):
-        self._logger.info("Creating cloud node of size %s.",
+        self._logger.info("Sending create_node request for node size %s.",
                           self.cloud_size.name)
         self.cloud_node = self._cloud.create_node(self.cloud_size,
                                                   self.arvados_node)
@@ -112,6 +123,7 @@ class ComputeNodeSetupActor(ComputeNodeStateChangeBase):
         self._logger.info("Cloud node %s created.", self.cloud_node.id)
         self._later.update_arvados_node_properties()
 
+    @ComputeNodeStateChangeBase._finish_on_exception
     @RetryMixin._retry(config.ARVADOS_ERRORS)
     def update_arvados_node_properties(self):
         """Tell Arvados some details about the cloud node.
@@ -175,7 +187,7 @@ class ComputeNodeShutdownActor(ComputeNodeStateChangeBase):
         self.success = None
 
     def _set_logger(self):
-        self._logger = logging.getLogger("%s.%s.%s" % (self.__class__.__name__, self.actor_urn[9:], self.cloud_node.id))
+        self._logger = logging.getLogger("%s.%s.%s" % (self.__class__.__name__, self.actor_urn[33:], self.cloud_node.name))
 
     def on_start(self):
         super(ComputeNodeShutdownActor, self).on_start()
@@ -199,30 +211,32 @@ class ComputeNodeShutdownActor(ComputeNodeStateChangeBase):
         @functools.wraps(orig_func)
         def stop_wrapper(self, *args, **kwargs):
             if (self.cancellable and
-                  (not self._monitor.shutdown_eligible().get())):
+                  (self._monitor.shutdown_eligible().get() is not True)):
                 self._later.cancel_shutdown(self.WINDOW_CLOSED)
                 return None
             else:
                 return orig_func(self, *args, **kwargs)
         return stop_wrapper
 
+    @ComputeNodeStateChangeBase._finish_on_exception
     @_stop_if_window_closed
     @RetryMixin._retry()
     def shutdown_node(self):
-        self._logger.info("Starting shutdown", self.cloud_node.id)
+        self._logger.info("Starting shutdown")
         if not self._cloud.destroy_node(self.cloud_node):
             if self._cloud.broken(self.cloud_node):
                 self._later.cancel_shutdown(self.NODE_BROKEN)
             else:
                 # Force a retry.
                 raise cloud_types.LibcloudError("destroy_node failed")
-        self._logger.info("Shutdown success", self.cloud_node.id)
+        self._logger.info("Shutdown success")
         arv_node = self._arvados_node()
         if arv_node is None:
             self._finished(success_flag=True)
         else:
             self._later.clean_arvados_node(arv_node)
 
+    @ComputeNodeStateChangeBase._finish_on_exception
     @RetryMixin._retry(config.ARVADOS_ERRORS)
     def clean_arvados_node(self, arvados_node):
         self._clean_arvados_node(arvados_node, "Shut down by Node Manager")
@@ -310,10 +324,11 @@ class ComputeNodeMonitorActor(config.actor_class):
         self._later.consider_shutdown()
 
     def _set_logger(self):
-        self._logger = logging.getLogger("%s.%s.%s" % (self.__class__.__name__, self.actor_urn[9:], self.cloud_node.name))
+        self._logger = logging.getLogger("%s.%s.%s" % (self.__class__.__name__, self.actor_urn[33:], self.cloud_node.name))
 
     def on_start(self):
         self._set_logger()
+        self._timer.schedule(self.cloud_node_start_time + self.boot_fail_after, self._later.consider_shutdown)
 
     def subscribe(self, subscriber):
         self.subscribers.add(subscriber)
@@ -341,12 +356,18 @@ class ComputeNodeMonitorActor(config.actor_class):
         return result
 
     def shutdown_eligible(self):
+        """Return True if eligible for shutdown, or a string explaining why the node
+        is not eligible for shutdown."""
+
         if not self._shutdowns.window_open():
-            return False
+            return "shutdown window is not open."
         if self.arvados_node is None:
             # Node is unpaired.
             # If it hasn't pinged Arvados after boot_fail seconds, shut it down
-            return not timestamp_fresh(self.cloud_node_start_time, self.boot_fail_after)
+            if timestamp_fresh(self.cloud_node_start_time, self.boot_fail_after):
+                return "node is still booting, will be considered a failed boot at %s" % time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(self.cloud_node_start_time + self.boot_fail_after))
+            else:
+                return True
         missing = arvados_node_missing(self.arvados_node, self.node_stale_after)
         if missing and self._cloud.broken(self.cloud_node):
             # Node is paired, but Arvados says it is missing and the cloud says the node
@@ -357,21 +378,27 @@ class ComputeNodeMonitorActor(config.actor_class):
                 "cloud reports broken node, but paired node %s never pinged "
                 "(bug?) -- skipped check for node_stale_after",
                 self.arvados_node['uuid'])
-        return self.in_state('idle')
+        if self.in_state('idle'):
+            return True
+        else:
+            return "node is not idle."
 
     def consider_shutdown(self):
-        next_opening = self._shutdowns.next_opening()
-        if self.shutdown_eligible():
-            self._debug("Suggesting shutdown.", self.cloud_node.id)
-            _notify_subscribers(self._later, self.subscribers)
-        elif self._shutdowns.window_open():
-            self._debug("Shutdown window open but node busy.",
-                        self.cloud_node.id)
-        elif self.last_shutdown_opening != next_opening:
-            self._debug("Shutdown window closed.  Next at %s.",
-                        self.cloud_node.id, time.ctime(next_opening))
-            self._timer.schedule(next_opening, self._later.consider_shutdown)
-            self.last_shutdown_opening = next_opening
+        try:
+            next_opening = self._shutdowns.next_opening()
+            eligible = self.shutdown_eligible()
+            if eligible is True:
+                self._debug("Suggesting shutdown.")
+                _notify_subscribers(self._later, self.subscribers)
+            elif self._shutdowns.window_open():
+                self._debug("Cannot shut down because %s", eligible)
+            elif self.last_shutdown_opening != next_opening:
+                self._debug("Shutdown window closed.  Next at %s.",
+                            time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(next_opening)))
+                self._timer.schedule(next_opening, self._later.consider_shutdown)
+                self.last_shutdown_opening = next_opening
+        except Exception:
+            self._logger.exception("Unexpected exception")
 
     def offer_arvados_pair(self, arvados_node):
         first_ping_s = arvados_node.get('first_ping_at')
diff --git a/services/nodemanager/arvnodeman/computenode/driver/__init__.py b/services/nodemanager/arvnodeman/computenode/driver/__init__.py
index 11cbc99..c98c95a 100644
--- a/services/nodemanager/arvnodeman/computenode/driver/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/driver/__init__.py
@@ -30,6 +30,10 @@ class BaseComputeNodeDriver(RetryMixin):
     def _create_driver(self, driver_class, **auth_kwargs):
         return driver_class(**auth_kwargs)
 
+    @RetryMixin._retry()
+    def _set_sizes(self):
+        self.sizes = {sz.id: sz for sz in self.real.list_sizes()}
+
     def __init__(self, auth_kwargs, list_kwargs, create_kwargs,
                  driver_class, retry_wait=1, max_retry_wait=180):
         """Base initializer for compute node drivers.
@@ -46,7 +50,7 @@ class BaseComputeNodeDriver(RetryMixin):
         """
 
         super(BaseComputeNodeDriver, self).__init__(retry_wait, max_retry_wait,
-                                         logging.getLogger(str(type(self))),
+                                         logging.getLogger(self.__class__.__name__),
                                          type(self),
                                          None)
         self.real = self._create_driver(driver_class, **auth_kwargs)
@@ -65,7 +69,7 @@ class BaseComputeNodeDriver(RetryMixin):
                 if new_pair is not None:
                     self.create_kwargs[new_pair[0]] = new_pair[1]
 
-        self.sizes = {sz.id: sz for sz in self.real.list_sizes()}
+        self._set_sizes()
 
     def _init_ping_host(self, ping_host):
         self.ping_host = ping_host
diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py
index 42ee4db..5616376 100644
--- a/services/nodemanager/arvnodeman/daemon.py
+++ b/services/nodemanager/arvnodeman/daemon.py
@@ -142,10 +142,10 @@ class NodeManagerDaemonActor(actor_class):
         self.booting = {}       # Actor IDs to ComputeNodeSetupActors
         self.booted = {}        # Cloud node IDs to _ComputeNodeRecords
         self.shutdowns = {}     # Cloud node IDs to ComputeNodeShutdownActors
-        self._logger.debug("Daemon initialized")
 
     def on_start(self):
-        self._logger = logging.getLogger("%s.%s" % (self.__class__.__name__, id(self.actor_urn[9:])))
+        self._logger = logging.getLogger("%s.%s" % (self.__class__.__name__, self.actor_urn[33:]))
+        self._logger.debug("Daemon started")
 
     def _update_poll_time(self, poll_key):
         self.last_polls[poll_key] = time.time()
@@ -216,15 +216,19 @@ class NodeManagerDaemonActor(actor_class):
                     self._pair_nodes(cloud_rec, arv_node)
                     break
 
-    def _nodes_up(self, size):
-        up = 0
-        up += sum(1
-                  for c in self.booting.itervalues()
-                  if size is None or c.cloud_size.get().id == size.id)
-        up += sum(1
+    def _nodes_booting(self, size):
+        return sum(1
+                   for c in self.booting.itervalues()
+                   if size is None or c.cloud_size.get().id == size.id)
+
+    def _nodes_booted(self, size):
+        return sum(1
                   for i in (self.booted, self.cloud_nodes.nodes)
                   for c in i.itervalues()
                   if size is None or c.cloud_node.size.id == size.id)
+
+    def _nodes_up(self, size):
+        up = self._nodes_booting(size) + self._nodes_booted(size)
         return up
 
     def _total_price(self):
@@ -274,11 +278,21 @@ class NodeManagerDaemonActor(actor_class):
         elif under_min > 0 and size.id == self.min_cloud_size.id:
             return under_min
 
-        up_count = self._nodes_up(size) - (self._size_shutdowns(size) +
-                                           self._nodes_busy(size) +
-                                           self._nodes_missing(size))
-
-        self._logger.debug("%s: idle nodes %i, wishlist size %i", size.name, up_count, self._size_wishlist(size))
+        try:
+            booting_count = self._nodes_booting(size) + sum(1 for _ in self.cloud_nodes.unpaired())
+            shutdown_count = self._size_shutdowns(size)
+            busy_count = self._nodes_busy(size)
+            up_count = self._nodes_up(size) - (shutdown_count + busy_count + self._nodes_missing(size))
+
+            self._logger.info("%s: wishlist %i, up %i (booting %i, idle %i, busy %i), shutting down %i", size.name,
+                              self._size_wishlist(size),
+                              up_count + busy_count,
+                              booting_count,
+                              up_count - booting_count,
+                              busy_count,
+                              shutdown_count)
+        except Exception as e:
+            self._logger.exception("whoops")
 
         wanted = self._size_wishlist(size) - up_count
         if wanted > 0 and self.max_total_price and ((total_price + (size.price*wanted)) > self.max_total_price):
@@ -329,7 +343,7 @@ class NodeManagerDaemonActor(actor_class):
         if nodes_wanted < 1:
             return None
         arvados_node = self.arvados_nodes.find_stale_node(self.node_stale_after)
-        self._logger.info("Want %s more nodes.  Booting a %s node.",
+        self._logger.info("Want %i more %s nodes.  Booting a node.",
                           nodes_wanted, cloud_size.name)
         new_setup = self._node_setup.start(
             timer_actor=self._timer,
@@ -352,12 +366,13 @@ class NodeManagerDaemonActor(actor_class):
         cloud_node = setup_proxy.cloud_node.get()
         del self.booting[setup_proxy.actor_ref.actor_urn]
         setup_proxy.stop()
-        record = self.cloud_nodes.get(cloud_node.id)
-        if record is None:
-            record = self._new_node(cloud_node)
-            self.booted[cloud_node.id] = record
-        self._timer.schedule(time.time() + self.boot_fail_after,
-                             self._later.shutdown_unpaired_node, cloud_node.id)
+        if cloud_node is not None:
+            record = self.cloud_nodes.get(cloud_node.id)
+            if record is None:
+                record = self._new_node(cloud_node)
+                self.booted[cloud_node.id] = record
+            self._timer.schedule(time.time() + self.boot_fail_after,
+                                 self._later.shutdown_unpaired_node, cloud_node.id)
 
     @_check_poll_freshness
     def stop_booting_node(self, size):
diff --git a/services/nodemanager/arvnodeman/launcher.py b/services/nodemanager/arvnodeman/launcher.py
index 1ee7928..c8b3d19 100644
--- a/services/nodemanager/arvnodeman/launcher.py
+++ b/services/nodemanager/arvnodeman/launcher.py
@@ -71,7 +71,7 @@ def launch_pollers(config, server_calculator):
 
     timer = TimedCallBackActor.start(poll_time / 10.0).proxy()
     cloud_node_poller = CloudNodeListMonitorActor.start(
-        config.new_cloud_client(timer), timer, poll_time, max_poll_time).proxy()
+        config.new_cloud_client(), timer, poll_time, max_poll_time).proxy()
     arvados_node_poller = ArvadosNodeListMonitorActor.start(
         config.new_arvados_client(), timer, poll_time, max_poll_time).proxy()
     job_queue_poller = JobQueueMonitorActor.start(
@@ -103,32 +103,36 @@ def main(args=None):
     for sigcode in [signal.SIGINT, signal.SIGQUIT, signal.SIGTERM]:
         signal.signal(sigcode, shutdown_signal)
 
-    setup_logging(config.get('Logging', 'file'), **config.log_levels())
-    node_setup, node_shutdown, node_update, node_monitor = \
-        config.dispatch_classes()
-    server_calculator = build_server_calculator(config)
-    timer, cloud_node_poller, arvados_node_poller, job_queue_poller = \
-        launch_pollers(config, server_calculator)
-    cloud_node_updater = node_update.start(config.new_cloud_client).proxy()
-    node_daemon = NodeManagerDaemonActor.start(
-        job_queue_poller, arvados_node_poller, cloud_node_poller,
-        cloud_node_updater, timer,
-        config.new_arvados_client, config.new_cloud_client,
-        config.shutdown_windows(),
-        server_calculator,
-        config.getint('Daemon', 'min_nodes'),
-        config.getint('Daemon', 'max_nodes'),
-        config.getint('Daemon', 'poll_stale_after'),
-        config.getint('Daemon', 'boot_fail_after'),
-        config.getint('Daemon', 'node_stale_after'),
-        node_setup, node_shutdown, node_monitor,
-        max_total_price=config.getfloat('Daemon', 'max_total_price')).proxy()
-
-    signal.pause()
-    daemon_stopped = node_daemon.actor_ref.actor_stopped.is_set
-    while not daemon_stopped():
-        time.sleep(1)
-    pykka.ActorRegistry.stop_all()
+    try:
+        setup_logging(config.get('Logging', 'file'), **config.log_levels())
+        node_setup, node_shutdown, node_update, node_monitor = \
+            config.dispatch_classes()
+        server_calculator = build_server_calculator(config)
+        timer, cloud_node_poller, arvados_node_poller, job_queue_poller = \
+            launch_pollers(config, server_calculator)
+        cloud_node_updater = node_update.start(config.new_cloud_client).proxy()
+        node_daemon = NodeManagerDaemonActor.start(
+            job_queue_poller, arvados_node_poller, cloud_node_poller,
+            cloud_node_updater, timer,
+            config.new_arvados_client, config.new_cloud_client,
+            config.shutdown_windows(),
+            server_calculator,
+            config.getint('Daemon', 'min_nodes'),
+            config.getint('Daemon', 'max_nodes'),
+            config.getint('Daemon', 'poll_stale_after'),
+            config.getint('Daemon', 'boot_fail_after'),
+            config.getint('Daemon', 'node_stale_after'),
+            node_setup, node_shutdown, node_monitor,
+            max_total_price=config.getfloat('Daemon', 'max_total_price')).proxy()
+
+        signal.pause()
+        daemon_stopped = node_daemon.actor_ref.actor_stopped.is_set
+        while not daemon_stopped():
+            time.sleep(1)
+    except Exception:
+        logging.exception("Uncaught exception during setup")
+    finally:
+        pykka.ActorRegistry.stop_all()
 
 
 if __name__ == '__main__':
diff --git a/services/nodemanager/arvnodeman/nodelist.py b/services/nodemanager/arvnodeman/nodelist.py
index 26f49a3..f1a661e 100644
--- a/services/nodemanager/arvnodeman/nodelist.py
+++ b/services/nodemanager/arvnodeman/nodelist.py
@@ -36,4 +36,5 @@ class CloudNodeListMonitorActor(clientactor.RemotePollLoopActor):
         return node.id
 
     def _send_request(self):
-        return self._client.list_nodes()
+        n = self._client.list_nodes()
+        return n

commit e9901a3f94eab93f9482b7cb3d7e2c1b50216f08
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Wed Feb 3 17:51:46 2016 -0500

    Try to make logging identify the actor consistently

diff --git a/services/nodemanager/arvnodeman/clientactor.py b/services/nodemanager/arvnodeman/clientactor.py
index 6319f4b..5e5354a 100644
--- a/services/nodemanager/arvnodeman/clientactor.py
+++ b/services/nodemanager/arvnodeman/clientactor.py
@@ -38,10 +38,8 @@ class RemotePollLoopActor(actor_class):
         super(RemotePollLoopActor, self).__init__()
         self._client = client
         self._timer = timer_actor
-        self._logger = logging.getLogger(self.LOGGER_NAME)
         self._later = self.actor_ref.proxy()
         self._polling_started = False
-        self.log_prefix = "{} (at {})".format(self.__class__.__name__, id(self))
         self.min_poll_wait = poll_wait
         self.max_poll_wait = max_poll_wait
         self.poll_wait = self.min_poll_wait
@@ -50,6 +48,9 @@ class RemotePollLoopActor(actor_class):
         if hasattr(self, '_item_key'):
             self.subscribe_to = self._subscribe_to
 
+    def on_start(self):
+        self._logger = logging.getLogger("%s.%s" % (self.__class__.__name__, id(self.actor_urn[9:])))
+
     def _start_polling(self):
         if not self._polling_started:
             self._polling_started = True
@@ -71,8 +72,7 @@ class RemotePollLoopActor(actor_class):
         raise NotImplementedError("subclasses must implement request method")
 
     def _got_response(self, response):
-        self._logger.debug("%s got response with %d items",
-                           self.log_prefix, len(response))
+        self._logger.debug("got response with %d items", len(response))
         self.poll_wait = self.min_poll_wait
         _notify_subscribers(response, self.all_subscribers)
         if hasattr(self, '_item_key'):
@@ -89,7 +89,7 @@ class RemotePollLoopActor(actor_class):
         return False
 
     def poll(self, scheduled_start=None):
-        self._logger.debug("%s sending poll", self.log_prefix)
+        self._logger.debug("sending request")
         start_time = time.time()
         if scheduled_start is None:
             scheduled_start = start_time
diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
index 8c983c1..45468c3 100644
--- a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
@@ -21,18 +21,24 @@ class ComputeNodeStateChangeBase(config.actor_class, RetryMixin):
     This base class takes care of retrying changes and notifying
     subscribers when the change is finished.
     """
-    def __init__(self, logger_name, cloud_client, arvados_client, timer_actor,
+    def __init__(self, cloud_client, arvados_client, timer_actor,
                  retry_wait, max_retry_wait):
         super(ComputeNodeStateChangeBase, self).__init__()
+        self._later = self.actor_ref.proxy()
+        self._arvados = arvados_client
+        self.subscribers = set()
+
+    def _set_logger(self):
+        self._logger = logging.getLogger("%s.%s" % (self.__class__.__name__, self.actor_urn[9:]))
+
+    def on_start(self):
+        self._set_logger()
         RetryMixin.__init__(self,
                             retry_wait,
                             max_retry_wait,
-                            logging.getLogger(logger_name),
+                            self._logger,
                             cloud_client,
                             timer_actor)
-        self._later = self.actor_ref.proxy()
-        self._arvados = arvados_client
-        self.subscribers = set()
 
     def _finished(self):
         _notify_subscribers(self._later, self.subscribers)
@@ -74,7 +80,7 @@ class ComputeNodeSetupActor(ComputeNodeStateChangeBase):
                  cloud_size, arvados_node=None,
                  retry_wait=1, max_retry_wait=180):
         super(ComputeNodeSetupActor, self).__init__(
-            'arvnodeman.nodeup', cloud_client, arvados_client, timer_actor,
+            cloud_client, arvados_client, timer_actor,
             retry_wait, max_retry_wait)
         self.cloud_size = cloud_size
         self.arvados_node = None
@@ -97,7 +103,7 @@ class ComputeNodeSetupActor(ComputeNodeStateChangeBase):
 
     @RetryMixin._retry()
     def create_cloud_node(self):
-        self._logger.info("Creating cloud node with size %s.",
+        self._logger.info("Creating cloud node of size %s.",
                           self.cloud_size.name)
         self.cloud_node = self._cloud.create_node(self.cloud_size,
                                                   self.arvados_node)
@@ -160,7 +166,7 @@ class ComputeNodeShutdownActor(ComputeNodeStateChangeBase):
         # eligible.  Normal shutdowns based on job demand should be
         # cancellable; shutdowns based on node misbehavior should not.
         super(ComputeNodeShutdownActor, self).__init__(
-            'arvnodeman.nodedown', cloud_client, arvados_client, timer_actor,
+            cloud_client, arvados_client, timer_actor,
             retry_wait, max_retry_wait)
         self._monitor = node_monitor.proxy()
         self.cloud_node = self._monitor.cloud_node.get()
@@ -168,7 +174,11 @@ class ComputeNodeShutdownActor(ComputeNodeStateChangeBase):
         self.cancel_reason = None
         self.success = None
 
+    def _set_logger(self):
+        self._logger = logging.getLogger("%s.%s.%s" % (self.__class__.__name__, self.actor_urn[9:], self.cloud_node.id))
+
     def on_start(self):
+        super(ComputeNodeShutdownActor, self).on_start()
         self._later.shutdown_node()
 
     def _arvados_node(self):
@@ -181,7 +191,7 @@ class ComputeNodeShutdownActor(ComputeNodeStateChangeBase):
 
     def cancel_shutdown(self, reason):
         self.cancel_reason = reason
-        self._logger.info("Cloud node %s shutdown cancelled: %s.",
+        self._logger.info("Shutdown cancelled: %s.",
                           self.cloud_node.id, reason)
         self._finished(success_flag=False)
 
@@ -199,13 +209,14 @@ class ComputeNodeShutdownActor(ComputeNodeStateChangeBase):
     @_stop_if_window_closed
     @RetryMixin._retry()
     def shutdown_node(self):
+        self._logger.info("Starting shutdown", self.cloud_node.id)
         if not self._cloud.destroy_node(self.cloud_node):
             if self._cloud.broken(self.cloud_node):
                 self._later.cancel_shutdown(self.NODE_BROKEN)
             else:
                 # Force a retry.
                 raise cloud_types.LibcloudError("destroy_node failed")
-        self._logger.info("Cloud node %s shut down.", self.cloud_node.id)
+        self._logger.info("Shutdown success", self.cloud_node.id)
         arv_node = self._arvados_node()
         if arv_node is None:
             self._finished(success_flag=True)
@@ -281,7 +292,6 @@ class ComputeNodeMonitorActor(config.actor_class):
     ):
         super(ComputeNodeMonitorActor, self).__init__()
         self._later = self.actor_ref.proxy()
-        self._logger = logging.getLogger('arvnodeman.computenode')
         self._last_log = None
         self._shutdowns = shutdown_timer
         self._cloud_node_fqdn = cloud_fqdn_func
@@ -299,6 +309,12 @@ class ComputeNodeMonitorActor(config.actor_class):
         self.last_shutdown_opening = None
         self._later.consider_shutdown()
 
+    def _set_logger(self):
+        self._logger = logging.getLogger("%s.%s.%s" % (self.__class__.__name__, self.actor_urn[9:], self.cloud_node.name))
+
+    def on_start(self):
+        self._set_logger()
+
     def subscribe(self, subscriber):
         self.subscribers.add(subscriber)
 
@@ -346,13 +362,13 @@ class ComputeNodeMonitorActor(config.actor_class):
     def consider_shutdown(self):
         next_opening = self._shutdowns.next_opening()
         if self.shutdown_eligible():
-            self._debug("Node %s suggesting shutdown.", self.cloud_node.id)
+            self._debug("Suggesting shutdown.", self.cloud_node.id)
             _notify_subscribers(self._later, self.subscribers)
         elif self._shutdowns.window_open():
-            self._debug("Node %s shutdown window open but node busy.",
+            self._debug("Shutdown window open but node busy.",
                         self.cloud_node.id)
         elif self.last_shutdown_opening != next_opening:
-            self._debug("Node %s shutdown window closed.  Next at %s.",
+            self._debug("Shutdown window closed.  Next at %s.",
                         self.cloud_node.id, time.ctime(next_opening))
             self._timer.schedule(next_opening, self._later.consider_shutdown)
             self.last_shutdown_opening = next_opening
diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py b/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
index 43f61c7..4d70436 100644
--- a/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
+++ b/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
@@ -22,6 +22,7 @@ class ComputeNodeShutdownActor(ShutdownActorBase):
             self._nodename = None
             return super(ComputeNodeShutdownActor, self).on_start()
         else:
+            self._set_logger()
             self._nodename = arv_node['hostname']
             self._logger.info("Draining SLURM node %s", self._nodename)
             self._later.issue_slurm_drain()
diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py
index 243d3bf..42ee4db 100644
--- a/services/nodemanager/arvnodeman/daemon.py
+++ b/services/nodemanager/arvnodeman/daemon.py
@@ -121,7 +121,6 @@ class NodeManagerDaemonActor(actor_class):
         self._new_arvados = arvados_factory
         self._new_cloud = cloud_factory
         self._cloud_driver = self._new_cloud()
-        self._logger = logging.getLogger('arvnodeman.daemon')
         self._later = self.actor_ref.proxy()
         self.shutdown_windows = shutdown_windows
         self.server_calculator = server_calculator
@@ -145,6 +144,9 @@ class NodeManagerDaemonActor(actor_class):
         self.shutdowns = {}     # Cloud node IDs to ComputeNodeShutdownActors
         self._logger.debug("Daemon initialized")
 
+    def on_start(self):
+        self._logger = logging.getLogger("%s.%s" % (self.__class__.__name__, id(self.actor_urn[9:])))
+
     def _update_poll_time(self, poll_key):
         self.last_polls[poll_key] = time.time()
 
diff --git a/services/nodemanager/arvnodeman/jobqueue.py b/services/nodemanager/arvnodeman/jobqueue.py
index e0f0a5b..87cf738 100644
--- a/services/nodemanager/arvnodeman/jobqueue.py
+++ b/services/nodemanager/arvnodeman/jobqueue.py
@@ -102,7 +102,6 @@ class JobQueueMonitorActor(clientactor.RemotePollLoopActor):
     """
 
     CLIENT_ERRORS = ARVADOS_ERRORS
-    LOGGER_NAME = 'arvnodeman.jobqueue'
 
     def __init__(self, client, timer_actor, server_calc, *args, **kwargs):
         super(JobQueueMonitorActor, self).__init__(
@@ -114,6 +113,6 @@ class JobQueueMonitorActor(clientactor.RemotePollLoopActor):
 
     def _got_response(self, queue):
         server_list = self._calculator.servers_for_queue(queue)
-        self._logger.debug("Sending server wishlist: %s",
+        self._logger.debug("Calculated wishlist: %s",
                            ', '.join(s.name for s in server_list) or "(empty)")
         return super(JobQueueMonitorActor, self)._got_response(server_list)
diff --git a/services/nodemanager/arvnodeman/nodelist.py b/services/nodemanager/arvnodeman/nodelist.py
index 83dd93f..26f49a3 100644
--- a/services/nodemanager/arvnodeman/nodelist.py
+++ b/services/nodemanager/arvnodeman/nodelist.py
@@ -11,7 +11,6 @@ class ArvadosNodeListMonitorActor(clientactor.RemotePollLoopActor):
     This actor regularly polls the list of Arvados node records, and
     sends it to subscribers.
     """
-    LOGGER_NAME = 'arvnodeman.arvados_nodes'
 
     def is_common_error(self, exception):
         return isinstance(exception, config.ARVADOS_ERRORS)
@@ -29,7 +28,6 @@ class CloudNodeListMonitorActor(clientactor.RemotePollLoopActor):
     This actor regularly polls the cloud to get a list of running compute
     nodes, and sends it to subscribers.
     """
-    LOGGER_NAME = 'arvnodeman.cloud_nodes'
 
     def is_common_error(self, exception):
         return self._client.is_cloud_exception(exception)

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list