[ARVADOS] created: 7efb310100364181c3fb1edc70d1fc1f6f16c4ae

Thu Jun 8 21:44:47 EDT 2017

at  7efb310100364181c3fb1edc70d1fc1f6f16c4ae (commit)


commit 7efb310100364181c3fb1edc70d1fc1f6f16c4ae
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Thu Jun 8 17:26:27 2017 -0400

    11461: Support providing hostname override for testing compute images.
    
    Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz at curoverse.com>

diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
index 4463ec6..9a7d5ca 100644
--- a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
@@ -58,10 +58,10 @@ class ComputeNodeStateChangeBase(config.actor_class, RetryMixin):
         else:
             self.subscribers.add(subscriber)
 
-    def _clean_arvados_node(self, arvados_node, explanation):
+    def _clean_arvados_node(self, arvados_node, explanation, hostname=None):
         return self._arvados.nodes().update(
             uuid=arvados_node['uuid'],
-            body={'hostname': None,
+            body={'hostname': hostname,
                   'ip_address': None,
                   'slot_number': None,
                   'first_ping_at': None,
@@ -94,7 +94,8 @@ class ComputeNodeSetupActor(ComputeNodeStateChangeBase):
     """
     def __init__(self, timer_actor, arvados_client, cloud_client,
                  cloud_size, arvados_node=None,
-                 retry_wait=1, max_retry_wait=180):
+                 retry_wait=1, max_retry_wait=180,
+                 assigned_hostname=None):
         super(ComputeNodeSetupActor, self).__init__(
             cloud_client, arvados_client, timer_actor,
             retry_wait, max_retry_wait)
@@ -102,6 +103,10 @@ class ComputeNodeSetupActor(ComputeNodeStateChangeBase):
         self.arvados_node = None
         self.cloud_node = None
         self.error = None
+        self.assigned_hostname = assigned_hostname
+
+        print("ABCABC", self.assigned_hostname)
+
         if arvados_node is None:
             self._later.create_arvados_node()
         else:
@@ -110,14 +115,14 @@ class ComputeNodeSetupActor(ComputeNodeStateChangeBase):
     @ComputeNodeStateChangeBase._finish_on_exception
     @RetryMixin._retry(config.ARVADOS_ERRORS)
     def create_arvados_node(self):
-        self.arvados_node = self._arvados.nodes().create(body={}).execute()
+        self.arvados_node = self._arvados.nodes().create(body={"hostname": self.assigned_hostname}).execute()
         self._later.create_cloud_node()
 
     @ComputeNodeStateChangeBase._finish_on_exception
     @RetryMixin._retry(config.ARVADOS_ERRORS)
     def prepare_arvados_node(self, node):
         self.arvados_node = self._clean_arvados_node(
-            node, "Prepared by Node Manager")
+            node, "Prepared by Node Manager", hostname=self.assigned_hostname)
         self._later.create_cloud_node()
 
     @ComputeNodeStateChangeBase._finish_on_exception
diff --git a/services/nodemanager/arvnodeman/config.py b/services/nodemanager/arvnodeman/config.py
index cbadc5f..2655c90 100644
--- a/services/nodemanager/arvnodeman/config.py
+++ b/services/nodemanager/arvnodeman/config.py
@@ -58,7 +58,10 @@ class NodeManagerConfig(ConfigParser.SafeConfigParser):
             'Manage': {'address': '127.0.0.1',
                        'port': '-1'},
             'Logging': {'file': '/dev/stderr',
-                        'level': 'WARNING'}
+                        'level': 'WARNING'},
+            'Testing': {'wishlist': '',
+                        'hostnames': ''
+            }
         }.iteritems():
             if not self.has_section(sec_name):
                 self.add_section(sec_name)
diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py
index 68dd54b..f3b9765 100644
--- a/services/nodemanager/arvnodeman/daemon.py
+++ b/services/nodemanager/arvnodeman/daemon.py
@@ -107,7 +107,8 @@ class NodeManagerDaemonActor(actor_class):
                  node_shutdown_class=dispatch.ComputeNodeShutdownActor,
                  node_actor_class=dispatch.ComputeNodeMonitorActor,
                  max_total_price=0,
-                 destroy_on_shutdown=False):
+                 destroy_on_shutdown=False,
+                 assigned_hostnames=[]):
         super(NodeManagerDaemonActor, self).__init__()
         self._node_setup = node_setup_class
         self._node_shutdown = node_shutdown_class
@@ -129,6 +130,7 @@ class NodeManagerDaemonActor(actor_class):
         self.boot_fail_after = boot_fail_after
         self.node_stale_after = node_stale_after
         self.last_polls = {}
+        self.assigned_hostnames = assigned_hostnames
         for poll_name in ['server_wishlist', 'arvados_nodes', 'cloud_nodes']:
             poll_actor = locals()[poll_name + '_actor']
             poll_actor.subscribe(getattr(self._later, 'update_' + poll_name))
@@ -384,6 +386,11 @@ class NodeManagerDaemonActor(actor_class):
         if nodes_wanted < 1:
             return None
         arvados_node = self.arvados_nodes.find_stale_node(self.node_stale_after)
+
+        assigned_hostname = None
+        if self.assigned_hostnames:
+            assigned_hostname = self.assigned_hostnames.pop(0)
+
         self._logger.info("Want %i more %s nodes.  Booting a node.",
                           nodes_wanted, cloud_size.name)
         new_setup = self._node_setup.start(
@@ -391,7 +398,8 @@ class NodeManagerDaemonActor(actor_class):
             arvados_client=self._new_arvados(),
             arvados_node=arvados_node,
             cloud_client=self._new_cloud(),
-            cloud_size=cloud_size).proxy()
+            cloud_size=cloud_size,
+            assigned_hostname=assigned_hostname).proxy()
         self.booting[new_setup.actor_ref.actor_urn] = new_setup
         self.sizes_booting[new_setup.actor_ref.actor_urn] = cloud_size
 
diff --git a/services/nodemanager/arvnodeman/launcher.py b/services/nodemanager/arvnodeman/launcher.py
index 911c70a..582f39c 100644
--- a/services/nodemanager/arvnodeman/launcher.py
+++ b/services/nodemanager/arvnodeman/launcher.py
@@ -143,7 +143,8 @@ def main(args=None):
             config.getint('Daemon', 'node_stale_after'),
             node_setup, node_shutdown, node_monitor,
             max_total_price=config.getfloat('Daemon', 'max_total_price'),
-            destroy_on_shutdown=config.getboolean('Daemon', 'destroy_on_shutdown')).tell_proxy()
+            destroy_on_shutdown=config.getboolean('Daemon', 'destroy_on_shutdown'),
+            assigned_hostnames=config.get('Testing', 'hostnames').split(",")).tell_proxy()
 
         watchdog = WatchdogActor.start(config.getint('Daemon', 'watchdog'),
                             cloud_node_poller.actor_ref,

commit 29c90ad946574534f88bd85df2999ce09ec9cdd9
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Thu Jun 8 15:16:50 2017 -0400

    11461: Add configuration options suitable to run an independent instance of
    node manager to boot a node, for compute image testing.

diff --git a/services/nodemanager/arvnodeman/config.py b/services/nodemanager/arvnodeman/config.py
index a16e0a8..cbadc5f 100644
--- a/services/nodemanager/arvnodeman/config.py
+++ b/services/nodemanager/arvnodeman/config.py
@@ -53,7 +53,8 @@ class NodeManagerConfig(ConfigParser.SafeConfigParser):
                        'boot_fail_after': str(sys.maxint),
                        'node_stale_after': str(60 * 60 * 2),
                        'watchdog': '600',
-                       'node_mem_scaling': '0.95'},
+                       'node_mem_scaling': '0.95',
+                       'destroy_on_shutdown': "no"},
             'Manage': {'address': '127.0.0.1',
                        'port': '-1'},
             'Logging': {'file': '/dev/stderr',
diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py
index 7e63c78..68dd54b 100644
--- a/services/nodemanager/arvnodeman/daemon.py
+++ b/services/nodemanager/arvnodeman/daemon.py
@@ -106,7 +106,8 @@ class NodeManagerDaemonActor(actor_class):
                  node_setup_class=dispatch.ComputeNodeSetupActor,
                  node_shutdown_class=dispatch.ComputeNodeShutdownActor,
                  node_actor_class=dispatch.ComputeNodeMonitorActor,
-                 max_total_price=0):
+                 max_total_price=0,
+                 destroy_on_shutdown=False):
         super(NodeManagerDaemonActor, self).__init__()
         self._node_setup = node_setup_class
         self._node_shutdown = node_shutdown_class
@@ -137,6 +138,7 @@ class NodeManagerDaemonActor(actor_class):
         self.arvados_nodes = _ArvadosNodeTracker()
         self.booting = {}       # Actor IDs to ComputeNodeSetupActors
         self.sizes_booting = {} # Actor IDs to node size
+        self.destroy_on_shutdown = destroy_on_shutdown
 
     def on_start(self):
         self._logger = logging.getLogger("%s.%s" % (self.__class__.__name__, self.actor_urn[33:]))
@@ -199,6 +201,8 @@ class NodeManagerDaemonActor(actor_class):
                 except pykka.ActorDeadError:
                     pass
                 record.shutdown_actor = None
+                if hasattr(record.cloud_node, "_nodemanager_recently_booted"):
+                    del record.cloud_node._nodemanager_recently_booted
 
             # A recently booted node is a node that successfully completed the
             # setup actor but has not yet appeared in the cloud node list.
@@ -516,25 +520,35 @@ class NodeManagerDaemonActor(actor_class):
 
     def shutdown(self):
         self._logger.info("Shutting down after signal.")
-        self.poll_stale_after = -1  # Inhibit starting/stopping nodes
 
         # Shut down pollers
         self._server_wishlist_actor.stop()
         self._arvados_nodes_actor.stop()
-        self._cloud_nodes_actor.stop()
-
-        # Clear cloud node list
-        self.update_cloud_nodes([])
 
         # Stop setup actors unless they are in the middle of setup.
         setup_stops = {key: node.stop_if_no_cloud_node()
                        for key, node in self.booting.iteritems()}
         self.booting = {key: self.booting[key]
                         for key in setup_stops if not setup_stops[key].get()}
+
+        if not self.destroy_on_shutdown:
+            # Clear cloud node list
+            self._cloud_nodes_actor.stop()
+            self.update_cloud_nodes([])
+            self.poll_stale_after = -1  # Inhibit starting/stopping nodes
+
         self._later.await_shutdown()
 
     def await_shutdown(self):
-        if self.booting:
+        nodes_up = 0
+        if self.destroy_on_shutdown:
+            for node in self.cloud_nodes.nodes.itervalues():
+                # Begin shutdown of all nodes.
+                if node.actor and not node.shutdown_actor:
+                    self._begin_node_shutdown(node.actor, cancellable=False)
+            nodes_up = sum(1 for node in self.cloud_nodes.nodes.itervalues() if node.actor)
+
+        if self.booting or nodes_up:
             self._timer.schedule(time.time() + 1, self._later.await_shutdown)
         else:
             self.stop()
diff --git a/services/nodemanager/arvnodeman/jobqueue.py b/services/nodemanager/arvnodeman/jobqueue.py
index 1716a57..c3d8f59 100644
--- a/services/nodemanager/arvnodeman/jobqueue.py
+++ b/services/nodemanager/arvnodeman/jobqueue.py
@@ -113,12 +113,13 @@ class JobQueueMonitorActor(clientactor.RemotePollLoopActor):
     CLIENT_ERRORS = ARVADOS_ERRORS
 
     def __init__(self, client, timer_actor, server_calc,
-                 jobs_queue, slurm_queue, *args, **kwargs):
+                 jobs_queue, slurm_queue, override_wishlist, *args, **kwargs):
         super(JobQueueMonitorActor, self).__init__(
             client, timer_actor, *args, **kwargs)
         self.jobs_queue = jobs_queue
         self.slurm_queue = slurm_queue
         self._calculator = server_calc
+        self.override_wishlist = override_wishlist
 
     @staticmethod
     def coerce_to_mb(x):
@@ -161,6 +162,8 @@ class JobQueueMonitorActor(clientactor.RemotePollLoopActor):
 
     def _got_response(self, queue):
         server_list = self._calculator.servers_for_queue(queue)
+        if self.override_wishlist:
+            server_list.extend(self.override_wishlist)
         self._logger.debug("Calculated wishlist: %s",
                            ', '.join(s.name for s in server_list) or "(empty)")
         return super(JobQueueMonitorActor, self)._got_response(server_list)
diff --git a/services/nodemanager/arvnodeman/launcher.py b/services/nodemanager/arvnodeman/launcher.py
index 72a285b..911c70a 100644
--- a/services/nodemanager/arvnodeman/launcher.py
+++ b/services/nodemanager/arvnodeman/launcher.py
@@ -89,6 +89,7 @@ def launch_pollers(config, server_calculator):
         config.new_arvados_client(), timer, server_calculator,
         config.getboolean('Arvados', 'jobs_queue'),
         config.getboolean('Arvados', 'slurm_queue'),
+        [server_calculator.find_size(sz) for sz in config.get('Testing', 'wishlist').split(",")],
         poll_time, max_poll_time
     ).tell_proxy()
     return timer, cloud_node_poller, arvados_node_poller, job_queue_poller
@@ -141,7 +142,8 @@ def main(args=None):
             config.getint('Daemon', 'boot_fail_after'),
             config.getint('Daemon', 'node_stale_after'),
             node_setup, node_shutdown, node_monitor,
-            max_total_price=config.getfloat('Daemon', 'max_total_price')).tell_proxy()
+            max_total_price=config.getfloat('Daemon', 'max_total_price'),
+            destroy_on_shutdown=config.getboolean('Daemon', 'destroy_on_shutdown')).tell_proxy()
 
         watchdog = WatchdogActor.start(config.getint('Daemon', 'watchdog'),
                             cloud_node_poller.actor_ref,
diff --git a/services/nodemanager/doc/compute-image-test.cfg b/services/nodemanager/doc/compute-image-test.cfg
new file mode 100644
index 0000000..4372f66
--- /dev/null
+++ b/services/nodemanager/doc/compute-image-test.cfg
@@ -0,0 +1,208 @@
+# Sample template for running node manager in compute image testing mode.
+#
+# Relevant sections:
+#
+## Wishlist to use instead of getting it from API or squeue
+#[Testing]
+#wishlist = Standard_D3
+#
+## Destroy compute nodes on shutdown
+#[Daemon]
+#destroy_on_shutdown = yes
+#
+## Disable populating wishlist from jobs queue, slurm queue.
+#[Arvados]
+#jobs_queue = no
+#slurm_queue = no
+
+
+[Manage]
+# The management server responds to http://addr:port/status.json with
+# a snapshot of internal state.
+
+# Management server listening address (default 127.0.0.1)
+#address = 0.0.0.0
+
+# Management server port number (default -1, server is disabled)
+#port = 8989
+
+[Testing]
+wishlist = Standard_D3
+
+[Daemon]
+destroy_on_shutdown = yes
+
+# The dispatcher can customize the start and stop procedure for
+# cloud nodes.  For example, the SLURM dispatcher drains nodes
+# through SLURM before shutting them down.
+#dispatcher = slurm
+
+# Node Manager will ensure that there are at least this many nodes running at
+# all times.  If node manager needs to start new idle nodes for the purpose of
+# satisfying min_nodes, it will use the cheapest node type.  However, depending
+# on usage patterns, it may also satisfy min_nodes by keeping alive some
+# more-expensive nodes
+min_nodes = 0
+
+# Node Manager will not start any compute nodes when at least this
+# many are running.
+max_nodes = 8
+
+# Upper limit on rate of spending (in $/hr), will not boot additional nodes
+# if total price of already running nodes meets or exceeds this threshold.
+# default 0 means no limit.
+max_total_price = 0
+
+# Poll Azure nodes and Arvados for new information every N seconds.
+poll_time = 5
+
+# Polls have exponential backoff when services fail to respond.
+# This is the longest time to wait between polls.
+max_poll_time = 300
+
+# If Node Manager can't succesfully poll a service for this long,
+# it will never start or stop compute nodes, on the assumption that its
+# information is too outdated.
+poll_stale_after = 600
+
+# If Node Manager boots a cloud node, and it does not pair with an Arvados
+# node before this long, assume that there was a cloud bootstrap failure and
+# shut it down.  Note that normal shutdown windows apply (see the Cloud
+# section), so this should be shorter than the first shutdown window value.
+boot_fail_after = 45
+
+# "Node stale time" affects two related behaviors.
+# 1. If a compute node has been running for at least this long, but it
+# isn't paired with an Arvados node, do not shut it down, but leave it alone.
+# This prevents the node manager from shutting down a node that might
+# actually be doing work, but is having temporary trouble contacting the
+# API server.
+# 2. When the Node Manager starts a new compute node, it will try to reuse
+# an Arvados node that hasn't been updated for this long.
+node_stale_after = 14400
+
+# Scaling factor to be applied to nodes' available RAM size. Usually there's a
+# variable discrepancy between the advertised RAM value on cloud nodes and the
+# actual amount available.
+# If not set, this value will be set to 0.95
+node_mem_scaling = 0.95
+
+# File path for Certificate Authorities
+certs_file = /etc/ssl/certs/ca-certificates.crt
+
+[Logging]
+# Log file path
+#file = node-manager.log
+
+# Log level for most Node Manager messages.
+# Choose one of DEBUG, INFO, WARNING, ERROR, or CRITICAL.
+# WARNING lets you know when polling a service fails.
+# INFO additionally lets you know when a compute node is started or stopped.
+level = DEBUG
+
+# You can also set different log levels for specific libraries.
+# Pykka is the Node Manager's actor library.
+# Setting this to DEBUG will display tracebacks for uncaught
+# exceptions in the actors, but it's also very chatty.
+pykka = DEBUG
+
+# Setting apiclient to INFO will log the URL of every Arvados API request.
+apiclient = WARNING
+
+[Arvados]
+host =
+token =
+timeout = 15
+jobs_queue = no
+slurm_queue = no
+
+# Accept an untrusted SSL certificate from the API server?
+insecure = yes
+
+[Cloud]
+provider = azure
+
+# Shutdown windows define periods of time when a node may and may not be shut
+# down.  These are windows in full minutes, separated by commas.  Counting from
+# the time the node is booted, the node WILL NOT shut down for N1 minutes; then
+# it MAY shut down for N2 minutes; then it WILL NOT shut down for N3 minutes;
+# and so on.  For example, "20, 999999" means the node may shut down between
+# the 20th and 999999th minutes of uptime.
+# Azure bills by the minute, so it makes sense to agressively shut down idle
+# nodes.  Specify at least two windows.  You can add as many as you need beyond
+# that.
+shutdown_windows = 1, 999999
+
+[Cloud Credentials]
+# Use "azure account list" with the azure CLI to get these values.
+tenant_id = 00000000-0000-0000-0000-000000000000
+subscription_id = 00000000-0000-0000-0000-000000000000
+
+# The following directions are based on
+# https://azure.microsoft.com/en-us/documentation/articles/resource-group-authenticate-service-principal/
+#
+# azure config mode arm
+# azure ad app create --name "<Your Application Display Name>" --home-page "<https://YourApplicationHomePage>" --identifier-uris "<https://YouApplicationUri>" --password <Your_Password>
+# azure ad sp create "<Application_Id>"
+# azure role assignment create --objectId "<Object_Id>" -o Owner -c /subscriptions/<subscriptionId>/
+#
+# Use <Application_Id> for "key" and the <Your_Password> for "secret"
+#
+key = 00000000-0000-0000-0000-000000000000
+secret = PASSWORD
+timeout = 60
+region = East US
+
+[Cloud List]
+# The resource group in which the compute node virtual machines will be created
+# and listed.
+ex_resource_group = ArvadosResourceGroup
+
+[Cloud Create]
+# The image id, in the form "Publisher:Offer:SKU:Version"
+image = Canonical:UbuntuServer:14.04.3-LTS:14.04.201508050
+
+# Path to a local ssh key file that will be used to provision new nodes.
+ssh_key = /dev/null
+
+# The account name for the admin user that will be provisioned on new nodes.
+ex_user_name = arvadosuser
+
+# The Azure storage account that will be used to store the node OS disk images.
+ex_storage_account = arvadosstorage
+
+# The virtual network the VMs will be associated with.
+ex_network = ArvadosNetwork
+
+# Optional subnet of the virtual network.
+#ex_subnet = default
+
+# Node tags
+tag_arvados-class = dynamic-compute
+tag_cluster = zyxwv
+
+# the API server to ping
+ping_host =
+
+# You can define any number of Size sections to list Azure sizes you're willing
+# to use.  The Node Manager should boot the cheapest size(s) that can run jobs
+# in the queue.  You must also provide price per hour as the Azure driver
+# compute currently does not report prices.
+#
+# See https://azure.microsoft.com/en-us/pricing/details/virtual-machines/
+# for a list of known machine types that may be used as a Size parameter.
+#
+# Each size section MUST define the number of cores are available in this
+# size class (since libcloud does not provide any consistent API for exposing
+# this setting).
+# You may also want to define the amount of scratch space (expressed
+# in GB) for Crunch jobs.  You can also override Microsoft's provided
+# data fields by setting them here.
+
+[Size Standard_D3]
+cores = 4
+price = 0.56
+
+[Size Standard_D4]
+cores = 8
+price = 1.12

commit 1e47e184d3b156dd2902a7d6d86cbf872522ffeb
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Thu Jun 8 16:56:10 2017 -0400

    11345: Clamp retry-after to (0, max_retry_wait).  Deindent retry_wrapper a bit for readability.
    
    Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz at curoverse.com>

diff --git a/services/nodemanager/arvnodeman/computenode/__init__.py b/services/nodemanager/arvnodeman/computenode/__init__.py
index b11b2de..5c14c1c 100644
--- a/services/nodemanager/arvnodeman/computenode/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/__init__.py
@@ -73,56 +73,66 @@ class RetryMixin(object):
             @functools.wraps(orig_func)
             def retry_wrapper(self, *args, **kwargs):
                 while True:
+                    should_retry = False
                     try:
                         ret = orig_func(self, *args, **kwargs)
-                    except Exception as error:
-                        should_retry = False
-
-                        if isinstance(error, BaseHTTPError):
-                            if error.headers and error.headers.get("retry-after"):
-                                try:
-                                    self.retry_wait = int(error.headers["retry-after"])
-                                    should_retry = True
-                                except ValueError:
-                                    pass
-                            if error.code == 429 or error.code >= 500:
+                    except BaseHTTPError as error:
+                        if error.headers and error.headers.get("retry-after"):
+                            try:
+                                self.retry_wait = int(error.headers["retry-after"])
+                                if self.retry_wait < 0 or self.retry_wait > self.max_retry_wait:
+                                    self.retry_wait = self.max_retry_wait
                                 should_retry = True
-                        elif isinstance(error, CLOUD_ERRORS) or isinstance(error, errors) or type(error) is Exception:
+                            except ValueError:
+                                pass
+                        if error.code == 429 or error.code >= 500:
                             should_retry = True
-
-                        if not should_retry:
-                            self.retry_wait = self.min_retry_wait
-                            self._logger.warning(
-                                "Re-raising error (no retry): %s",
-                                error, exc_info=error)
-                            raise
-
-                        self._logger.warning(
-                            "Client error: %s - %s %s seconds",
-                            error,
-                            "scheduling retry in" if self._timer else "sleeping",
-                            self.retry_wait,
-                            exc_info=error)
-
-                        if self._timer:
-                            start_time = time.time()
-                            # reschedule to be called again
-                            self._timer.schedule(start_time + self.retry_wait,
-                                                 getattr(self._later,
-                                                         orig_func.__name__),
-                                                 *args, **kwargs)
-                        else:
-                            # sleep on it.
-                            time.sleep(self.retry_wait)
-
-                        self.retry_wait = min(self.retry_wait * 2,
-                                              self.max_retry_wait)
-                        if self._timer:
-                            # expect to be called again by timer so don't loop
-                            return
+                    except CLOUD_ERRORS:
+                        should_retry = True
+                    except errors:
+                        should_retry = True
+                    except Exception as error:
+                        # As a libcloud workaround for drivers that don't use
+                        # typed exceptions, consider bare Exception() objects
+                        # retryable.
+                        should_retry = type(error) is Exception
                     else:
+                        # No exception,
                         self.retry_wait = self.min_retry_wait
                         return ret
+
+                    # Only got here if an exception was caught.  Now determine what to do about it.
+                    if not should_retry:
+                        self.retry_wait = self.min_retry_wait
+                        self._logger.warning(
+                            "Re-raising error (no retry): %s",
+                            error, exc_info=error)
+                        raise
+
+                    self._logger.warning(
+                        "Client error: %s - %s %s seconds",
+                        error,
+                        "scheduling retry in" if self._timer else "sleeping",
+                        self.retry_wait,
+                        exc_info=error)
+
+                    if self._timer:
+                        start_time = time.time()
+                        # reschedule to be called again
+                        self._timer.schedule(start_time + self.retry_wait,
+                                             getattr(self._later,
+                                                     orig_func.__name__),
+                                             *args, **kwargs)
+                    else:
+                        # sleep on it.
+                        time.sleep(self.retry_wait)
+
+                    self.retry_wait = min(self.retry_wait * 2,
+                                          self.max_retry_wait)
+                    if self._timer:
+                        # expect to be called again by timer so don't loop
+                        return
+
             return retry_wrapper
         return decorator
 
diff --git a/services/nodemanager/tests/integration_test.py b/services/nodemanager/tests/integration_test.py
index ad0e683..f024b0c 100755
--- a/services/nodemanager/tests/integration_test.py
+++ b/services/nodemanager/tests/integration_test.py
@@ -332,7 +332,7 @@ def main():
              "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
              "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"
          }),
-        "test6": (
+        "test_retry_create": (
             [
                 (r".*Daemon started", set_squeue),
                 (r".*Rate limit exceeded - scheduling retry in 12 seconds", noop),

commit 2fa4de30af88afd7bcac2f603497dcbe36e48429
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Thu Jun 8 13:13:58 2017 -0400

    11345: Fix race-prone test test_issue_slurm_drain_retry
    
    Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz at curoverse.com>

diff --git a/services/nodemanager/tests/test_computenode_dispatch_slurm.py b/services/nodemanager/tests/test_computenode_dispatch_slurm.py
index 73bcb57..4218980 100644
--- a/services/nodemanager/tests/test_computenode_dispatch_slurm.py
+++ b/services/nodemanager/tests/test_computenode_dispatch_slurm.py
@@ -23,11 +23,15 @@ class SLURMComputeNodeShutdownActorTestCase(ComputeNodeShutdownActorMixin,
         for s in args:
             self.assertIn(s, slurm_cmd)
 
-    def check_success_after_reset(self, proc_mock, end_state='drain\n'):
+    def check_success_after_reset(self, proc_mock, end_state='drain\n', timer=False):
         self.make_mocks(arvados_node=testutil.arvados_node_mock(63))
+        if not timer:
+            self.timer = testutil.MockTimer(False)
         self.make_actor()
         self.check_success_flag(None, 0)
+        self.timer.deliver()
         self.check_success_flag(None, 0)
+        self.timer.deliver()
         # Order is critical here: if the mock gets called when no return value
         # or side effect is set, we may invoke a real subprocess.
         proc_mock.return_value = end_state
@@ -84,8 +88,8 @@ class SLURMComputeNodeShutdownActorTestCase(ComputeNodeShutdownActorMixin,
         self.check_success_flag(False, 2)
 
     def test_issue_slurm_drain_retry(self, proc_mock):
-        proc_mock.side_effect = iter([OSError, '', OSError, 'drng\n', 'drain\n', 'drain\n'])
-        self.check_success_after_reset(proc_mock)
+        proc_mock.side_effect = iter([OSError, '', OSError, 'drng\n'])
+        self.check_success_after_reset(proc_mock, timer=False)
 
     def test_arvados_node_cleaned_after_shutdown(self, proc_mock):
         proc_mock.return_value = 'drain\n'

-----------------------------------------------------------------------


hooks/post-receive
--