[ARVADOS] created: 7efb310100364181c3fb1edc70d1fc1f6f16c4ae
Git user
git at public.curoverse.com
Thu Jun 8 21:44:47 EDT 2017
at 7efb310100364181c3fb1edc70d1fc1f6f16c4ae (commit)
commit 7efb310100364181c3fb1edc70d1fc1f6f16c4ae
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date: Thu Jun 8 17:26:27 2017 -0400
11461: Support providing hostname override for testing compute images.
Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz at curoverse.com>
diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
index 4463ec6..9a7d5ca 100644
--- a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
@@ -58,10 +58,10 @@ class ComputeNodeStateChangeBase(config.actor_class, RetryMixin):
else:
self.subscribers.add(subscriber)
- def _clean_arvados_node(self, arvados_node, explanation):
+ def _clean_arvados_node(self, arvados_node, explanation, hostname=None):
return self._arvados.nodes().update(
uuid=arvados_node['uuid'],
- body={'hostname': None,
+ body={'hostname': hostname,
'ip_address': None,
'slot_number': None,
'first_ping_at': None,
@@ -94,7 +94,8 @@ class ComputeNodeSetupActor(ComputeNodeStateChangeBase):
"""
def __init__(self, timer_actor, arvados_client, cloud_client,
cloud_size, arvados_node=None,
- retry_wait=1, max_retry_wait=180):
+ retry_wait=1, max_retry_wait=180,
+ assigned_hostname=None):
super(ComputeNodeSetupActor, self).__init__(
cloud_client, arvados_client, timer_actor,
retry_wait, max_retry_wait)
@@ -102,6 +103,10 @@ class ComputeNodeSetupActor(ComputeNodeStateChangeBase):
self.arvados_node = None
self.cloud_node = None
self.error = None
+ self.assigned_hostname = assigned_hostname
+
+ print("ABCABC", self.assigned_hostname)
+
if arvados_node is None:
self._later.create_arvados_node()
else:
@@ -110,14 +115,14 @@ class ComputeNodeSetupActor(ComputeNodeStateChangeBase):
@ComputeNodeStateChangeBase._finish_on_exception
@RetryMixin._retry(config.ARVADOS_ERRORS)
def create_arvados_node(self):
- self.arvados_node = self._arvados.nodes().create(body={}).execute()
+ self.arvados_node = self._arvados.nodes().create(body={"hostname": self.assigned_hostname}).execute()
self._later.create_cloud_node()
@ComputeNodeStateChangeBase._finish_on_exception
@RetryMixin._retry(config.ARVADOS_ERRORS)
def prepare_arvados_node(self, node):
self.arvados_node = self._clean_arvados_node(
- node, "Prepared by Node Manager")
+ node, "Prepared by Node Manager", hostname=self.assigned_hostname)
self._later.create_cloud_node()
@ComputeNodeStateChangeBase._finish_on_exception
diff --git a/services/nodemanager/arvnodeman/config.py b/services/nodemanager/arvnodeman/config.py
index cbadc5f..2655c90 100644
--- a/services/nodemanager/arvnodeman/config.py
+++ b/services/nodemanager/arvnodeman/config.py
@@ -58,7 +58,10 @@ class NodeManagerConfig(ConfigParser.SafeConfigParser):
'Manage': {'address': '127.0.0.1',
'port': '-1'},
'Logging': {'file': '/dev/stderr',
- 'level': 'WARNING'}
+ 'level': 'WARNING'},
+ 'Testing': {'wishlist': '',
+ 'hostnames': ''
+ }
}.iteritems():
if not self.has_section(sec_name):
self.add_section(sec_name)
diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py
index 68dd54b..f3b9765 100644
--- a/services/nodemanager/arvnodeman/daemon.py
+++ b/services/nodemanager/arvnodeman/daemon.py
@@ -107,7 +107,8 @@ class NodeManagerDaemonActor(actor_class):
node_shutdown_class=dispatch.ComputeNodeShutdownActor,
node_actor_class=dispatch.ComputeNodeMonitorActor,
max_total_price=0,
- destroy_on_shutdown=False):
+ destroy_on_shutdown=False,
+ assigned_hostnames=[]):
super(NodeManagerDaemonActor, self).__init__()
self._node_setup = node_setup_class
self._node_shutdown = node_shutdown_class
@@ -129,6 +130,7 @@ class NodeManagerDaemonActor(actor_class):
self.boot_fail_after = boot_fail_after
self.node_stale_after = node_stale_after
self.last_polls = {}
+ self.assigned_hostnames = assigned_hostnames
for poll_name in ['server_wishlist', 'arvados_nodes', 'cloud_nodes']:
poll_actor = locals()[poll_name + '_actor']
poll_actor.subscribe(getattr(self._later, 'update_' + poll_name))
@@ -384,6 +386,11 @@ class NodeManagerDaemonActor(actor_class):
if nodes_wanted < 1:
return None
arvados_node = self.arvados_nodes.find_stale_node(self.node_stale_after)
+
+ assigned_hostname = None
+ if self.assigned_hostnames:
+ assigned_hostname = self.assigned_hostnames.pop(0)
+
self._logger.info("Want %i more %s nodes. Booting a node.",
nodes_wanted, cloud_size.name)
new_setup = self._node_setup.start(
@@ -391,7 +398,8 @@ class NodeManagerDaemonActor(actor_class):
arvados_client=self._new_arvados(),
arvados_node=arvados_node,
cloud_client=self._new_cloud(),
- cloud_size=cloud_size).proxy()
+ cloud_size=cloud_size,
+ assigned_hostname=assigned_hostname).proxy()
self.booting[new_setup.actor_ref.actor_urn] = new_setup
self.sizes_booting[new_setup.actor_ref.actor_urn] = cloud_size
diff --git a/services/nodemanager/arvnodeman/launcher.py b/services/nodemanager/arvnodeman/launcher.py
index 911c70a..582f39c 100644
--- a/services/nodemanager/arvnodeman/launcher.py
+++ b/services/nodemanager/arvnodeman/launcher.py
@@ -143,7 +143,8 @@ def main(args=None):
config.getint('Daemon', 'node_stale_after'),
node_setup, node_shutdown, node_monitor,
max_total_price=config.getfloat('Daemon', 'max_total_price'),
- destroy_on_shutdown=config.getboolean('Daemon', 'destroy_on_shutdown')).tell_proxy()
+ destroy_on_shutdown=config.getboolean('Daemon', 'destroy_on_shutdown'),
+ assigned_hostnames=config.get('Testing', 'hostnames').split(",")).tell_proxy()
watchdog = WatchdogActor.start(config.getint('Daemon', 'watchdog'),
cloud_node_poller.actor_ref,
commit 29c90ad946574534f88bd85df2999ce09ec9cdd9
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date: Thu Jun 8 15:16:50 2017 -0400
11461: Add configuration options suitable to run an independent instance of
node manager to boot a node, for compute image testing.
diff --git a/services/nodemanager/arvnodeman/config.py b/services/nodemanager/arvnodeman/config.py
index a16e0a8..cbadc5f 100644
--- a/services/nodemanager/arvnodeman/config.py
+++ b/services/nodemanager/arvnodeman/config.py
@@ -53,7 +53,8 @@ class NodeManagerConfig(ConfigParser.SafeConfigParser):
'boot_fail_after': str(sys.maxint),
'node_stale_after': str(60 * 60 * 2),
'watchdog': '600',
- 'node_mem_scaling': '0.95'},
+ 'node_mem_scaling': '0.95',
+ 'destroy_on_shutdown': "no"},
'Manage': {'address': '127.0.0.1',
'port': '-1'},
'Logging': {'file': '/dev/stderr',
diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py
index 7e63c78..68dd54b 100644
--- a/services/nodemanager/arvnodeman/daemon.py
+++ b/services/nodemanager/arvnodeman/daemon.py
@@ -106,7 +106,8 @@ class NodeManagerDaemonActor(actor_class):
node_setup_class=dispatch.ComputeNodeSetupActor,
node_shutdown_class=dispatch.ComputeNodeShutdownActor,
node_actor_class=dispatch.ComputeNodeMonitorActor,
- max_total_price=0):
+ max_total_price=0,
+ destroy_on_shutdown=False):
super(NodeManagerDaemonActor, self).__init__()
self._node_setup = node_setup_class
self._node_shutdown = node_shutdown_class
@@ -137,6 +138,7 @@ class NodeManagerDaemonActor(actor_class):
self.arvados_nodes = _ArvadosNodeTracker()
self.booting = {} # Actor IDs to ComputeNodeSetupActors
self.sizes_booting = {} # Actor IDs to node size
+ self.destroy_on_shutdown = destroy_on_shutdown
def on_start(self):
self._logger = logging.getLogger("%s.%s" % (self.__class__.__name__, self.actor_urn[33:]))
@@ -199,6 +201,8 @@ class NodeManagerDaemonActor(actor_class):
except pykka.ActorDeadError:
pass
record.shutdown_actor = None
+ if hasattr(record.cloud_node, "_nodemanager_recently_booted"):
+ del record.cloud_node._nodemanager_recently_booted
# A recently booted node is a node that successfully completed the
# setup actor but has not yet appeared in the cloud node list.
@@ -516,25 +520,35 @@ class NodeManagerDaemonActor(actor_class):
def shutdown(self):
self._logger.info("Shutting down after signal.")
- self.poll_stale_after = -1 # Inhibit starting/stopping nodes
# Shut down pollers
self._server_wishlist_actor.stop()
self._arvados_nodes_actor.stop()
- self._cloud_nodes_actor.stop()
-
- # Clear cloud node list
- self.update_cloud_nodes([])
# Stop setup actors unless they are in the middle of setup.
setup_stops = {key: node.stop_if_no_cloud_node()
for key, node in self.booting.iteritems()}
self.booting = {key: self.booting[key]
for key in setup_stops if not setup_stops[key].get()}
+
+ if not self.destroy_on_shutdown:
+ # Clear cloud node list
+ self._cloud_nodes_actor.stop()
+ self.update_cloud_nodes([])
+ self.poll_stale_after = -1 # Inhibit starting/stopping nodes
+
self._later.await_shutdown()
def await_shutdown(self):
- if self.booting:
+ nodes_up = 0
+ if self.destroy_on_shutdown:
+ for node in self.cloud_nodes.nodes.itervalues():
+ # Begin shutdown of all nodes.
+ if node.actor and not node.shutdown_actor:
+ self._begin_node_shutdown(node.actor, cancellable=False)
+ nodes_up = sum(1 for node in self.cloud_nodes.nodes.itervalues() if node.actor)
+
+ if self.booting or nodes_up:
self._timer.schedule(time.time() + 1, self._later.await_shutdown)
else:
self.stop()
diff --git a/services/nodemanager/arvnodeman/jobqueue.py b/services/nodemanager/arvnodeman/jobqueue.py
index 1716a57..c3d8f59 100644
--- a/services/nodemanager/arvnodeman/jobqueue.py
+++ b/services/nodemanager/arvnodeman/jobqueue.py
@@ -113,12 +113,13 @@ class JobQueueMonitorActor(clientactor.RemotePollLoopActor):
CLIENT_ERRORS = ARVADOS_ERRORS
def __init__(self, client, timer_actor, server_calc,
- jobs_queue, slurm_queue, *args, **kwargs):
+ jobs_queue, slurm_queue, override_wishlist, *args, **kwargs):
super(JobQueueMonitorActor, self).__init__(
client, timer_actor, *args, **kwargs)
self.jobs_queue = jobs_queue
self.slurm_queue = slurm_queue
self._calculator = server_calc
+ self.override_wishlist = override_wishlist
@staticmethod
def coerce_to_mb(x):
@@ -161,6 +162,8 @@ class JobQueueMonitorActor(clientactor.RemotePollLoopActor):
def _got_response(self, queue):
server_list = self._calculator.servers_for_queue(queue)
+ if self.override_wishlist:
+ server_list.extend(self.override_wishlist)
self._logger.debug("Calculated wishlist: %s",
', '.join(s.name for s in server_list) or "(empty)")
return super(JobQueueMonitorActor, self)._got_response(server_list)
diff --git a/services/nodemanager/arvnodeman/launcher.py b/services/nodemanager/arvnodeman/launcher.py
index 72a285b..911c70a 100644
--- a/services/nodemanager/arvnodeman/launcher.py
+++ b/services/nodemanager/arvnodeman/launcher.py
@@ -89,6 +89,7 @@ def launch_pollers(config, server_calculator):
config.new_arvados_client(), timer, server_calculator,
config.getboolean('Arvados', 'jobs_queue'),
config.getboolean('Arvados', 'slurm_queue'),
+ [server_calculator.find_size(sz) for sz in config.get('Testing', 'wishlist').split(",")],
poll_time, max_poll_time
).tell_proxy()
return timer, cloud_node_poller, arvados_node_poller, job_queue_poller
@@ -141,7 +142,8 @@ def main(args=None):
config.getint('Daemon', 'boot_fail_after'),
config.getint('Daemon', 'node_stale_after'),
node_setup, node_shutdown, node_monitor,
- max_total_price=config.getfloat('Daemon', 'max_total_price')).tell_proxy()
+ max_total_price=config.getfloat('Daemon', 'max_total_price'),
+ destroy_on_shutdown=config.getboolean('Daemon', 'destroy_on_shutdown')).tell_proxy()
watchdog = WatchdogActor.start(config.getint('Daemon', 'watchdog'),
cloud_node_poller.actor_ref,
diff --git a/services/nodemanager/doc/compute-image-test.cfg b/services/nodemanager/doc/compute-image-test.cfg
new file mode 100644
index 0000000..4372f66
--- /dev/null
+++ b/services/nodemanager/doc/compute-image-test.cfg
@@ -0,0 +1,208 @@
+# Sample template for running node manager in compute image testing mode.
+#
+# Relevant sections:
+#
+## Wishlist to use instead of getting it from API or squeue
+#[Testing]
+#wishlist = Standard_D3
+#
+## Destroy compute nodes on shutdown
+#[Daemon]
+#destroy_on_shutdown = yes
+#
+## Disable populating wishlist from jobs queue, slurm queue.
+#[Arvados]
+#jobs_queue = no
+#slurm_queue = no
+
+
+[Manage]
+# The management server responds to http://addr:port/status.json with
+# a snapshot of internal state.
+
+# Management server listening address (default 127.0.0.1)
+#address = 0.0.0.0
+
+# Management server port number (default -1, server is disabled)
+#port = 8989
+
+[Testing]
+wishlist = Standard_D3
+
+[Daemon]
+destroy_on_shutdown = yes
+
+# The dispatcher can customize the start and stop procedure for
+# cloud nodes. For example, the SLURM dispatcher drains nodes
+# through SLURM before shutting them down.
+#dispatcher = slurm
+
+# Node Manager will ensure that there are at least this many nodes running at
+# all times. If node manager needs to start new idle nodes for the purpose of
+# satisfying min_nodes, it will use the cheapest node type. However, depending
+# on usage patterns, it may also satisfy min_nodes by keeping alive some
+# more-expensive nodes
+min_nodes = 0
+
+# Node Manager will not start any compute nodes when at least this
+# many are running.
+max_nodes = 8
+
+# Upper limit on rate of spending (in $/hr), will not boot additional nodes
+# if total price of already running nodes meets or exceeds this threshold.
+# default 0 means no limit.
+max_total_price = 0
+
+# Poll Azure nodes and Arvados for new information every N seconds.
+poll_time = 5
+
+# Polls have exponential backoff when services fail to respond.
+# This is the longest time to wait between polls.
+max_poll_time = 300
+
+# If Node Manager can't succesfully poll a service for this long,
+# it will never start or stop compute nodes, on the assumption that its
+# information is too outdated.
+poll_stale_after = 600
+
+# If Node Manager boots a cloud node, and it does not pair with an Arvados
+# node before this long, assume that there was a cloud bootstrap failure and
+# shut it down. Note that normal shutdown windows apply (see the Cloud
+# section), so this should be shorter than the first shutdown window value.
+boot_fail_after = 45
+
+# "Node stale time" affects two related behaviors.
+# 1. If a compute node has been running for at least this long, but it
+# isn't paired with an Arvados node, do not shut it down, but leave it alone.
+# This prevents the node manager from shutting down a node that might
+# actually be doing work, but is having temporary trouble contacting the
+# API server.
+# 2. When the Node Manager starts a new compute node, it will try to reuse
+# an Arvados node that hasn't been updated for this long.
+node_stale_after = 14400
+
+# Scaling factor to be applied to nodes' available RAM size. Usually there's a
+# variable discrepancy between the advertised RAM value on cloud nodes and the
+# actual amount available.
+# If not set, this value will be set to 0.95
+node_mem_scaling = 0.95
+
+# File path for Certificate Authorities
+certs_file = /etc/ssl/certs/ca-certificates.crt
+
+[Logging]
+# Log file path
+#file = node-manager.log
+
+# Log level for most Node Manager messages.
+# Choose one of DEBUG, INFO, WARNING, ERROR, or CRITICAL.
+# WARNING lets you know when polling a service fails.
+# INFO additionally lets you know when a compute node is started or stopped.
+level = DEBUG
+
+# You can also set different log levels for specific libraries.
+# Pykka is the Node Manager's actor library.
+# Setting this to DEBUG will display tracebacks for uncaught
+# exceptions in the actors, but it's also very chatty.
+pykka = DEBUG
+
+# Setting apiclient to INFO will log the URL of every Arvados API request.
+apiclient = WARNING
+
+[Arvados]
+host =
+token =
+timeout = 15
+jobs_queue = no
+slurm_queue = no
+
+# Accept an untrusted SSL certificate from the API server?
+insecure = yes
+
+[Cloud]
+provider = azure
+
+# Shutdown windows define periods of time when a node may and may not be shut
+# down. These are windows in full minutes, separated by commas. Counting from
+# the time the node is booted, the node WILL NOT shut down for N1 minutes; then
+# it MAY shut down for N2 minutes; then it WILL NOT shut down for N3 minutes;
+# and so on. For example, "20, 999999" means the node may shut down between
+# the 20th and 999999th minutes of uptime.
+# Azure bills by the minute, so it makes sense to agressively shut down idle
+# nodes. Specify at least two windows. You can add as many as you need beyond
+# that.
+shutdown_windows = 1, 999999
+
+[Cloud Credentials]
+# Use "azure account list" with the azure CLI to get these values.
+tenant_id = 00000000-0000-0000-0000-000000000000
+subscription_id = 00000000-0000-0000-0000-000000000000
+
+# The following directions are based on
+# https://azure.microsoft.com/en-us/documentation/articles/resource-group-authenticate-service-principal/
+#
+# azure config mode arm
+# azure ad app create --name "<Your Application Display Name>" --home-page "<https://YourApplicationHomePage>" --identifier-uris "<https://YouApplicationUri>" --password <Your_Password>
+# azure ad sp create "<Application_Id>"
+# azure role assignment create --objectId "<Object_Id>" -o Owner -c /subscriptions/<subscriptionId>/
+#
+# Use <Application_Id> for "key" and the <Your_Password> for "secret"
+#
+key = 00000000-0000-0000-0000-000000000000
+secret = PASSWORD
+timeout = 60
+region = East US
+
+[Cloud List]
+# The resource group in which the compute node virtual machines will be created
+# and listed.
+ex_resource_group = ArvadosResourceGroup
+
+[Cloud Create]
+# The image id, in the form "Publisher:Offer:SKU:Version"
+image = Canonical:UbuntuServer:14.04.3-LTS:14.04.201508050
+
+# Path to a local ssh key file that will be used to provision new nodes.
+ssh_key = /dev/null
+
+# The account name for the admin user that will be provisioned on new nodes.
+ex_user_name = arvadosuser
+
+# The Azure storage account that will be used to store the node OS disk images.
+ex_storage_account = arvadosstorage
+
+# The virtual network the VMs will be associated with.
+ex_network = ArvadosNetwork
+
+# Optional subnet of the virtual network.
+#ex_subnet = default
+
+# Node tags
+tag_arvados-class = dynamic-compute
+tag_cluster = zyxwv
+
+# the API server to ping
+ping_host =
+
+# You can define any number of Size sections to list Azure sizes you're willing
+# to use. The Node Manager should boot the cheapest size(s) that can run jobs
+# in the queue. You must also provide price per hour as the Azure driver
+# compute currently does not report prices.
+#
+# See https://azure.microsoft.com/en-us/pricing/details/virtual-machines/
+# for a list of known machine types that may be used as a Size parameter.
+#
+# Each size section MUST define the number of cores are available in this
+# size class (since libcloud does not provide any consistent API for exposing
+# this setting).
+# You may also want to define the amount of scratch space (expressed
+# in GB) for Crunch jobs. You can also override Microsoft's provided
+# data fields by setting them here.
+
+[Size Standard_D3]
+cores = 4
+price = 0.56
+
+[Size Standard_D4]
+cores = 8
+price = 1.12
commit 1e47e184d3b156dd2902a7d6d86cbf872522ffeb
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date: Thu Jun 8 16:56:10 2017 -0400
11345: Clamp retry-after to (0, max_retry_wait). Deindent retry_wrapper a bit for readability.
Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz at curoverse.com>
diff --git a/services/nodemanager/arvnodeman/computenode/__init__.py b/services/nodemanager/arvnodeman/computenode/__init__.py
index b11b2de..5c14c1c 100644
--- a/services/nodemanager/arvnodeman/computenode/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/__init__.py
@@ -73,56 +73,66 @@ class RetryMixin(object):
@functools.wraps(orig_func)
def retry_wrapper(self, *args, **kwargs):
while True:
+ should_retry = False
try:
ret = orig_func(self, *args, **kwargs)
- except Exception as error:
- should_retry = False
-
- if isinstance(error, BaseHTTPError):
- if error.headers and error.headers.get("retry-after"):
- try:
- self.retry_wait = int(error.headers["retry-after"])
- should_retry = True
- except ValueError:
- pass
- if error.code == 429 or error.code >= 500:
+ except BaseHTTPError as error:
+ if error.headers and error.headers.get("retry-after"):
+ try:
+ self.retry_wait = int(error.headers["retry-after"])
+ if self.retry_wait < 0 or self.retry_wait > self.max_retry_wait:
+ self.retry_wait = self.max_retry_wait
should_retry = True
- elif isinstance(error, CLOUD_ERRORS) or isinstance(error, errors) or type(error) is Exception:
+ except ValueError:
+ pass
+ if error.code == 429 or error.code >= 500:
should_retry = True
-
- if not should_retry:
- self.retry_wait = self.min_retry_wait
- self._logger.warning(
- "Re-raising error (no retry): %s",
- error, exc_info=error)
- raise
-
- self._logger.warning(
- "Client error: %s - %s %s seconds",
- error,
- "scheduling retry in" if self._timer else "sleeping",
- self.retry_wait,
- exc_info=error)
-
- if self._timer:
- start_time = time.time()
- # reschedule to be called again
- self._timer.schedule(start_time + self.retry_wait,
- getattr(self._later,
- orig_func.__name__),
- *args, **kwargs)
- else:
- # sleep on it.
- time.sleep(self.retry_wait)
-
- self.retry_wait = min(self.retry_wait * 2,
- self.max_retry_wait)
- if self._timer:
- # expect to be called again by timer so don't loop
- return
+ except CLOUD_ERRORS:
+ should_retry = True
+ except errors:
+ should_retry = True
+ except Exception as error:
+ # As a libcloud workaround for drivers that don't use
+ # typed exceptions, consider bare Exception() objects
+ # retryable.
+ should_retry = type(error) is Exception
else:
+ # No exception,
self.retry_wait = self.min_retry_wait
return ret
+
+ # Only got here if an exception was caught. Now determine what to do about it.
+ if not should_retry:
+ self.retry_wait = self.min_retry_wait
+ self._logger.warning(
+ "Re-raising error (no retry): %s",
+ error, exc_info=error)
+ raise
+
+ self._logger.warning(
+ "Client error: %s - %s %s seconds",
+ error,
+ "scheduling retry in" if self._timer else "sleeping",
+ self.retry_wait,
+ exc_info=error)
+
+ if self._timer:
+ start_time = time.time()
+ # reschedule to be called again
+ self._timer.schedule(start_time + self.retry_wait,
+ getattr(self._later,
+ orig_func.__name__),
+ *args, **kwargs)
+ else:
+ # sleep on it.
+ time.sleep(self.retry_wait)
+
+ self.retry_wait = min(self.retry_wait * 2,
+ self.max_retry_wait)
+ if self._timer:
+ # expect to be called again by timer so don't loop
+ return
+
return retry_wrapper
return decorator
diff --git a/services/nodemanager/tests/integration_test.py b/services/nodemanager/tests/integration_test.py
index ad0e683..f024b0c 100755
--- a/services/nodemanager/tests/integration_test.py
+++ b/services/nodemanager/tests/integration_test.py
@@ -332,7 +332,7 @@ def main():
"34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
"34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"
}),
- "test6": (
+ "test_retry_create": (
[
(r".*Daemon started", set_squeue),
(r".*Rate limit exceeded - scheduling retry in 12 seconds", noop),
commit 2fa4de30af88afd7bcac2f603497dcbe36e48429
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date: Thu Jun 8 13:13:58 2017 -0400
11345: Fix race-prone test test_issue_slurm_drain_retry
Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz at curoverse.com>
diff --git a/services/nodemanager/tests/test_computenode_dispatch_slurm.py b/services/nodemanager/tests/test_computenode_dispatch_slurm.py
index 73bcb57..4218980 100644
--- a/services/nodemanager/tests/test_computenode_dispatch_slurm.py
+++ b/services/nodemanager/tests/test_computenode_dispatch_slurm.py
@@ -23,11 +23,15 @@ class SLURMComputeNodeShutdownActorTestCase(ComputeNodeShutdownActorMixin,
for s in args:
self.assertIn(s, slurm_cmd)
- def check_success_after_reset(self, proc_mock, end_state='drain\n'):
+ def check_success_after_reset(self, proc_mock, end_state='drain\n', timer=False):
self.make_mocks(arvados_node=testutil.arvados_node_mock(63))
+ if not timer:
+ self.timer = testutil.MockTimer(False)
self.make_actor()
self.check_success_flag(None, 0)
+ self.timer.deliver()
self.check_success_flag(None, 0)
+ self.timer.deliver()
# Order is critical here: if the mock gets called when no return value
# or side effect is set, we may invoke a real subprocess.
proc_mock.return_value = end_state
@@ -84,8 +88,8 @@ class SLURMComputeNodeShutdownActorTestCase(ComputeNodeShutdownActorMixin,
self.check_success_flag(False, 2)
def test_issue_slurm_drain_retry(self, proc_mock):
- proc_mock.side_effect = iter([OSError, '', OSError, 'drng\n', 'drain\n', 'drain\n'])
- self.check_success_after_reset(proc_mock)
+ proc_mock.side_effect = iter([OSError, '', OSError, 'drng\n'])
+ self.check_success_after_reset(proc_mock, timer=False)
def test_arvados_node_cleaned_after_shutdown(self, proc_mock):
proc_mock.return_value = 'drain\n'
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list