[ARVADOS] created: 028b97d2b758d6494b7ce5a63e10d2a1e6aa3a23
Git user
git at public.curoverse.com
Mon Jun 12 13:01:01 EDT 2017
at 028b97d2b758d6494b7ce5a63e10d2a1e6aa3a23 (commit)
commit 028b97d2b758d6494b7ce5a63e10d2a1e6aa3a23
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date: Fri Jun 9 16:21:54 2017 -0400
11461: When destroy_on_shutdown is true, only shut down nodes known to have been booted by the current process.
Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz at curoverse.com>
diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py
index f3b9765..7ef628d 100644
--- a/services/nodemanager/arvnodeman/daemon.py
+++ b/services/nodemanager/arvnodeman/daemon.py
@@ -141,6 +141,7 @@ class NodeManagerDaemonActor(actor_class):
self.booting = {} # Actor IDs to ComputeNodeSetupActors
self.sizes_booting = {} # Actor IDs to node size
self.destroy_on_shutdown = destroy_on_shutdown
+ self.booted_by_this_process = []
def on_start(self):
self._logger = logging.getLogger("%s.%s" % (self.__class__.__name__, self.actor_urn[33:]))
@@ -447,6 +448,9 @@ class NodeManagerDaemonActor(actor_class):
self.node_quota = len(self.cloud_nodes)+1
self._logger.warning("After successful boot setting node quota to %s", self.node_quota)
+ if self.destroy_on_shutdown:
+ self.booted_by_this_process.append(cloud_node.id)
+
self.node_quota = min(self.node_quota, self.max_nodes)
del self.booting[setup_proxy.actor_ref.actor_urn]
del self.sizes_booting[setup_proxy.actor_ref.actor_urn]
@@ -550,11 +554,13 @@ class NodeManagerDaemonActor(actor_class):
def await_shutdown(self):
nodes_up = 0
if self.destroy_on_shutdown:
- for node in self.cloud_nodes.nodes.itervalues():
- # Begin shutdown of all nodes.
- if node.actor and not node.shutdown_actor:
- self._begin_node_shutdown(node.actor, cancellable=False)
- nodes_up = sum(1 for node in self.cloud_nodes.nodes.itervalues() if node.actor)
+ for nodeid in self.booted_by_this_process:
+ # Begin shutdown of nodes booted by the current process.
+ node = self.cloud_nodes.nodes[nodeid]
+ if node.actor:
+ nodes_up += 1
+ if not node.shutdown_actor:
+ self._begin_node_shutdown(node.actor, cancellable=False)
if self.booting or nodes_up:
self._timer.schedule(time.time() + 1, self._later.await_shutdown)
commit 36294bbd0433cca38297673aa06e181f2becab64
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date: Thu Jun 8 17:26:27 2017 -0400
11461: Support providing hostname override for testing compute images.
Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz at curoverse.com>
diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
index 4463ec6..dbeb131 100644
--- a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
@@ -58,10 +58,10 @@ class ComputeNodeStateChangeBase(config.actor_class, RetryMixin):
else:
self.subscribers.add(subscriber)
- def _clean_arvados_node(self, arvados_node, explanation):
+ def _clean_arvados_node(self, arvados_node, explanation, hostname=None):
return self._arvados.nodes().update(
uuid=arvados_node['uuid'],
- body={'hostname': None,
+ body={'hostname': hostname,
'ip_address': None,
'slot_number': None,
'first_ping_at': None,
@@ -94,7 +94,8 @@ class ComputeNodeSetupActor(ComputeNodeStateChangeBase):
"""
def __init__(self, timer_actor, arvados_client, cloud_client,
cloud_size, arvados_node=None,
- retry_wait=1, max_retry_wait=180):
+ retry_wait=1, max_retry_wait=180,
+ assigned_hostname=None):
super(ComputeNodeSetupActor, self).__init__(
cloud_client, arvados_client, timer_actor,
retry_wait, max_retry_wait)
@@ -102,6 +103,8 @@ class ComputeNodeSetupActor(ComputeNodeStateChangeBase):
self.arvados_node = None
self.cloud_node = None
self.error = None
+ self.assigned_hostname = assigned_hostname
+
if arvados_node is None:
self._later.create_arvados_node()
else:
@@ -110,14 +113,14 @@ class ComputeNodeSetupActor(ComputeNodeStateChangeBase):
@ComputeNodeStateChangeBase._finish_on_exception
@RetryMixin._retry(config.ARVADOS_ERRORS)
def create_arvados_node(self):
- self.arvados_node = self._arvados.nodes().create(body={}).execute()
+ self.arvados_node = self._arvados.nodes().create(body={"hostname": self.assigned_hostname}).execute()
self._later.create_cloud_node()
@ComputeNodeStateChangeBase._finish_on_exception
@RetryMixin._retry(config.ARVADOS_ERRORS)
def prepare_arvados_node(self, node):
self.arvados_node = self._clean_arvados_node(
- node, "Prepared by Node Manager")
+ node, "Prepared by Node Manager", hostname=self.assigned_hostname)
self._later.create_cloud_node()
@ComputeNodeStateChangeBase._finish_on_exception
diff --git a/services/nodemanager/arvnodeman/config.py b/services/nodemanager/arvnodeman/config.py
index cbadc5f..2655c90 100644
--- a/services/nodemanager/arvnodeman/config.py
+++ b/services/nodemanager/arvnodeman/config.py
@@ -58,7 +58,10 @@ class NodeManagerConfig(ConfigParser.SafeConfigParser):
'Manage': {'address': '127.0.0.1',
'port': '-1'},
'Logging': {'file': '/dev/stderr',
- 'level': 'WARNING'}
+ 'level': 'WARNING'},
+ 'Testing': {'wishlist': '',
+ 'hostnames': ''
+ }
}.iteritems():
if not self.has_section(sec_name):
self.add_section(sec_name)
diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py
index 68dd54b..f3b9765 100644
--- a/services/nodemanager/arvnodeman/daemon.py
+++ b/services/nodemanager/arvnodeman/daemon.py
@@ -107,7 +107,8 @@ class NodeManagerDaemonActor(actor_class):
node_shutdown_class=dispatch.ComputeNodeShutdownActor,
node_actor_class=dispatch.ComputeNodeMonitorActor,
max_total_price=0,
- destroy_on_shutdown=False):
+ destroy_on_shutdown=False,
+ assigned_hostnames=[]):
super(NodeManagerDaemonActor, self).__init__()
self._node_setup = node_setup_class
self._node_shutdown = node_shutdown_class
@@ -129,6 +130,7 @@ class NodeManagerDaemonActor(actor_class):
self.boot_fail_after = boot_fail_after
self.node_stale_after = node_stale_after
self.last_polls = {}
+ self.assigned_hostnames = assigned_hostnames
for poll_name in ['server_wishlist', 'arvados_nodes', 'cloud_nodes']:
poll_actor = locals()[poll_name + '_actor']
poll_actor.subscribe(getattr(self._later, 'update_' + poll_name))
@@ -384,6 +386,11 @@ class NodeManagerDaemonActor(actor_class):
if nodes_wanted < 1:
return None
arvados_node = self.arvados_nodes.find_stale_node(self.node_stale_after)
+
+ assigned_hostname = None
+ if self.assigned_hostnames:
+ assigned_hostname = self.assigned_hostnames.pop(0)
+
self._logger.info("Want %i more %s nodes. Booting a node.",
nodes_wanted, cloud_size.name)
new_setup = self._node_setup.start(
@@ -391,7 +398,8 @@ class NodeManagerDaemonActor(actor_class):
arvados_client=self._new_arvados(),
arvados_node=arvados_node,
cloud_client=self._new_cloud(),
- cloud_size=cloud_size).proxy()
+ cloud_size=cloud_size,
+ assigned_hostname=assigned_hostname).proxy()
self.booting[new_setup.actor_ref.actor_urn] = new_setup
self.sizes_booting[new_setup.actor_ref.actor_urn] = cloud_size
diff --git a/services/nodemanager/arvnodeman/launcher.py b/services/nodemanager/arvnodeman/launcher.py
index 911c70a..582f39c 100644
--- a/services/nodemanager/arvnodeman/launcher.py
+++ b/services/nodemanager/arvnodeman/launcher.py
@@ -143,7 +143,8 @@ def main(args=None):
config.getint('Daemon', 'node_stale_after'),
node_setup, node_shutdown, node_monitor,
max_total_price=config.getfloat('Daemon', 'max_total_price'),
- destroy_on_shutdown=config.getboolean('Daemon', 'destroy_on_shutdown')).tell_proxy()
+ destroy_on_shutdown=config.getboolean('Daemon', 'destroy_on_shutdown'),
+ assigned_hostnames=config.get('Testing', 'hostnames').split(",")).tell_proxy()
watchdog = WatchdogActor.start(config.getint('Daemon', 'watchdog'),
cloud_node_poller.actor_ref,
commit 64da95d5525598c1821d5801f4c8e43462c8cf1f
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date: Thu Jun 8 15:16:50 2017 -0400
11461: Add configuration options suitable to run an independent instance of
node manager to boot a node, for compute image testing.
diff --git a/services/nodemanager/arvnodeman/config.py b/services/nodemanager/arvnodeman/config.py
index a16e0a8..cbadc5f 100644
--- a/services/nodemanager/arvnodeman/config.py
+++ b/services/nodemanager/arvnodeman/config.py
@@ -53,7 +53,8 @@ class NodeManagerConfig(ConfigParser.SafeConfigParser):
'boot_fail_after': str(sys.maxint),
'node_stale_after': str(60 * 60 * 2),
'watchdog': '600',
- 'node_mem_scaling': '0.95'},
+ 'node_mem_scaling': '0.95',
+ 'destroy_on_shutdown': "no"},
'Manage': {'address': '127.0.0.1',
'port': '-1'},
'Logging': {'file': '/dev/stderr',
diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py
index 7e63c78..68dd54b 100644
--- a/services/nodemanager/arvnodeman/daemon.py
+++ b/services/nodemanager/arvnodeman/daemon.py
@@ -106,7 +106,8 @@ class NodeManagerDaemonActor(actor_class):
node_setup_class=dispatch.ComputeNodeSetupActor,
node_shutdown_class=dispatch.ComputeNodeShutdownActor,
node_actor_class=dispatch.ComputeNodeMonitorActor,
- max_total_price=0):
+ max_total_price=0,
+ destroy_on_shutdown=False):
super(NodeManagerDaemonActor, self).__init__()
self._node_setup = node_setup_class
self._node_shutdown = node_shutdown_class
@@ -137,6 +138,7 @@ class NodeManagerDaemonActor(actor_class):
self.arvados_nodes = _ArvadosNodeTracker()
self.booting = {} # Actor IDs to ComputeNodeSetupActors
self.sizes_booting = {} # Actor IDs to node size
+ self.destroy_on_shutdown = destroy_on_shutdown
def on_start(self):
self._logger = logging.getLogger("%s.%s" % (self.__class__.__name__, self.actor_urn[33:]))
@@ -199,6 +201,8 @@ class NodeManagerDaemonActor(actor_class):
except pykka.ActorDeadError:
pass
record.shutdown_actor = None
+ if hasattr(record.cloud_node, "_nodemanager_recently_booted"):
+ del record.cloud_node._nodemanager_recently_booted
# A recently booted node is a node that successfully completed the
# setup actor but has not yet appeared in the cloud node list.
@@ -516,25 +520,35 @@ class NodeManagerDaemonActor(actor_class):
def shutdown(self):
self._logger.info("Shutting down after signal.")
- self.poll_stale_after = -1 # Inhibit starting/stopping nodes
# Shut down pollers
self._server_wishlist_actor.stop()
self._arvados_nodes_actor.stop()
- self._cloud_nodes_actor.stop()
-
- # Clear cloud node list
- self.update_cloud_nodes([])
# Stop setup actors unless they are in the middle of setup.
setup_stops = {key: node.stop_if_no_cloud_node()
for key, node in self.booting.iteritems()}
self.booting = {key: self.booting[key]
for key in setup_stops if not setup_stops[key].get()}
+
+ if not self.destroy_on_shutdown:
+ # Clear cloud node list
+ self._cloud_nodes_actor.stop()
+ self.update_cloud_nodes([])
+ self.poll_stale_after = -1 # Inhibit starting/stopping nodes
+
self._later.await_shutdown()
def await_shutdown(self):
- if self.booting:
+ nodes_up = 0
+ if self.destroy_on_shutdown:
+ for node in self.cloud_nodes.nodes.itervalues():
+ # Begin shutdown of all nodes.
+ if node.actor and not node.shutdown_actor:
+ self._begin_node_shutdown(node.actor, cancellable=False)
+ nodes_up = sum(1 for node in self.cloud_nodes.nodes.itervalues() if node.actor)
+
+ if self.booting or nodes_up:
self._timer.schedule(time.time() + 1, self._later.await_shutdown)
else:
self.stop()
diff --git a/services/nodemanager/arvnodeman/jobqueue.py b/services/nodemanager/arvnodeman/jobqueue.py
index 1716a57..c3d8f59 100644
--- a/services/nodemanager/arvnodeman/jobqueue.py
+++ b/services/nodemanager/arvnodeman/jobqueue.py
@@ -113,12 +113,13 @@ class JobQueueMonitorActor(clientactor.RemotePollLoopActor):
CLIENT_ERRORS = ARVADOS_ERRORS
def __init__(self, client, timer_actor, server_calc,
- jobs_queue, slurm_queue, *args, **kwargs):
+ jobs_queue, slurm_queue, override_wishlist, *args, **kwargs):
super(JobQueueMonitorActor, self).__init__(
client, timer_actor, *args, **kwargs)
self.jobs_queue = jobs_queue
self.slurm_queue = slurm_queue
self._calculator = server_calc
+ self.override_wishlist = override_wishlist
@staticmethod
def coerce_to_mb(x):
@@ -161,6 +162,8 @@ class JobQueueMonitorActor(clientactor.RemotePollLoopActor):
def _got_response(self, queue):
server_list = self._calculator.servers_for_queue(queue)
+ if self.override_wishlist:
+ server_list.extend(self.override_wishlist)
self._logger.debug("Calculated wishlist: %s",
', '.join(s.name for s in server_list) or "(empty)")
return super(JobQueueMonitorActor, self)._got_response(server_list)
diff --git a/services/nodemanager/arvnodeman/launcher.py b/services/nodemanager/arvnodeman/launcher.py
index 72a285b..911c70a 100644
--- a/services/nodemanager/arvnodeman/launcher.py
+++ b/services/nodemanager/arvnodeman/launcher.py
@@ -89,6 +89,7 @@ def launch_pollers(config, server_calculator):
config.new_arvados_client(), timer, server_calculator,
config.getboolean('Arvados', 'jobs_queue'),
config.getboolean('Arvados', 'slurm_queue'),
+ [server_calculator.find_size(sz) for sz in config.get('Testing', 'wishlist').split(",")],
poll_time, max_poll_time
).tell_proxy()
return timer, cloud_node_poller, arvados_node_poller, job_queue_poller
@@ -141,7 +142,8 @@ def main(args=None):
config.getint('Daemon', 'boot_fail_after'),
config.getint('Daemon', 'node_stale_after'),
node_setup, node_shutdown, node_monitor,
- max_total_price=config.getfloat('Daemon', 'max_total_price')).tell_proxy()
+ max_total_price=config.getfloat('Daemon', 'max_total_price'),
+ destroy_on_shutdown=config.getboolean('Daemon', 'destroy_on_shutdown')).tell_proxy()
watchdog = WatchdogActor.start(config.getint('Daemon', 'watchdog'),
cloud_node_poller.actor_ref,
diff --git a/services/nodemanager/doc/compute-image-test.cfg b/services/nodemanager/doc/compute-image-test.cfg
new file mode 100644
index 0000000..4372f66
--- /dev/null
+++ b/services/nodemanager/doc/compute-image-test.cfg
@@ -0,0 +1,208 @@
+# Sample template for running node manager in compute image testing mode.
+#
+# Relevant sections:
+#
+## Wishlist to use instead of getting it from API or squeue
+#[Testing]
+#wishlist = Standard_D3
+#
+## Destroy compute nodes on shutdown
+#[Daemon]
+#destroy_on_shutdown = yes
+#
+## Disable populating wishlist from jobs queue, slurm queue.
+#[Arvados]
+#jobs_queue = no
+#slurm_queue = no
+
+
+[Manage]
+# The management server responds to http://addr:port/status.json with
+# a snapshot of internal state.
+
+# Management server listening address (default 127.0.0.1)
+#address = 0.0.0.0
+
+# Management server port number (default -1, server is disabled)
+#port = 8989
+
+[Testing]
+wishlist = Standard_D3
+
+[Daemon]
+destroy_on_shutdown = yes
+
+# The dispatcher can customize the start and stop procedure for
+# cloud nodes. For example, the SLURM dispatcher drains nodes
+# through SLURM before shutting them down.
+#dispatcher = slurm
+
+# Node Manager will ensure that there are at least this many nodes running at
+# all times. If node manager needs to start new idle nodes for the purpose of
+# satisfying min_nodes, it will use the cheapest node type. However, depending
+# on usage patterns, it may also satisfy min_nodes by keeping alive some
+# more-expensive nodes
+min_nodes = 0
+
+# Node Manager will not start any compute nodes when at least this
+# many are running.
+max_nodes = 8
+
+# Upper limit on rate of spending (in $/hr), will not boot additional nodes
+# if total price of already running nodes meets or exceeds this threshold.
+# default 0 means no limit.
+max_total_price = 0
+
+# Poll Azure nodes and Arvados for new information every N seconds.
+poll_time = 5
+
+# Polls have exponential backoff when services fail to respond.
+# This is the longest time to wait between polls.
+max_poll_time = 300
+
+# If Node Manager can't succesfully poll a service for this long,
+# it will never start or stop compute nodes, on the assumption that its
+# information is too outdated.
+poll_stale_after = 600
+
+# If Node Manager boots a cloud node, and it does not pair with an Arvados
+# node before this long, assume that there was a cloud bootstrap failure and
+# shut it down. Note that normal shutdown windows apply (see the Cloud
+# section), so this should be shorter than the first shutdown window value.
+boot_fail_after = 45
+
+# "Node stale time" affects two related behaviors.
+# 1. If a compute node has been running for at least this long, but it
+# isn't paired with an Arvados node, do not shut it down, but leave it alone.
+# This prevents the node manager from shutting down a node that might
+# actually be doing work, but is having temporary trouble contacting the
+# API server.
+# 2. When the Node Manager starts a new compute node, it will try to reuse
+# an Arvados node that hasn't been updated for this long.
+node_stale_after = 14400
+
+# Scaling factor to be applied to nodes' available RAM size. Usually there's a
+# variable discrepancy between the advertised RAM value on cloud nodes and the
+# actual amount available.
+# If not set, this value will be set to 0.95
+node_mem_scaling = 0.95
+
+# File path for Certificate Authorities
+certs_file = /etc/ssl/certs/ca-certificates.crt
+
+[Logging]
+# Log file path
+#file = node-manager.log
+
+# Log level for most Node Manager messages.
+# Choose one of DEBUG, INFO, WARNING, ERROR, or CRITICAL.
+# WARNING lets you know when polling a service fails.
+# INFO additionally lets you know when a compute node is started or stopped.
+level = DEBUG
+
+# You can also set different log levels for specific libraries.
+# Pykka is the Node Manager's actor library.
+# Setting this to DEBUG will display tracebacks for uncaught
+# exceptions in the actors, but it's also very chatty.
+pykka = DEBUG
+
+# Setting apiclient to INFO will log the URL of every Arvados API request.
+apiclient = WARNING
+
+[Arvados]
+host =
+token =
+timeout = 15
+jobs_queue = no
+slurm_queue = no
+
+# Accept an untrusted SSL certificate from the API server?
+insecure = yes
+
+[Cloud]
+provider = azure
+
+# Shutdown windows define periods of time when a node may and may not be shut
+# down. These are windows in full minutes, separated by commas. Counting from
+# the time the node is booted, the node WILL NOT shut down for N1 minutes; then
+# it MAY shut down for N2 minutes; then it WILL NOT shut down for N3 minutes;
+# and so on. For example, "20, 999999" means the node may shut down between
+# the 20th and 999999th minutes of uptime.
+# Azure bills by the minute, so it makes sense to agressively shut down idle
+# nodes. Specify at least two windows. You can add as many as you need beyond
+# that.
+shutdown_windows = 1, 999999
+
+[Cloud Credentials]
+# Use "azure account list" with the azure CLI to get these values.
+tenant_id = 00000000-0000-0000-0000-000000000000
+subscription_id = 00000000-0000-0000-0000-000000000000
+
+# The following directions are based on
+# https://azure.microsoft.com/en-us/documentation/articles/resource-group-authenticate-service-principal/
+#
+# azure config mode arm
+# azure ad app create --name "<Your Application Display Name>" --home-page "<https://YourApplicationHomePage>" --identifier-uris "<https://YouApplicationUri>" --password <Your_Password>
+# azure ad sp create "<Application_Id>"
+# azure role assignment create --objectId "<Object_Id>" -o Owner -c /subscriptions/<subscriptionId>/
+#
+# Use <Application_Id> for "key" and the <Your_Password> for "secret"
+#
+key = 00000000-0000-0000-0000-000000000000
+secret = PASSWORD
+timeout = 60
+region = East US
+
+[Cloud List]
+# The resource group in which the compute node virtual machines will be created
+# and listed.
+ex_resource_group = ArvadosResourceGroup
+
+[Cloud Create]
+# The image id, in the form "Publisher:Offer:SKU:Version"
+image = Canonical:UbuntuServer:14.04.3-LTS:14.04.201508050
+
+# Path to a local ssh key file that will be used to provision new nodes.
+ssh_key = /dev/null
+
+# The account name for the admin user that will be provisioned on new nodes.
+ex_user_name = arvadosuser
+
+# The Azure storage account that will be used to store the node OS disk images.
+ex_storage_account = arvadosstorage
+
+# The virtual network the VMs will be associated with.
+ex_network = ArvadosNetwork
+
+# Optional subnet of the virtual network.
+#ex_subnet = default
+
+# Node tags
+tag_arvados-class = dynamic-compute
+tag_cluster = zyxwv
+
+# the API server to ping
+ping_host =
+
+# You can define any number of Size sections to list Azure sizes you're willing
+# to use. The Node Manager should boot the cheapest size(s) that can run jobs
+# in the queue. You must also provide price per hour as the Azure driver
+# compute currently does not report prices.
+#
+# See https://azure.microsoft.com/en-us/pricing/details/virtual-machines/
+# for a list of known machine types that may be used as a Size parameter.
+#
+# Each size section MUST define the number of cores are available in this
+# size class (since libcloud does not provide any consistent API for exposing
+# this setting).
+# You may also want to define the amount of scratch space (expressed
+# in GB) for Crunch jobs. You can also override Microsoft's provided
+# data fields by setting them here.
+
+[Size Standard_D3]
+cores = 4
+price = 0.56
+
+[Size Standard_D4]
+cores = 8
+price = 1.12
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list