[ARVADOS] created: 028b97d2b758d6494b7ce5a63e10d2a1e6aa3a23

Mon Jun 12 13:01:01 EDT 2017

at  028b97d2b758d6494b7ce5a63e10d2a1e6aa3a23 (commit)


commit 028b97d2b758d6494b7ce5a63e10d2a1e6aa3a23
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Fri Jun 9 16:21:54 2017 -0400

    11461: When destroy_on_shutdown is true, only shut down nodes known to have been booted by the current process.
    
    Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz at curoverse.com>

diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py
index f3b9765..7ef628d 100644
--- a/services/nodemanager/arvnodeman/daemon.py
+++ b/services/nodemanager/arvnodeman/daemon.py
@@ -141,6 +141,7 @@ class NodeManagerDaemonActor(actor_class):
         self.booting = {}       # Actor IDs to ComputeNodeSetupActors
         self.sizes_booting = {} # Actor IDs to node size
         self.destroy_on_shutdown = destroy_on_shutdown
+        self.booted_by_this_process = []
 
     def on_start(self):
         self._logger = logging.getLogger("%s.%s" % (self.__class__.__name__, self.actor_urn[33:]))
@@ -447,6 +448,9 @@ class NodeManagerDaemonActor(actor_class):
                 self.node_quota = len(self.cloud_nodes)+1
                 self._logger.warning("After successful boot setting node quota to %s", self.node_quota)
 
+            if self.destroy_on_shutdown:
+                self.booted_by_this_process.append(cloud_node.id)
+
         self.node_quota = min(self.node_quota, self.max_nodes)
         del self.booting[setup_proxy.actor_ref.actor_urn]
         del self.sizes_booting[setup_proxy.actor_ref.actor_urn]
@@ -550,11 +554,13 @@ class NodeManagerDaemonActor(actor_class):
     def await_shutdown(self):
         nodes_up = 0
         if self.destroy_on_shutdown:
-            for node in self.cloud_nodes.nodes.itervalues():
-                # Begin shutdown of all nodes.
-                if node.actor and not node.shutdown_actor:
-                    self._begin_node_shutdown(node.actor, cancellable=False)
-            nodes_up = sum(1 for node in self.cloud_nodes.nodes.itervalues() if node.actor)
+            for nodeid in self.booted_by_this_process:
+                # Begin shutdown of nodes booted by the current process.
+                node = self.cloud_nodes.nodes[nodeid]
+                if node.actor:
+                    nodes_up += 1
+                    if not node.shutdown_actor:
+                        self._begin_node_shutdown(node.actor, cancellable=False)
 
         if self.booting or nodes_up:
             self._timer.schedule(time.time() + 1, self._later.await_shutdown)

commit 36294bbd0433cca38297673aa06e181f2becab64
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Thu Jun 8 17:26:27 2017 -0400

    11461: Support providing hostname override for testing compute images.
    
    Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz at curoverse.com>

diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
index 4463ec6..dbeb131 100644
--- a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
@@ -58,10 +58,10 @@ class ComputeNodeStateChangeBase(config.actor_class, RetryMixin):
         else:
             self.subscribers.add(subscriber)
 
-    def _clean_arvados_node(self, arvados_node, explanation):
+    def _clean_arvados_node(self, arvados_node, explanation, hostname=None):
         return self._arvados.nodes().update(
             uuid=arvados_node['uuid'],
-            body={'hostname': None,
+            body={'hostname': hostname,
                   'ip_address': None,
                   'slot_number': None,
                   'first_ping_at': None,
@@ -94,7 +94,8 @@ class ComputeNodeSetupActor(ComputeNodeStateChangeBase):
     """
     def __init__(self, timer_actor, arvados_client, cloud_client,
                  cloud_size, arvados_node=None,
-                 retry_wait=1, max_retry_wait=180):
+                 retry_wait=1, max_retry_wait=180,
+                 assigned_hostname=None):
         super(ComputeNodeSetupActor, self).__init__(
             cloud_client, arvados_client, timer_actor,
             retry_wait, max_retry_wait)
@@ -102,6 +103,8 @@ class ComputeNodeSetupActor(ComputeNodeStateChangeBase):
         self.arvados_node = None
         self.cloud_node = None
         self.error = None
+        self.assigned_hostname = assigned_hostname
+
         if arvados_node is None:
             self._later.create_arvados_node()
         else:
@@ -110,14 +113,14 @@ class ComputeNodeSetupActor(ComputeNodeStateChangeBase):
     @ComputeNodeStateChangeBase._finish_on_exception
     @RetryMixin._retry(config.ARVADOS_ERRORS)
     def create_arvados_node(self):
-        self.arvados_node = self._arvados.nodes().create(body={}).execute()
+        self.arvados_node = self._arvados.nodes().create(body={"hostname": self.assigned_hostname}).execute()
         self._later.create_cloud_node()
 
     @ComputeNodeStateChangeBase._finish_on_exception
     @RetryMixin._retry(config.ARVADOS_ERRORS)
     def prepare_arvados_node(self, node):
         self.arvados_node = self._clean_arvados_node(
-            node, "Prepared by Node Manager")
+            node, "Prepared by Node Manager", hostname=self.assigned_hostname)
         self._later.create_cloud_node()
 
     @ComputeNodeStateChangeBase._finish_on_exception
diff --git a/services/nodemanager/arvnodeman/config.py b/services/nodemanager/arvnodeman/config.py
index cbadc5f..2655c90 100644
--- a/services/nodemanager/arvnodeman/config.py
+++ b/services/nodemanager/arvnodeman/config.py
@@ -58,7 +58,10 @@ class NodeManagerConfig(ConfigParser.SafeConfigParser):
             'Manage': {'address': '127.0.0.1',
                        'port': '-1'},
             'Logging': {'file': '/dev/stderr',
-                        'level': 'WARNING'}
+                        'level': 'WARNING'},
+            'Testing': {'wishlist': '',
+                        'hostnames': ''
+            }
         }.iteritems():
             if not self.has_section(sec_name):
                 self.add_section(sec_name)
diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py
index 68dd54b..f3b9765 100644
--- a/services/nodemanager/arvnodeman/daemon.py
+++ b/services/nodemanager/arvnodeman/daemon.py
@@ -107,7 +107,8 @@ class NodeManagerDaemonActor(actor_class):
                  node_shutdown_class=dispatch.ComputeNodeShutdownActor,
                  node_actor_class=dispatch.ComputeNodeMonitorActor,
                  max_total_price=0,
-                 destroy_on_shutdown=False):
+                 destroy_on_shutdown=False,
+                 assigned_hostnames=[]):
         super(NodeManagerDaemonActor, self).__init__()
         self._node_setup = node_setup_class
         self._node_shutdown = node_shutdown_class
@@ -129,6 +130,7 @@ class NodeManagerDaemonActor(actor_class):
         self.boot_fail_after = boot_fail_after
         self.node_stale_after = node_stale_after
         self.last_polls = {}
+        self.assigned_hostnames = assigned_hostnames
         for poll_name in ['server_wishlist', 'arvados_nodes', 'cloud_nodes']:
             poll_actor = locals()[poll_name + '_actor']
             poll_actor.subscribe(getattr(self._later, 'update_' + poll_name))
@@ -384,6 +386,11 @@ class NodeManagerDaemonActor(actor_class):
         if nodes_wanted < 1:
             return None
         arvados_node = self.arvados_nodes.find_stale_node(self.node_stale_after)
+
+        assigned_hostname = None
+        if self.assigned_hostnames:
+            assigned_hostname = self.assigned_hostnames.pop(0)
+
         self._logger.info("Want %i more %s nodes.  Booting a node.",
                           nodes_wanted, cloud_size.name)
         new_setup = self._node_setup.start(
@@ -391,7 +398,8 @@ class NodeManagerDaemonActor(actor_class):
             arvados_client=self._new_arvados(),
             arvados_node=arvados_node,
             cloud_client=self._new_cloud(),
-            cloud_size=cloud_size).proxy()
+            cloud_size=cloud_size,
+            assigned_hostname=assigned_hostname).proxy()
         self.booting[new_setup.actor_ref.actor_urn] = new_setup
         self.sizes_booting[new_setup.actor_ref.actor_urn] = cloud_size
 
diff --git a/services/nodemanager/arvnodeman/launcher.py b/services/nodemanager/arvnodeman/launcher.py
index 911c70a..582f39c 100644
--- a/services/nodemanager/arvnodeman/launcher.py
+++ b/services/nodemanager/arvnodeman/launcher.py
@@ -143,7 +143,8 @@ def main(args=None):
             config.getint('Daemon', 'node_stale_after'),
             node_setup, node_shutdown, node_monitor,
             max_total_price=config.getfloat('Daemon', 'max_total_price'),
-            destroy_on_shutdown=config.getboolean('Daemon', 'destroy_on_shutdown')).tell_proxy()
+            destroy_on_shutdown=config.getboolean('Daemon', 'destroy_on_shutdown'),
+            assigned_hostnames=config.get('Testing', 'hostnames').split(",")).tell_proxy()
 
         watchdog = WatchdogActor.start(config.getint('Daemon', 'watchdog'),
                             cloud_node_poller.actor_ref,

commit 64da95d5525598c1821d5801f4c8e43462c8cf1f
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Thu Jun 8 15:16:50 2017 -0400

    11461: Add configuration options suitable to run an independent instance of
    node manager to boot a node, for compute image testing.

diff --git a/services/nodemanager/arvnodeman/config.py b/services/nodemanager/arvnodeman/config.py
index a16e0a8..cbadc5f 100644
--- a/services/nodemanager/arvnodeman/config.py
+++ b/services/nodemanager/arvnodeman/config.py
@@ -53,7 +53,8 @@ class NodeManagerConfig(ConfigParser.SafeConfigParser):
                        'boot_fail_after': str(sys.maxint),
                        'node_stale_after': str(60 * 60 * 2),
                        'watchdog': '600',
-                       'node_mem_scaling': '0.95'},
+                       'node_mem_scaling': '0.95',
+                       'destroy_on_shutdown': "no"},
             'Manage': {'address': '127.0.0.1',
                        'port': '-1'},
             'Logging': {'file': '/dev/stderr',
diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py
index 7e63c78..68dd54b 100644
--- a/services/nodemanager/arvnodeman/daemon.py
+++ b/services/nodemanager/arvnodeman/daemon.py
@@ -106,7 +106,8 @@ class NodeManagerDaemonActor(actor_class):
                  node_setup_class=dispatch.ComputeNodeSetupActor,
                  node_shutdown_class=dispatch.ComputeNodeShutdownActor,
                  node_actor_class=dispatch.ComputeNodeMonitorActor,
-                 max_total_price=0):
+                 max_total_price=0,
+                 destroy_on_shutdown=False):
         super(NodeManagerDaemonActor, self).__init__()
         self._node_setup = node_setup_class
         self._node_shutdown = node_shutdown_class
@@ -137,6 +138,7 @@ class NodeManagerDaemonActor(actor_class):
         self.arvados_nodes = _ArvadosNodeTracker()
         self.booting = {}       # Actor IDs to ComputeNodeSetupActors
         self.sizes_booting = {} # Actor IDs to node size
+        self.destroy_on_shutdown = destroy_on_shutdown
 
     def on_start(self):
         self._logger = logging.getLogger("%s.%s" % (self.__class__.__name__, self.actor_urn[33:]))
@@ -199,6 +201,8 @@ class NodeManagerDaemonActor(actor_class):
                 except pykka.ActorDeadError:
                     pass
                 record.shutdown_actor = None
+                if hasattr(record.cloud_node, "_nodemanager_recently_booted"):
+                    del record.cloud_node._nodemanager_recently_booted
 
             # A recently booted node is a node that successfully completed the
             # setup actor but has not yet appeared in the cloud node list.
@@ -516,25 +520,35 @@ class NodeManagerDaemonActor(actor_class):
 
     def shutdown(self):
         self._logger.info("Shutting down after signal.")
-        self.poll_stale_after = -1  # Inhibit starting/stopping nodes
 
         # Shut down pollers
         self._server_wishlist_actor.stop()
         self._arvados_nodes_actor.stop()
-        self._cloud_nodes_actor.stop()
-
-        # Clear cloud node list
-        self.update_cloud_nodes([])
 
         # Stop setup actors unless they are in the middle of setup.
         setup_stops = {key: node.stop_if_no_cloud_node()
                        for key, node in self.booting.iteritems()}
         self.booting = {key: self.booting[key]
                         for key in setup_stops if not setup_stops[key].get()}
+
+        if not self.destroy_on_shutdown:
+            # Clear cloud node list
+            self._cloud_nodes_actor.stop()
+            self.update_cloud_nodes([])
+            self.poll_stale_after = -1  # Inhibit starting/stopping nodes
+
         self._later.await_shutdown()
 
     def await_shutdown(self):
-        if self.booting:
+        nodes_up = 0
+        if self.destroy_on_shutdown:
+            for node in self.cloud_nodes.nodes.itervalues():
+                # Begin shutdown of all nodes.
+                if node.actor and not node.shutdown_actor:
+                    self._begin_node_shutdown(node.actor, cancellable=False)
+            nodes_up = sum(1 for node in self.cloud_nodes.nodes.itervalues() if node.actor)
+
+        if self.booting or nodes_up:
             self._timer.schedule(time.time() + 1, self._later.await_shutdown)
         else:
             self.stop()
diff --git a/services/nodemanager/arvnodeman/jobqueue.py b/services/nodemanager/arvnodeman/jobqueue.py
index 1716a57..c3d8f59 100644
--- a/services/nodemanager/arvnodeman/jobqueue.py
+++ b/services/nodemanager/arvnodeman/jobqueue.py
@@ -113,12 +113,13 @@ class JobQueueMonitorActor(clientactor.RemotePollLoopActor):
     CLIENT_ERRORS = ARVADOS_ERRORS
 
     def __init__(self, client, timer_actor, server_calc,
-                 jobs_queue, slurm_queue, *args, **kwargs):
+                 jobs_queue, slurm_queue, override_wishlist, *args, **kwargs):
         super(JobQueueMonitorActor, self).__init__(
             client, timer_actor, *args, **kwargs)
         self.jobs_queue = jobs_queue
         self.slurm_queue = slurm_queue
         self._calculator = server_calc
+        self.override_wishlist = override_wishlist
 
     @staticmethod
     def coerce_to_mb(x):
@@ -161,6 +162,8 @@ class JobQueueMonitorActor(clientactor.RemotePollLoopActor):
 
     def _got_response(self, queue):
         server_list = self._calculator.servers_for_queue(queue)
+        if self.override_wishlist:
+            server_list.extend(self.override_wishlist)
         self._logger.debug("Calculated wishlist: %s",
                            ', '.join(s.name for s in server_list) or "(empty)")
         return super(JobQueueMonitorActor, self)._got_response(server_list)
diff --git a/services/nodemanager/arvnodeman/launcher.py b/services/nodemanager/arvnodeman/launcher.py
index 72a285b..911c70a 100644
--- a/services/nodemanager/arvnodeman/launcher.py
+++ b/services/nodemanager/arvnodeman/launcher.py
@@ -89,6 +89,7 @@ def launch_pollers(config, server_calculator):
         config.new_arvados_client(), timer, server_calculator,
         config.getboolean('Arvados', 'jobs_queue'),
         config.getboolean('Arvados', 'slurm_queue'),
+        [server_calculator.find_size(sz) for sz in config.get('Testing', 'wishlist').split(",")],
         poll_time, max_poll_time
     ).tell_proxy()
     return timer, cloud_node_poller, arvados_node_poller, job_queue_poller
@@ -141,7 +142,8 @@ def main(args=None):
             config.getint('Daemon', 'boot_fail_after'),
             config.getint('Daemon', 'node_stale_after'),
             node_setup, node_shutdown, node_monitor,
-            max_total_price=config.getfloat('Daemon', 'max_total_price')).tell_proxy()
+            max_total_price=config.getfloat('Daemon', 'max_total_price'),
+            destroy_on_shutdown=config.getboolean('Daemon', 'destroy_on_shutdown')).tell_proxy()
 
         watchdog = WatchdogActor.start(config.getint('Daemon', 'watchdog'),
                             cloud_node_poller.actor_ref,
diff --git a/services/nodemanager/doc/compute-image-test.cfg b/services/nodemanager/doc/compute-image-test.cfg
new file mode 100644
index 0000000..4372f66
--- /dev/null
+++ b/services/nodemanager/doc/compute-image-test.cfg
@@ -0,0 +1,208 @@
+# Sample template for running node manager in compute image testing mode.
+#
+# Relevant sections:
+#
+## Wishlist to use instead of getting it from API or squeue
+#[Testing]
+#wishlist = Standard_D3
+#
+## Destroy compute nodes on shutdown
+#[Daemon]
+#destroy_on_shutdown = yes
+#
+## Disable populating wishlist from jobs queue, slurm queue.
+#[Arvados]
+#jobs_queue = no
+#slurm_queue = no
+
+
+[Manage]
+# The management server responds to http://addr:port/status.json with
+# a snapshot of internal state.
+
+# Management server listening address (default 127.0.0.1)
+#address = 0.0.0.0
+
+# Management server port number (default -1, server is disabled)
+#port = 8989
+
+[Testing]
+wishlist = Standard_D3
+
+[Daemon]
+destroy_on_shutdown = yes
+
+# The dispatcher can customize the start and stop procedure for
+# cloud nodes.  For example, the SLURM dispatcher drains nodes
+# through SLURM before shutting them down.
+#dispatcher = slurm
+
+# Node Manager will ensure that there are at least this many nodes running at
+# all times.  If node manager needs to start new idle nodes for the purpose of
+# satisfying min_nodes, it will use the cheapest node type.  However, depending
+# on usage patterns, it may also satisfy min_nodes by keeping alive some
+# more-expensive nodes
+min_nodes = 0
+
+# Node Manager will not start any compute nodes when at least this
+# many are running.
+max_nodes = 8
+
+# Upper limit on rate of spending (in $/hr), will not boot additional nodes
+# if total price of already running nodes meets or exceeds this threshold.
+# default 0 means no limit.
+max_total_price = 0
+
+# Poll Azure nodes and Arvados for new information every N seconds.
+poll_time = 5
+
+# Polls have exponential backoff when services fail to respond.
+# This is the longest time to wait between polls.
+max_poll_time = 300
+
+# If Node Manager can't succesfully poll a service for this long,
+# it will never start or stop compute nodes, on the assumption that its
+# information is too outdated.
+poll_stale_after = 600
+
+# If Node Manager boots a cloud node, and it does not pair with an Arvados
+# node before this long, assume that there was a cloud bootstrap failure and
+# shut it down.  Note that normal shutdown windows apply (see the Cloud
+# section), so this should be shorter than the first shutdown window value.
+boot_fail_after = 45
+
+# "Node stale time" affects two related behaviors.
+# 1. If a compute node has been running for at least this long, but it
+# isn't paired with an Arvados node, do not shut it down, but leave it alone.
+# This prevents the node manager from shutting down a node that might
+# actually be doing work, but is having temporary trouble contacting the
+# API server.
+# 2. When the Node Manager starts a new compute node, it will try to reuse
+# an Arvados node that hasn't been updated for this long.
+node_stale_after = 14400
+
+# Scaling factor to be applied to nodes' available RAM size. Usually there's a
+# variable discrepancy between the advertised RAM value on cloud nodes and the
+# actual amount available.
+# If not set, this value will be set to 0.95
+node_mem_scaling = 0.95
+
+# File path for Certificate Authorities
+certs_file = /etc/ssl/certs/ca-certificates.crt
+
+[Logging]
+# Log file path
+#file = node-manager.log
+
+# Log level for most Node Manager messages.
+# Choose one of DEBUG, INFO, WARNING, ERROR, or CRITICAL.
+# WARNING lets you know when polling a service fails.
+# INFO additionally lets you know when a compute node is started or stopped.
+level = DEBUG
+
+# You can also set different log levels for specific libraries.
+# Pykka is the Node Manager's actor library.
+# Setting this to DEBUG will display tracebacks for uncaught
+# exceptions in the actors, but it's also very chatty.
+pykka = DEBUG
+
+# Setting apiclient to INFO will log the URL of every Arvados API request.
+apiclient = WARNING
+
+[Arvados]
+host =
+token =
+timeout = 15
+jobs_queue = no
+slurm_queue = no
+
+# Accept an untrusted SSL certificate from the API server?
+insecure = yes
+
+[Cloud]
+provider = azure
+
+# Shutdown windows define periods of time when a node may and may not be shut
+# down.  These are windows in full minutes, separated by commas.  Counting from
+# the time the node is booted, the node WILL NOT shut down for N1 minutes; then
+# it MAY shut down for N2 minutes; then it WILL NOT shut down for N3 minutes;
+# and so on.  For example, "20, 999999" means the node may shut down between
+# the 20th and 999999th minutes of uptime.
+# Azure bills by the minute, so it makes sense to agressively shut down idle
+# nodes.  Specify at least two windows.  You can add as many as you need beyond
+# that.
+shutdown_windows = 1, 999999
+
+[Cloud Credentials]
+# Use "azure account list" with the azure CLI to get these values.
+tenant_id = 00000000-0000-0000-0000-000000000000
+subscription_id = 00000000-0000-0000-0000-000000000000
+
+# The following directions are based on
+# https://azure.microsoft.com/en-us/documentation/articles/resource-group-authenticate-service-principal/
+#
+# azure config mode arm
+# azure ad app create --name "<Your Application Display Name>" --home-page "<https://YourApplicationHomePage>" --identifier-uris "<https://YouApplicationUri>" --password <Your_Password>
+# azure ad sp create "<Application_Id>"
+# azure role assignment create --objectId "<Object_Id>" -o Owner -c /subscriptions/<subscriptionId>/
+#
+# Use <Application_Id> for "key" and the <Your_Password> for "secret"
+#
+key = 00000000-0000-0000-0000-000000000000
+secret = PASSWORD
+timeout = 60
+region = East US
+
+[Cloud List]
+# The resource group in which the compute node virtual machines will be created
+# and listed.
+ex_resource_group = ArvadosResourceGroup
+
+[Cloud Create]
+# The image id, in the form "Publisher:Offer:SKU:Version"
+image = Canonical:UbuntuServer:14.04.3-LTS:14.04.201508050
+
+# Path to a local ssh key file that will be used to provision new nodes.
+ssh_key = /dev/null
+
+# The account name for the admin user that will be provisioned on new nodes.
+ex_user_name = arvadosuser
+
+# The Azure storage account that will be used to store the node OS disk images.
+ex_storage_account = arvadosstorage
+
+# The virtual network the VMs will be associated with.
+ex_network = ArvadosNetwork
+
+# Optional subnet of the virtual network.
+#ex_subnet = default
+
+# Node tags
+tag_arvados-class = dynamic-compute
+tag_cluster = zyxwv
+
+# the API server to ping
+ping_host =
+
+# You can define any number of Size sections to list Azure sizes you're willing
+# to use.  The Node Manager should boot the cheapest size(s) that can run jobs
+# in the queue.  You must also provide price per hour as the Azure driver
+# compute currently does not report prices.
+#
+# See https://azure.microsoft.com/en-us/pricing/details/virtual-machines/
+# for a list of known machine types that may be used as a Size parameter.
+#
+# Each size section MUST define the number of cores are available in this
+# size class (since libcloud does not provide any consistent API for exposing
+# this setting).
+# You may also want to define the amount of scratch space (expressed
+# in GB) for Crunch jobs.  You can also override Microsoft's provided
+# data fields by setting them here.
+
+[Size Standard_D3]
+cores = 4
+price = 0.56
+
+[Size Standard_D4]
+cores = 8
+price = 1.12

-----------------------------------------------------------------------


hooks/post-receive
--