[ARVADOS] created: c46ab7c622127315b4e90c98e859588a3403267a

git at public.curoverse.com git at public.curoverse.com
Tue Aug 18 17:22:01 EDT 2015


        at  c46ab7c622127315b4e90c98e859588a3403267a (commit)


commit c46ab7c622127315b4e90c98e859588a3403267a
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Tue Aug 18 17:23:14 2015 -0400

    Create, destroy, tag nodes.

diff --git a/services/nodemanager/arvnodeman/computenode/driver/azure.py b/services/nodemanager/arvnodeman/computenode/driver/azure.py
index c054fac..ecab75c 100644
--- a/services/nodemanager/arvnodeman/computenode/driver/azure.py
+++ b/services/nodemanager/arvnodeman/computenode/driver/azure.py
@@ -3,41 +3,67 @@
 from __future__ import absolute_import, print_function
 
 import time
+from operator import attrgetter
 
 import libcloud.compute.base as cloud_base
 import libcloud.compute.providers as cloud_provider
 import libcloud.compute.types as cloud_types
 
 from . import BaseComputeNodeDriver
-from .. import arvados_node_fqdn
+from .. import arvados_node_fqdn, arvados_timestamp, ARVADOS_TIMEFMT
 
 class ComputeNodeDriver(BaseComputeNodeDriver):
 
-    DEFAULT_DRIVER = cloud_provider.get_driver(cloud_types.Provider.AZURE)
+    DEFAULT_DRIVER = cloud_provider.get_driver(cloud_types.Provider.AZURE_ARM)
     SEARCH_CACHE = {}
 
     def __init__(self, auth_kwargs, list_kwargs, create_kwargs,
                  driver_class=DEFAULT_DRIVER):
+        list_kwargs["ex_resource_group"] = create_kwargs["ex_resource_group"]
+
+        self.tags = {key[4:]: value
+                     for key, value in create_kwargs.iteritems()
+                     if key.startswith('tag_')}
         super(ComputeNodeDriver, self).__init__(
             auth_kwargs, list_kwargs, create_kwargs,
             driver_class)
 
     def arvados_create_kwargs(self, arvados_node):
-        return {'name': arvados_node["uuid"]}
+        cluster_id, _, node_id = arvados_node['uuid'].split('-')
+        name = 'compute-{}-{}'.format(node_id, cluster_id)
+        tags = {
+            'booted_at': time.strftime(ARVADOS_TIMEFMT, time.gmtime()),
+            'arv-ping-url': self._make_ping_url(arvados_node),
+        }
+        tags.update(self.tags)
+        return {
+            'name': name,
+            'ex_tags': tags,
+        }
 
     def sync_node(self, cloud_node, arvados_node):
-        print("In sync_node")
+        hostname = arvados_node_fqdn(arvados_node)
+        self.real.ex_create_tags(cloud_node.id, {"hostname": hostname})
+
+    def _init_image(self, urn):
+        return "image", self.list_images(ex_urn=urn)[0]
 
-    def _init_image(self, image):
-        return 'image', self.search_for(image, 'list_images')
+    def _init_ssh_key(self, filename):
+        with open(filename) as ssh_file:
+            key = cloud_base.NodeAuthSSHKey(ssh_file.read())
+        return 'auth', key
 
-    def _init_password(self, password):
-        return 'auth', cloud_base.NodeAuthPassword(password)
+    def list_nodes(self):
+        # Azure only supports filtering node lists by resource group.
+        # Do our own filtering based on tag.
+        return [node for node in
+                super(ComputeNodeDriver, self).list_nodes()
+                if node.extra["tags"].get("arvados-class") == self.tags["arvados-class"]]
 
     @classmethod
     def node_fqdn(cls, node):
-        return node.name
+        return node.extra["tags"].get("hostname")
 
     @classmethod
     def node_start_time(cls, node):
-        pass
+        return arvados_timestamp(node.extra["tags"].get("booted_at"))

commit d5341150545efd0960acf34186ca18b98a1b1860
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Fri Aug 14 13:48:23 2015 -0400

    6507: Initial commit

diff --git a/services/nodemanager/arvnodeman/computenode/driver/__init__.py b/services/nodemanager/arvnodeman/computenode/driver/__init__.py
index 042f6a5..16134a2 100644
--- a/services/nodemanager/arvnodeman/computenode/driver/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/driver/__init__.py
@@ -9,6 +9,8 @@ from libcloud.compute.base import NodeDriver
 
 from ...config import NETWORK_ERRORS
 
+import pprint
+
 class BaseComputeNodeDriver(object):
     """Abstract base class for compute node drivers.
 
@@ -56,7 +58,7 @@ class BaseComputeNodeDriver(object):
     def _init_ping_host(self, ping_host):
         self.ping_host = ping_host
 
-    def search_for(self, term, list_method, key=attrgetter('id')):
+    def search_for(self, term, list_method, key=attrgetter('id'), **kwargs):
         """Return one matching item from a list of cloud objects.
 
         Raises ValueError if the number of matching objects is not exactly 1.
@@ -71,7 +73,8 @@ class BaseComputeNodeDriver(object):
         """
         cache_key = (list_method, term)
         if cache_key not in self.SEARCH_CACHE:
-            results = [item for item in getattr(self.real, list_method)()
+            items = getattr(self.real, list_method)(**kwargs)
+            results = [item for item in items
                        if key(item) == term]
             count = len(results)
             if count != 1:
diff --git a/services/nodemanager/arvnodeman/computenode/driver/azure.py b/services/nodemanager/arvnodeman/computenode/driver/azure.py
new file mode 100644
index 0000000..c054fac
--- /dev/null
+++ b/services/nodemanager/arvnodeman/computenode/driver/azure.py
@@ -0,0 +1,43 @@
+#!/usr/bin/env python
+
+from __future__ import absolute_import, print_function
+
+import time
+
+import libcloud.compute.base as cloud_base
+import libcloud.compute.providers as cloud_provider
+import libcloud.compute.types as cloud_types
+
+from . import BaseComputeNodeDriver
+from .. import arvados_node_fqdn
+
+class ComputeNodeDriver(BaseComputeNodeDriver):
+
+    DEFAULT_DRIVER = cloud_provider.get_driver(cloud_types.Provider.AZURE)
+    SEARCH_CACHE = {}
+
+    def __init__(self, auth_kwargs, list_kwargs, create_kwargs,
+                 driver_class=DEFAULT_DRIVER):
+        super(ComputeNodeDriver, self).__init__(
+            auth_kwargs, list_kwargs, create_kwargs,
+            driver_class)
+
+    def arvados_create_kwargs(self, arvados_node):
+        return {'name': arvados_node["uuid"]}
+
+    def sync_node(self, cloud_node, arvados_node):
+        print("In sync_node")
+
+    def _init_image(self, image):
+        return 'image', self.search_for(image, 'list_images')
+
+    def _init_password(self, password):
+        return 'auth', cloud_base.NodeAuthPassword(password)
+
+    @classmethod
+    def node_fqdn(cls, node):
+        return node.name
+
+    @classmethod
+    def node_start_time(cls, node):
+        pass
diff --git a/services/nodemanager/doc/azure.example.cfg b/services/nodemanager/doc/azure.example.cfg
new file mode 100644
index 0000000..8fad85d
--- /dev/null
+++ b/services/nodemanager/doc/azure.example.cfg
@@ -0,0 +1,119 @@
+# Azure configuration for Arvados Node Manager.
+# All times are in seconds unless specified otherwise.
+
+[Daemon]
+# The dispatcher can customize the start and stop procedure for
+# cloud nodes.  For example, the SLURM dispatcher drains nodes
+# through SLURM before shutting them down.
+#dispatcher = slurm
+
+# Node Manager will ensure that there are at least this many nodes
+# running at all times.
+min_nodes = 0
+
+# Node Manager will not start any compute nodes when at least this
+# many are running.
+max_nodes = 8
+
+# Poll Azure nodes and Arvados for new information every N seconds.
+poll_time = 60
+
+# Polls have exponential backoff when services fail to respond.
+# This is the longest time to wait between polls.
+max_poll_time = 300
+
+# If Node Manager can't succesfully poll a service for this long,
+# it will never start or stop compute nodes, on the assumption that its
+# information is too outdated.
+poll_stale_after = 600
+
+# If Node Manager boots a cloud node, and it does not pair with an Arvados
+# node before this long, assume that there was a cloud bootstrap failure and
+# shut it down.  Note that normal shutdown windows apply (see the Cloud
+# section), so this should be shorter than the first shutdown window value.
+boot_fail_after = 1800
+
+# "Node stale time" affects two related behaviors.
+# 1. If a compute node has been running for at least this long, but it
+# isn't paired with an Arvados node, do not shut it down, but leave it alone.
+# This prevents the node manager from shutting down a node that might
+# actually be doing work, but is having temporary trouble contacting the
+# API server.
+# 2. When the Node Manager starts a new compute node, it will try to reuse
+# an Arvados node that hasn't been updated for this long.
+node_stale_after = 14400
+
+# File path for Certificate Authorities
+certs_file = /etc/ssl/certs/ca-certificates.crt
+
+[Logging]
+# Log file path
+file = /var/log/arvados/node-manager.log
+
+# Log level for most Node Manager messages.
+# Choose one of DEBUG, INFO, WARNING, ERROR, or CRITICAL.
+# WARNING lets you know when polling a service fails.
+# INFO additionally lets you know when a compute node is started or stopped.
+level = INFO
+
+# You can also set different log levels for specific libraries.
+# Pykka is the Node Manager's actor library.
+# Setting this to DEBUG will display tracebacks for uncaught
+# exceptions in the actors, but it's also very chatty.
+pykka = WARNING
+
+# Setting apiclient to INFO will log the URL of every Arvados API request.
+apiclient = WARNING
+
+[Arvados]
+host = zyxwv.arvadosapi.com
+token = ARVADOS_TOKEN
+timeout = 15
+
+# Accept an untrusted SSL certificate from the API server?
+insecure = no
+
+[Cloud]
+provider = azure
+
+# Shutdown windows define periods of time when a node may and may not be shut
+# down.  These are windows in full minutes, separated by commas.  Counting from
+# the time the node is booted, the node WILL NOT shut down for N1 minutes; then
+# it MAY shut down for N2 minutes; then it WILL NOT shut down for N3 minutes;
+# and so on.  For example, "20, 999999" means the node may shut down between
+# the 20th and 999999th minutes of uptime.
+# Azure bills by the minute, so it makes sense to agressively shut down idle
+# nodes.  Specify at least two windows.  You can add as many as you need beyond
+# that.
+shutdown_windows = 20, 999999
+
+[Cloud Credentials]
+subscription_id = SUBSCRIPTION_ID
+key_file = PATH_TO_PEM_FILE
+timeout = 60
+
+[Cloud List]
+# This section defines filters that find compute nodes.
+# Tags that you specify here will automatically be added to nodes you create.
+# Replace colons in Microsoft filters with underscores
+# (e.g., write "tag:mytag" as "tag_mytag").
+instance-state-name = running
+tag_arvados-class = dynamic-compute
+tag_cluster = zyxwv
+
+[Cloud Create]
+image: ???
+ex_cloud_service_name: ???
+
+[Size A3]
+# You can define any number of Size sections to list Azure sizes you're
+# willing to use.  The Node Manager should boot the cheapest size(s) that
+# can run jobs in the queue (N.B.: defining more than one size has not been
+# tested yet).
+# Each size section MUST define the number of cores are available in this
+# size class (since libcloud does not provide any consistent API for exposing
+# this setting).
+# You may also want to define the amount of scratch space (expressed
+# in GB) for Crunch jobs.  You can also override Microsoft's provided
+# data fields by setting the same names here.
+cores = 4
diff --git a/services/nodemanager/setup.py b/services/nodemanager/setup.py
index d9fcbcf..502690b 100644
--- a/services/nodemanager/setup.py
+++ b/services/nodemanager/setup.py
@@ -25,7 +25,7 @@ setup(name='arvados-node-manager',
       license='GNU Affero General Public License, version 3.0',
       packages=find_packages(),
       install_requires=[
-        'apache-libcloud>=0.16',
+        'apache-libcloud>=0.18',
         'arvados-python-client>=0.1.20150206225333',
         'pykka',
         'python-daemon',

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list