[ARVADOS] created: c46ab7c622127315b4e90c98e859588a3403267a
git at public.curoverse.com
git at public.curoverse.com
Tue Aug 18 17:22:01 EDT 2015
at c46ab7c622127315b4e90c98e859588a3403267a (commit)
commit c46ab7c622127315b4e90c98e859588a3403267a
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date: Tue Aug 18 17:23:14 2015 -0400
Create, destroy, tag nodes.
diff --git a/services/nodemanager/arvnodeman/computenode/driver/azure.py b/services/nodemanager/arvnodeman/computenode/driver/azure.py
index c054fac..ecab75c 100644
--- a/services/nodemanager/arvnodeman/computenode/driver/azure.py
+++ b/services/nodemanager/arvnodeman/computenode/driver/azure.py
@@ -3,41 +3,67 @@
from __future__ import absolute_import, print_function
import time
+from operator import attrgetter
import libcloud.compute.base as cloud_base
import libcloud.compute.providers as cloud_provider
import libcloud.compute.types as cloud_types
from . import BaseComputeNodeDriver
-from .. import arvados_node_fqdn
+from .. import arvados_node_fqdn, arvados_timestamp, ARVADOS_TIMEFMT
class ComputeNodeDriver(BaseComputeNodeDriver):
- DEFAULT_DRIVER = cloud_provider.get_driver(cloud_types.Provider.AZURE)
+ DEFAULT_DRIVER = cloud_provider.get_driver(cloud_types.Provider.AZURE_ARM)
SEARCH_CACHE = {}
def __init__(self, auth_kwargs, list_kwargs, create_kwargs,
driver_class=DEFAULT_DRIVER):
+ list_kwargs["ex_resource_group"] = create_kwargs["ex_resource_group"]
+
+ self.tags = {key[4:]: value
+ for key, value in create_kwargs.iteritems()
+ if key.startswith('tag_')}
super(ComputeNodeDriver, self).__init__(
auth_kwargs, list_kwargs, create_kwargs,
driver_class)
def arvados_create_kwargs(self, arvados_node):
- return {'name': arvados_node["uuid"]}
+ cluster_id, _, node_id = arvados_node['uuid'].split('-')
+ name = 'compute-{}-{}'.format(node_id, cluster_id)
+ tags = {
+ 'booted_at': time.strftime(ARVADOS_TIMEFMT, time.gmtime()),
+ 'arv-ping-url': self._make_ping_url(arvados_node),
+ }
+ tags.update(self.tags)
+ return {
+ 'name': name,
+ 'ex_tags': tags,
+ }
def sync_node(self, cloud_node, arvados_node):
- print("In sync_node")
+ hostname = arvados_node_fqdn(arvados_node)
+ self.real.ex_create_tags(cloud_node.id, {"hostname": hostname})
+
+ def _init_image(self, urn):
+ return "image", self.list_images(ex_urn=urn)[0]
- def _init_image(self, image):
- return 'image', self.search_for(image, 'list_images')
+ def _init_ssh_key(self, filename):
+ with open(filename) as ssh_file:
+ key = cloud_base.NodeAuthSSHKey(ssh_file.read())
+ return 'auth', key
- def _init_password(self, password):
- return 'auth', cloud_base.NodeAuthPassword(password)
+ def list_nodes(self):
+ # Azure only supports filtering node lists by resource group.
+ # Do our own filtering based on tag.
+ return [node for node in
+ super(ComputeNodeDriver, self).list_nodes()
+ if node.extra["tags"].get("arvados-class") == self.tags["arvados-class"]]
@classmethod
def node_fqdn(cls, node):
- return node.name
+ return node.extra["tags"].get("hostname")
@classmethod
def node_start_time(cls, node):
- pass
+ return arvados_timestamp(node.extra["tags"].get("booted_at"))
commit d5341150545efd0960acf34186ca18b98a1b1860
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date: Fri Aug 14 13:48:23 2015 -0400
6507: Initial commit
diff --git a/services/nodemanager/arvnodeman/computenode/driver/__init__.py b/services/nodemanager/arvnodeman/computenode/driver/__init__.py
index 042f6a5..16134a2 100644
--- a/services/nodemanager/arvnodeman/computenode/driver/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/driver/__init__.py
@@ -9,6 +9,8 @@ from libcloud.compute.base import NodeDriver
from ...config import NETWORK_ERRORS
+import pprint
+
class BaseComputeNodeDriver(object):
"""Abstract base class for compute node drivers.
@@ -56,7 +58,7 @@ class BaseComputeNodeDriver(object):
def _init_ping_host(self, ping_host):
self.ping_host = ping_host
- def search_for(self, term, list_method, key=attrgetter('id')):
+ def search_for(self, term, list_method, key=attrgetter('id'), **kwargs):
"""Return one matching item from a list of cloud objects.
Raises ValueError if the number of matching objects is not exactly 1.
@@ -71,7 +73,8 @@ class BaseComputeNodeDriver(object):
"""
cache_key = (list_method, term)
if cache_key not in self.SEARCH_CACHE:
- results = [item for item in getattr(self.real, list_method)()
+ items = getattr(self.real, list_method)(**kwargs)
+ results = [item for item in items
if key(item) == term]
count = len(results)
if count != 1:
diff --git a/services/nodemanager/arvnodeman/computenode/driver/azure.py b/services/nodemanager/arvnodeman/computenode/driver/azure.py
new file mode 100644
index 0000000..c054fac
--- /dev/null
+++ b/services/nodemanager/arvnodeman/computenode/driver/azure.py
@@ -0,0 +1,43 @@
+#!/usr/bin/env python
+
+from __future__ import absolute_import, print_function
+
+import time
+
+import libcloud.compute.base as cloud_base
+import libcloud.compute.providers as cloud_provider
+import libcloud.compute.types as cloud_types
+
+from . import BaseComputeNodeDriver
+from .. import arvados_node_fqdn
+
+class ComputeNodeDriver(BaseComputeNodeDriver):
+
+ DEFAULT_DRIVER = cloud_provider.get_driver(cloud_types.Provider.AZURE)
+ SEARCH_CACHE = {}
+
+ def __init__(self, auth_kwargs, list_kwargs, create_kwargs,
+ driver_class=DEFAULT_DRIVER):
+ super(ComputeNodeDriver, self).__init__(
+ auth_kwargs, list_kwargs, create_kwargs,
+ driver_class)
+
+ def arvados_create_kwargs(self, arvados_node):
+ return {'name': arvados_node["uuid"]}
+
+ def sync_node(self, cloud_node, arvados_node):
+ print("In sync_node")
+
+ def _init_image(self, image):
+ return 'image', self.search_for(image, 'list_images')
+
+ def _init_password(self, password):
+ return 'auth', cloud_base.NodeAuthPassword(password)
+
+ @classmethod
+ def node_fqdn(cls, node):
+ return node.name
+
+ @classmethod
+ def node_start_time(cls, node):
+ pass
diff --git a/services/nodemanager/doc/azure.example.cfg b/services/nodemanager/doc/azure.example.cfg
new file mode 100644
index 0000000..8fad85d
--- /dev/null
+++ b/services/nodemanager/doc/azure.example.cfg
@@ -0,0 +1,119 @@
+# Azure configuration for Arvados Node Manager.
+# All times are in seconds unless specified otherwise.
+
+[Daemon]
+# The dispatcher can customize the start and stop procedure for
+# cloud nodes. For example, the SLURM dispatcher drains nodes
+# through SLURM before shutting them down.
+#dispatcher = slurm
+
+# Node Manager will ensure that there are at least this many nodes
+# running at all times.
+min_nodes = 0
+
+# Node Manager will not start any compute nodes when at least this
+# many are running.
+max_nodes = 8
+
+# Poll Azure nodes and Arvados for new information every N seconds.
+poll_time = 60
+
+# Polls have exponential backoff when services fail to respond.
+# This is the longest time to wait between polls.
+max_poll_time = 300
+
+# If Node Manager can't succesfully poll a service for this long,
+# it will never start or stop compute nodes, on the assumption that its
+# information is too outdated.
+poll_stale_after = 600
+
+# If Node Manager boots a cloud node, and it does not pair with an Arvados
+# node before this long, assume that there was a cloud bootstrap failure and
+# shut it down. Note that normal shutdown windows apply (see the Cloud
+# section), so this should be shorter than the first shutdown window value.
+boot_fail_after = 1800
+
+# "Node stale time" affects two related behaviors.
+# 1. If a compute node has been running for at least this long, but it
+# isn't paired with an Arvados node, do not shut it down, but leave it alone.
+# This prevents the node manager from shutting down a node that might
+# actually be doing work, but is having temporary trouble contacting the
+# API server.
+# 2. When the Node Manager starts a new compute node, it will try to reuse
+# an Arvados node that hasn't been updated for this long.
+node_stale_after = 14400
+
+# File path for Certificate Authorities
+certs_file = /etc/ssl/certs/ca-certificates.crt
+
+[Logging]
+# Log file path
+file = /var/log/arvados/node-manager.log
+
+# Log level for most Node Manager messages.
+# Choose one of DEBUG, INFO, WARNING, ERROR, or CRITICAL.
+# WARNING lets you know when polling a service fails.
+# INFO additionally lets you know when a compute node is started or stopped.
+level = INFO
+
+# You can also set different log levels for specific libraries.
+# Pykka is the Node Manager's actor library.
+# Setting this to DEBUG will display tracebacks for uncaught
+# exceptions in the actors, but it's also very chatty.
+pykka = WARNING
+
+# Setting apiclient to INFO will log the URL of every Arvados API request.
+apiclient = WARNING
+
+[Arvados]
+host = zyxwv.arvadosapi.com
+token = ARVADOS_TOKEN
+timeout = 15
+
+# Accept an untrusted SSL certificate from the API server?
+insecure = no
+
+[Cloud]
+provider = azure
+
+# Shutdown windows define periods of time when a node may and may not be shut
+# down. These are windows in full minutes, separated by commas. Counting from
+# the time the node is booted, the node WILL NOT shut down for N1 minutes; then
+# it MAY shut down for N2 minutes; then it WILL NOT shut down for N3 minutes;
+# and so on. For example, "20, 999999" means the node may shut down between
+# the 20th and 999999th minutes of uptime.
+# Azure bills by the minute, so it makes sense to agressively shut down idle
+# nodes. Specify at least two windows. You can add as many as you need beyond
+# that.
+shutdown_windows = 20, 999999
+
+[Cloud Credentials]
+subscription_id = SUBSCRIPTION_ID
+key_file = PATH_TO_PEM_FILE
+timeout = 60
+
+[Cloud List]
+# This section defines filters that find compute nodes.
+# Tags that you specify here will automatically be added to nodes you create.
+# Replace colons in Microsoft filters with underscores
+# (e.g., write "tag:mytag" as "tag_mytag").
+instance-state-name = running
+tag_arvados-class = dynamic-compute
+tag_cluster = zyxwv
+
+[Cloud Create]
+image: ???
+ex_cloud_service_name: ???
+
+[Size A3]
+# You can define any number of Size sections to list Azure sizes you're
+# willing to use. The Node Manager should boot the cheapest size(s) that
+# can run jobs in the queue (N.B.: defining more than one size has not been
+# tested yet).
+# Each size section MUST define the number of cores are available in this
+# size class (since libcloud does not provide any consistent API for exposing
+# this setting).
+# You may also want to define the amount of scratch space (expressed
+# in GB) for Crunch jobs. You can also override Microsoft's provided
+# data fields by setting the same names here.
+cores = 4
diff --git a/services/nodemanager/setup.py b/services/nodemanager/setup.py
index d9fcbcf..502690b 100644
--- a/services/nodemanager/setup.py
+++ b/services/nodemanager/setup.py
@@ -25,7 +25,7 @@ setup(name='arvados-node-manager',
license='GNU Affero General Public License, version 3.0',
packages=find_packages(),
install_requires=[
- 'apache-libcloud>=0.16',
+ 'apache-libcloud>=0.18',
'arvados-python-client>=0.1.20150206225333',
'pykka',
'python-daemon',
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list