[ARVADOS] created: 8f28c5303bce12a562eb1e0ee861c7168d1c3f09
git at public.curoverse.com
git at public.curoverse.com
Mon Feb 16 11:15:09 EST 2015
at 8f28c5303bce12a562eb1e0ee861c7168d1c3f09 (commit)
commit 8f28c5303bce12a562eb1e0ee861c7168d1c3f09
Author: Brett Smith <brett at curoverse.com>
Date: Mon Feb 16 11:06:41 2015 -0500
4138: Prepare Node Manager GCE driver for production.
* Set node metadata in more appropriate places.
* Bridge more differences between GCE and EC2, like the fact that
sizes are listed for each location they're available.
* Use more infrastructure from BaseComputeNodeDriver to reduce code
duplication.
* Load as many objects as possible at initialization time, to reduce
API overhead of creating nodes.
diff --git a/services/nodemanager/arvnodeman/computenode/driver/gce.py b/services/nodemanager/arvnodeman/computenode/driver/gce.py
index a8edc43..6921839 100644
--- a/services/nodemanager/arvnodeman/computenode/driver/gce.py
+++ b/services/nodemanager/arvnodeman/computenode/driver/gce.py
@@ -6,10 +6,8 @@ import functools
import json
import time
-import libcloud.compute.base as cloud_base
import libcloud.compute.providers as cloud_provider
import libcloud.compute.types as cloud_types
-from libcloud.compute.drivers import gce
from . import BaseComputeNodeDriver
from .. import arvados_node_fqdn
@@ -21,73 +19,94 @@ class ComputeNodeDriver(BaseComputeNodeDriver):
"""
DEFAULT_DRIVER = cloud_provider.get_driver(cloud_types.Provider.GCE)
SEARCH_CACHE = {}
- ssh_key = None
- service_accounts = None
+ node_start_times = {}
def __init__(self, auth_kwargs, list_kwargs, create_kwargs,
driver_class=DEFAULT_DRIVER):
+ list_kwargs = list_kwargs.copy()
+ tags_str = list_kwargs.pop('tags', '')
+ if not tags_str.strip():
+ self.node_tags = frozenset()
+ else:
+ self.node_tags = frozenset(t.strip() for t in tags_str.split(','))
+ create_kwargs = create_kwargs.copy()
+ create_kwargs.setdefault('external_ip', None)
+ create_kwargs.setdefault('ex_metadata', {})
super(ComputeNodeDriver, self).__init__(
auth_kwargs, list_kwargs, create_kwargs,
driver_class)
- for key in self.create_kwargs.keys():
- init_method = getattr(self, '_init_' + key, None)
- if init_method is not None:
- new_pair = init_method(self.create_kwargs.pop(key))
- if new_pair is not None:
- self.create_kwargs[new_pair[0]] = new_pair[1]
+ @staticmethod
+ def _name_key(cloud_object):
+ return cloud_object.name
def _init_image(self, image_name):
- return 'image', image_name
+ return 'image', self.search_for(
+ image_name, 'list_images', self._name_key)
- def _init_location(self, location):
- return 'location', location
+ def _init_location(self, location_name):
+ return 'location', self.search_for(
+ location_name, 'list_locations', self._name_key)
- def _init_ping_host(self, ping_host):
- self.ping_host = ping_host
+ def _init_network(self, network_name):
+ return 'ex_network', self.search_for(
+ network_name, 'ex_list_networks', self._name_key)
def _init_service_accounts(self, service_accounts_str):
- self.service_accounts = json.loads(service_accounts_str)
+ return 'ex_service_accounts', json.loads(service_accounts_str)
def _init_ssh_key(self, filename):
+ # SSH keys are delivered to GCE nodes via ex_metadata: see
+ # http://stackoverflow.com/questions/26752617/creating-sshkeys-for-gce-instance-using-libcloud
with open(filename) as ssh_file:
- self.ssh_key = ssh_file.read().strip()
+ self.create_kwargs['ex_metadata']['sshKeys'] = (
+ 'root:' + ssh_file.read().strip())
- def arvados_create_kwargs(self, arvados_node):
- result = {'ex_metadata': self.list_kwargs.copy() }
- ping_secret = arvados_node['info'].get('ping_secret')
- if ping_secret is not None:
- ping_url = ('https://{}/arvados/v1/nodes/{}/ping?ping_secret={}'.
- format(self.ping_host, arvados_node['uuid'],
- ping_secret))
- result['ex_metadata']['pingUrl'] = ping_url
- if self.service_accounts is not None:
- result['ex_service_accounts'] = self.service_accounts
+ def list_sizes(self):
+ return super(ComputeNodeDriver, self).list_sizes(
+ self.create_kwargs['location'])
- # SSH keys are delivered to GCE nodes via ex_metadata: see
- # http://stackoverflow.com/questions/26752617/creating-sshkeys-for-gce-instance-using-libcloud
- if self.ssh_key is not None:
- result['ex_metadata']['sshKeys'] = 'root:{}'.format(self.ssh_key)
+ def arvados_create_kwargs(self, arvados_node):
+ cluster_id, _, node_id = arvados_node['uuid'].split('-')
+ result = {'name': 'compute-{}-{}'.format(node_id, cluster_id),
+ 'ex_metadata': self.create_kwargs['ex_metadata'].copy(),
+ 'ex_tags': list(self.node_tags)}
+ result['ex_metadata']['hostname'] = arvados_node_fqdn(arvados_node)
+ result['ex_metadata']['user-data'] = self._make_ping_url(arvados_node)
return result
- def create_node(self, size, arvados_node):
- kwargs = self.create_kwargs.copy()
- kwargs.update(self.arvados_create_kwargs(arvados_node))
- kwargs.setdefault('name', 'arv-{}'.format(arvados_node['uuid']))
- kwargs['size'] = size
- return self.real.create_node(**kwargs)
-
- # When an Arvados node is synced with a GCE node, the Arvados hostname
- # is forwarded in a GCE tag 'hostname-foo'.
- # TODO(twp): implement an ex_set_metadata method (at least until
- # libcloud supports the API setMetadata method) so we can pass this
- # sensibly in the node metadata.
+ def list_nodes(self):
+ # The GCE libcloud driver only supports filtering node lists by zone.
+ # Do our own filtering based on tag list.
+ return [node for node in
+ super(ComputeNodeDriver, self).list_nodes()
+ if self.node_tags.issubset(node.extra.get('tags', []))]
+
def sync_node(self, cloud_node, arvados_node):
- tags = ['hostname-{}'.format(arvados_node_fqdn(arvados_node))]
- self.real.ex_set_node_tags(cloud_node, tags)
+ hostname = arvados_node_fqdn(arvados_node)
+ metadata_req = cloud_node.extra['metadata'].copy()
+ for data_dict in metadata_req.setdefault('items', []):
+ if data_dict['key'] == 'hostname':
+ data_dict['value'] = hostname
+ break
+ else:
+ metadata_req['items'].append({'key': 'hostname', 'value': hostname})
+ response = self.real.connection.async_request(
+ '/zones/{}/instances/{}/setMetadata'.format(
+ cloud_node.extra['zone'].name, cloud_node.name),
+ method='POST', data=metadata_req)
+ if not response.success():
+ raise Exception("setMetadata error: {}".format(response.error))
+
+ def destroy_node(self, node):
+ success = super(ComputeNodeDriver, self).destroy_node(node)
+ if success:
+ self.node_start_times.pop(node.id, None)
+ return success
@classmethod
def node_start_time(cls, node):
- time_str = node.extra['launch_time'].split('.', 2)[0] + 'UTC'
- return time.mktime(time.strptime(
- time_str,'%Y-%m-%dT%H:%M:%S%Z')) - time.timezone
+ # Launch time isn't available on GCE node records. Thankfully that's
+ # not too big a deal because they have by-minute billing.
+ # Fake an answer based on the first time we see it.
+ return cls.node_start_times.setdefault(node.id, time.time())
diff --git a/services/nodemanager/doc/gce.example.cfg b/services/nodemanager/doc/gce.example.cfg
index 6369383..3789cea 100644
--- a/services/nodemanager/doc/gce.example.cfg
+++ b/services/nodemanager/doc/gce.example.cfg
@@ -65,18 +65,20 @@ insecure = no
[Cloud]
provider = gce
-# XXX(twp): figure out good default settings for GCE
-# It's usually most cost-effective to shut down compute nodes during narrow
-# windows of time. For example, EC2 bills each node by the hour, so the best
-# time to shut down a node is right before a new hour of uptime starts.
-# Shutdown windows define these periods of time. These are windows in
-# full minutes, separated by commas. Counting from the time the node is
-# booted, the node WILL NOT shut down for N1 minutes; then it MAY shut down
-# for N2 minutes; then it WILL NOT shut down for N3 minutes; and so on.
-# For example, "54, 5, 1" means the node may shut down from the 54th to the
-# 59th minute of each hour of uptime.
-# Specify at least two windows. You can add as many as you need beyond that.
-shutdown_windows = 54, 5, 1
+# Shutdown windows define periods of time when a node may and may not
+# be shut down. These are windows in full minutes, separated by
+# commas. Counting from the time the node is booted, the node WILL
+# NOT shut down for N1 minutes; then it MAY shut down for N2 minutes;
+# then it WILL NOT shut down for N3 minutes; and so on. For example,
+# "54, 5, 1" means the node may shut down from the 54th to the 59th
+# minute of each hour of uptime.
+# GCE bills by the minute, and does not provide information about
+# when a node booted. Because of that, Node Manager will pretend a
+# node booted when it's first seen by the process. To help
+# counterbalance that, these shutdown settings are very aggressive.
+# You may want to adjust this if you want more continuity of service
+# from a single node.
+shutdown_windows = 20, 999999
[Cloud Credentials]
user_id = client_email_address at developer.gserviceaccount.com
@@ -93,7 +95,10 @@ timeout = 60
# credential_file =
[Cloud List]
-# Keywords here will be used to populate the metadata field for a GCE node.
+# A comma-separated list of tags that must be applied to a node for it to
+# be considered a compute node.
+# The driver will automatically apply these tags to nodes it creates.
+tags = zyxwv, compute
[Cloud Create]
# New compute nodes will send pings to Arvados at this host.
@@ -108,6 +113,7 @@ ping_host = hostname:port
# * Valid location (zone) names: https://cloud.google.com/compute/docs/zones
image = debian-7
location = us-central1-a
+# network = your_network_name
# JSON string of service account authorizations for this cluster.
# See http://libcloud.readthedocs.org/en/latest/compute/drivers/gce.html#specifying-service-account-scopes
diff --git a/services/nodemanager/tests/test_computenode_driver_gce.py b/services/nodemanager/tests/test_computenode_driver_gce.py
index d4b73f7..26f8ca7 100644
--- a/services/nodemanager/tests/test_computenode_driver_gce.py
+++ b/services/nodemanager/tests/test_computenode_driver_gce.py
@@ -2,6 +2,7 @@
from __future__ import absolute_import, print_function
+import json
import time
import unittest
@@ -19,80 +20,145 @@ class GCEComputeNodeDriverTestCase(testutil.DriverTestMixin, unittest.TestCase):
self.assertTrue(self.driver_mock.called)
self.assertEqual(kwargs, self.driver_mock.call_args[1])
- def test_create_location_loaded_at_initialization(self):
- kwargs = {'location': 'testregion'}
- driver = self.new_driver(create_kwargs=kwargs)
- self.assertTrue(self.driver_mock().list_locations)
-
- def test_create_image_loaded_at_initialization(self):
- kwargs = {'image': 'testimage'}
- driver = self.new_driver(create_kwargs=kwargs)
- self.assertTrue(self.driver_mock().list_images)
+ def test_create_image_loaded_at_initialization_by_name(self):
+ image_mocks = [testutil.cloud_object_mock(c) for c in 'abc']
+ list_method = self.driver_mock().list_images
+ list_method.return_value = image_mocks
+ driver = self.new_driver(create_kwargs={'image': 'B'})
+ self.assertEqual(1, list_method.call_count)
+
+ def test_list_sizes_requires_location_match(self):
+ locations = [testutil.cloud_object_mock(name)
+ for name in ['there', 'here', 'other']]
+ self.driver_mock().list_locations.return_value = locations
+ driver = self.new_driver(create_kwargs={'location': 'HERE'})
+ driver.list_sizes()
+ self.assertIs(locations[1],
+ self.driver_mock().list_sizes.call_args[0][0])
def test_create_includes_ping_secret(self):
arv_node = testutil.arvados_node_mock(info={'ping_secret': 'ssshh'})
driver = self.new_driver()
driver.create_node(testutil.MockSize(1), arv_node)
- create_method = self.driver_mock().create_node
- self.assertTrue(create_method.called)
- create_metadata = create_method.call_args[1].get('ex_metadata')
- self.assertIsInstance(create_metadata, dict)
- self.assertIn('ping_secret=ssshh',
- create_metadata.get('pingUrl', 'arg missing'))
-
- def test_generate_metadata_for_new_arvados_node(self):
- arv_node = testutil.arvados_node_mock(8)
- driver = self.new_driver(list_kwargs={'list': 'test'})
- self.assertEqual({'ex_metadata': {'list': 'test'}},
- driver.arvados_create_kwargs(arv_node))
+ metadata = self.driver_mock().create_node.call_args[1]['ex_metadata']
+ self.assertIn('ping_secret=ssshh', metadata.get('user-data'))
- def test_tags_set_default_hostname_from_new_arvados_node(self):
- arv_node = testutil.arvados_node_mock(hostname=None)
- cloud_node = testutil.cloud_node_mock(1)
+ def test_create_sets_default_hostname(self):
driver = self.new_driver()
- driver.sync_node(cloud_node, arv_node)
- tag_mock = self.driver_mock().ex_set_node_tags
- self.assertTrue(tag_mock.called)
- self.assertEqual(['hostname-dynamic.compute.zzzzz.arvadosapi.com'],
- tag_mock.call_args[0][1])
-
- def test_sync_node_sets_static_hostname(self):
+ driver.create_node(testutil.MockSize(1),
+ testutil.arvados_node_mock(254, hostname=None))
+ create_kwargs = self.driver_mock().create_node.call_args[1]
+ self.assertEqual('compute-0000000000000fe-zzzzz',
+ create_kwargs.get('name'))
+ self.assertEqual('dynamic.compute.zzzzz.arvadosapi.com',
+ create_kwargs.get('ex_metadata', {}).get('hostname'))
+
+ def test_create_tags_from_list_tags(self):
+ driver = self.new_driver(list_kwargs={'tags': 'testA, testB'})
+ driver.create_node(testutil.MockSize(1), testutil.arvados_node_mock())
+ self.assertEqual(['testA', 'testB'],
+ self.driver_mock().create_node.call_args[1]['ex_tags'])
+
+ def test_list_nodes_requires_tags_match(self):
+ # A node matches if our list tags are a subset of the node's tags.
+ # Test behavior with no tags, no match, partial matches, different
+ # order, and strict supersets.
+ cloud_mocks = [
+ testutil.cloud_node_mock(node_num, tags=tag_set)
+ for node_num, tag_set in enumerate(
+ [[], ['bad'], ['good'], ['great'], ['great', 'ok'],
+ ['great', 'good'], ['good', 'fantastic', 'great']])]
+ cloud_mocks.append(testutil.cloud_node_mock())
+ self.driver_mock().list_nodes.return_value = cloud_mocks
+ driver = self.new_driver(list_kwargs={'tags': 'good, great'})
+ self.assertItemsEqual(['5', '6'], [n.id for n in driver.list_nodes()])
+
+ def check_sync_node_updates_hostname_tag(self, plain_metadata):
+ start_metadata = {
+ 'kind': 'compute#metadata',
+ 'fingerprint': 'testprint',
+ 'items': [{'key': key, 'value': plain_metadata[key]}
+ for key in plain_metadata],
+ }
arv_node = testutil.arvados_node_mock(1)
- cloud_node = testutil.cloud_node_mock(2)
+ cloud_node = testutil.cloud_node_mock(
+ 2, metadata=start_metadata.copy(),
+ zone=testutil.cloud_object_mock('testzone'))
driver = self.new_driver()
driver.sync_node(cloud_node, arv_node)
- tag_mock = self.driver_mock().ex_set_node_tags
- self.assertTrue(tag_mock.called)
- self.assertEqual(['hostname-compute1.zzzzz.arvadosapi.com'],
- tag_mock.call_args[0][1])
-
- def test_node_create_time(self):
- refsecs = int(time.time())
- reftuple = time.gmtime(refsecs)
- node = testutil.cloud_node_mock()
- node.extra = {'launch_time': time.strftime('%Y-%m-%dT%H:%M:%S.000Z',
- reftuple)}
- self.assertEqual(refsecs, gce.ComputeNodeDriver.node_start_time(node))
-
- def test_generate_metadata_for_new_arvados_node(self):
+ args, kwargs = self.driver_mock().connection.async_request.call_args
+ self.assertEqual('/zones/TESTZONE/instances/2/setMetadata', args[0])
+ for key in ['kind', 'fingerprint']:
+ self.assertEqual(start_metadata[key], kwargs['data'][key])
+ plain_metadata['hostname'] = 'compute1.zzzzz.arvadosapi.com'
+ self.assertEqual(
+ plain_metadata,
+ {item['key']: item['value'] for item in kwargs['data']['items']})
+
+ def test_sync_node_updates_hostname_tag(self):
+ self.check_sync_node_updates_hostname_tag(
+ {'testkey': 'testvalue', 'hostname': 'startvalue'})
+
+ def test_sync_node_adds_hostname_tag(self):
+ self.check_sync_node_updates_hostname_tag({'testkey': 'testval'})
+
+ def test_sync_node_raises_exception_on_failure(self):
arv_node = testutil.arvados_node_mock(8)
- driver = self.new_driver(list_kwargs={'list': 'test'})
- self.assertEqual({'ex_metadata': {'list': 'test'}},
- driver.arvados_create_kwargs(arv_node))
+ cloud_node = testutil.cloud_node_mock(
+ 9, metadata={}, zone=testutil.cloud_object_mock('failzone'))
+ mock_response = self.driver_mock().connection.async_request()
+ mock_response.success.return_value = False
+ mock_response.error = 'sync error test'
+ driver = self.new_driver()
+ with self.assertRaises(Exception) as err_check:
+ driver.sync_node(cloud_node, arv_node)
+ self.assertIs(err_check.exception.__class__, Exception)
+ self.assertIn('sync error test', str(err_check.exception))
+
+ def test_node_create_time_static(self):
+ node = testutil.cloud_node_mock()
+ with mock.patch('time.time', return_value=1) as time_mock:
+ result1 = gce.ComputeNodeDriver.node_start_time(node)
+ time_mock.return_value += 1
+ self.assertEqual(result1,
+ gce.ComputeNodeDriver.node_start_time(node))
+
+ def test_node_create_time_varies_by_node(self):
+ node1 = testutil.cloud_node_mock(1)
+ node2 = testutil.cloud_node_mock(2)
+ with mock.patch('time.time', return_value=1) as time_mock:
+ start_time1 = gce.ComputeNodeDriver.node_start_time(node1)
+ time_mock.return_value += 1
+ self.assertNotEqual(start_time1,
+ gce.ComputeNodeDriver.node_start_time(node2))
+
+ def test_node_create_time_not_remembered_after_delete(self):
+ node = testutil.cloud_node_mock()
+ driver = self.new_driver()
+ with mock.patch('time.time', return_value=1) as time_mock:
+ result1 = gce.ComputeNodeDriver.node_start_time(node)
+ driver.destroy_node(node)
+ time_mock.return_value += 1
+ self.assertNotEqual(result1,
+ gce.ComputeNodeDriver.node_start_time(node))
def test_deliver_ssh_key_in_metadata(self):
test_ssh_key = 'ssh-rsa-foo'
arv_node = testutil.arvados_node_mock(1)
- with mock.patch('__builtin__.open', mock.mock_open(read_data=test_ssh_key)) as mock_file:
+ with mock.patch('__builtin__.open',
+ mock.mock_open(read_data=test_ssh_key)) as mock_file:
driver = self.new_driver(create_kwargs={'ssh_key': 'ssh-key-file'})
mock_file.assert_called_once_with('ssh-key-file')
- self.assertEqual({'ex_metadata': {'sshKeys': 'root:ssh-rsa-foo'}},
- driver.arvados_create_kwargs(arv_node))
+ driver.create_node(testutil.MockSize(1), arv_node)
+ metadata = self.driver_mock().create_node.call_args[1]['ex_metadata']
+ self.assertEqual('root:ssh-rsa-foo', metadata.get('sshKeys'))
def test_create_driver_with_service_accounts(self):
- srv_acct_config = { 'service_accounts': '{ "email": "foo at bar", "scopes":["storage-full"]}' }
+ service_accounts = {'email': 'foo at bar', 'scopes': ['storage-full']}
+ srv_acct_config = {'service_accounts': json.dumps(service_accounts)}
arv_node = testutil.arvados_node_mock(1)
driver = self.new_driver(create_kwargs=srv_acct_config)
- create_kwargs = driver.arvados_create_kwargs(arv_node)
- self.assertEqual({u'email': u'foo at bar', u'scopes': [u'storage-full']},
- create_kwargs['ex_service_accounts'])
+ driver.create_node(testutil.MockSize(1), arv_node)
+ self.assertEqual(
+ service_accounts,
+ self.driver_mock().create_node.call_args[1]['ex_service_accounts'])
diff --git a/services/nodemanager/tests/testutil.py b/services/nodemanager/tests/testutil.py
index 633aac6..d51aab2 100644
--- a/services/nodemanager/tests/testutil.py
+++ b/services/nodemanager/tests/testutil.py
@@ -39,7 +39,7 @@ def cloud_object_mock(name_id):
cloud_object.name = cloud_object.id.upper()
return cloud_object
-def cloud_node_mock(node_num=99):
+def cloud_node_mock(node_num=99, **extra):
node = mock.NonCallableMagicMock(
['id', 'name', 'state', 'public_ips', 'private_ips', 'driver', 'size',
'image', 'extra'],
@@ -48,6 +48,7 @@ def cloud_node_mock(node_num=99):
node.name = node.id
node.public_ips = []
node.private_ips = [ip_address_mock(node_num)]
+ node.extra = extra
return node
def ip_address_mock(last_octet):
commit 82bb5282db6a64cc6523c5bdbc31d4b8247ed5d9
Author: Brett Smith <brett at curoverse.com>
Date: Fri Feb 13 15:24:04 2015 -0500
4138: Revamp Node Manager driver proxying in BaseComputeNodeDriver.
Accessing attributes through a super() proxy does not invoke
__getattr__ on base classes, so the old implementation made it
impossible for subclasses to be agnostic about whether a method was
implemented in BaseComputeNodeDriver or the real libcloud driver.
This version makes that possible. It's also a little nicer because
now the class will report these method names to dir(), hasattr(), etc.
diff --git a/services/nodemanager/arvnodeman/computenode/driver/__init__.py b/services/nodemanager/arvnodeman/computenode/driver/__init__.py
index c06bba3..369bb8f 100644
--- a/services/nodemanager/arvnodeman/computenode/driver/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/driver/__init__.py
@@ -3,6 +3,7 @@
from __future__ import absolute_import, print_function
import libcloud.common.types as cloud_types
+from libcloud.compute.base import NodeDriver
from ...config import NETWORK_ERRORS
@@ -35,14 +36,6 @@ class BaseComputeNodeDriver(object):
def _init_ping_host(self, ping_host):
self.ping_host = ping_host
- def __getattr__(self, name):
- # Proxy non-extension methods to the real driver.
- if (not name.startswith('_') and not name.startswith('ex_')
- and hasattr(self.real, name)):
- return getattr(self.real, name)
- else:
- return super(BaseComputeNodeDriver, self).__getattr__(name)
-
def search_for(self, term, list_method, key=lambda item: item.id):
cache_key = (list_method, term)
if cache_key not in self.SEARCH_CACHE:
@@ -96,3 +89,16 @@ class BaseComputeNodeDriver(object):
# exactly an Exception, or a better-known higher-level exception.
return (isinstance(exception, cls.CLOUD_ERRORS) or
getattr(exception, '__class__', None) is Exception)
+
+ # Now that we've defined all our own methods, delegate generic, public
+ # attributes of libcloud drivers that we haven't defined ourselves.
+ def _delegate_to_real(attr_name):
+ return property(
+ lambda self: getattr(self.real, attr_name),
+ lambda self, value: setattr(self.real, attr_name, value),
+ doc=getattr(getattr(NodeDriver, attr_name), '__doc__', None))
+
+ _locals = locals()
+ for _attr_name in dir(NodeDriver):
+ if (not _attr_name.startswith('_')) and (_attr_name not in _locals):
+ _locals[_attr_name] = _delegate_to_real(_attr_name)
commit 1e0fa5965513b3396296568d45c5b97b27a96dd8
Author: Brett Smith <brett at curoverse.com>
Date: Thu Feb 12 17:22:12 2015 -0500
4138: Refactor out Node Manager DriverTestMixin.
diff --git a/services/nodemanager/tests/test_computenode_driver_ec2.py b/services/nodemanager/tests/test_computenode_driver_ec2.py
index 10a41a8..8e52824 100644
--- a/services/nodemanager/tests/test_computenode_driver_ec2.py
+++ b/services/nodemanager/tests/test_computenode_driver_ec2.py
@@ -12,15 +12,8 @@ import mock
import arvnodeman.computenode.driver.ec2 as ec2
from . import testutil
-class EC2ComputeNodeDriverTestCase(unittest.TestCase):
- def setUp(self):
- self.driver_mock = mock.MagicMock(name='driver_mock')
-
- def new_driver(self, auth_kwargs={}, list_kwargs={}, create_kwargs={}):
- create_kwargs.setdefault('ping_host', '100::')
- return ec2.ComputeNodeDriver(
- auth_kwargs, list_kwargs, create_kwargs,
- driver_class=self.driver_mock)
+class EC2ComputeNodeDriverTestCase(testutil.DriverTestMixin, unittest.TestCase):
+ TEST_CLASS = ec2.ComputeNodeDriver
def test_driver_instantiation(self):
kwargs = {'key': 'testkey'}
diff --git a/services/nodemanager/tests/test_computenode_driver_gce.py b/services/nodemanager/tests/test_computenode_driver_gce.py
index 0e10e2f..d4b73f7 100644
--- a/services/nodemanager/tests/test_computenode_driver_gce.py
+++ b/services/nodemanager/tests/test_computenode_driver_gce.py
@@ -10,15 +10,8 @@ import mock
import arvnodeman.computenode.driver.gce as gce
from . import testutil
-class GCEComputeNodeDriverTestCase(unittest.TestCase):
- def setUp(self):
- self.driver_mock = mock.MagicMock(name='driver_mock')
-
- def new_driver(self, auth_kwargs={}, list_kwargs={}, create_kwargs={}):
- create_kwargs.setdefault('ping_host', '100::')
- return gce.ComputeNodeDriver(
- auth_kwargs, list_kwargs, create_kwargs,
- driver_class=self.driver_mock)
+class GCEComputeNodeDriverTestCase(testutil.DriverTestMixin, unittest.TestCase):
+ TEST_CLASS = gce.ComputeNodeDriver
def test_driver_instantiation(self):
kwargs = {'user_id': 'foo'}
diff --git a/services/nodemanager/tests/testutil.py b/services/nodemanager/tests/testutil.py
index c5b6539..633aac6 100644
--- a/services/nodemanager/tests/testutil.py
+++ b/services/nodemanager/tests/testutil.py
@@ -115,6 +115,21 @@ class ActorTestMixin(object):
return result
+class DriverTestMixin(object):
+ def setUp(self):
+ self.driver_mock = mock.MagicMock(name='driver_mock')
+ super(DriverTestMixin, self).setUp()
+
+ def new_driver(self, auth_kwargs={}, list_kwargs={}, create_kwargs={}):
+ create_kwargs.setdefault('ping_host', '100::')
+ return self.TEST_CLASS(
+ auth_kwargs, list_kwargs, create_kwargs,
+ driver_class=self.driver_mock)
+
+ def driver_method_args(self, method_name):
+ return getattr(self.driver_mock(), method_name).call_args
+
+
class RemotePollLoopActorTestMixin(ActorTestMixin):
def build_monitor(self, *args, **kwargs):
self.timer = mock.MagicMock(name='timer_mock')
commit 58be160ab5549fc03c7c09b90c8fad7ff1234096
Author: Brett Smith <brett at curoverse.com>
Date: Thu Feb 12 15:53:16 2015 -0500
4138: Fix noop Node Manager EC2 driver tests.
The previous tests simply instantiated the driver, then checked that a
mock method was truthy (which it will always be). This makes the test
work as intended.
diff --git a/services/nodemanager/tests/test_computenode_driver_ec2.py b/services/nodemanager/tests/test_computenode_driver_ec2.py
index fae63a5..10a41a8 100644
--- a/services/nodemanager/tests/test_computenode_driver_ec2.py
+++ b/services/nodemanager/tests/test_computenode_driver_ec2.py
@@ -37,15 +37,12 @@ class EC2ComputeNodeDriverTestCase(unittest.TestCase):
self.assertEqual({'tag:test': 'true'},
list_method.call_args[1].get('ex_filters'))
- def test_create_location_loaded_at_initialization(self):
- kwargs = {'location': 'testregion'}
- driver = self.new_driver(create_kwargs=kwargs)
- self.assertTrue(self.driver_mock().list_locations)
-
def test_create_image_loaded_at_initialization(self):
- kwargs = {'image': 'testimage'}
- driver = self.new_driver(create_kwargs=kwargs)
- self.assertTrue(self.driver_mock().list_images)
+ list_method = self.driver_mock().list_images
+ list_method.return_value = [testutil.cloud_object_mock(c)
+ for c in 'abc']
+ driver = self.new_driver(create_kwargs={'image_id': 'b'})
+ self.assertEqual(1, list_method.call_count)
def test_create_includes_ping_secret(self):
arv_node = testutil.arvados_node_mock(info={'ping_secret': 'ssshh'})
diff --git a/services/nodemanager/tests/testutil.py b/services/nodemanager/tests/testutil.py
index ff525f0..c5b6539 100644
--- a/services/nodemanager/tests/testutil.py
+++ b/services/nodemanager/tests/testutil.py
@@ -30,6 +30,15 @@ def arvados_node_mock(node_num=99, job_uuid=None, age=0, **kwargs):
node.update(kwargs)
return node
+def cloud_object_mock(name_id):
+ # A very generic mock, useful for stubbing libcloud objects we
+ # only search for and pass around, like locations, subnets, etc.
+ cloud_object = mock.NonCallableMagicMock(['id', 'name'],
+ name='cloud_object')
+ cloud_object.id = str(name_id)
+ cloud_object.name = cloud_object.id.upper()
+ return cloud_object
+
def cloud_node_mock(node_num=99):
node = mock.NonCallableMagicMock(
['id', 'name', 'state', 'public_ips', 'private_ips', 'driver', 'size',
commit 6bd3ba9693f500162bd2744b2e79edf91fab6b02
Author: Brett Smith <brett at curoverse.com>
Date: Fri Feb 13 16:00:30 2015 -0500
4138: Refactor common Node Manager driver initialization to base driver.
diff --git a/services/nodemanager/arvnodeman/computenode/driver/__init__.py b/services/nodemanager/arvnodeman/computenode/driver/__init__.py
index 3a0c206..c06bba3 100644
--- a/services/nodemanager/arvnodeman/computenode/driver/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/driver/__init__.py
@@ -25,6 +25,15 @@ class BaseComputeNodeDriver(object):
self.real = driver_class(**auth_kwargs)
self.list_kwargs = list_kwargs
self.create_kwargs = create_kwargs
+ for key in self.create_kwargs.keys():
+ init_method = getattr(self, '_init_' + key, None)
+ if init_method is not None:
+ new_pair = init_method(self.create_kwargs.pop(key))
+ if new_pair is not None:
+ self.create_kwargs[new_pair[0]] = new_pair[1]
+
+ def _init_ping_host(self, ping_host):
+ self.ping_host = ping_host
def __getattr__(self, name):
# Proxy non-extension methods to the real driver.
@@ -52,6 +61,11 @@ class BaseComputeNodeDriver(object):
def arvados_create_kwargs(self, arvados_node):
raise NotImplementedError("BaseComputeNodeDriver.arvados_create_kwargs")
+ def _make_ping_url(self, arvados_node):
+ return 'https://{}/arvados/v1/nodes/{}/ping?ping_secret={}'.format(
+ self.ping_host, arvados_node['uuid'],
+ arvados_node['info']['ping_secret'])
+
def create_node(self, size, arvados_node):
kwargs = self.create_kwargs.copy()
kwargs.update(self.arvados_create_kwargs(arvados_node))
diff --git a/services/nodemanager/arvnodeman/computenode/driver/ec2.py b/services/nodemanager/arvnodeman/computenode/driver/ec2.py
index 255a948..9db3d89 100644
--- a/services/nodemanager/arvnodeman/computenode/driver/ec2.py
+++ b/services/nodemanager/arvnodeman/computenode/driver/ec2.py
@@ -52,19 +52,10 @@ class ComputeNodeDriver(BaseComputeNodeDriver):
super(ComputeNodeDriver, self).__init__(
auth_kwargs, {'ex_filters': list_kwargs}, create_kwargs,
driver_class)
- for key in self.create_kwargs.keys():
- init_method = getattr(self, '_init_' + key, None)
- if init_method is not None:
- new_pair = init_method(self.create_kwargs.pop(key))
- if new_pair is not None:
- self.create_kwargs[new_pair[0]] = new_pair[1]
def _init_image_id(self, image_id):
return 'image', self.search_for(image_id, 'list_images')
- def _init_ping_host(self, ping_host):
- self.ping_host = ping_host
-
def _init_security_groups(self, group_names):
return 'ex_security_groups', [
self.search_for(gname.strip(), 'ex_get_security_groups')
@@ -79,14 +70,8 @@ class ComputeNodeDriver(BaseComputeNodeDriver):
return 'auth', key
def arvados_create_kwargs(self, arvados_node):
- result = {'name': arvados_node_fqdn(arvados_node)}
- ping_secret = arvados_node['info'].get('ping_secret')
- if ping_secret is not None:
- ping_url = ('https://{}/arvados/v1/nodes/{}/ping?ping_secret={}'.
- format(self.ping_host, arvados_node['uuid'],
- ping_secret))
- result['ex_userdata'] = ping_url
- return result
+ return {'name': arvados_node_fqdn(arvados_node),
+ 'ex_userdata': self._make_ping_url(arvados_node)}
def post_create_node(self, cloud_node):
self.real.ex_create_tags(cloud_node, self.tags)
diff --git a/services/nodemanager/tests/testutil.py b/services/nodemanager/tests/testutil.py
index f254e72..ff525f0 100644
--- a/services/nodemanager/tests/testutil.py
+++ b/services/nodemanager/tests/testutil.py
@@ -26,7 +26,7 @@ def arvados_node_mock(node_num=99, job_uuid=None, age=0, **kwargs):
'ip_address': ip_address_mock(node_num),
'job_uuid': job_uuid,
'crunch_worker_state': crunch_worker_state,
- 'info': {}}
+ 'info': {'ping_secret': 'defaulttestsecret'}}
node.update(kwargs)
return node
commit 0d0a1fe02eb3775fd48e54a5d2408d3220090ed8
Author: Brett Smith <brett at curoverse.com>
Date: Wed Feb 11 15:12:37 2015 -0500
4138: Simplify Node Manager GCE credential handling.
Because libcloud's GCE driver accepts a key path as a constructor
argument, it's relatively straightforward to put all the constructor
arguments directly in the Node Manager configuration. No need to
parse out JSON.
diff --git a/services/nodemanager/arvnodeman/config.py b/services/nodemanager/arvnodeman/config.py
index f386653..315df1c 100644
--- a/services/nodemanager/arvnodeman/config.py
+++ b/services/nodemanager/arvnodeman/config.py
@@ -4,7 +4,6 @@ from __future__ import absolute_import, print_function
import ConfigParser
import importlib
-import json
import logging
import ssl
import sys
@@ -99,13 +98,6 @@ class NodeManagerConfig(ConfigParser.SafeConfigParser):
module = importlib.import_module('arvnodeman.computenode.driver.' +
self.get('Cloud', 'provider'))
auth_kwargs = self.get_section('Cloud Credentials')
- # GCE credentials are delivered in a JSON file.
- if 'json_credential_file' in auth_kwargs:
- with open(auth_kwargs['json_credential_file']) as jf:
- json_creds = json.load(jf)
- auth_kwargs['user_id'] = json_creds['client_email']
- auth_kwargs['key'] = json_creds['private_key']
-
if 'timeout' in auth_kwargs:
auth_kwargs['timeout'] = int(auth_kwargs['timeout'])
return module.ComputeNodeDriver(auth_kwargs,
diff --git a/services/nodemanager/doc/gce.example.cfg b/services/nodemanager/doc/gce.example.cfg
index 73cd2e1..6369383 100644
--- a/services/nodemanager/doc/gce.example.cfg
+++ b/services/nodemanager/doc/gce.example.cfg
@@ -79,7 +79,8 @@ provider = gce
shutdown_windows = 54, 5, 1
[Cloud Credentials]
-json_credential_file = /path/to/credential_file.json
+user_id = client_email_address at developer.gserviceaccount.com
+key = path_to_certificate.pem
project = project-id-from-google-cloud-dashboard
timeout = 60
commit 36929497f4825cde6c9c386217300cf4fd43f4f4
Author: Tim Pierce <twp at curoverse.com>
Date: Fri Jan 23 17:44:41 2015 -0500
4138: updated unit test
Corrected test_create_includes_ping_secret to account for delivering the
ping secret via metadata in GCE.
diff --git a/services/nodemanager/tests/test_computenode_driver_gce.py b/services/nodemanager/tests/test_computenode_driver_gce.py
index 075760a..0e10e2f 100644
--- a/services/nodemanager/tests/test_computenode_driver_gce.py
+++ b/services/nodemanager/tests/test_computenode_driver_gce.py
@@ -42,9 +42,10 @@ class GCEComputeNodeDriverTestCase(unittest.TestCase):
driver.create_node(testutil.MockSize(1), arv_node)
create_method = self.driver_mock().create_node
self.assertTrue(create_method.called)
+ create_metadata = create_method.call_args[1].get('ex_metadata')
+ self.assertIsInstance(create_metadata, dict)
self.assertIn('ping_secret=ssshh',
- create_method.call_args[1].get('ex_userdata',
- 'arg missing'))
+ create_metadata.get('pingUrl', 'arg missing'))
def test_generate_metadata_for_new_arvados_node(self):
arv_node = testutil.arvados_node_mock(8)
commit 0d0de3867410b415620573ae13706192ddcd8961
Author: Tim Pierce <twp at curoverse.com>
Date: Fri Jan 23 17:24:54 2015 -0500
4138: GCE fixes
The 'network_id' parameter needs to be delivered as 'location' in GCE.
The ping_url parameter is now delivered in the node metadata as
'pingUrl'.
When creating a new GCE instance, 'name' is a required parameter and
must begin with a letter. The default name is the UUID of the
corresponding Arvados node, prepended with 'arv-'.
diff --git a/services/nodemanager/arvnodeman/computenode/driver/gce.py b/services/nodemanager/arvnodeman/computenode/driver/gce.py
index 125a090..a8edc43 100644
--- a/services/nodemanager/arvnodeman/computenode/driver/gce.py
+++ b/services/nodemanager/arvnodeman/computenode/driver/gce.py
@@ -37,8 +37,11 @@ class ComputeNodeDriver(BaseComputeNodeDriver):
if new_pair is not None:
self.create_kwargs[new_pair[0]] = new_pair[1]
- def _init_image_id(self, image_id):
- return 'image', image_id
+ def _init_image(self, image_name):
+ return 'image', image_name
+
+ def _init_location(self, location):
+ return 'location', location
def _init_ping_host(self, ping_host):
self.ping_host = ping_host
@@ -46,9 +49,6 @@ class ComputeNodeDriver(BaseComputeNodeDriver):
def _init_service_accounts(self, service_accounts_str):
self.service_accounts = json.loads(service_accounts_str)
- def _init_network_id(self, subnet_id):
- return 'ex_network', subnet_id
-
def _init_ssh_key(self, filename):
with open(filename) as ssh_file:
self.ssh_key = ssh_file.read().strip()
@@ -60,7 +60,7 @@ class ComputeNodeDriver(BaseComputeNodeDriver):
ping_url = ('https://{}/arvados/v1/nodes/{}/ping?ping_secret={}'.
format(self.ping_host, arvados_node['uuid'],
ping_secret))
- result['ex_userdata'] = ping_url
+ result['ex_metadata']['pingUrl'] = ping_url
if self.service_accounts is not None:
result['ex_service_accounts'] = self.service_accounts
@@ -70,6 +70,13 @@ class ComputeNodeDriver(BaseComputeNodeDriver):
result['ex_metadata']['sshKeys'] = 'root:{}'.format(self.ssh_key)
return result
+ def create_node(self, size, arvados_node):
+ kwargs = self.create_kwargs.copy()
+ kwargs.update(self.arvados_create_kwargs(arvados_node))
+ kwargs.setdefault('name', 'arv-{}'.format(arvados_node['uuid']))
+ kwargs['size'] = size
+ return self.real.create_node(**kwargs)
+
# When an Arvados node is synced with a GCE node, the Arvados hostname
# is forwarded in a GCE tag 'hostname-foo'.
# TODO(twp): implement an ex_set_metadata method (at least until
diff --git a/services/nodemanager/doc/gce.example.cfg b/services/nodemanager/doc/gce.example.cfg
index 11289bc..73cd2e1 100644
--- a/services/nodemanager/doc/gce.example.cfg
+++ b/services/nodemanager/doc/gce.example.cfg
@@ -104,9 +104,9 @@ ping_host = hostname:port
# The GCE image name and network zone name to use when creating new nodes.
# * Valid image aliases: https://cloud.google.com/sdk/gcloud/reference/compute/instances/create
-# * Valid network zones: https://cloud.google.com/compute/docs/zones
-image_id = debian-7
-network_id = us-central1-a
+# * Valid location (zone) names: https://cloud.google.com/compute/docs/zones
+image = debian-7
+location = us-central1-a
# JSON string of service account authorizations for this cluster.
# See http://libcloud.readthedocs.org/en/latest/compute/drivers/gce.html#specifying-service-account-scopes
commit 2b6cfad3ac09667655e5e5a745cffe97b215ab4b
Author: Tim Pierce <twp at curoverse.com>
Date: Wed Jan 21 13:06:35 2015 -0500
4138: general GCE fixes
* JSON credential file
** GCE credentials are delivered as a JSON string (and the key is formatted as a multi-line RSA private key). Let the GCE config file specify a path to the JSON credential file for simplicity.
* Accept NodeSizes addressed by id or name
** In EC2, NodeSizes are identified by the 'id' field. In GCE they are identified by the 'name' field. Allow the Node Manager config module to accept either.
diff --git a/services/nodemanager/arvnodeman/computenode/driver/gce.py b/services/nodemanager/arvnodeman/computenode/driver/gce.py
index a4fd57d..125a090 100644
--- a/services/nodemanager/arvnodeman/computenode/driver/gce.py
+++ b/services/nodemanager/arvnodeman/computenode/driver/gce.py
@@ -38,7 +38,7 @@ class ComputeNodeDriver(BaseComputeNodeDriver):
self.create_kwargs[new_pair[0]] = new_pair[1]
def _init_image_id(self, image_id):
- return 'image', self.search_for(image_id, 'list_images')
+ return 'image', image_id
def _init_ping_host(self, ping_host):
self.ping_host = ping_host
@@ -47,7 +47,7 @@ class ComputeNodeDriver(BaseComputeNodeDriver):
self.service_accounts = json.loads(service_accounts_str)
def _init_network_id(self, subnet_id):
- return 'ex_network', self.search_for(subnet_id, 'ex_list_networks')
+ return 'ex_network', subnet_id
def _init_ssh_key(self, filename):
with open(filename) as ssh_file:
diff --git a/services/nodemanager/arvnodeman/config.py b/services/nodemanager/arvnodeman/config.py
index e15f023..f386653 100644
--- a/services/nodemanager/arvnodeman/config.py
+++ b/services/nodemanager/arvnodeman/config.py
@@ -4,6 +4,7 @@ from __future__ import absolute_import, print_function
import ConfigParser
import importlib
+import json
import logging
import ssl
import sys
@@ -98,6 +99,13 @@ class NodeManagerConfig(ConfigParser.SafeConfigParser):
module = importlib.import_module('arvnodeman.computenode.driver.' +
self.get('Cloud', 'provider'))
auth_kwargs = self.get_section('Cloud Credentials')
+ # GCE credentials are delivered in a JSON file.
+ if 'json_credential_file' in auth_kwargs:
+ with open(auth_kwargs['json_credential_file']) as jf:
+ json_creds = json.load(jf)
+ auth_kwargs['user_id'] = json_creds['client_email']
+ auth_kwargs['key'] = json_creds['private_key']
+
if 'timeout' in auth_kwargs:
auth_kwargs['timeout'] = int(auth_kwargs['timeout'])
return module.ComputeNodeDriver(auth_kwargs,
@@ -105,14 +113,26 @@ class NodeManagerConfig(ConfigParser.SafeConfigParser):
self.get_section('Cloud Create'))
def node_sizes(self, all_sizes):
+ """Finds all acceptable NodeSizes for our installation.
+
+ Returns a list of (NodeSize, kwargs) pairs for each NodeSize object
+ returned by libcloud that matches a size listed in our config file.
+ """
+
size_kwargs = {}
for sec_name in self.sections():
sec_words = sec_name.split(None, 2)
if sec_words[0] != 'Size':
continue
size_kwargs[sec_words[1]] = self.get_section(sec_name, int)
- return [(size, size_kwargs[size.id]) for size in all_sizes
- if size.id in size_kwargs]
+ # EC2 node sizes are identified by id. GCE sizes are identified by name.
+ matching_sizes = []
+ for size in all_sizes:
+ if size.id in size_kwargs:
+ matching_sizes.append((size, size_kwargs[size.id]))
+ elif size.name in size_kwargs:
+ matching_sizes.append((size, size_kwargs[size.name]))
+ return matching_sizes
def shutdown_windows(self):
return [int(n)
diff --git a/services/nodemanager/doc/gce.example.cfg b/services/nodemanager/doc/gce.example.cfg
index adc0300..11289bc 100644
--- a/services/nodemanager/doc/gce.example.cfg
+++ b/services/nodemanager/doc/gce.example.cfg
@@ -79,9 +79,8 @@ provider = gce
shutdown_windows = 54, 5, 1
[Cloud Credentials]
-user_id = USERID
-key = SECRET_KEY
-project = project_name
+json_credential_file = /path/to/credential_file.json
+project = project-id-from-google-cloud-dashboard
timeout = 60
# Optional settings. For full documentation see
@@ -103,9 +102,11 @@ ping_host = hostname:port
# A file path for an SSH key that can log in to the compute node.
# ssh_key = path
-# The GCE IDs of the image and network compute nodes should use.
-image_id = idstring
-network_id = idstring
+# The GCE image name and network zone name to use when creating new nodes.
+# * Valid image aliases: https://cloud.google.com/sdk/gcloud/reference/compute/instances/create
+# * Valid network zones: https://cloud.google.com/compute/docs/zones
+image_id = debian-7
+network_id = us-central1-a
# JSON string of service account authorizations for this cluster.
# See http://libcloud.readthedocs.org/en/latest/compute/drivers/gce.html#specifying-service-account-scopes
@@ -120,6 +121,9 @@ network_id = idstring
# The Size fields are interpreted the same way as with a libcloud NodeSize:
# http://libcloud.readthedocs.org/en/latest/compute/api.html#libcloud.compute.base.NodeSize
#
+# See https://cloud.google.com/compute/docs/machine-types for a list
+# of known machine types that may be used as a Size parameter.
+#
# Each size section MUST define the number of cores are available in this
# size class (since libcloud does not provide any consistent API for exposing
# this setting).
commit c2fb3f5752b87617cb42761e4963126c549b3edc
Author: Tim Pierce <twp at curoverse.com>
Date: Mon Nov 24 17:12:07 2014 -0500
4138: code review feedback
diff --git a/services/nodemanager/doc/gce.example.cfg b/services/nodemanager/doc/gce.example.cfg
index 4886cb2..adc0300 100644
--- a/services/nodemanager/doc/gce.example.cfg
+++ b/services/nodemanager/doc/gce.example.cfg
@@ -82,15 +82,15 @@ shutdown_windows = 54, 5, 1
user_id = USERID
key = SECRET_KEY
project = project_name
-timeout = 60 # used by NodeManagerConfig
+timeout = 60
# Optional settings. For full documentation see
# http://libcloud.readthedocs.org/en/latest/compute/drivers/gce.html#libcloud.compute.drivers.gce.GCENodeDriver
#
-# datacenter = 'us-central1-a'
-# auth_type = 'SA' # SA, IA or GCE
+# datacenter = us-central1-a
+# auth_type = SA # SA, IA or GCE
# scopes = https://www.googleapis.com/auth/compute
-# credential_file =
+# credential_file =
[Cloud List]
# Keywords here will be used to populate the metadata field for a GCE node.
commit 650bd4aea584857b5ba8e1e3657a07ff540e9ab8
Author: Tim Pierce <twp at curoverse.com>
Date: Tue Nov 18 13:49:10 2014 -0500
4138: support for Google Cloud Engine.
* Added:
** nodemanager/arvnodeman/computenode/drivers/gce.py
** nodemanager/doc/gce.example.cfg
** nodemanager/tests/test_computenode_driver_gce.py
Updated comment in nodemanager/arvnodeman/computenode/drivers/ec2.py.
diff --git a/services/nodemanager/arvnodeman/computenode/driver/gce.py b/services/nodemanager/arvnodeman/computenode/driver/gce.py
new file mode 100644
index 0000000..a4fd57d
--- /dev/null
+++ b/services/nodemanager/arvnodeman/computenode/driver/gce.py
@@ -0,0 +1,86 @@
+#!/usr/bin/env python
+
+from __future__ import absolute_import, print_function
+
+import functools
+import json
+import time
+
+import libcloud.compute.base as cloud_base
+import libcloud.compute.providers as cloud_provider
+import libcloud.compute.types as cloud_types
+from libcloud.compute.drivers import gce
+
+from . import BaseComputeNodeDriver
+from .. import arvados_node_fqdn
+
+class ComputeNodeDriver(BaseComputeNodeDriver):
+ """Compute node driver wrapper for GCE
+
+ This translates cloud driver requests to GCE's specific parameters.
+ """
+ DEFAULT_DRIVER = cloud_provider.get_driver(cloud_types.Provider.GCE)
+ SEARCH_CACHE = {}
+ ssh_key = None
+ service_accounts = None
+
+ def __init__(self, auth_kwargs, list_kwargs, create_kwargs,
+ driver_class=DEFAULT_DRIVER):
+ super(ComputeNodeDriver, self).__init__(
+ auth_kwargs, list_kwargs, create_kwargs,
+ driver_class)
+
+ for key in self.create_kwargs.keys():
+ init_method = getattr(self, '_init_' + key, None)
+ if init_method is not None:
+ new_pair = init_method(self.create_kwargs.pop(key))
+ if new_pair is not None:
+ self.create_kwargs[new_pair[0]] = new_pair[1]
+
+ def _init_image_id(self, image_id):
+ return 'image', self.search_for(image_id, 'list_images')
+
+ def _init_ping_host(self, ping_host):
+ self.ping_host = ping_host
+
+ def _init_service_accounts(self, service_accounts_str):
+ self.service_accounts = json.loads(service_accounts_str)
+
+ def _init_network_id(self, subnet_id):
+ return 'ex_network', self.search_for(subnet_id, 'ex_list_networks')
+
+ def _init_ssh_key(self, filename):
+ with open(filename) as ssh_file:
+ self.ssh_key = ssh_file.read().strip()
+
+ def arvados_create_kwargs(self, arvados_node):
+ result = {'ex_metadata': self.list_kwargs.copy() }
+ ping_secret = arvados_node['info'].get('ping_secret')
+ if ping_secret is not None:
+ ping_url = ('https://{}/arvados/v1/nodes/{}/ping?ping_secret={}'.
+ format(self.ping_host, arvados_node['uuid'],
+ ping_secret))
+ result['ex_userdata'] = ping_url
+ if self.service_accounts is not None:
+ result['ex_service_accounts'] = self.service_accounts
+
+ # SSH keys are delivered to GCE nodes via ex_metadata: see
+ # http://stackoverflow.com/questions/26752617/creating-sshkeys-for-gce-instance-using-libcloud
+ if self.ssh_key is not None:
+ result['ex_metadata']['sshKeys'] = 'root:{}'.format(self.ssh_key)
+ return result
+
+ # When an Arvados node is synced with a GCE node, the Arvados hostname
+ # is forwarded in a GCE tag 'hostname-foo'.
+ # TODO(twp): implement an ex_set_metadata method (at least until
+ # libcloud supports the API setMetadata method) so we can pass this
+ # sensibly in the node metadata.
+ def sync_node(self, cloud_node, arvados_node):
+ tags = ['hostname-{}'.format(arvados_node_fqdn(arvados_node))]
+ self.real.ex_set_node_tags(cloud_node, tags)
+
+ @classmethod
+ def node_start_time(cls, node):
+ time_str = node.extra['launch_time'].split('.', 2)[0] + 'UTC'
+ return time.mktime(time.strptime(
+ time_str,'%Y-%m-%dT%H:%M:%S%Z')) - time.timezone
diff --git a/services/nodemanager/doc/ec2.example.cfg b/services/nodemanager/doc/ec2.example.cfg
index 024ed2b..9b41ca1 100644
--- a/services/nodemanager/doc/ec2.example.cfg
+++ b/services/nodemanager/doc/ec2.example.cfg
@@ -128,9 +128,11 @@ security_groups = idstring1, idstring2
# willing to use. The Node Manager should boot the cheapest size(s) that
# can run jobs in the queue (N.B.: defining more than one size has not been
# tested yet).
-# Each size section MUST define the number of cores it has. You may also
-# want to define the number of mebibytes of scratch space for Crunch jobs.
-# You can also override Amazon's provided data fields by setting the same
-# names here.
+# Each size section MUST define the number of cores are available in this
+# size class (since libcloud does not provide any consistent API for exposing
+# this setting).
+# You may also want to define the amount of scratch space (expressed
+# in GB) for Crunch jobs. You can also override Amazon's provided
+# data fields by setting the same names here.
cores = 2
-scratch = 100
\ No newline at end of file
+scratch = 100
diff --git a/services/nodemanager/doc/ec2.example.cfg b/services/nodemanager/doc/gce.example.cfg
similarity index 64%
copy from services/nodemanager/doc/ec2.example.cfg
copy to services/nodemanager/doc/gce.example.cfg
index 024ed2b..4886cb2 100644
--- a/services/nodemanager/doc/ec2.example.cfg
+++ b/services/nodemanager/doc/gce.example.cfg
@@ -1,12 +1,7 @@
-# EC2 configuration for Arvados Node Manager.
+# Google Compute Engine configuration for Arvados Node Manager.
# All times are in seconds unless specified otherwise.
[Daemon]
-# The dispatcher can customize the start and stop procedure for
-# cloud nodes. For example, the SLURM dispatcher drains nodes
-# through SLURM before shutting them down.
-#dispatcher = slurm
-
# Node Manager will ensure that there are at least this many nodes
# running at all times.
min_nodes = 0
@@ -15,7 +10,7 @@ min_nodes = 0
# many are running.
max_nodes = 8
-# Poll EC2 nodes and Arvados for new information every N seconds.
+# Poll compute nodes and Arvados for new information every N seconds.
poll_time = 60
# Polls have exponential backoff when services fail to respond.
@@ -27,12 +22,6 @@ max_poll_time = 300
# information is too outdated.
poll_stale_after = 600
-# If Node Manager boots a cloud node, and it does not pair with an Arvados
-# node before this long, assume that there was a cloud bootstrap failure and
-# shut it down. Note that normal shutdown windows apply (see the Cloud
-# section), so this should be shorter than the first shutdown window value.
-boot_fail_after = 1800
-
# "Node stale time" affects two related behaviors.
# 1. If a compute node has been running for at least this long, but it
# isn't paired with an Arvados node, do not shut it down, but leave it alone.
@@ -74,8 +63,9 @@ timeout = 15
insecure = no
[Cloud]
-provider = ec2
+provider = gce
+# XXX(twp): figure out good default settings for GCE
# It's usually most cost-effective to shut down compute nodes during narrow
# windows of time. For example, EC2 bills each node by the hour, so the best
# time to shut down a node is right before a new hour of uptime starts.
@@ -89,48 +79,52 @@ provider = ec2
shutdown_windows = 54, 5, 1
[Cloud Credentials]
-key = KEY
-secret = SECRET_KEY
-region = us-east-1
-timeout = 60
+user_id = USERID
+key = SECRET_KEY
+project = project_name
+timeout = 60 # used by NodeManagerConfig
+
+# Optional settings. For full documentation see
+# http://libcloud.readthedocs.org/en/latest/compute/drivers/gce.html#libcloud.compute.drivers.gce.GCENodeDriver
+#
+# datacenter = 'us-central1-a'
+# auth_type = 'SA' # SA, IA or GCE
+# scopes = https://www.googleapis.com/auth/compute
+# credential_file =
[Cloud List]
-# This section defines filters that find compute nodes.
-# Tags that you specify here will automatically be added to nodes you create.
-# Replace colons in Amazon filters with underscores
-# (e.g., write "tag:mytag" as "tag_mytag").
-instance-state-name = running
-tag_arvados-class = dynamic-compute
-tag_cluster = zyxwv
+# Keywords here will be used to populate the metadata field for a GCE node.
[Cloud Create]
# New compute nodes will send pings to Arvados at this host.
# You may specify a port, and use brackets to disambiguate IPv6 addresses.
ping_host = hostname:port
-# Give the name of an SSH key on AWS...
-ex_keyname = string
-
-# ... or a file path for an SSH key that can log in to the compute node.
-# (One or the other, not both.)
+# A file path for an SSH key that can log in to the compute node.
# ssh_key = path
-# The EC2 IDs of the image and subnet compute nodes should use.
+# The GCE IDs of the image and network compute nodes should use.
image_id = idstring
-subnet_id = idstring
+network_id = idstring
-# Comma-separated EC2 IDs for the security group(s) assigned to each
-# compute node.
-security_groups = idstring1, idstring2
+# JSON string of service account authorizations for this cluster.
+# See http://libcloud.readthedocs.org/en/latest/compute/drivers/gce.html#specifying-service-account-scopes
+# service_accounts = [ { 'email': 'ex at mple.com', 'scopes': ['storage-ro'] } ]
-[Size t2.medium]
-# You can define any number of Size sections to list EC2 sizes you're
+[Size n1-standard-2]
+# You can define any number of Size sections to list node sizes you're
# willing to use. The Node Manager should boot the cheapest size(s) that
# can run jobs in the queue (N.B.: defining more than one size has not been
# tested yet).
-# Each size section MUST define the number of cores it has. You may also
-# want to define the number of mebibytes of scratch space for Crunch jobs.
-# You can also override Amazon's provided data fields by setting the same
-# names here.
+#
+# The Size fields are interpreted the same way as with a libcloud NodeSize:
+# http://libcloud.readthedocs.org/en/latest/compute/api.html#libcloud.compute.base.NodeSize
+#
+# Each size section MUST define the number of cores are available in this
+# size class (since libcloud does not provide any consistent API for exposing
+# this setting).
+# You may also want to define the amount of scratch space (expressed
+# in GB) for Crunch jobs.
cores = 2
-scratch = 100
\ No newline at end of file
+scratch = 100
+ram = 512
diff --git a/services/nodemanager/tests/test_computenode_driver_gce.py b/services/nodemanager/tests/test_computenode_driver_gce.py
new file mode 100644
index 0000000..075760a
--- /dev/null
+++ b/services/nodemanager/tests/test_computenode_driver_gce.py
@@ -0,0 +1,104 @@
+#!/usr/bin/env python
+
+from __future__ import absolute_import, print_function
+
+import time
+import unittest
+
+import mock
+
+import arvnodeman.computenode.driver.gce as gce
+from . import testutil
+
+class GCEComputeNodeDriverTestCase(unittest.TestCase):
+ def setUp(self):
+ self.driver_mock = mock.MagicMock(name='driver_mock')
+
+ def new_driver(self, auth_kwargs={}, list_kwargs={}, create_kwargs={}):
+ create_kwargs.setdefault('ping_host', '100::')
+ return gce.ComputeNodeDriver(
+ auth_kwargs, list_kwargs, create_kwargs,
+ driver_class=self.driver_mock)
+
+ def test_driver_instantiation(self):
+ kwargs = {'user_id': 'foo'}
+ driver = self.new_driver(auth_kwargs=kwargs)
+ self.assertTrue(self.driver_mock.called)
+ self.assertEqual(kwargs, self.driver_mock.call_args[1])
+
+ def test_create_location_loaded_at_initialization(self):
+ kwargs = {'location': 'testregion'}
+ driver = self.new_driver(create_kwargs=kwargs)
+ self.assertTrue(self.driver_mock().list_locations)
+
+ def test_create_image_loaded_at_initialization(self):
+ kwargs = {'image': 'testimage'}
+ driver = self.new_driver(create_kwargs=kwargs)
+ self.assertTrue(self.driver_mock().list_images)
+
+ def test_create_includes_ping_secret(self):
+ arv_node = testutil.arvados_node_mock(info={'ping_secret': 'ssshh'})
+ driver = self.new_driver()
+ driver.create_node(testutil.MockSize(1), arv_node)
+ create_method = self.driver_mock().create_node
+ self.assertTrue(create_method.called)
+ self.assertIn('ping_secret=ssshh',
+ create_method.call_args[1].get('ex_userdata',
+ 'arg missing'))
+
+ def test_generate_metadata_for_new_arvados_node(self):
+ arv_node = testutil.arvados_node_mock(8)
+ driver = self.new_driver(list_kwargs={'list': 'test'})
+ self.assertEqual({'ex_metadata': {'list': 'test'}},
+ driver.arvados_create_kwargs(arv_node))
+
+ def test_tags_set_default_hostname_from_new_arvados_node(self):
+ arv_node = testutil.arvados_node_mock(hostname=None)
+ cloud_node = testutil.cloud_node_mock(1)
+ driver = self.new_driver()
+ driver.sync_node(cloud_node, arv_node)
+ tag_mock = self.driver_mock().ex_set_node_tags
+ self.assertTrue(tag_mock.called)
+ self.assertEqual(['hostname-dynamic.compute.zzzzz.arvadosapi.com'],
+ tag_mock.call_args[0][1])
+
+ def test_sync_node_sets_static_hostname(self):
+ arv_node = testutil.arvados_node_mock(1)
+ cloud_node = testutil.cloud_node_mock(2)
+ driver = self.new_driver()
+ driver.sync_node(cloud_node, arv_node)
+ tag_mock = self.driver_mock().ex_set_node_tags
+ self.assertTrue(tag_mock.called)
+ self.assertEqual(['hostname-compute1.zzzzz.arvadosapi.com'],
+ tag_mock.call_args[0][1])
+
+ def test_node_create_time(self):
+ refsecs = int(time.time())
+ reftuple = time.gmtime(refsecs)
+ node = testutil.cloud_node_mock()
+ node.extra = {'launch_time': time.strftime('%Y-%m-%dT%H:%M:%S.000Z',
+ reftuple)}
+ self.assertEqual(refsecs, gce.ComputeNodeDriver.node_start_time(node))
+
+ def test_generate_metadata_for_new_arvados_node(self):
+ arv_node = testutil.arvados_node_mock(8)
+ driver = self.new_driver(list_kwargs={'list': 'test'})
+ self.assertEqual({'ex_metadata': {'list': 'test'}},
+ driver.arvados_create_kwargs(arv_node))
+
+ def test_deliver_ssh_key_in_metadata(self):
+ test_ssh_key = 'ssh-rsa-foo'
+ arv_node = testutil.arvados_node_mock(1)
+ with mock.patch('__builtin__.open', mock.mock_open(read_data=test_ssh_key)) as mock_file:
+ driver = self.new_driver(create_kwargs={'ssh_key': 'ssh-key-file'})
+ mock_file.assert_called_once_with('ssh-key-file')
+ self.assertEqual({'ex_metadata': {'sshKeys': 'root:ssh-rsa-foo'}},
+ driver.arvados_create_kwargs(arv_node))
+
+ def test_create_driver_with_service_accounts(self):
+ srv_acct_config = { 'service_accounts': '{ "email": "foo at bar", "scopes":["storage-full"]}' }
+ arv_node = testutil.arvados_node_mock(1)
+ driver = self.new_driver(create_kwargs=srv_acct_config)
+ create_kwargs = driver.arvados_create_kwargs(arv_node)
+ self.assertEqual({u'email': u'foo at bar', u'scopes': [u'storage-full']},
+ create_kwargs['ex_service_accounts'])
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list