[ARVADOS] created: a03ce4056e503710caa1e95d315b92fb74c96abf
Git user
git at public.curoverse.com
Tue Apr 4 13:21:18 EDT 2017
at a03ce4056e503710caa1e95d315b92fb74c96abf (commit)
commit a03ce4056e503710caa1e95d315b92fb74c96abf
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date: Tue Apr 4 11:40:58 2017 -0400
11413: Wrap destroy_node with similar logic to create_node: on exception check
the node list to determine if the node was actually destroyed successfully.
diff --git a/services/nodemanager/arvnodeman/computenode/driver/__init__.py b/services/nodemanager/arvnodeman/computenode/driver/__init__.py
index c78f1c6..29b0484 100644
--- a/services/nodemanager/arvnodeman/computenode/driver/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/driver/__init__.py
@@ -219,6 +219,23 @@ class BaseComputeNodeDriver(RetryMixin):
return (isinstance(exception, cls.CLOUD_ERRORS) or
type(exception) is Exception)
+ def destroy_node(self, cloud_node):
+ try:
+ return self.real.destroy_node(cloud_node)
+ except self.CLOUD_ERRORS as destroy_error:
+ # Sometimes the destroy node request succeeds but times out and
+ # raises an exception instead of returning success. If this
+ # happens, we get a noisy stack trace. Check if the node is still
+ # on the node list. If it is gone, we can declare victory.
+ try:
+ self.search_for_now(cloud_node.id, 'list_nodes')
+ except ValueError:
+ # If we catch ValueError, that means search_for_now didn't find
+ # it, which means destroy_node actually succeeded.
+ return True
+ # The node is still on the list. Re-raise.
+ raise
+
# Now that we've defined all our own methods, delegate generic, public
# attributes of libcloud drivers that we haven't defined ourselves.
def _delegate_to_real(attr_name):
commit 4a35e06b098bdd44a24fcaf77921aea5f371c84b
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date: Tue Apr 4 11:21:57 2017 -0400
11413: Fix issues with node manager on GCE:
* Always override Node.size with CloudSizeWrapper
* Get updated node record before setting metadata to minimize 'Supplied
fingerprint does not match current metadata fingerprint.' error.
* Use ex_set_node_metadata() instead of issuing request directly.
diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
index 71f9083..9ee26e3 100644
--- a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
@@ -121,8 +121,14 @@ class ComputeNodeSetupActor(ComputeNodeStateChangeBase):
self.cloud_size.name)
self.cloud_node = self._cloud.create_node(self.cloud_size,
self.arvados_node)
- if not self.cloud_node.size:
- self.cloud_node.size = self.cloud_size
+
+ # The information included in the node size object we get from libcloud
+ # is inconsistent between cloud providers. Replace libcloud NodeSize
+ # object with compatible CloudSizeWrapper object which merges the size
+ # info reported from the cloud with size information from the
+ # configuration file.
+ self.cloud_node.size = self.cloud_size
+
self._logger.info("Cloud node %s created.", self.cloud_node.id)
self._later.update_arvados_node_properties()
diff --git a/services/nodemanager/arvnodeman/computenode/driver/gce.py b/services/nodemanager/arvnodeman/computenode/driver/gce.py
index 1c6d214..79e43cb 100644
--- a/services/nodemanager/arvnodeman/computenode/driver/gce.py
+++ b/services/nodemanager/arvnodeman/computenode/driver/gce.py
@@ -136,6 +136,10 @@ class ComputeNodeDriver(BaseComputeNodeDriver):
raise
def sync_node(self, cloud_node, arvados_node):
+ # Update the cloud node record to ensure we have the correct metadata
+ # fingerprint.
+ cloud_node = self.real.ex_get_node(cloud_node.name, cloud_node.extra['zone'])
+
# We can't store the FQDN on the name attribute or anything like it,
# because (a) names are static throughout the node's life (so FQDN
# isn't available because we don't know it at node creation time) and
@@ -147,12 +151,8 @@ class ComputeNodeDriver(BaseComputeNodeDriver):
self._find_metadata(metadata_items, 'hostname')['value'] = hostname
except KeyError:
metadata_items.append({'key': 'hostname', 'value': hostname})
- response = self.real.connection.async_request(
- '/zones/{}/instances/{}/setMetadata'.format(
- cloud_node.extra['zone'].name, cloud_node.name),
- method='POST', data=metadata_req)
- if not response.success():
- raise Exception("setMetadata error: {}".format(response.error))
+
+ self.real.ex_set_node_metadata(cloud_node, metadata_items)
@classmethod
def node_fqdn(cls, node):
diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py
index f23b261..2287724 100644
--- a/services/nodemanager/arvnodeman/daemon.py
+++ b/services/nodemanager/arvnodeman/daemon.py
@@ -336,7 +336,7 @@ class NodeManagerDaemonActor(actor_class):
elif (nodes_wanted < 0) and self.booting:
self._later.stop_booting_node(size)
except Exception as e:
- self._logger.exception("while calculating nodes wanted for size %s", size)
+ self._logger.exception("while calculating nodes wanted for size %s", size.id)
def _check_poll_freshness(orig_func):
"""Decorator to inhibit a method when poll information is stale.
diff --git a/services/nodemanager/tests/test_computenode_driver_gce.py b/services/nodemanager/tests/test_computenode_driver_gce.py
index 84e061d..d47dbdf 100644
--- a/services/nodemanager/tests/test_computenode_driver_gce.py
+++ b/services/nodemanager/tests/test_computenode_driver_gce.py
@@ -123,16 +123,15 @@ class GCEComputeNodeDriverTestCase(testutil.DriverTestMixin, unittest.TestCase):
cloud_node = testutil.cloud_node_mock(
2, metadata=start_metadata.copy(),
zone=testutil.cloud_object_mock('testzone'))
+ self.driver_mock().ex_get_node.return_value = cloud_node
driver = self.new_driver()
driver.sync_node(cloud_node, arv_node)
- args, kwargs = self.driver_mock().connection.async_request.call_args
- self.assertEqual('/zones/testzone/instances/2/setMetadata', args[0])
- for key in ['kind', 'fingerprint']:
- self.assertEqual(start_metadata[key], kwargs['data'][key])
+ args, kwargs = self.driver_mock().ex_set_node_metadata.call_args
+ self.assertEqual(cloud_node, args[0])
plain_metadata['hostname'] = 'compute1.zzzzz.arvadosapi.com'
self.assertEqual(
plain_metadata,
- {item['key']: item['value'] for item in kwargs['data']['items']})
+ {item['key']: item['value'] for item in args[1]})
def test_sync_node_updates_hostname_tag(self):
self.check_sync_node_updates_hostname_tag(
@@ -145,9 +144,7 @@ class GCEComputeNodeDriverTestCase(testutil.DriverTestMixin, unittest.TestCase):
arv_node = testutil.arvados_node_mock(8)
cloud_node = testutil.cloud_node_mock(
9, metadata={}, zone=testutil.cloud_object_mock('failzone'))
- mock_response = self.driver_mock().connection.async_request()
- mock_response.success.return_value = False
- mock_response.error = 'sync error test'
+ mock_response = self.driver_mock().ex_set_node_metadata.side_effect = (Exception('sync error test'),)
driver = self.new_driver()
with self.assertRaises(Exception) as err_check:
driver.sync_node(cloud_node, arv_node)
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list