[ARVADOS] created: a03ce4056e503710caa1e95d315b92fb74c96abf

Git user git at public.curoverse.com
Tue Apr 4 13:21:18 EDT 2017


        at  a03ce4056e503710caa1e95d315b92fb74c96abf (commit)


commit a03ce4056e503710caa1e95d315b92fb74c96abf
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Tue Apr 4 11:40:58 2017 -0400

    11413: Wrap destroy_node with similar logic to create_node: on exception check
    the node list to determine if the node was actually destroyed successfully.

diff --git a/services/nodemanager/arvnodeman/computenode/driver/__init__.py b/services/nodemanager/arvnodeman/computenode/driver/__init__.py
index c78f1c6..29b0484 100644
--- a/services/nodemanager/arvnodeman/computenode/driver/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/driver/__init__.py
@@ -219,6 +219,23 @@ class BaseComputeNodeDriver(RetryMixin):
         return (isinstance(exception, cls.CLOUD_ERRORS) or
                 type(exception) is Exception)
 
+    def destroy_node(self, cloud_node):
+        try:
+            return self.real.destroy_node(cloud_node)
+        except self.CLOUD_ERRORS as destroy_error:
+            # Sometimes the destroy node request succeeds but times out and
+            # raises an exception instead of returning success.  If this
+            # happens, we get a noisy stack trace.  Check if the node is still
+            # on the node list.  If it is gone, we can declare victory.
+            try:
+                self.search_for_now(cloud_node.id, 'list_nodes')
+            except ValueError:
+                # If we catch ValueError, that means search_for_now didn't find
+                # it, which means destroy_node actually succeeded.
+                return True
+            # The node is still on the list.  Re-raise.
+            raise
+
     # Now that we've defined all our own methods, delegate generic, public
     # attributes of libcloud drivers that we haven't defined ourselves.
     def _delegate_to_real(attr_name):

commit 4a35e06b098bdd44a24fcaf77921aea5f371c84b
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Tue Apr 4 11:21:57 2017 -0400

    11413: Fix issues with node manager on GCE:
    
    * Always override Node.size with CloudSizeWrapper
    
    * Get updated node record before setting metadata to minimize 'Supplied
      fingerprint does not match current metadata fingerprint.' error.
    
    * Use ex_set_node_metadata() instead of issuing request directly.

diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
index 71f9083..9ee26e3 100644
--- a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
@@ -121,8 +121,14 @@ class ComputeNodeSetupActor(ComputeNodeStateChangeBase):
                           self.cloud_size.name)
         self.cloud_node = self._cloud.create_node(self.cloud_size,
                                                   self.arvados_node)
-        if not self.cloud_node.size:
-             self.cloud_node.size = self.cloud_size
+
+        # The information included in the node size object we get from libcloud
+        # is inconsistent between cloud providers.  Replace libcloud NodeSize
+        # object with compatible CloudSizeWrapper object which merges the size
+        # info reported from the cloud with size information from the
+        # configuration file.
+        self.cloud_node.size = self.cloud_size
+
         self._logger.info("Cloud node %s created.", self.cloud_node.id)
         self._later.update_arvados_node_properties()
 
diff --git a/services/nodemanager/arvnodeman/computenode/driver/gce.py b/services/nodemanager/arvnodeman/computenode/driver/gce.py
index 1c6d214..79e43cb 100644
--- a/services/nodemanager/arvnodeman/computenode/driver/gce.py
+++ b/services/nodemanager/arvnodeman/computenode/driver/gce.py
@@ -136,6 +136,10 @@ class ComputeNodeDriver(BaseComputeNodeDriver):
             raise
 
     def sync_node(self, cloud_node, arvados_node):
+        # Update the cloud node record to ensure we have the correct metadata
+        # fingerprint.
+        cloud_node = self.real.ex_get_node(cloud_node.name, cloud_node.extra['zone'])
+
         # We can't store the FQDN on the name attribute or anything like it,
         # because (a) names are static throughout the node's life (so FQDN
         # isn't available because we don't know it at node creation time) and
@@ -147,12 +151,8 @@ class ComputeNodeDriver(BaseComputeNodeDriver):
             self._find_metadata(metadata_items, 'hostname')['value'] = hostname
         except KeyError:
             metadata_items.append({'key': 'hostname', 'value': hostname})
-        response = self.real.connection.async_request(
-            '/zones/{}/instances/{}/setMetadata'.format(
-                cloud_node.extra['zone'].name, cloud_node.name),
-            method='POST', data=metadata_req)
-        if not response.success():
-            raise Exception("setMetadata error: {}".format(response.error))
+
+        self.real.ex_set_node_metadata(cloud_node, metadata_items)
 
     @classmethod
     def node_fqdn(cls, node):
diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py
index f23b261..2287724 100644
--- a/services/nodemanager/arvnodeman/daemon.py
+++ b/services/nodemanager/arvnodeman/daemon.py
@@ -336,7 +336,7 @@ class NodeManagerDaemonActor(actor_class):
                 elif (nodes_wanted < 0) and self.booting:
                     self._later.stop_booting_node(size)
             except Exception as e:
-                self._logger.exception("while calculating nodes wanted for size %s", size)
+                self._logger.exception("while calculating nodes wanted for size %s", size.id)
 
     def _check_poll_freshness(orig_func):
         """Decorator to inhibit a method when poll information is stale.
diff --git a/services/nodemanager/tests/test_computenode_driver_gce.py b/services/nodemanager/tests/test_computenode_driver_gce.py
index 84e061d..d47dbdf 100644
--- a/services/nodemanager/tests/test_computenode_driver_gce.py
+++ b/services/nodemanager/tests/test_computenode_driver_gce.py
@@ -123,16 +123,15 @@ class GCEComputeNodeDriverTestCase(testutil.DriverTestMixin, unittest.TestCase):
         cloud_node = testutil.cloud_node_mock(
             2, metadata=start_metadata.copy(),
             zone=testutil.cloud_object_mock('testzone'))
+        self.driver_mock().ex_get_node.return_value = cloud_node
         driver = self.new_driver()
         driver.sync_node(cloud_node, arv_node)
-        args, kwargs = self.driver_mock().connection.async_request.call_args
-        self.assertEqual('/zones/testzone/instances/2/setMetadata', args[0])
-        for key in ['kind', 'fingerprint']:
-            self.assertEqual(start_metadata[key], kwargs['data'][key])
+        args, kwargs = self.driver_mock().ex_set_node_metadata.call_args
+        self.assertEqual(cloud_node, args[0])
         plain_metadata['hostname'] = 'compute1.zzzzz.arvadosapi.com'
         self.assertEqual(
             plain_metadata,
-            {item['key']: item['value'] for item in kwargs['data']['items']})
+            {item['key']: item['value'] for item in args[1]})
 
     def test_sync_node_updates_hostname_tag(self):
         self.check_sync_node_updates_hostname_tag(
@@ -145,9 +144,7 @@ class GCEComputeNodeDriverTestCase(testutil.DriverTestMixin, unittest.TestCase):
         arv_node = testutil.arvados_node_mock(8)
         cloud_node = testutil.cloud_node_mock(
             9, metadata={}, zone=testutil.cloud_object_mock('failzone'))
-        mock_response = self.driver_mock().connection.async_request()
-        mock_response.success.return_value = False
-        mock_response.error = 'sync error test'
+        mock_response = self.driver_mock().ex_set_node_metadata.side_effect = (Exception('sync error test'),)
         driver = self.new_driver()
         with self.assertRaises(Exception) as err_check:
             driver.sync_node(cloud_node, arv_node)

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list