[ARVADOS] updated: 1.1.2-141-g3ccc70f

Git user git at public.curoverse.com
Tue Jan 30 13:39:04 EST 2018


Summary of changes:
 .../api/app/controllers/application_controller.rb  |  4 ++
 .../app/controllers/arvados/v1/nodes_controller.rb | 26 +++++++++++++
 services/api/app/models/node.rb                    | 44 +++++++++++-----------
 .../functional/arvados/v1/nodes_controller_test.rb | 34 +++++++++++++++++
 .../arvnodeman/computenode/dispatch/__init__.py    | 25 ++++++------
 .../arvnodeman/computenode/dispatch/slurm.py       | 22 +++++++----
 services/nodemanager/arvnodeman/daemon.py          |  1 -
 .../nodemanager/tests/test_computenode_dispatch.py | 26 ++++++-------
 .../tests/test_computenode_dispatch_slurm.py       |  6 ++-
 services/nodemanager/tests/testutil.py             |  2 +-
 10 files changed, 133 insertions(+), 57 deletions(-)

       via  3ccc70f4bb06bab6c0b3c71f555cba24cc5c6a47 (commit)
       via  5120666e073018f6821bab07f0bbb788098f97b1 (commit)
      from  80da19707253af74bd78c374bfcab64b04d2dbde (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.


commit 3ccc70f4bb06bab6c0b3c71f555cba24cc5c6a47
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Tue Jan 30 13:33:12 2018 -0500

    12199: Keep SLURM's node properties up to date.
    
    Change the semantics of ComputeNodeUpdateActor.sync_node so it gets
    called every time a new Arvados node record appears, even if hostnames
    match. The base actor's implementation now compares hostnames itself
    before calling the cloud driver.
    
    This allows the slurm update actor's sync_node method to sync SLURM
    state periodically, even if hostnames don't go out of sync.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
index 67ea8a2..597a011 100644
--- a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
@@ -317,7 +317,8 @@ class ComputeNodeUpdateActor(config.actor_class, RetryMixin):
 
     @RetryMixin._retry()
     def sync_node(self, cloud_node, arvados_node):
-        return self._cloud.sync_node(cloud_node, arvados_node)
+        if self._cloud.node_fqdn(cloud_node) != arvados_node_fqdn(arvados_node):
+            return self._cloud.sync_node(cloud_node, arvados_node)
 
 
 class ComputeNodeMonitorActor(config.actor_class):
@@ -328,14 +329,13 @@ class ComputeNodeMonitorActor(config.actor_class):
     for shutdown.
     """
     def __init__(self, cloud_node, cloud_node_start_time, shutdown_timer,
-                 cloud_fqdn_func, timer_actor, update_actor, cloud_client,
+                 timer_actor, update_actor, cloud_client,
                  arvados_node=None, poll_stale_after=600, node_stale_after=3600,
                  boot_fail_after=1800
     ):
         super(ComputeNodeMonitorActor, self).__init__()
         self._later = self.actor_ref.tell_proxy()
         self._shutdowns = shutdown_timer
-        self._cloud_node_fqdn = cloud_fqdn_func
         self._timer = timer_actor
         self._update = update_actor
         self._cloud = cloud_client
@@ -488,8 +488,11 @@ class ComputeNodeMonitorActor(config.actor_class):
             self._later.consider_shutdown()
 
     def update_arvados_node(self, arvados_node):
-        # If the cloud node's FQDN doesn't match what's in the Arvados node
-        # record, make them match.
+        """Called when the latest Arvados node record is retrieved.
+
+        Calls the updater's sync_node() method.
+
+        """
         # This method is a little unusual in the way it just fires off the
         # request without checking the result or retrying errors.  That's
         # because this update happens every time we reload the Arvados node
@@ -498,7 +501,5 @@ class ComputeNodeMonitorActor(config.actor_class):
         # the logic to throttle those effective retries when there's trouble.
         if arvados_node is not None:
             self.arvados_node = arvados_node
-            if (self._cloud_node_fqdn(self.cloud_node) !=
-                  arvados_node_fqdn(self.arvados_node)):
-                self._update.sync_node(self.cloud_node, self.arvados_node)
+            self._update.sync_node(self.cloud_node, self.arvados_node)
             self._later.consider_shutdown()
diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py b/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
index d022892..13a2a6a 100644
--- a/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
+++ b/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
@@ -28,6 +28,12 @@ class SlurmMixin(object):
             self._logger.error(
                 "SLURM update %r failed", cmd, exc_info=True)
 
+    def _update_slurm_size_attrs(self, nodename, size):
+        self._update_slurm_node(nodename, [
+            'Weight=%i' % int(size.price * 1000),
+            'Features=instancetype=' + size.name,
+        ])
+
     def _get_slurm_state(self, nodename):
         return subprocess.check_output(['sinfo', '--noheader', '-o', '%t', '-n', nodename])
 
@@ -36,10 +42,7 @@ class ComputeNodeSetupActor(SlurmMixin, SetupActorBase):
     def create_cloud_node(self):
         hostname = self.arvados_node.get("hostname")
         if hostname:
-            self._update_slurm_node(self.arvados_node['hostname'], [
-                'Weight=%i' % int(self.cloud_size.price * 1000),
-                'Features=instancetype='+self.cloud_size.name,
-            ])
+            self._update_slurm_size_attrs(hostname, self.cloud_size)
         return super(ComputeNodeSetupActor, self).create_cloud_node()
 
 
@@ -103,11 +106,14 @@ class ComputeNodeShutdownActor(SlurmMixin, ShutdownActorBase):
 
 class ComputeNodeUpdateActor(SlurmMixin, UpdateActorBase):
     def sync_node(self, cloud_node, arvados_node):
+        """Keep SLURM's node properties up to date."""
         hostname = arvados_node.get("hostname")
         if hostname:
-            self._update_slurm_node(hostname, [
-                'Weight=%i' % int(cloud_node.size.price * 1000),
-                'Features=instancetype=' + cloud_node.size.name,
-            ])
+            # This is only needed when slurm has restarted and lost
+            # the dynamically configured node properties. So it's
+            # usually redundant, but detecting when it's necessary
+            # would be about the same amount of work as doing it
+            # repetitively.
+            self._update_slurm_size_attrs(hostname, cloud_node.size)
         return super(ComputeNodeUpdateActor, self).sync_node(
             cloud_node, arvados_node)
diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py
index dd441ed..73b58bf 100644
--- a/services/nodemanager/arvnodeman/daemon.py
+++ b/services/nodemanager/arvnodeman/daemon.py
@@ -167,7 +167,6 @@ class NodeManagerDaemonActor(actor_class):
             cloud_node=cloud_node,
             cloud_node_start_time=start_time,
             shutdown_timer=shutdown_timer,
-            cloud_fqdn_func=self._cloud_driver.node_fqdn,
             update_actor=self._cloud_updater,
             timer_actor=self._timer,
             arvados_node=None,
diff --git a/services/nodemanager/tests/test_computenode_dispatch.py b/services/nodemanager/tests/test_computenode_dispatch.py
index 1102bf7..0a2deb8 100644
--- a/services/nodemanager/tests/test_computenode_dispatch.py
+++ b/services/nodemanager/tests/test_computenode_dispatch.py
@@ -63,6 +63,7 @@ class ComputeNodeSetupActorTestCase(testutil.ActorTestMixin, unittest.TestCase):
         self.assertEqual(self.arvados_effect[-1],
                          self.setup_actor.arvados_node.get(self.TIMEOUT))
         assert(finished.wait(self.TIMEOUT))
+        self.api_client.nodes().create.called_with(body={}, assign_slot=True)
         self.assertEqual(1, self.api_client.nodes().create().execute.call_count)
         self.assertEqual(1, self.api_client.nodes().update().execute.call_count)
         self.assert_node_properties_updated()
@@ -78,7 +79,8 @@ class ComputeNodeSetupActorTestCase(testutil.ActorTestMixin, unittest.TestCase):
                          self.setup_actor.arvados_node.get(self.TIMEOUT))
         assert(finished.wait(self.TIMEOUT))
         self.assert_node_properties_updated()
-        self.assertEqual(2, self.api_client.nodes().update().execute.call_count)
+        self.api_client.nodes().create.called_with(body={}, assign_slot=True)
+        self.assertEqual(3, self.api_client.nodes().update().execute.call_count)
         self.assertEqual(self.cloud_client.create_node(),
                          self.setup_actor.cloud_node.get(self.TIMEOUT))
 
@@ -195,7 +197,7 @@ class ComputeNodeShutdownActorMixin(testutil.ActorTestMixin):
             start_time = time.time()
         monitor_actor = dispatch.ComputeNodeMonitorActor.start(
             self.cloud_node, start_time, self.shutdowns,
-            testutil.cloud_node_fqdn, self.timer, self.updates, self.cloud_client,
+            self.timer, self.updates, self.cloud_client,
             self.arvados_node)
         self.shutdown_actor = self.ACTOR_CLASS.start(
             self.timer, self.cloud_client, self.arvados_client, monitor_actor,
@@ -333,7 +335,7 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
             start_time = time.time()
         self.node_actor = dispatch.ComputeNodeMonitorActor.start(
             self.cloud_mock, start_time, self.shutdowns,
-            testutil.cloud_node_fqdn, self.timer, self.updates, self.cloud_client,
+            self.timer, self.updates, self.cloud_client,
             arv_node, boot_fail_after=300).proxy()
         self.node_actor.subscribe(self.subscriber).get(self.TIMEOUT)
 
@@ -518,19 +520,10 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
         self.assertEqual(testutil.ip_address_mock(4),
                          current_arvados['ip_address'])
 
-    def test_update_arvados_node_syncs_when_fqdn_mismatch(self):
+    def test_update_arvados_node_calls_sync_node(self):
         self.make_mocks(5)
         self.cloud_mock.extra['testname'] = 'cloudfqdn.zzzzz.arvadosapi.com'
         self.make_actor()
         arv_node = testutil.arvados_node_mock(5)
         self.node_actor.update_arvados_node(arv_node).get(self.TIMEOUT)
         self.assertEqual(1, self.updates.sync_node.call_count)
-
-    def test_update_arvados_node_skips_sync_when_fqdn_match(self):
-        self.make_mocks(6)
-        arv_node = testutil.arvados_node_mock(6)
-        self.cloud_mock.extra['testname'] ='{n[hostname]}.{n[domain]}'.format(
-            n=arv_node)
-        self.make_actor()
-        self.node_actor.update_arvados_node(arv_node).get(self.TIMEOUT)
-        self.assertEqual(0, self.updates.sync_node.call_count)
diff --git a/services/nodemanager/tests/test_computenode_dispatch_slurm.py b/services/nodemanager/tests/test_computenode_dispatch_slurm.py
index f896684..b61db5c 100644
--- a/services/nodemanager/tests/test_computenode_dispatch_slurm.py
+++ b/services/nodemanager/tests/test_computenode_dispatch_slurm.py
@@ -133,7 +133,11 @@ class SLURMComputeNodeSetupActorTestCase(ComputeNodeSetupActorTestCase):
 
     @mock.patch('subprocess.check_output')
     def test_update_node_features(self, check_output):
-        self.make_mocks()
+        # `scontrol update` happens only if the Arvados node record
+        # has a hostname. ComputeNodeSetupActorTestCase.make_mocks
+        # uses mocks with scrubbed hostnames, so we override with the
+        # default testutil.arvados_node_mock.
+        self.make_mocks(arvados_effect=[testutil.arvados_node_mock()])
         self.make_actor()
         self.wait_for_assignment(self.setup_actor, 'cloud_node')
         check_output.assert_called_with(['scontrol', 'update', 'NodeName=compute99', 'Weight=1000', 'Features=instancetype=z1.test'])
diff --git a/services/nodemanager/tests/testutil.py b/services/nodemanager/tests/testutil.py
index 6e13437..d13475b 100644
--- a/services/nodemanager/tests/testutil.py
+++ b/services/nodemanager/tests/testutil.py
@@ -55,7 +55,7 @@ def cloud_object_mock(name_id, **extra):
 def cloud_node_fqdn(node):
     # We intentionally put the FQDN somewhere goofy to make sure tested code is
     # using this function for lookups.
-    return node.extra.get('testname', 'NoTestName')
+    return node.extra.get('testname', node.name+'.NoTestName.invalid')
 
 def ip_address_mock(last_octet):
     return '10.20.30.{}'.format(last_octet)

commit 5120666e073018f6821bab07f0bbb788098f97b1
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Mon Jan 29 20:03:23 2018 -0500

    12199: Assign slot and hostname when creating/reusing a node record.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/services/api/app/controllers/application_controller.rb b/services/api/app/controllers/application_controller.rb
index c94ce89..c4f64f6 100644
--- a/services/api/app/controllers/application_controller.rb
+++ b/services/api/app/controllers/application_controller.rb
@@ -554,6 +554,10 @@ class ApplicationController < ActionController::Base
     }
   end
 
+  def self._update_requires_parameters
+    {}
+  end
+
   def self._index_requires_parameters
     {
       filters: { type: 'array', required: false },
diff --git a/services/api/app/controllers/arvados/v1/nodes_controller.rb b/services/api/app/controllers/arvados/v1/nodes_controller.rb
index 7ee8c2f..73f1dee 100644
--- a/services/api/app/controllers/arvados/v1/nodes_controller.rb
+++ b/services/api/app/controllers/arvados/v1/nodes_controller.rb
@@ -20,6 +20,32 @@ class Arvados::V1::NodesController < ApplicationController
     { ping_secret: {required: true} }
   end
 
+  def self._create_requires_parameters
+    super.merge(
+      { assign_slot: {required: false, type: 'boolean', description: 'assign slot and hostname'} })
+  end
+
+  def self._update_requires_parameters
+    super.merge(
+      { assign_slot: {required: false, type: 'boolean', description: 'assign slot and hostname'} })
+  end
+
+  def create
+    @object = model_class.new(resource_attrs)
+    @object.assign_slot if params[:assign_slot]
+    @object.save!
+    show
+  end
+
+  def update
+    attrs_to_update = resource_attrs.reject { |k,v|
+      [:kind, :etag, :href].index k
+    }
+    @object.update_attributes!(attrs_to_update)
+    @object.assign_slot if params[:assign_slot]
+    show
+  end
+
   def ping
     act_as_system_user do
       @object = Node.where(uuid: (params[:id] || params[:uuid])).first
diff --git a/services/api/app/models/node.rb b/services/api/app/models/node.rb
index bf1b636..3d8b91b 100644
--- a/services/api/app/models/node.rb
+++ b/services/api/app/models/node.rb
@@ -106,27 +106,7 @@ class Node < ArvadosModel
       end
     end
 
-    # Assign slot_number
-    if self.slot_number.nil?
-      while true
-        n = self.class.available_slot_number
-        if n.nil?
-          raise "No available node slots"
-        end
-        self.slot_number = n
-        begin
-          self.save!
-          break
-        rescue ActiveRecord::RecordNotUnique
-          # try again
-        end
-      end
-    end
-
-    # Assign hostname
-    if self.hostname.nil? and Rails.configuration.assign_node_hostname
-      self.hostname = self.class.hostname_for_slot(self.slot_number)
-    end
+    assign_slot
 
     # Record other basic stats
     ['total_cpu_cores', 'total_ram_mb', 'total_scratch_mb'].each do |key|
@@ -140,8 +120,30 @@ class Node < ArvadosModel
     save!
   end
 
+  def assign_slot
+    return if self.slot_number.andand > 0
+    while true
+      self.slot_number = self.class.available_slot_number
+      if self.slot_number.nil?
+        raise "No available node slots"
+      end
+      begin
+        save!
+        return assign_hostname
+      rescue ActiveRecord::RecordNotUnique
+        # try again
+      end
+    end
+  end
+
   protected
 
+  def assign_hostname
+    if self.hostname.nil? and Rails.configuration.assign_node_hostname
+      self.hostname = self.class.hostname_for_slot(self.slot_number)
+    end
+  end
+
   def self.available_slot_number
     # Join the sequence 1..max with the nodes table. Return the first
     # (i.e., smallest) value that doesn't match the slot_number of any
diff --git a/services/api/test/functional/arvados/v1/nodes_controller_test.rb b/services/api/test/functional/arvados/v1/nodes_controller_test.rb
index f9e5be4..c198c4c 100644
--- a/services/api/test/functional/arvados/v1/nodes_controller_test.rb
+++ b/services/api/test/functional/arvados/v1/nodes_controller_test.rb
@@ -78,6 +78,40 @@ class Arvados::V1::NodesControllerTest < ActionController::TestCase
     assert_not_nil json_response['uuid']
     assert_not_nil json_response['info'].is_a? Hash
     assert_not_nil json_response['info']['ping_secret']
+    assert_nil json_response['slot_number']
+    assert_nil json_response['hostname']
+  end
+
+  test "create node and assign slot" do
+    authorize_with :admin
+    post :create, {node: {}, assign_slot: true}
+    assert_response :success
+    assert_not_nil json_response['uuid']
+    assert_not_nil json_response['info'].is_a? Hash
+    assert_not_nil json_response['info']['ping_secret']
+    assert_operator 0, :<, json_response['slot_number']
+    n = json_response['slot_number']
+    assert_equal "compute#{n}", json_response['hostname']
+  end
+
+  test "update node and assign slot" do
+    authorize_with :admin
+    node = nodes(:new_with_no_hostname)
+    post :update, {id: node.uuid, node: {}, assign_slot: true}
+    assert_response :success
+    assert_operator 0, :<, json_response['slot_number']
+    n = json_response['slot_number']
+    assert_equal "compute#{n}", json_response['hostname']
+  end
+
+  test "update node and assign slot, don't clobber hostname" do
+    authorize_with :admin
+    node = nodes(:new_with_custom_hostname)
+    post :update, {id: node.uuid, node: {}, assign_slot: true}
+    assert_response :success
+    assert_operator 0, :<, json_response['slot_number']
+    n = json_response['slot_number']
+    assert_equal "custom1", json_response['hostname']
   end
 
   test "ping adds node stats to info" do
diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
index 6c61e32..67ea8a2 100644
--- a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
@@ -113,14 +113,16 @@ class ComputeNodeSetupActor(ComputeNodeStateChangeBase):
     @ComputeNodeStateChangeBase._finish_on_exception
     @RetryMixin._retry(config.ARVADOS_ERRORS)
     def create_arvados_node(self):
-        self.arvados_node = self._arvados.nodes().create(body={}).execute()
+        self.arvados_node = self._arvados.nodes().create(
+            body={}, assign_slot=True).execute()
         self._later.create_cloud_node()
 
     @ComputeNodeStateChangeBase._finish_on_exception
     @RetryMixin._retry(config.ARVADOS_ERRORS)
     def prepare_arvados_node(self, node):
-        self.arvados_node = self._clean_arvados_node(
-            node, "Prepared by Node Manager")
+        self._clean_arvados_node(node, "Prepared by Node Manager")
+        self.arvados_node = self._arvados.nodes().update(
+            body={}, assign_slot=True).execute()
         self._later.create_cloud_node()
 
     @ComputeNodeStateChangeBase._finish_on_exception
diff --git a/services/nodemanager/tests/test_computenode_dispatch.py b/services/nodemanager/tests/test_computenode_dispatch.py
index b62ce56..1102bf7 100644
--- a/services/nodemanager/tests/test_computenode_dispatch.py
+++ b/services/nodemanager/tests/test_computenode_dispatch.py
@@ -25,7 +25,12 @@ class ComputeNodeSetupActorTestCase(testutil.ActorTestMixin, unittest.TestCase):
 
     def make_mocks(self, arvados_effect=None):
         if arvados_effect is None:
-            arvados_effect = [testutil.arvados_node_mock()]
+            arvados_effect = [testutil.arvados_node_mock(
+                slot_number=None,
+                hostname=None,
+                first_ping_at=None,
+                last_ping_at=None,
+            )]
         self.arvados_effect = arvados_effect
         self.timer = testutil.MockTimer()
         self.api_client = mock.MagicMock(name='api_client')

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list