[ARVADOS] updated: 1.1.2-141-g3ccc70f
Git user
git at public.curoverse.com
Tue Jan 30 13:39:04 EST 2018
Summary of changes:
.../api/app/controllers/application_controller.rb | 4 ++
.../app/controllers/arvados/v1/nodes_controller.rb | 26 +++++++++++++
services/api/app/models/node.rb | 44 +++++++++++-----------
.../functional/arvados/v1/nodes_controller_test.rb | 34 +++++++++++++++++
.../arvnodeman/computenode/dispatch/__init__.py | 25 ++++++------
.../arvnodeman/computenode/dispatch/slurm.py | 22 +++++++----
services/nodemanager/arvnodeman/daemon.py | 1 -
.../nodemanager/tests/test_computenode_dispatch.py | 26 ++++++-------
.../tests/test_computenode_dispatch_slurm.py | 6 ++-
services/nodemanager/tests/testutil.py | 2 +-
10 files changed, 133 insertions(+), 57 deletions(-)
via 3ccc70f4bb06bab6c0b3c71f555cba24cc5c6a47 (commit)
via 5120666e073018f6821bab07f0bbb788098f97b1 (commit)
from 80da19707253af74bd78c374bfcab64b04d2dbde (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
commit 3ccc70f4bb06bab6c0b3c71f555cba24cc5c6a47
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date: Tue Jan 30 13:33:12 2018 -0500
12199: Keep SLURM's node properties up to date.
Change the semantics of ComputeNodeUpdateActor.sync_node so it gets
called every time a new Arvados node record appears, even if hostnames
match. The base actor's implementation now compares hostnames itself
before calling the cloud driver.
This allows the slurm update actor's sync_node method to sync SLURM
state periodically, even if hostnames don't go out of sync.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>
diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
index 67ea8a2..597a011 100644
--- a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
@@ -317,7 +317,8 @@ class ComputeNodeUpdateActor(config.actor_class, RetryMixin):
@RetryMixin._retry()
def sync_node(self, cloud_node, arvados_node):
- return self._cloud.sync_node(cloud_node, arvados_node)
+ if self._cloud.node_fqdn(cloud_node) != arvados_node_fqdn(arvados_node):
+ return self._cloud.sync_node(cloud_node, arvados_node)
class ComputeNodeMonitorActor(config.actor_class):
@@ -328,14 +329,13 @@ class ComputeNodeMonitorActor(config.actor_class):
for shutdown.
"""
def __init__(self, cloud_node, cloud_node_start_time, shutdown_timer,
- cloud_fqdn_func, timer_actor, update_actor, cloud_client,
+ timer_actor, update_actor, cloud_client,
arvados_node=None, poll_stale_after=600, node_stale_after=3600,
boot_fail_after=1800
):
super(ComputeNodeMonitorActor, self).__init__()
self._later = self.actor_ref.tell_proxy()
self._shutdowns = shutdown_timer
- self._cloud_node_fqdn = cloud_fqdn_func
self._timer = timer_actor
self._update = update_actor
self._cloud = cloud_client
@@ -488,8 +488,11 @@ class ComputeNodeMonitorActor(config.actor_class):
self._later.consider_shutdown()
def update_arvados_node(self, arvados_node):
- # If the cloud node's FQDN doesn't match what's in the Arvados node
- # record, make them match.
+ """Called when the latest Arvados node record is retrieved.
+
+ Calls the updater's sync_node() method.
+
+ """
# This method is a little unusual in the way it just fires off the
# request without checking the result or retrying errors. That's
# because this update happens every time we reload the Arvados node
@@ -498,7 +501,5 @@ class ComputeNodeMonitorActor(config.actor_class):
# the logic to throttle those effective retries when there's trouble.
if arvados_node is not None:
self.arvados_node = arvados_node
- if (self._cloud_node_fqdn(self.cloud_node) !=
- arvados_node_fqdn(self.arvados_node)):
- self._update.sync_node(self.cloud_node, self.arvados_node)
+ self._update.sync_node(self.cloud_node, self.arvados_node)
self._later.consider_shutdown()
diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py b/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
index d022892..13a2a6a 100644
--- a/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
+++ b/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
@@ -28,6 +28,12 @@ class SlurmMixin(object):
self._logger.error(
"SLURM update %r failed", cmd, exc_info=True)
+ def _update_slurm_size_attrs(self, nodename, size):
+ self._update_slurm_node(nodename, [
+ 'Weight=%i' % int(size.price * 1000),
+ 'Features=instancetype=' + size.name,
+ ])
+
def _get_slurm_state(self, nodename):
return subprocess.check_output(['sinfo', '--noheader', '-o', '%t', '-n', nodename])
@@ -36,10 +42,7 @@ class ComputeNodeSetupActor(SlurmMixin, SetupActorBase):
def create_cloud_node(self):
hostname = self.arvados_node.get("hostname")
if hostname:
- self._update_slurm_node(self.arvados_node['hostname'], [
- 'Weight=%i' % int(self.cloud_size.price * 1000),
- 'Features=instancetype='+self.cloud_size.name,
- ])
+ self._update_slurm_size_attrs(hostname, self.cloud_size)
return super(ComputeNodeSetupActor, self).create_cloud_node()
@@ -103,11 +106,14 @@ class ComputeNodeShutdownActor(SlurmMixin, ShutdownActorBase):
class ComputeNodeUpdateActor(SlurmMixin, UpdateActorBase):
def sync_node(self, cloud_node, arvados_node):
+ """Keep SLURM's node properties up to date."""
hostname = arvados_node.get("hostname")
if hostname:
- self._update_slurm_node(hostname, [
- 'Weight=%i' % int(cloud_node.size.price * 1000),
- 'Features=instancetype=' + cloud_node.size.name,
- ])
+ # This is only needed when slurm has restarted and lost
+ # the dynamically configured node properties. So it's
+ # usually redundant, but detecting when it's necessary
+ # would be about the same amount of work as doing it
+ # repetitively.
+ self._update_slurm_size_attrs(hostname, cloud_node.size)
return super(ComputeNodeUpdateActor, self).sync_node(
cloud_node, arvados_node)
diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py
index dd441ed..73b58bf 100644
--- a/services/nodemanager/arvnodeman/daemon.py
+++ b/services/nodemanager/arvnodeman/daemon.py
@@ -167,7 +167,6 @@ class NodeManagerDaemonActor(actor_class):
cloud_node=cloud_node,
cloud_node_start_time=start_time,
shutdown_timer=shutdown_timer,
- cloud_fqdn_func=self._cloud_driver.node_fqdn,
update_actor=self._cloud_updater,
timer_actor=self._timer,
arvados_node=None,
diff --git a/services/nodemanager/tests/test_computenode_dispatch.py b/services/nodemanager/tests/test_computenode_dispatch.py
index 1102bf7..0a2deb8 100644
--- a/services/nodemanager/tests/test_computenode_dispatch.py
+++ b/services/nodemanager/tests/test_computenode_dispatch.py
@@ -63,6 +63,7 @@ class ComputeNodeSetupActorTestCase(testutil.ActorTestMixin, unittest.TestCase):
self.assertEqual(self.arvados_effect[-1],
self.setup_actor.arvados_node.get(self.TIMEOUT))
assert(finished.wait(self.TIMEOUT))
+ self.api_client.nodes().create.called_with(body={}, assign_slot=True)
self.assertEqual(1, self.api_client.nodes().create().execute.call_count)
self.assertEqual(1, self.api_client.nodes().update().execute.call_count)
self.assert_node_properties_updated()
@@ -78,7 +79,8 @@ class ComputeNodeSetupActorTestCase(testutil.ActorTestMixin, unittest.TestCase):
self.setup_actor.arvados_node.get(self.TIMEOUT))
assert(finished.wait(self.TIMEOUT))
self.assert_node_properties_updated()
- self.assertEqual(2, self.api_client.nodes().update().execute.call_count)
+ self.api_client.nodes().create.called_with(body={}, assign_slot=True)
+ self.assertEqual(3, self.api_client.nodes().update().execute.call_count)
self.assertEqual(self.cloud_client.create_node(),
self.setup_actor.cloud_node.get(self.TIMEOUT))
@@ -195,7 +197,7 @@ class ComputeNodeShutdownActorMixin(testutil.ActorTestMixin):
start_time = time.time()
monitor_actor = dispatch.ComputeNodeMonitorActor.start(
self.cloud_node, start_time, self.shutdowns,
- testutil.cloud_node_fqdn, self.timer, self.updates, self.cloud_client,
+ self.timer, self.updates, self.cloud_client,
self.arvados_node)
self.shutdown_actor = self.ACTOR_CLASS.start(
self.timer, self.cloud_client, self.arvados_client, monitor_actor,
@@ -333,7 +335,7 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
start_time = time.time()
self.node_actor = dispatch.ComputeNodeMonitorActor.start(
self.cloud_mock, start_time, self.shutdowns,
- testutil.cloud_node_fqdn, self.timer, self.updates, self.cloud_client,
+ self.timer, self.updates, self.cloud_client,
arv_node, boot_fail_after=300).proxy()
self.node_actor.subscribe(self.subscriber).get(self.TIMEOUT)
@@ -518,19 +520,10 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
self.assertEqual(testutil.ip_address_mock(4),
current_arvados['ip_address'])
- def test_update_arvados_node_syncs_when_fqdn_mismatch(self):
+ def test_update_arvados_node_calls_sync_node(self):
self.make_mocks(5)
self.cloud_mock.extra['testname'] = 'cloudfqdn.zzzzz.arvadosapi.com'
self.make_actor()
arv_node = testutil.arvados_node_mock(5)
self.node_actor.update_arvados_node(arv_node).get(self.TIMEOUT)
self.assertEqual(1, self.updates.sync_node.call_count)
-
- def test_update_arvados_node_skips_sync_when_fqdn_match(self):
- self.make_mocks(6)
- arv_node = testutil.arvados_node_mock(6)
- self.cloud_mock.extra['testname'] ='{n[hostname]}.{n[domain]}'.format(
- n=arv_node)
- self.make_actor()
- self.node_actor.update_arvados_node(arv_node).get(self.TIMEOUT)
- self.assertEqual(0, self.updates.sync_node.call_count)
diff --git a/services/nodemanager/tests/test_computenode_dispatch_slurm.py b/services/nodemanager/tests/test_computenode_dispatch_slurm.py
index f896684..b61db5c 100644
--- a/services/nodemanager/tests/test_computenode_dispatch_slurm.py
+++ b/services/nodemanager/tests/test_computenode_dispatch_slurm.py
@@ -133,7 +133,11 @@ class SLURMComputeNodeSetupActorTestCase(ComputeNodeSetupActorTestCase):
@mock.patch('subprocess.check_output')
def test_update_node_features(self, check_output):
- self.make_mocks()
+ # `scontrol update` happens only if the Arvados node record
+ # has a hostname. ComputeNodeSetupActorTestCase.make_mocks
+ # uses mocks with scrubbed hostnames, so we override with the
+ # default testutil.arvados_node_mock.
+ self.make_mocks(arvados_effect=[testutil.arvados_node_mock()])
self.make_actor()
self.wait_for_assignment(self.setup_actor, 'cloud_node')
check_output.assert_called_with(['scontrol', 'update', 'NodeName=compute99', 'Weight=1000', 'Features=instancetype=z1.test'])
diff --git a/services/nodemanager/tests/testutil.py b/services/nodemanager/tests/testutil.py
index 6e13437..d13475b 100644
--- a/services/nodemanager/tests/testutil.py
+++ b/services/nodemanager/tests/testutil.py
@@ -55,7 +55,7 @@ def cloud_object_mock(name_id, **extra):
def cloud_node_fqdn(node):
# We intentionally put the FQDN somewhere goofy to make sure tested code is
# using this function for lookups.
- return node.extra.get('testname', 'NoTestName')
+ return node.extra.get('testname', node.name+'.NoTestName.invalid')
def ip_address_mock(last_octet):
return '10.20.30.{}'.format(last_octet)
commit 5120666e073018f6821bab07f0bbb788098f97b1
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date: Mon Jan 29 20:03:23 2018 -0500
12199: Assign slot and hostname when creating/reusing a node record.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>
diff --git a/services/api/app/controllers/application_controller.rb b/services/api/app/controllers/application_controller.rb
index c94ce89..c4f64f6 100644
--- a/services/api/app/controllers/application_controller.rb
+++ b/services/api/app/controllers/application_controller.rb
@@ -554,6 +554,10 @@ class ApplicationController < ActionController::Base
}
end
+ def self._update_requires_parameters
+ {}
+ end
+
def self._index_requires_parameters
{
filters: { type: 'array', required: false },
diff --git a/services/api/app/controllers/arvados/v1/nodes_controller.rb b/services/api/app/controllers/arvados/v1/nodes_controller.rb
index 7ee8c2f..73f1dee 100644
--- a/services/api/app/controllers/arvados/v1/nodes_controller.rb
+++ b/services/api/app/controllers/arvados/v1/nodes_controller.rb
@@ -20,6 +20,32 @@ class Arvados::V1::NodesController < ApplicationController
{ ping_secret: {required: true} }
end
+ def self._create_requires_parameters
+ super.merge(
+ { assign_slot: {required: false, type: 'boolean', description: 'assign slot and hostname'} })
+ end
+
+ def self._update_requires_parameters
+ super.merge(
+ { assign_slot: {required: false, type: 'boolean', description: 'assign slot and hostname'} })
+ end
+
+ def create
+ @object = model_class.new(resource_attrs)
+ @object.assign_slot if params[:assign_slot]
+ @object.save!
+ show
+ end
+
+ def update
+ attrs_to_update = resource_attrs.reject { |k,v|
+ [:kind, :etag, :href].index k
+ }
+ @object.update_attributes!(attrs_to_update)
+ @object.assign_slot if params[:assign_slot]
+ show
+ end
+
def ping
act_as_system_user do
@object = Node.where(uuid: (params[:id] || params[:uuid])).first
diff --git a/services/api/app/models/node.rb b/services/api/app/models/node.rb
index bf1b636..3d8b91b 100644
--- a/services/api/app/models/node.rb
+++ b/services/api/app/models/node.rb
@@ -106,27 +106,7 @@ class Node < ArvadosModel
end
end
- # Assign slot_number
- if self.slot_number.nil?
- while true
- n = self.class.available_slot_number
- if n.nil?
- raise "No available node slots"
- end
- self.slot_number = n
- begin
- self.save!
- break
- rescue ActiveRecord::RecordNotUnique
- # try again
- end
- end
- end
-
- # Assign hostname
- if self.hostname.nil? and Rails.configuration.assign_node_hostname
- self.hostname = self.class.hostname_for_slot(self.slot_number)
- end
+ assign_slot
# Record other basic stats
['total_cpu_cores', 'total_ram_mb', 'total_scratch_mb'].each do |key|
@@ -140,8 +120,30 @@ class Node < ArvadosModel
save!
end
+ def assign_slot
+ return if self.slot_number.andand > 0
+ while true
+ self.slot_number = self.class.available_slot_number
+ if self.slot_number.nil?
+ raise "No available node slots"
+ end
+ begin
+ save!
+ return assign_hostname
+ rescue ActiveRecord::RecordNotUnique
+ # try again
+ end
+ end
+ end
+
protected
+ def assign_hostname
+ if self.hostname.nil? and Rails.configuration.assign_node_hostname
+ self.hostname = self.class.hostname_for_slot(self.slot_number)
+ end
+ end
+
def self.available_slot_number
# Join the sequence 1..max with the nodes table. Return the first
# (i.e., smallest) value that doesn't match the slot_number of any
diff --git a/services/api/test/functional/arvados/v1/nodes_controller_test.rb b/services/api/test/functional/arvados/v1/nodes_controller_test.rb
index f9e5be4..c198c4c 100644
--- a/services/api/test/functional/arvados/v1/nodes_controller_test.rb
+++ b/services/api/test/functional/arvados/v1/nodes_controller_test.rb
@@ -78,6 +78,40 @@ class Arvados::V1::NodesControllerTest < ActionController::TestCase
assert_not_nil json_response['uuid']
assert_not_nil json_response['info'].is_a? Hash
assert_not_nil json_response['info']['ping_secret']
+ assert_nil json_response['slot_number']
+ assert_nil json_response['hostname']
+ end
+
+ test "create node and assign slot" do
+ authorize_with :admin
+ post :create, {node: {}, assign_slot: true}
+ assert_response :success
+ assert_not_nil json_response['uuid']
+ assert_not_nil json_response['info'].is_a? Hash
+ assert_not_nil json_response['info']['ping_secret']
+ assert_operator 0, :<, json_response['slot_number']
+ n = json_response['slot_number']
+ assert_equal "compute#{n}", json_response['hostname']
+ end
+
+ test "update node and assign slot" do
+ authorize_with :admin
+ node = nodes(:new_with_no_hostname)
+ post :update, {id: node.uuid, node: {}, assign_slot: true}
+ assert_response :success
+ assert_operator 0, :<, json_response['slot_number']
+ n = json_response['slot_number']
+ assert_equal "compute#{n}", json_response['hostname']
+ end
+
+ test "update node and assign slot, don't clobber hostname" do
+ authorize_with :admin
+ node = nodes(:new_with_custom_hostname)
+ post :update, {id: node.uuid, node: {}, assign_slot: true}
+ assert_response :success
+ assert_operator 0, :<, json_response['slot_number']
+ n = json_response['slot_number']
+ assert_equal "custom1", json_response['hostname']
end
test "ping adds node stats to info" do
diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
index 6c61e32..67ea8a2 100644
--- a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
@@ -113,14 +113,16 @@ class ComputeNodeSetupActor(ComputeNodeStateChangeBase):
@ComputeNodeStateChangeBase._finish_on_exception
@RetryMixin._retry(config.ARVADOS_ERRORS)
def create_arvados_node(self):
- self.arvados_node = self._arvados.nodes().create(body={}).execute()
+ self.arvados_node = self._arvados.nodes().create(
+ body={}, assign_slot=True).execute()
self._later.create_cloud_node()
@ComputeNodeStateChangeBase._finish_on_exception
@RetryMixin._retry(config.ARVADOS_ERRORS)
def prepare_arvados_node(self, node):
- self.arvados_node = self._clean_arvados_node(
- node, "Prepared by Node Manager")
+ self._clean_arvados_node(node, "Prepared by Node Manager")
+ self.arvados_node = self._arvados.nodes().update(
+ body={}, assign_slot=True).execute()
self._later.create_cloud_node()
@ComputeNodeStateChangeBase._finish_on_exception
diff --git a/services/nodemanager/tests/test_computenode_dispatch.py b/services/nodemanager/tests/test_computenode_dispatch.py
index b62ce56..1102bf7 100644
--- a/services/nodemanager/tests/test_computenode_dispatch.py
+++ b/services/nodemanager/tests/test_computenode_dispatch.py
@@ -25,7 +25,12 @@ class ComputeNodeSetupActorTestCase(testutil.ActorTestMixin, unittest.TestCase):
def make_mocks(self, arvados_effect=None):
if arvados_effect is None:
- arvados_effect = [testutil.arvados_node_mock()]
+ arvados_effect = [testutil.arvados_node_mock(
+ slot_number=None,
+ hostname=None,
+ first_ping_at=None,
+ last_ping_at=None,
+ )]
self.arvados_effect = arvados_effect
self.timer = testutil.MockTimer()
self.api_client = mock.MagicMock(name='api_client')
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list