[ARVADOS] created: 3ee90fd536fbfc9d8d91d5fac7c12d0ebe0df5ab
git at public.curoverse.com
git at public.curoverse.com
Mon Nov 10 16:28:46 EST 2014
at 3ee90fd536fbfc9d8d91d5fac7c12d0ebe0df5ab (commit)
commit 3ee90fd536fbfc9d8d91d5fac7c12d0ebe0df5ab
Author: Tim Pierce <twp at curoverse.com>
Date: Fri Nov 7 09:37:24 2014 -0500
4294: added min_nodes config parameter
Added min_nodes configuration setting. The job queue will return a
wishlist with at least min_nodes elements in it, and the node manager
daemon will avoid shutting down a node if it would bring the total below
min_nodes.
* arvnodeman.config sets default min_nodes to 0.
* NodeManagerDaemonActor:
** _nodes_wanted returns at least enough to keep the node count above
min_nodes
** _nodes_excess will not allow the node count to drop below min_nodes
* jobqueue.ServerCalculator.servers_for_queue returns a list with at least
min_nodes elements
* added test cases:
** NodeManagerDaemonActorTestCase.test_shutdown_declined_below_min_nodes
** ServerCalculatorTestCase.test_server_calc_returns_at_least_min_nodes
diff --git a/services/nodemanager/arvnodeman/config.py b/services/nodemanager/arvnodeman/config.py
index 1699584..754b931 100644
--- a/services/nodemanager/arvnodeman/config.py
+++ b/services/nodemanager/arvnodeman/config.py
@@ -37,7 +37,8 @@ class NodeManagerConfig(ConfigParser.SafeConfigParser):
for sec_name, settings in {
'Arvados': {'insecure': 'no',
'timeout': '15'},
- 'Daemon': {'max_nodes': '1',
+ 'Daemon': {'min_nodes': '0',
+ 'max_nodes': '1',
'poll_time': '60',
'max_poll_time': '300',
'poll_stale_after': '600',
diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py
index 83e3ec9..d950e2a 100644
--- a/services/nodemanager/arvnodeman/daemon.py
+++ b/services/nodemanager/arvnodeman/daemon.py
@@ -94,7 +94,7 @@ class NodeManagerDaemonActor(actor_class):
def __init__(self, server_wishlist_actor, arvados_nodes_actor,
cloud_nodes_actor, cloud_update_actor, timer_actor,
arvados_factory, cloud_factory,
- shutdown_windows, max_nodes,
+ shutdown_windows, min_nodes, max_nodes,
poll_stale_after=600, node_stale_after=7200,
node_setup_class=cnode.ComputeNodeSetupActor,
node_shutdown_class=cnode.ComputeNodeShutdownActor,
@@ -111,6 +111,7 @@ class NodeManagerDaemonActor(actor_class):
self._logger = logging.getLogger('arvnodeman.daemon')
self._later = self.actor_ref.proxy()
self.shutdown_windows = shutdown_windows
+ self.min_nodes = min_nodes
self.max_nodes = max_nodes
self.poll_stale_after = poll_stale_after
self.node_stale_after = node_stale_after
@@ -203,7 +204,8 @@ class NodeManagerDaemonActor(actor_class):
self.max_nodes) - self._nodes_up()
def _nodes_excess(self):
- return self._nodes_up() - self._nodes_busy() - len(self.last_wishlist)
+ idle_nodes = self._nodes_busy() + len(self.last_wishlist)
+ return (self._nodes_up() - max(self.min_nodes, idle_nodes))
def update_server_wishlist(self, wishlist):
self._update_poll_time('server_wishlist')
diff --git a/services/nodemanager/arvnodeman/jobqueue.py b/services/nodemanager/arvnodeman/jobqueue.py
index 59659fe..239934f 100644
--- a/services/nodemanager/arvnodeman/jobqueue.py
+++ b/services/nodemanager/arvnodeman/jobqueue.py
@@ -38,10 +38,11 @@ class ServerCalculator(object):
return True
- def __init__(self, server_list, max_nodes=None):
+ def __init__(self, server_list, min_nodes=0, max_nodes=None):
self.cloud_sizes = [self.CloudSizeWrapper(s, **kws)
for s, kws in server_list]
self.cloud_sizes.sort(key=lambda s: s.price)
+ self.min_nodes = min_nodes
self.max_nodes = max_nodes or float('inf')
self.logger = logging.getLogger('arvnodeman.jobqueue')
self.logged_jobs = set()
@@ -78,6 +79,13 @@ class ServerCalculator(object):
elif (want_count <= self.max_nodes):
servers.extend([cloud_size.real] * max(1, want_count))
self.logged_jobs.intersection_update(seen_jobs)
+
+ # Make sure the server queue has at least enough entries to
+ # satisfy min_nodes.
+ node_shortfall = self.min_nodes - len(servers)
+ if node_shortfall > 0:
+ basic_node = self.cloud_size_for_constraints({})
+ servers.extend([basic_node.real] * node_shortfall)
return servers
diff --git a/services/nodemanager/arvnodeman/launcher.py b/services/nodemanager/arvnodeman/launcher.py
index 87f2dda..f4ad716 100644
--- a/services/nodemanager/arvnodeman/launcher.py
+++ b/services/nodemanager/arvnodeman/launcher.py
@@ -68,7 +68,9 @@ def launch_pollers(config):
abort("No valid node sizes configured")
server_calculator = ServerCalculator(
- cloud_size_list, config.getint('Daemon', 'max_nodes'))
+ cloud_size_list,
+ config.getint('Daemon', 'min_nodes'),
+ config.getint('Daemon', 'max_nodes'))
poll_time = config.getint('Daemon', 'poll_time')
max_poll_time = config.getint('Daemon', 'max_poll_time')
@@ -115,7 +117,9 @@ def main(args=None):
job_queue_poller, arvados_node_poller, cloud_node_poller,
cloud_node_updater, timer,
config.new_arvados_client, config.new_cloud_client,
- config.shutdown_windows(), config.getint('Daemon', 'max_nodes'),
+ config.shutdown_windows(),
+ config.getint('Daemon', 'min_nodes'),
+ config.getint('Daemon', 'max_nodes'),
config.getint('Daemon', 'poll_stale_after'),
config.getint('Daemon', 'node_stale_after')).proxy()
diff --git a/services/nodemanager/tests/test_daemon.py b/services/nodemanager/tests/test_daemon.py
index 93e4435..4bffd09 100644
--- a/services/nodemanager/tests/test_daemon.py
+++ b/services/nodemanager/tests/test_daemon.py
@@ -15,7 +15,7 @@ from . import testutil
class NodeManagerDaemonActorTestCase(testutil.ActorTestMixin,
unittest.TestCase):
def make_daemon(self, cloud_nodes=[], arvados_nodes=[], want_sizes=[],
- max_nodes=8):
+ min_nodes=0, max_nodes=8):
for name in ['cloud_nodes', 'arvados_nodes', 'server_wishlist']:
setattr(self, name + '_poller', mock.MagicMock(name=name + '_mock'))
self.arv_factory = mock.MagicMock(name='arvados_mock')
@@ -29,7 +29,7 @@ class NodeManagerDaemonActorTestCase(testutil.ActorTestMixin,
self.server_wishlist_poller, self.arvados_nodes_poller,
self.cloud_nodes_poller, self.cloud_updates, self.timer,
self.arv_factory, self.cloud_factory,
- [54, 5, 1], max_nodes, 600, 3600,
+ [54, 5, 1], min_nodes, max_nodes, 600, 3600,
self.node_setup, self.node_shutdown).proxy()
if cloud_nodes is not None:
self.daemon.update_cloud_nodes(cloud_nodes).get(self.TIMEOUT)
@@ -227,6 +227,15 @@ class NodeManagerDaemonActorTestCase(testutil.ActorTestMixin,
self.stop_proxy(self.daemon)
self.assertFalse(self.node_shutdown.start.called)
+ def test_shutdown_declined_below_min_nodes(self):
+ cloud_node = testutil.cloud_node_mock(1)
+ self.make_daemon(cloud_nodes=[cloud_node], min_nodes=1)
+ self.assertEqual(1, self.alive_monitor_count())
+ monitor = self.monitor_list()[0].proxy()
+ self.daemon.node_can_shutdown(monitor).get(self.TIMEOUT)
+ self.stop_proxy(self.daemon)
+ self.assertFalse(self.node_shutdown.start.called)
+
def test_shutdown_accepted_below_capacity(self):
self.make_daemon(cloud_nodes=[testutil.cloud_node_mock()])
self.assertEqual(1, self.alive_monitor_count())
diff --git a/services/nodemanager/tests/test_jobqueue.py b/services/nodemanager/tests/test_jobqueue.py
index 158a3fd..6b5f532 100644
--- a/services/nodemanager/tests/test_jobqueue.py
+++ b/services/nodemanager/tests/test_jobqueue.py
@@ -48,6 +48,11 @@ class ServerCalculatorTestCase(unittest.TestCase):
{'min_scratch_mb_per_node': 200})
self.assertEqual(6, len(servlist))
+ def test_server_calc_returns_at_least_min_nodes(self):
+ servcalc = self.make_calculator([1], min_nodes=5, max_nodes=9)
+ servlist = self.calculate(servcalc, {})
+ self.assertEqual(5, len(servlist))
+
def test_job_requesting_max_nodes_accepted(self):
servcalc = self.make_calculator([1], max_nodes=4)
servlist = self.calculate(servcalc, {'min_nodes': 4})
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list