[ARVADOS] updated: 9b128b6391d9bbe7302f5df47aba139e09a267fd
Git user
git at public.curoverse.com
Fri Jun 2 17:35:39 EDT 2017
Summary of changes:
services/nodemanager/arvnodeman/daemon.py | 7 +-
.../nodemanager/arvnodeman/test/fake_driver.py | 31 +++
services/nodemanager/tests/fake.cfg.template | 2 +-
services/nodemanager/tests/integration_test.py | 253 ++++++++++++++++-----
4 files changed, 238 insertions(+), 55 deletions(-)
via 9b128b6391d9bbe7302f5df47aba139e09a267fd (commit)
from 09794d996eca79b85d3ac0c21a4a43c65a51d0d7 (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
commit 9b128b6391d9bbe7302f5df47aba139e09a267fd
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date: Fri Jun 2 17:35:15 2017 -0400
10312: Tests pass for booting single node, multiple nodes, hitting quota. Working on quota probe behavior.
diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py
index 029d818..58fa0a4 100644
--- a/services/nodemanager/arvnodeman/daemon.py
+++ b/services/nodemanager/arvnodeman/daemon.py
@@ -407,13 +407,13 @@ class NodeManagerDaemonActor(actor_class):
setup_proxy, 'cloud_node', 'arvados_node', 'error')
setup_proxy.stop()
- total_node_count = self._nodes_booting(None) + len(self.cloud_nodes)
+ up_node_count = len(self.cloud_nodes)
if cloud_node is None:
# If cloud_node is None then the node create wasn't successful.
if error == dispatch.QuotaExceeded:
# We've hit a quota limit, so adjust node_quota to stop trying to
# boot new nodes until the node count goes down.
- self.node_quota = min(total_node_count-1, self.max_nodes)
+ self.node_quota = min(up_node_count, self.max_nodes)
self._logger.warning("Setting node quota to %s", self.node_quota)
else:
# Node creation succeeded. Update cloud node list.
@@ -432,8 +432,9 @@ class NodeManagerDaemonActor(actor_class):
# now booting single core machines (actual quota 20), we want to
# allow the quota to expand so we don't get stuck at 10 machines
# forever.
- if total_node_count == self.node_quota and self.node_quota < self.max_nodes:
+ if up_node_count == self.node_quota and self.node_quota < self.max_nodes:
self.node_quota += 1
+ self._logger.warning("Setting node quota to %s", self.node_quota)
del self.booting[setup_proxy.actor_ref.actor_urn]
del self.sizes_booting[setup_proxy.actor_ref.actor_urn]
diff --git a/services/nodemanager/arvnodeman/test/fake_driver.py b/services/nodemanager/arvnodeman/test/fake_driver.py
index be0789e..1da61ff 100644
--- a/services/nodemanager/arvnodeman/test/fake_driver.py
+++ b/services/nodemanager/arvnodeman/test/fake_driver.py
@@ -3,12 +3,14 @@ import urllib
import ssl
from libcloud.compute.base import NodeSize, Node, NodeDriver, NodeState
+from libcloud.common.exceptions import BaseHTTPError
all_nodes = []
class FakeDriver(NodeDriver):
def __init__(self, *args, **kwargs):
self.name = "FakeDriver"
+ self.create_calls = 0
def list_sizes(self, **kwargs):
return [NodeSize("Standard_D3", "Standard_D3", 3500, 200, 0, 0, self),
@@ -27,6 +29,7 @@ class FakeDriver(NodeDriver):
ex_user_name=None,
ex_tags=None,
ex_network=None):
+ self.create_calls += 1
global all_nodes
all_nodes.append(Node(name, name, NodeState.RUNNING, [], [], self, size=size, extra={"tags": ex_tags}))
ping_url = re.search(r"echo '(.*)' > /var/tmp/arv-node-data/arv-ping-url", ex_customdata).groups(1)[0] + "&instance_id=" + name
@@ -46,3 +49,31 @@ class FakeDriver(NodeDriver):
def ex_create_tags(self, cloud_node, tags):
pass
+
+class QuotaDriver(FakeDriver):
+
+ def create_node(self, name=None,
+ size=None,
+ image=None,
+ auth=None,
+ ex_storage_account=None,
+ ex_customdata=None,
+ ex_resource_group=None,
+ ex_user_name=None,
+ ex_tags=None,
+ ex_network=None):
+ global all_nodes
+ if len(all_nodes) >= 2 and self.create_calls < 3:
+ self.create_calls += 1
+ raise BaseHTTPError(503, "Quota exceeded")
+ else:
+ return super(QuotaDriver, self).create_node(name=name,
+ size=size,
+ image=image,
+ auth=auth,
+ ex_storage_account=ex_storage_account,
+ ex_customdata=ex_customdata,
+ ex_resource_group=ex_resource_group,
+ ex_user_name=ex_user_name,
+ ex_tags=ex_tags,
+ ex_network=ex_network)
diff --git a/services/nodemanager/tests/fake.cfg.template b/services/nodemanager/tests/fake.cfg.template
index 631745a..7d43dec 100644
--- a/services/nodemanager/tests/fake.cfg.template
+++ b/services/nodemanager/tests/fake.cfg.template
@@ -49,7 +49,7 @@ poll_stale_after = 600
# node before this long, assume that there was a cloud bootstrap failure and
# shut it down. Note that normal shutdown windows apply (see the Cloud
# section), so this should be shorter than the first shutdown window value.
-boot_fail_after = 20
+boot_fail_after = 45
# "Node stale time" affects two related behaviors.
# 1. If a compute node has been running for at least this long, but it
diff --git a/services/nodemanager/tests/integration_test.py b/services/nodemanager/tests/integration_test.py
index 90bf237..37ca206 100755
--- a/services/nodemanager/tests/integration_test.py
+++ b/services/nodemanager/tests/integration_test.py
@@ -8,52 +8,87 @@ import logging
import stat
import tempfile
import shutil
+from functools import partial
logging.basicConfig(level=logging.INFO)
fake_slurm = None
compute_nodes = None
+all_jobs = None
def update_script(path, val):
with open(path+"_", "w") as f:
f.write(val)
os.chmod(path+"_", stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR)
os.rename(path+"_", path)
+ logging.info("Update script %s: %s", path, val)
+def set_squeue(g):
+ global all_jobs
+ update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n" +
+ "\n".join("echo '1|100|100|%s|%s'" % (v, k) for k,v in all_jobs.items()))
+ return 0
+
+
+def node_paired(g):
+ global compute_nodes
+ compute_nodes[g.group(1)] = g.group(3)
+
+ update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n" +
+ "\n".join("echo '%s alloc'" % (v) for k,v in compute_nodes.items()))
+
+ for k,v in all_jobs.items():
+ if v == "ReqNodeNotAvail":
+ all_jobs[k] = "Running"
+ break
+
+ set_squeue(g)
-def set_squeue(actions, checks, k, g):
- update_script(os.path.join(fake_slurm, "squeue"), """#!/bin/sh
-echo '1|100|100|ReqNodeNotAvail|34t0i-dz642-h42bg3hq4bdfpf9'
-""")
return 0
-def set_sinfo_alloc(actions, checks, k, g):
- update_script(os.path.join(fake_slurm, "sinfo"), """#!/bin/sh
-echo '%s alloc'
-""" % (g.group(3)))
+def remaining_jobs(g):
+ update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n" +
+ "\n".join("echo '%s alloc'" % (v) for k,v in compute_nodes.items()))
+
+ for k,v in all_jobs.items():
+ all_jobs[k] = "Running"
- update_script(os.path.join(fake_slurm, "squeue"), """#!/bin/sh
-echo '1|100|100|Running|34t0i-dz642-h42bg3hq4bdfpf9'
-""")
+ set_squeue(g)
+ return 0
+
+
+def node_busy(g):
+ update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n" +
+ "\n".join("echo '%s idle'" % (v) for k,v in compute_nodes.items()))
+ return 0
+
+def node_shutdown(g):
global compute_nodes
- compute_nodes[g.group(1)] = g.group(3)
+ del compute_nodes[g.group(1)]
return 0
-def set_sinfo_idle(actions, checks, k, g):
- update_script(os.path.join(fake_slurm, "sinfo"), """#!/bin/sh
-echo '%s idle'
-""" % (compute_nodes[g.group(1)]))
+def jobs_req(g):
+ global all_jobs
+ for k,v in all_jobs.items():
+ all_jobs[k] = "ReqNodeNotAvail"
+ set_squeue(g)
return 0
-def noop(actions, checks, k, g):
+def noop(g):
return 0
-def down_fail(actions, checks, k, g):
+def fail(checks, pattern, g):
return 1
+def expect_count(count, checks, pattern, g):
+ if count == 0:
+ return 1
+ else:
+ checks[pattern] = partial(expect_count, count-1)
+ return 0
-def run_test(actions, checks, driver_class):
+def run_test(actions, checks, driver_class, jobs):
code = 0
global fake_slurm
@@ -63,6 +98,9 @@ def run_test(actions, checks, driver_class):
global compute_nodes
compute_nodes = {}
+ global all_jobs
+ all_jobs = jobs
+
env = os.environ.copy()
env["PATH"] = fake_slurm + ":" + env["PATH"]
@@ -78,56 +116,169 @@ def run_test(actions, checks, driver_class):
driver_class=driver_class,
ssh_key=os.path.join(fake_slurm, "id_rsa.pub")))
- timeout = time.time() + 300
+ timeout = time.time() + 180
+ terminated = False
p = subprocess.Popen(["bin/arvados-node-manager", "--foreground", "--config", os.path.join(fake_slurm, "fake.cfg")],
- bufsize=1, stderr=subprocess.PIPE, env=env)
- for line in p.stderr:
- sys.stdout.write(line)
+ bufsize=0, stderr=subprocess.PIPE, env=env)
- if time.time() > timeout:
- logging.error("Exceeded timeout")
- code = 1
- p.terminate()
-
- for k,v in actions.items():
- g = re.match(k, line)
- if g:
- logging.info("Triggered action %s", k)
- del actions[k]
- code = v(actions, checks, k, g)
- if code != 0:
- logging.error("Action failed")
- p.terminate()
+ # naive line iteration over pipes gets buffered, which isn't what we want,
+ # see https://bugs.python.org/issue3907
+ for line in iter(p.stderr.readline, ""):
+ sys.stdout.write(line)
for k,v in checks.items():
g = re.match(k, line)
if g:
- logging.info("Triggered check %s", k)
- code = v(actions, checks, k, g)
+ logging.info("Matched check %s", k)
+ code += v(checks, k, g)
if code != 0:
logging.error("Check failed")
- p.terminate()
+ if not terminated:
+ p.terminate()
+ terminated = True
+
+ if terminated:
+ continue
+
+ if time.time() > timeout:
+ logging.error("Exceeded timeout with actions remaining: %s", actions)
+ code += 1
+ if not terminated:
+ p.terminate()
+ terminated = True
+
+ k, v = actions[0]
+ g = re.match(k, line)
+ if g:
+ logging.info("Matched action %s", k)
+ actions.pop(0)
+ code += v(g)
+ if code != 0:
+ logging.error("Action failed")
+ p.terminate()
+ terminated = True
if not actions:
p.terminate()
+ terminated = True
- #shutil.rmtree(fake_slurm)
+ shutil.rmtree(fake_slurm)
return code
def main():
- code = run_test({
- r".*Daemon started": set_squeue,
- r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": set_sinfo_alloc,
- r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)": set_sinfo_idle,
- r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)": noop,
- r".*Shutdown success": noop,
- }, {
- r".*Suggesting shutdown because node state is \('down', .*\)": down_fail
- },
- "arvnodeman.test.fake_driver.FakeDriver")
+ # Test lifecycle.
+
+ tests = {
+ "test1": (
+ [
+ (r".*Daemon started", set_squeue),
+ (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
+ (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
+ (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
+ (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
+ ], {
+ r".*Suggesting shutdown because node state is \('down', .*\)": fail,
+ r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
+ r".*Setting node quota.*": fail,
+ },
+ "arvnodeman.test.fake_driver.FakeDriver",
+ {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"}),
+ "test2": (
+ [
+ (r".*Daemon started", set_squeue),
+ (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
+ (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
+ (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
+ (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
+ (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
+ (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
+ (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
+ (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
+ (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
+ (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
+ ], {
+ r".*Suggesting shutdown because node state is \('down', .*\)": fail,
+ r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 4),
+ r".*Setting node quota.*": fail,
+ },
+ "arvnodeman.test.fake_driver.FakeDriver",
+ {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
+ "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
+ "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
+ "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"
+ }),
+ "test3": (
+ [
+ (r".*Daemon started", set_squeue),
+ (r".*Setting node quota to 2", noop),
+ (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
+ (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
+ (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
+ (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", remaining_jobs),
+ (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
+ (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
+ (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown)
+ ], {
+ r".*Suggesting shutdown because node state is \('down', .*\)": fail,
+ r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 2),
+ r".*Sending create_node request.*": partial(expect_count, 4)
+ },
+ "arvnodeman.test.fake_driver.QuotaDriver",
+ {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
+ "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
+ "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
+ "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"
+ }),
+ "test4": (
+ [
+ (r".*Daemon started", set_squeue),
+ (r".*Setting node quota to 2", noop),
+ (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
+ (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
+ (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
+ (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", remaining_jobs),
+ (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
+ (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
+ (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
+ (r".*sending request", jobs_req),
+ (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
+ (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
+ (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
+ (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
+ (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
+ (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
+ (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
+ (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
+ (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
+ (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
+ ], {
+ r".*Suggesting shutdown because node state is \('down', .*\)": fail,
+ r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 6),
+ r".*Sending create_node request.*": partial(expect_count, 6)
+ },
+ "arvnodeman.test.fake_driver.QuotaDriver",
+ {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
+ "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
+ "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
+ "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"
+ })
+ }
+
+ code = 0
+ if len(sys.argv) > 1:
+ code = run_test(*tests[sys.argv[1]])
+ else:
+ for t in tests:
+ code += run_test(*tests[t])
+
+ if code == 0:
+ logging.info("Test passed")
+ else:
+ logging.info("Test failed")
+
exit(code)
main()
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list