[ARVADOS] updated: 9b128b6391d9bbe7302f5df47aba139e09a267fd

Fri Jun 2 17:35:39 EDT 2017

Summary of changes:
 services/nodemanager/arvnodeman/daemon.py          |   7 +-
 .../nodemanager/arvnodeman/test/fake_driver.py     |  31 +++
 services/nodemanager/tests/fake.cfg.template       |   2 +-
 services/nodemanager/tests/integration_test.py     | 253 ++++++++++++++++-----
 4 files changed, 238 insertions(+), 55 deletions(-)

       via  9b128b6391d9bbe7302f5df47aba139e09a267fd (commit)
      from  09794d996eca79b85d3ac0c21a4a43c65a51d0d7 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.


commit 9b128b6391d9bbe7302f5df47aba139e09a267fd
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Fri Jun 2 17:35:15 2017 -0400

    10312: Tests pass for booting single node, multiple nodes, hitting quota.  Working on quota probe behavior.

diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py
index 029d818..58fa0a4 100644
--- a/services/nodemanager/arvnodeman/daemon.py
+++ b/services/nodemanager/arvnodeman/daemon.py
@@ -407,13 +407,13 @@ class NodeManagerDaemonActor(actor_class):
             setup_proxy, 'cloud_node', 'arvados_node', 'error')
         setup_proxy.stop()
 
-        total_node_count = self._nodes_booting(None) + len(self.cloud_nodes)
+        up_node_count = len(self.cloud_nodes)
         if cloud_node is None:
             # If cloud_node is None then the node create wasn't successful.
             if error == dispatch.QuotaExceeded:
                 # We've hit a quota limit, so adjust node_quota to stop trying to
                 # boot new nodes until the node count goes down.
-                self.node_quota = min(total_node_count-1, self.max_nodes)
+                self.node_quota = min(up_node_count, self.max_nodes)
                 self._logger.warning("Setting node quota to %s", self.node_quota)
         else:
             # Node creation succeeded.  Update cloud node list.
@@ -432,8 +432,9 @@ class NodeManagerDaemonActor(actor_class):
             # now booting single core machines (actual quota 20), we want to
             # allow the quota to expand so we don't get stuck at 10 machines
             # forever.
-            if total_node_count == self.node_quota and self.node_quota < self.max_nodes:
+            if up_node_count == self.node_quota and self.node_quota < self.max_nodes:
                 self.node_quota += 1
+                self._logger.warning("Setting node quota to %s", self.node_quota)
         del self.booting[setup_proxy.actor_ref.actor_urn]
         del self.sizes_booting[setup_proxy.actor_ref.actor_urn]
 
diff --git a/services/nodemanager/arvnodeman/test/fake_driver.py b/services/nodemanager/arvnodeman/test/fake_driver.py
index be0789e..1da61ff 100644
--- a/services/nodemanager/arvnodeman/test/fake_driver.py
+++ b/services/nodemanager/arvnodeman/test/fake_driver.py
@@ -3,12 +3,14 @@ import urllib
 import ssl
 
 from libcloud.compute.base import NodeSize, Node, NodeDriver, NodeState
+from libcloud.common.exceptions import BaseHTTPError
 
 all_nodes = []
 
 class FakeDriver(NodeDriver):
     def __init__(self, *args, **kwargs):
         self.name = "FakeDriver"
+        self.create_calls = 0
 
     def list_sizes(self, **kwargs):
         return [NodeSize("Standard_D3", "Standard_D3", 3500, 200, 0, 0, self),
@@ -27,6 +29,7 @@ class FakeDriver(NodeDriver):
                     ex_user_name=None,
                     ex_tags=None,
                     ex_network=None):
+        self.create_calls += 1
         global all_nodes
         all_nodes.append(Node(name, name, NodeState.RUNNING, [], [], self, size=size, extra={"tags": ex_tags}))
         ping_url = re.search(r"echo '(.*)' > /var/tmp/arv-node-data/arv-ping-url", ex_customdata).groups(1)[0] + "&instance_id=" + name
@@ -46,3 +49,31 @@ class FakeDriver(NodeDriver):
 
     def ex_create_tags(self, cloud_node, tags):
         pass
+
+class QuotaDriver(FakeDriver):
+
+    def create_node(self, name=None,
+                    size=None,
+                    image=None,
+                    auth=None,
+                    ex_storage_account=None,
+                    ex_customdata=None,
+                    ex_resource_group=None,
+                    ex_user_name=None,
+                    ex_tags=None,
+                    ex_network=None):
+        global all_nodes
+        if len(all_nodes) >= 2 and self.create_calls < 3:
+            self.create_calls += 1
+            raise BaseHTTPError(503, "Quota exceeded")
+        else:
+            return super(QuotaDriver, self).create_node(name=name,
+                    size=size,
+                    image=image,
+                    auth=auth,
+                    ex_storage_account=ex_storage_account,
+                    ex_customdata=ex_customdata,
+                    ex_resource_group=ex_resource_group,
+                    ex_user_name=ex_user_name,
+                    ex_tags=ex_tags,
+                    ex_network=ex_network)
diff --git a/services/nodemanager/tests/fake.cfg.template b/services/nodemanager/tests/fake.cfg.template
index 631745a..7d43dec 100644
--- a/services/nodemanager/tests/fake.cfg.template
+++ b/services/nodemanager/tests/fake.cfg.template
@@ -49,7 +49,7 @@ poll_stale_after = 600
 # node before this long, assume that there was a cloud bootstrap failure and
 # shut it down.  Note that normal shutdown windows apply (see the Cloud
 # section), so this should be shorter than the first shutdown window value.
-boot_fail_after = 20
+boot_fail_after = 45
 
 # "Node stale time" affects two related behaviors.
 # 1. If a compute node has been running for at least this long, but it
diff --git a/services/nodemanager/tests/integration_test.py b/services/nodemanager/tests/integration_test.py
index 90bf237..37ca206 100755
--- a/services/nodemanager/tests/integration_test.py
+++ b/services/nodemanager/tests/integration_test.py
@@ -8,52 +8,87 @@ import logging
 import stat
 import tempfile
 import shutil
+from functools import partial
 
 logging.basicConfig(level=logging.INFO)
 
 fake_slurm = None
 compute_nodes = None
+all_jobs = None
 
 def update_script(path, val):
     with open(path+"_", "w") as f:
         f.write(val)
     os.chmod(path+"_", stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR)
     os.rename(path+"_", path)
+    logging.info("Update script %s: %s", path, val)
 
+def set_squeue(g):
+    global all_jobs
+    update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n" +
+                  "\n".join("echo '1|100|100|%s|%s'" % (v, k) for k,v in all_jobs.items()))
+    return 0
+
+
+def node_paired(g):
+    global compute_nodes
+    compute_nodes[g.group(1)] = g.group(3)
+
+    update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n" +
+                  "\n".join("echo '%s alloc'" % (v) for k,v in compute_nodes.items()))
+
+    for k,v in all_jobs.items():
+        if v == "ReqNodeNotAvail":
+            all_jobs[k] = "Running"
+            break
+
+    set_squeue(g)
 
-def set_squeue(actions, checks, k, g):
-    update_script(os.path.join(fake_slurm, "squeue"), """#!/bin/sh
-echo '1|100|100|ReqNodeNotAvail|34t0i-dz642-h42bg3hq4bdfpf9'
-""")
     return 0
 
-def set_sinfo_alloc(actions, checks, k, g):
-    update_script(os.path.join(fake_slurm, "sinfo"), """#!/bin/sh
-echo '%s alloc'
-""" % (g.group(3)))
+def remaining_jobs(g):
+    update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n" +
+                  "\n".join("echo '%s alloc'" % (v) for k,v in compute_nodes.items()))
+
+    for k,v in all_jobs.items():
+        all_jobs[k] = "Running"
 
-    update_script(os.path.join(fake_slurm, "squeue"), """#!/bin/sh
-echo '1|100|100|Running|34t0i-dz642-h42bg3hq4bdfpf9'
-""")
+    set_squeue(g)
 
+    return 0
+
+
+def node_busy(g):
+    update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n" +
+                  "\n".join("echo '%s idle'" % (v) for k,v in compute_nodes.items()))
+    return 0
+
+def node_shutdown(g):
     global compute_nodes
-    compute_nodes[g.group(1)] = g.group(3)
+    del compute_nodes[g.group(1)]
     return 0
 
-def set_sinfo_idle(actions, checks, k, g):
-    update_script(os.path.join(fake_slurm, "sinfo"), """#!/bin/sh
-echo '%s idle'
-""" % (compute_nodes[g.group(1)]))
+def jobs_req(g):
+    global all_jobs
+    for k,v in all_jobs.items():
+        all_jobs[k] = "ReqNodeNotAvail"
+    set_squeue(g)
     return 0
 
-def noop(actions, checks, k, g):
+def noop(g):
     return 0
 
-def down_fail(actions, checks, k, g):
+def fail(checks, pattern, g):
     return 1
 
+def expect_count(count, checks, pattern, g):
+    if count == 0:
+        return 1
+    else:
+        checks[pattern] = partial(expect_count, count-1)
+        return 0
 
-def run_test(actions, checks, driver_class):
+def run_test(actions, checks, driver_class, jobs):
     code = 0
 
     global fake_slurm
@@ -63,6 +98,9 @@ def run_test(actions, checks, driver_class):
     global compute_nodes
     compute_nodes = {}
 
+    global all_jobs
+    all_jobs = jobs
+
     env = os.environ.copy()
     env["PATH"] = fake_slurm + ":" + env["PATH"]
 
@@ -78,56 +116,169 @@ def run_test(actions, checks, driver_class):
                                       driver_class=driver_class,
                                       ssh_key=os.path.join(fake_slurm, "id_rsa.pub")))
 
-    timeout = time.time() + 300
+    timeout = time.time() + 180
+    terminated = False
 
     p = subprocess.Popen(["bin/arvados-node-manager", "--foreground", "--config", os.path.join(fake_slurm, "fake.cfg")],
-                         bufsize=1, stderr=subprocess.PIPE, env=env)
-    for line in p.stderr:
-        sys.stdout.write(line)
+                         bufsize=0, stderr=subprocess.PIPE, env=env)
 
-        if time.time() > timeout:
-            logging.error("Exceeded timeout")
-            code = 1
-            p.terminate()
-
-        for k,v in actions.items():
-            g = re.match(k, line)
-            if g:
-                logging.info("Triggered action %s", k)
-                del actions[k]
-                code = v(actions, checks, k, g)
-                if code != 0:
-                    logging.error("Action failed")
-                    p.terminate()
+    # naive line iteration over pipes gets buffered, which isn't what we want,
+    # see https://bugs.python.org/issue3907
+    for line in iter(p.stderr.readline, ""):
+        sys.stdout.write(line)
 
         for k,v in checks.items():
             g = re.match(k, line)
             if g:
-                logging.info("Triggered check %s", k)
-                code = v(actions, checks, k, g)
+                logging.info("Matched check %s", k)
+                code += v(checks, k, g)
                 if code != 0:
                     logging.error("Check failed")
-                    p.terminate()
+                    if not terminated:
+                        p.terminate()
+                        terminated = True
+
+        if terminated:
+            continue
+
+        if time.time() > timeout:
+            logging.error("Exceeded timeout with actions remaining: %s", actions)
+            code += 1
+            if not terminated:
+                p.terminate()
+                terminated = True
+
+        k, v = actions[0]
+        g = re.match(k, line)
+        if g:
+            logging.info("Matched action %s", k)
+            actions.pop(0)
+            code += v(g)
+            if code != 0:
+                logging.error("Action failed")
+                p.terminate()
+                terminated = True
 
         if not actions:
             p.terminate()
+            terminated = True
 
-    #shutil.rmtree(fake_slurm)
+    shutil.rmtree(fake_slurm)
 
     return code
 
 
 def main():
-    code = run_test({
-        r".*Daemon started": set_squeue,
-        r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": set_sinfo_alloc,
-        r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)": set_sinfo_idle,
-        r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)": noop,
-        r".*Shutdown success": noop,
-    }, {
-        r".*Suggesting shutdown because node state is \('down', .*\)": down_fail
-    },
-    "arvnodeman.test.fake_driver.FakeDriver")
+    # Test lifecycle.
+
+    tests = {
+        "test1": (
+            [
+                (r".*Daemon started", set_squeue),
+                (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
+                (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
+                (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
+                (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
+            ], {
+                r".*Suggesting shutdown because node state is \('down', .*\)": fail,
+                r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
+                r".*Setting node quota.*": fail,
+            },
+            "arvnodeman.test.fake_driver.FakeDriver",
+            {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"}),
+        "test2": (
+            [
+                (r".*Daemon started", set_squeue),
+                (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
+                (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
+                (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
+                (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
+                (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
+                (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
+                (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
+                (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
+                (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
+                (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
+            ], {
+                r".*Suggesting shutdown because node state is \('down', .*\)": fail,
+                r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 4),
+                r".*Setting node quota.*": fail,
+            },
+            "arvnodeman.test.fake_driver.FakeDriver",
+            {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
+             "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
+             "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
+             "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"
+         }),
+        "test3": (
+            [
+                (r".*Daemon started", set_squeue),
+                (r".*Setting node quota to 2", noop),
+                (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
+                (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
+                (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
+                (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", remaining_jobs),
+                (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
+                (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
+                (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown)
+            ], {
+                r".*Suggesting shutdown because node state is \('down', .*\)": fail,
+                r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 2),
+                r".*Sending create_node request.*": partial(expect_count, 4)
+            },
+            "arvnodeman.test.fake_driver.QuotaDriver",
+            {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
+             "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
+             "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
+             "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"
+         }),
+        "test4": (
+            [
+                (r".*Daemon started", set_squeue),
+                (r".*Setting node quota to 2", noop),
+                (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
+                (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
+                (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
+                (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", remaining_jobs),
+                (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
+                (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
+                (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
+                (r".*sending request", jobs_req),
+                (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
+                (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
+                (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
+                (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
+                (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
+                (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
+                (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
+                (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
+                (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
+                (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
+            ], {
+                r".*Suggesting shutdown because node state is \('down', .*\)": fail,
+                r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 6),
+                r".*Sending create_node request.*": partial(expect_count, 6)
+            },
+            "arvnodeman.test.fake_driver.QuotaDriver",
+            {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
+             "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
+             "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
+             "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"
+         })
+    }
+
+    code = 0
+    if len(sys.argv) > 1:
+        code = run_test(*tests[sys.argv[1]])
+    else:
+        for t in tests:
+            code += run_test(*tests[t])
+
+    if code == 0:
+        logging.info("Test passed")
+    else:
+        logging.info("Test failed")
+
     exit(code)
 
 main()

-----------------------------------------------------------------------


hooks/post-receive
--