[ARVADOS] updated: 09794d996eca79b85d3ac0c21a4a43c65a51d0d7

Git user git at public.curoverse.com
Fri Jun 2 11:59:05 EDT 2017


Summary of changes:
 services/nodemanager/arvnodeman/daemon.py          |   4 +-
 services/nodemanager/arvnodeman/launcher.py        |   6 +-
 .../nodemanager/arvnodeman/test}/__init__.py       |   0
 .../arvnodeman/{ => test}/fake_driver.py           |   7 +-
 services/nodemanager/fake_slurm/sinfo              |   2 -
 services/nodemanager/fake_slurm/squeue             |   2 -
 .../tests/{fake.azure.cfg => fake.cfg.template}    |  18 +--
 services/nodemanager/tests/integration_test.py     | 133 +++++++++++++++++++++
 8 files changed, 152 insertions(+), 20 deletions(-)
 copy {sdk/cwl/tests => services/nodemanager/arvnodeman/test}/__init__.py (100%)
 rename services/nodemanager/arvnodeman/{ => test}/fake_driver.py (91%)
 delete mode 100755 services/nodemanager/fake_slurm/sinfo
 delete mode 100755 services/nodemanager/fake_slurm/squeue
 rename services/nodemanager/tests/{fake.azure.cfg => fake.cfg.template} (95%)
 create mode 100755 services/nodemanager/tests/integration_test.py

       via  09794d996eca79b85d3ac0c21a4a43c65a51d0d7 (commit)
      from  ea47e67af0e7528f0bcb23f3b34019b308eaa68a (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.


commit 09794d996eca79b85d3ac0c21a4a43c65a51d0d7
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Fri Jun 2 11:58:55 2017 -0400

    10312: Integration test framework for node manager, runs full node manager with
    fake cloud driver and monitors logging output.
    
    Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz at curoverse.com>

diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py
index 8f9207e..029d818 100644
--- a/services/nodemanager/arvnodeman/daemon.py
+++ b/services/nodemanager/arvnodeman/daemon.py
@@ -146,8 +146,8 @@ class NodeManagerDaemonActor(actor_class):
         self.last_polls[poll_key] = time.time()
 
     def _pair_nodes(self, node_record, arvados_node):
-        self._logger.info("Cloud node %s is now paired with Arvados node %s",
-                          node_record.cloud_node.name, arvados_node['uuid'])
+        self._logger.info("Cloud node %s is now paired with Arvados node %s with hostname %s",
+                          node_record.cloud_node.name, arvados_node['uuid'], arvados_node['hostname'])
         self._arvados_nodes_actor.subscribe_to(
             arvados_node['uuid'], node_record.actor.update_arvados_node)
         node_record.arvados_node = arvados_node
diff --git a/services/nodemanager/arvnodeman/launcher.py b/services/nodemanager/arvnodeman/launcher.py
index 11d38ec..cb80bbf 100644
--- a/services/nodemanager/arvnodeman/launcher.py
+++ b/services/nodemanager/arvnodeman/launcher.py
@@ -22,6 +22,7 @@ from .timedcallback import TimedCallBackActor
 from ._version import __version__
 
 node_daemon = None
+watchdog = None
 
 def abort(msg, code=1):
     print("arvados-node-manager: " + msg)
@@ -97,6 +98,7 @@ def shutdown_signal(signal_code, frame):
         pykka.ActorRegistry.stop_all()
         sys.exit(-signal_code)
     elif current_count == 0:
+        watchdog.stop()
         node_daemon.shutdown()
     elif current_count == 1:
         pykka.ActorRegistry.stop_all()
@@ -104,7 +106,7 @@ def shutdown_signal(signal_code, frame):
         sys.exit(-signal_code)
 
 def main(args=None):
-    global node_daemon
+    global node_daemon, watchdog
     args = parse_cli(args)
     config = load_config(args.config)
 
@@ -138,7 +140,7 @@ def main(args=None):
             node_setup, node_shutdown, node_monitor,
             max_total_price=config.getfloat('Daemon', 'max_total_price')).tell_proxy()
 
-        WatchdogActor.start(config.getint('Daemon', 'watchdog'),
+        watchdog = WatchdogActor.start(config.getint('Daemon', 'watchdog'),
                             cloud_node_poller.actor_ref,
                             arvados_node_poller.actor_ref,
                             job_queue_poller.actor_ref,
diff --git a/services/nodemanager/arvnodeman/test/__init__.py b/services/nodemanager/arvnodeman/test/__init__.py
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/services/nodemanager/arvnodeman/test/__init__.py
@@ -0,0 +1 @@
+
diff --git a/services/nodemanager/arvnodeman/fake_driver.py b/services/nodemanager/arvnodeman/test/fake_driver.py
similarity index 91%
rename from services/nodemanager/arvnodeman/fake_driver.py
rename to services/nodemanager/arvnodeman/test/fake_driver.py
index 89a3dbb..be0789e 100644
--- a/services/nodemanager/arvnodeman/fake_driver.py
+++ b/services/nodemanager/arvnodeman/test/fake_driver.py
@@ -27,18 +27,19 @@ class FakeDriver(NodeDriver):
                     ex_user_name=None,
                     ex_tags=None,
                     ex_network=None):
+        global all_nodes
         all_nodes.append(Node(name, name, NodeState.RUNNING, [], [], self, size=size, extra={"tags": ex_tags}))
         ping_url = re.search(r"echo '(.*)' > /var/tmp/arv-node-data/arv-ping-url", ex_customdata).groups(1)[0] + "&instance_id=" + name
-        print(ping_url)
         ctx = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
         ctx.verify_mode = ssl.CERT_NONE
         f = urllib.urlopen(ping_url, "", context=ctx)
-        print(f.read())
         f.close()
         return all_nodes[-1]
 
     def destroy_node(self, cloud_node):
-        return None
+        global all_nodes
+        all_nodes = [n for n in all_nodes if n.id != cloud_node.id]
+        return True
 
     def get_image(self, img):
         pass
diff --git a/services/nodemanager/fake_slurm/sinfo b/services/nodemanager/fake_slurm/sinfo
deleted file mode 100755
index e57d0d3..0000000
--- a/services/nodemanager/fake_slurm/sinfo
+++ /dev/null
@@ -1,2 +0,0 @@
-#!/bin/sh
-echo
\ No newline at end of file
diff --git a/services/nodemanager/fake_slurm/squeue b/services/nodemanager/fake_slurm/squeue
deleted file mode 100755
index dd114a0..0000000
--- a/services/nodemanager/fake_slurm/squeue
+++ /dev/null
@@ -1,2 +0,0 @@
-#!/bin/sh
-echo '1|100|100|ReqNodeNotAvail|34t0i-dz642-h42bg3hq4bdfpf9'
diff --git a/services/nodemanager/tests/fake.azure.cfg b/services/nodemanager/tests/fake.cfg.template
similarity index 95%
rename from services/nodemanager/tests/fake.azure.cfg
rename to services/nodemanager/tests/fake.cfg.template
index 7f7629f..631745a 100644
--- a/services/nodemanager/tests/fake.azure.cfg
+++ b/services/nodemanager/tests/fake.cfg.template
@@ -34,7 +34,7 @@ max_nodes = 8
 max_total_price = 0
 
 # Poll Azure nodes and Arvados for new information every N seconds.
-poll_time = 15
+poll_time = 5
 
 # Polls have exponential backoff when services fail to respond.
 # This is the longest time to wait between polls.
@@ -49,7 +49,7 @@ poll_stale_after = 600
 # node before this long, assume that there was a cloud bootstrap failure and
 # shut it down.  Note that normal shutdown windows apply (see the Cloud
 # section), so this should be shorter than the first shutdown window value.
-boot_fail_after = 1800
+boot_fail_after = 20
 
 # "Node stale time" affects two related behaviors.
 # 1. If a compute node has been running for at least this long, but it
@@ -90,8 +90,8 @@ pykka = WARNING
 apiclient = WARNING
 
 [Arvados]
-host = 192.168.5.2:8000
-token = 2tnmn9ou33o3vk3bynzyzrc7aedhijo7ufa11j9kyv7509cygx
+host = {host}
+token = {token}
 timeout = 15
 
 # Accept an untrusted SSL certificate from the API server?
@@ -99,7 +99,7 @@ insecure = yes
 
 [Cloud]
 provider = azure
-driver_class = arvnodeman.fake_driver.FakeDriver
+driver_class = {driver_class}
 
 # Shutdown windows define periods of time when a node may and may not be shut
 # down.  These are windows in full minutes, separated by commas.  Counting from
@@ -110,7 +110,7 @@ driver_class = arvnodeman.fake_driver.FakeDriver
 # Azure bills by the minute, so it makes sense to agressively shut down idle
 # nodes.  Specify at least two windows.  You can add as many as you need beyond
 # that.
-shutdown_windows = 5, 999999
+shutdown_windows = 1, 999999
 
 [Cloud Credentials]
 # Use "azure account list" with the azure CLI to get these values.
@@ -123,7 +123,7 @@ subscription_id = 00000000-0000-0000-0000-000000000000
 # azure config mode arm
 # azure ad app create --name "<Your Application Display Name>" --home-page "<https://YourApplicationHomePage>" --identifier-uris "<https://YouApplicationUri>" --password <Your_Password>
 # azure ad sp create "<Application_Id>"
-# azure role assignment create --objectId "<Object_Id>" -o Owner -c /subscriptions/{subscriptionId}/
+# azure role assignment create --objectId "<Object_Id>" -o Owner -c /subscriptions/<subscriptionId>/
 #
 # Use <Application_Id> for "key" and the <Your_Password> for "secret"
 #
@@ -142,7 +142,7 @@ ex_resource_group = ArvadosResourceGroup
 image = Canonical:UbuntuServer:14.04.3-LTS:14.04.201508050
 
 # Path to a local ssh key file that will be used to provision new nodes.
-ssh_key = /home/peter/.ssh/id_rsa.pub
+ssh_key = {ssh_key}
 
 # The account name for the admin user that will be provisioned on new nodes.
 ex_user_name = arvadosuser
@@ -161,7 +161,7 @@ tag_arvados-class = dynamic-compute
 tag_cluster = zyxwv
 
 # the API server to ping
-ping_host = 192.168.5.2:8000
+ping_host = {host}
 
 # You can define any number of Size sections to list Azure sizes you're willing
 # to use.  The Node Manager should boot the cheapest size(s) that can run jobs
diff --git a/services/nodemanager/tests/integration_test.py b/services/nodemanager/tests/integration_test.py
new file mode 100755
index 0000000..90bf237
--- /dev/null
+++ b/services/nodemanager/tests/integration_test.py
@@ -0,0 +1,133 @@
+#!/usr/bin/env python
+import subprocess
+import os
+import sys
+import re
+import time
+import logging
+import stat
+import tempfile
+import shutil
+
+logging.basicConfig(level=logging.INFO)
+
+fake_slurm = None
+compute_nodes = None
+
+def update_script(path, val):
+    with open(path+"_", "w") as f:
+        f.write(val)
+    os.chmod(path+"_", stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR)
+    os.rename(path+"_", path)
+
+
+def set_squeue(actions, checks, k, g):
+    update_script(os.path.join(fake_slurm, "squeue"), """#!/bin/sh
+echo '1|100|100|ReqNodeNotAvail|34t0i-dz642-h42bg3hq4bdfpf9'
+""")
+    return 0
+
+def set_sinfo_alloc(actions, checks, k, g):
+    update_script(os.path.join(fake_slurm, "sinfo"), """#!/bin/sh
+echo '%s alloc'
+""" % (g.group(3)))
+
+    update_script(os.path.join(fake_slurm, "squeue"), """#!/bin/sh
+echo '1|100|100|Running|34t0i-dz642-h42bg3hq4bdfpf9'
+""")
+
+    global compute_nodes
+    compute_nodes[g.group(1)] = g.group(3)
+    return 0
+
+def set_sinfo_idle(actions, checks, k, g):
+    update_script(os.path.join(fake_slurm, "sinfo"), """#!/bin/sh
+echo '%s idle'
+""" % (compute_nodes[g.group(1)]))
+    return 0
+
+def noop(actions, checks, k, g):
+    return 0
+
+def down_fail(actions, checks, k, g):
+    return 1
+
+
+def run_test(actions, checks, driver_class):
+    code = 0
+
+    global fake_slurm
+    fake_slurm = tempfile.mkdtemp()
+    logging.info("fake_slurm is %s", fake_slurm)
+
+    global compute_nodes
+    compute_nodes = {}
+
+    env = os.environ.copy()
+    env["PATH"] = fake_slurm + ":" + env["PATH"]
+
+    update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n")
+    update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n")
+
+    with open("tests/fake.cfg.template") as f:
+        with open(os.path.join(fake_slurm, "id_rsa.pub"), "w") as ssh:
+            pass
+        with open(os.path.join(fake_slurm, "fake.cfg"), "w") as cfg:
+            cfg.write(f.read().format(host=os.environ["ARVADOS_API_HOST"],
+                                      token=os.environ["ARVADOS_API_TOKEN"],
+                                      driver_class=driver_class,
+                                      ssh_key=os.path.join(fake_slurm, "id_rsa.pub")))
+
+    timeout = time.time() + 300
+
+    p = subprocess.Popen(["bin/arvados-node-manager", "--foreground", "--config", os.path.join(fake_slurm, "fake.cfg")],
+                         bufsize=1, stderr=subprocess.PIPE, env=env)
+    for line in p.stderr:
+        sys.stdout.write(line)
+
+        if time.time() > timeout:
+            logging.error("Exceeded timeout")
+            code = 1
+            p.terminate()
+
+        for k,v in actions.items():
+            g = re.match(k, line)
+            if g:
+                logging.info("Triggered action %s", k)
+                del actions[k]
+                code = v(actions, checks, k, g)
+                if code != 0:
+                    logging.error("Action failed")
+                    p.terminate()
+
+        for k,v in checks.items():
+            g = re.match(k, line)
+            if g:
+                logging.info("Triggered check %s", k)
+                code = v(actions, checks, k, g)
+                if code != 0:
+                    logging.error("Check failed")
+                    p.terminate()
+
+        if not actions:
+            p.terminate()
+
+    #shutil.rmtree(fake_slurm)
+
+    return code
+
+
+def main():
+    code = run_test({
+        r".*Daemon started": set_squeue,
+        r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": set_sinfo_alloc,
+        r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)": set_sinfo_idle,
+        r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)": noop,
+        r".*Shutdown success": noop,
+    }, {
+        r".*Suggesting shutdown because node state is \('down', .*\)": down_fail
+    },
+    "arvnodeman.test.fake_driver.FakeDriver")
+    exit(code)
+
+main()

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list