[ARVADOS] updated: 09794d996eca79b85d3ac0c21a4a43c65a51d0d7
Git user
git at public.curoverse.com
Fri Jun 2 11:59:05 EDT 2017
Summary of changes:
services/nodemanager/arvnodeman/daemon.py | 4 +-
services/nodemanager/arvnodeman/launcher.py | 6 +-
.../nodemanager/arvnodeman/test}/__init__.py | 0
.../arvnodeman/{ => test}/fake_driver.py | 7 +-
services/nodemanager/fake_slurm/sinfo | 2 -
services/nodemanager/fake_slurm/squeue | 2 -
.../tests/{fake.azure.cfg => fake.cfg.template} | 18 +--
services/nodemanager/tests/integration_test.py | 133 +++++++++++++++++++++
8 files changed, 152 insertions(+), 20 deletions(-)
copy {sdk/cwl/tests => services/nodemanager/arvnodeman/test}/__init__.py (100%)
rename services/nodemanager/arvnodeman/{ => test}/fake_driver.py (91%)
delete mode 100755 services/nodemanager/fake_slurm/sinfo
delete mode 100755 services/nodemanager/fake_slurm/squeue
rename services/nodemanager/tests/{fake.azure.cfg => fake.cfg.template} (95%)
create mode 100755 services/nodemanager/tests/integration_test.py
via 09794d996eca79b85d3ac0c21a4a43c65a51d0d7 (commit)
from ea47e67af0e7528f0bcb23f3b34019b308eaa68a (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
commit 09794d996eca79b85d3ac0c21a4a43c65a51d0d7
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date: Fri Jun 2 11:58:55 2017 -0400
10312: Integration test framework for node manager, runs full node manager with
fake cloud driver and monitors logging output.
Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz at curoverse.com>
diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py
index 8f9207e..029d818 100644
--- a/services/nodemanager/arvnodeman/daemon.py
+++ b/services/nodemanager/arvnodeman/daemon.py
@@ -146,8 +146,8 @@ class NodeManagerDaemonActor(actor_class):
self.last_polls[poll_key] = time.time()
def _pair_nodes(self, node_record, arvados_node):
- self._logger.info("Cloud node %s is now paired with Arvados node %s",
- node_record.cloud_node.name, arvados_node['uuid'])
+ self._logger.info("Cloud node %s is now paired with Arvados node %s with hostname %s",
+ node_record.cloud_node.name, arvados_node['uuid'], arvados_node['hostname'])
self._arvados_nodes_actor.subscribe_to(
arvados_node['uuid'], node_record.actor.update_arvados_node)
node_record.arvados_node = arvados_node
diff --git a/services/nodemanager/arvnodeman/launcher.py b/services/nodemanager/arvnodeman/launcher.py
index 11d38ec..cb80bbf 100644
--- a/services/nodemanager/arvnodeman/launcher.py
+++ b/services/nodemanager/arvnodeman/launcher.py
@@ -22,6 +22,7 @@ from .timedcallback import TimedCallBackActor
from ._version import __version__
node_daemon = None
+watchdog = None
def abort(msg, code=1):
print("arvados-node-manager: " + msg)
@@ -97,6 +98,7 @@ def shutdown_signal(signal_code, frame):
pykka.ActorRegistry.stop_all()
sys.exit(-signal_code)
elif current_count == 0:
+ watchdog.stop()
node_daemon.shutdown()
elif current_count == 1:
pykka.ActorRegistry.stop_all()
@@ -104,7 +106,7 @@ def shutdown_signal(signal_code, frame):
sys.exit(-signal_code)
def main(args=None):
- global node_daemon
+ global node_daemon, watchdog
args = parse_cli(args)
config = load_config(args.config)
@@ -138,7 +140,7 @@ def main(args=None):
node_setup, node_shutdown, node_monitor,
max_total_price=config.getfloat('Daemon', 'max_total_price')).tell_proxy()
- WatchdogActor.start(config.getint('Daemon', 'watchdog'),
+ watchdog = WatchdogActor.start(config.getint('Daemon', 'watchdog'),
cloud_node_poller.actor_ref,
arvados_node_poller.actor_ref,
job_queue_poller.actor_ref,
diff --git a/services/nodemanager/arvnodeman/test/__init__.py b/services/nodemanager/arvnodeman/test/__init__.py
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/services/nodemanager/arvnodeman/test/__init__.py
@@ -0,0 +1 @@
+
diff --git a/services/nodemanager/arvnodeman/fake_driver.py b/services/nodemanager/arvnodeman/test/fake_driver.py
similarity index 91%
rename from services/nodemanager/arvnodeman/fake_driver.py
rename to services/nodemanager/arvnodeman/test/fake_driver.py
index 89a3dbb..be0789e 100644
--- a/services/nodemanager/arvnodeman/fake_driver.py
+++ b/services/nodemanager/arvnodeman/test/fake_driver.py
@@ -27,18 +27,19 @@ class FakeDriver(NodeDriver):
ex_user_name=None,
ex_tags=None,
ex_network=None):
+ global all_nodes
all_nodes.append(Node(name, name, NodeState.RUNNING, [], [], self, size=size, extra={"tags": ex_tags}))
ping_url = re.search(r"echo '(.*)' > /var/tmp/arv-node-data/arv-ping-url", ex_customdata).groups(1)[0] + "&instance_id=" + name
- print(ping_url)
ctx = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
ctx.verify_mode = ssl.CERT_NONE
f = urllib.urlopen(ping_url, "", context=ctx)
- print(f.read())
f.close()
return all_nodes[-1]
def destroy_node(self, cloud_node):
- return None
+ global all_nodes
+ all_nodes = [n for n in all_nodes if n.id != cloud_node.id]
+ return True
def get_image(self, img):
pass
diff --git a/services/nodemanager/fake_slurm/sinfo b/services/nodemanager/fake_slurm/sinfo
deleted file mode 100755
index e57d0d3..0000000
--- a/services/nodemanager/fake_slurm/sinfo
+++ /dev/null
@@ -1,2 +0,0 @@
-#!/bin/sh
-echo
\ No newline at end of file
diff --git a/services/nodemanager/fake_slurm/squeue b/services/nodemanager/fake_slurm/squeue
deleted file mode 100755
index dd114a0..0000000
--- a/services/nodemanager/fake_slurm/squeue
+++ /dev/null
@@ -1,2 +0,0 @@
-#!/bin/sh
-echo '1|100|100|ReqNodeNotAvail|34t0i-dz642-h42bg3hq4bdfpf9'
diff --git a/services/nodemanager/tests/fake.azure.cfg b/services/nodemanager/tests/fake.cfg.template
similarity index 95%
rename from services/nodemanager/tests/fake.azure.cfg
rename to services/nodemanager/tests/fake.cfg.template
index 7f7629f..631745a 100644
--- a/services/nodemanager/tests/fake.azure.cfg
+++ b/services/nodemanager/tests/fake.cfg.template
@@ -34,7 +34,7 @@ max_nodes = 8
max_total_price = 0
# Poll Azure nodes and Arvados for new information every N seconds.
-poll_time = 15
+poll_time = 5
# Polls have exponential backoff when services fail to respond.
# This is the longest time to wait between polls.
@@ -49,7 +49,7 @@ poll_stale_after = 600
# node before this long, assume that there was a cloud bootstrap failure and
# shut it down. Note that normal shutdown windows apply (see the Cloud
# section), so this should be shorter than the first shutdown window value.
-boot_fail_after = 1800
+boot_fail_after = 20
# "Node stale time" affects two related behaviors.
# 1. If a compute node has been running for at least this long, but it
@@ -90,8 +90,8 @@ pykka = WARNING
apiclient = WARNING
[Arvados]
-host = 192.168.5.2:8000
-token = 2tnmn9ou33o3vk3bynzyzrc7aedhijo7ufa11j9kyv7509cygx
+host = {host}
+token = {token}
timeout = 15
# Accept an untrusted SSL certificate from the API server?
@@ -99,7 +99,7 @@ insecure = yes
[Cloud]
provider = azure
-driver_class = arvnodeman.fake_driver.FakeDriver
+driver_class = {driver_class}
# Shutdown windows define periods of time when a node may and may not be shut
# down. These are windows in full minutes, separated by commas. Counting from
@@ -110,7 +110,7 @@ driver_class = arvnodeman.fake_driver.FakeDriver
# Azure bills by the minute, so it makes sense to agressively shut down idle
# nodes. Specify at least two windows. You can add as many as you need beyond
# that.
-shutdown_windows = 5, 999999
+shutdown_windows = 1, 999999
[Cloud Credentials]
# Use "azure account list" with the azure CLI to get these values.
@@ -123,7 +123,7 @@ subscription_id = 00000000-0000-0000-0000-000000000000
# azure config mode arm
# azure ad app create --name "<Your Application Display Name>" --home-page "<https://YourApplicationHomePage>" --identifier-uris "<https://YouApplicationUri>" --password <Your_Password>
# azure ad sp create "<Application_Id>"
-# azure role assignment create --objectId "<Object_Id>" -o Owner -c /subscriptions/{subscriptionId}/
+# azure role assignment create --objectId "<Object_Id>" -o Owner -c /subscriptions/<subscriptionId>/
#
# Use <Application_Id> for "key" and the <Your_Password> for "secret"
#
@@ -142,7 +142,7 @@ ex_resource_group = ArvadosResourceGroup
image = Canonical:UbuntuServer:14.04.3-LTS:14.04.201508050
# Path to a local ssh key file that will be used to provision new nodes.
-ssh_key = /home/peter/.ssh/id_rsa.pub
+ssh_key = {ssh_key}
# The account name for the admin user that will be provisioned on new nodes.
ex_user_name = arvadosuser
@@ -161,7 +161,7 @@ tag_arvados-class = dynamic-compute
tag_cluster = zyxwv
# the API server to ping
-ping_host = 192.168.5.2:8000
+ping_host = {host}
# You can define any number of Size sections to list Azure sizes you're willing
# to use. The Node Manager should boot the cheapest size(s) that can run jobs
diff --git a/services/nodemanager/tests/integration_test.py b/services/nodemanager/tests/integration_test.py
new file mode 100755
index 0000000..90bf237
--- /dev/null
+++ b/services/nodemanager/tests/integration_test.py
@@ -0,0 +1,133 @@
+#!/usr/bin/env python
+import subprocess
+import os
+import sys
+import re
+import time
+import logging
+import stat
+import tempfile
+import shutil
+
+logging.basicConfig(level=logging.INFO)
+
+fake_slurm = None
+compute_nodes = None
+
+def update_script(path, val):
+ with open(path+"_", "w") as f:
+ f.write(val)
+ os.chmod(path+"_", stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR)
+ os.rename(path+"_", path)
+
+
+def set_squeue(actions, checks, k, g):
+ update_script(os.path.join(fake_slurm, "squeue"), """#!/bin/sh
+echo '1|100|100|ReqNodeNotAvail|34t0i-dz642-h42bg3hq4bdfpf9'
+""")
+ return 0
+
+def set_sinfo_alloc(actions, checks, k, g):
+ update_script(os.path.join(fake_slurm, "sinfo"), """#!/bin/sh
+echo '%s alloc'
+""" % (g.group(3)))
+
+ update_script(os.path.join(fake_slurm, "squeue"), """#!/bin/sh
+echo '1|100|100|Running|34t0i-dz642-h42bg3hq4bdfpf9'
+""")
+
+ global compute_nodes
+ compute_nodes[g.group(1)] = g.group(3)
+ return 0
+
+def set_sinfo_idle(actions, checks, k, g):
+ update_script(os.path.join(fake_slurm, "sinfo"), """#!/bin/sh
+echo '%s idle'
+""" % (compute_nodes[g.group(1)]))
+ return 0
+
+def noop(actions, checks, k, g):
+ return 0
+
+def down_fail(actions, checks, k, g):
+ return 1
+
+
+def run_test(actions, checks, driver_class):
+ code = 0
+
+ global fake_slurm
+ fake_slurm = tempfile.mkdtemp()
+ logging.info("fake_slurm is %s", fake_slurm)
+
+ global compute_nodes
+ compute_nodes = {}
+
+ env = os.environ.copy()
+ env["PATH"] = fake_slurm + ":" + env["PATH"]
+
+ update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n")
+ update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n")
+
+ with open("tests/fake.cfg.template") as f:
+ with open(os.path.join(fake_slurm, "id_rsa.pub"), "w") as ssh:
+ pass
+ with open(os.path.join(fake_slurm, "fake.cfg"), "w") as cfg:
+ cfg.write(f.read().format(host=os.environ["ARVADOS_API_HOST"],
+ token=os.environ["ARVADOS_API_TOKEN"],
+ driver_class=driver_class,
+ ssh_key=os.path.join(fake_slurm, "id_rsa.pub")))
+
+ timeout = time.time() + 300
+
+ p = subprocess.Popen(["bin/arvados-node-manager", "--foreground", "--config", os.path.join(fake_slurm, "fake.cfg")],
+ bufsize=1, stderr=subprocess.PIPE, env=env)
+ for line in p.stderr:
+ sys.stdout.write(line)
+
+ if time.time() > timeout:
+ logging.error("Exceeded timeout")
+ code = 1
+ p.terminate()
+
+ for k,v in actions.items():
+ g = re.match(k, line)
+ if g:
+ logging.info("Triggered action %s", k)
+ del actions[k]
+ code = v(actions, checks, k, g)
+ if code != 0:
+ logging.error("Action failed")
+ p.terminate()
+
+ for k,v in checks.items():
+ g = re.match(k, line)
+ if g:
+ logging.info("Triggered check %s", k)
+ code = v(actions, checks, k, g)
+ if code != 0:
+ logging.error("Check failed")
+ p.terminate()
+
+ if not actions:
+ p.terminate()
+
+ #shutil.rmtree(fake_slurm)
+
+ return code
+
+
+def main():
+ code = run_test({
+ r".*Daemon started": set_squeue,
+ r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": set_sinfo_alloc,
+ r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)": set_sinfo_idle,
+ r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)": noop,
+ r".*Shutdown success": noop,
+ }, {
+ r".*Suggesting shutdown because node state is \('down', .*\)": down_fail
+ },
+ "arvnodeman.test.fake_driver.FakeDriver")
+ exit(code)
+
+main()
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list