[ARVADOS] created: 2849771f5771c5ab3940fbd7411d4e4a0589fb32
Git user
git at public.curoverse.com
Thu Jun 22 17:40:59 EDT 2017
at 2849771f5771c5ab3940fbd7411d4e4a0589fb32 (commit)
commit 2849771f5771c5ab3940fbd7411d4e4a0589fb32
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date: Thu Jun 22 17:40:09 2017 -0400
11896: Add integration test coverage for gce and aws drivers. Handle disk=None from gce driver.
diff --git a/services/nodemanager/arvnodeman/computenode/driver/gce.py b/services/nodemanager/arvnodeman/computenode/driver/gce.py
index 79e43cb..9837ad9 100644
--- a/services/nodemanager/arvnodeman/computenode/driver/gce.py
+++ b/services/nodemanager/arvnodeman/computenode/driver/gce.py
@@ -67,6 +67,10 @@ class ComputeNodeDriver(BaseComputeNodeDriver):
def arvados_create_kwargs(self, size, arvados_node):
name = self.create_cloud_name(arvados_node)
+
+ if size.scratch > 375000:
+ self._logger.warning("Requested %d MB scratch space, but GCE driver currently only supports attaching a single 375 GB disk.", size.scratch)
+
disks = [
{'autoDelete': True,
'boot': True,
diff --git a/services/nodemanager/arvnodeman/jobqueue.py b/services/nodemanager/arvnodeman/jobqueue.py
index 1716a57..2237420 100644
--- a/services/nodemanager/arvnodeman/jobqueue.py
+++ b/services/nodemanager/arvnodeman/jobqueue.py
@@ -27,6 +27,8 @@ class ServerCalculator(object):
self.cores = kwargs.pop('cores')
# libcloud disk sizes are in GB, Arvados/SLURM are in MB
# multiply by 1000 instead of 1024 to err on low side
+ if self.disk is None:
+ self.disk = 0
self.scratch = self.disk * 1000
self.ram = int(self.ram * node_mem_scaling)
for name, override in kwargs.iteritems():
diff --git a/services/nodemanager/arvnodeman/nodelist.py b/services/nodemanager/arvnodeman/nodelist.py
index 7bc3a5e..4faa8ff 100644
--- a/services/nodemanager/arvnodeman/nodelist.py
+++ b/services/nodemanager/arvnodeman/nodelist.py
@@ -64,7 +64,7 @@ class CloudNodeListMonitorActor(clientactor.RemotePollLoopActor):
self._calculator = server_calc
def is_common_error(self, exception):
- return self._client.is_cloud_exception(exception)
+ return isinstance(exception, config.CLOUD_ERRORS)
def _item_key(self, node):
return node.id
diff --git a/services/nodemanager/arvnodeman/test/fake_driver.py b/services/nodemanager/arvnodeman/test/fake_driver.py
index 1e15002..4251680 100644
--- a/services/nodemanager/arvnodeman/test/fake_driver.py
+++ b/services/nodemanager/arvnodeman/test/fake_driver.py
@@ -5,7 +5,8 @@ import time
from arvnodeman.computenode import ARVADOS_TIMEFMT
-from libcloud.compute.base import NodeSize, Node, NodeDriver, NodeState
+from libcloud.compute.base import NodeSize, Node, NodeDriver, NodeState, NodeImage
+from libcloud.compute.drivers.gce import GCEDiskType
from libcloud.common.exceptions import BaseHTTPError
all_nodes = []
@@ -32,16 +33,21 @@ class FakeDriver(NodeDriver):
ex_resource_group=None,
ex_user_name=None,
ex_tags=None,
+ ex_metadata=None,
ex_network=None,
ex_userdata=None):
global all_nodes, create_calls
create_calls += 1
- n = Node(name, name, NodeState.RUNNING, [], [], self, size=size, extra={"tags": ex_tags})
+ nodeid = "node%i" % create_calls
+ n = Node(nodeid, nodeid, NodeState.RUNNING, [], [], self, size=size, extra={"tags": ex_tags})
all_nodes.append(n)
if ex_customdata:
- ping_url = re.search(r"echo '(.*)' > /var/tmp/arv-node-data/arv-ping-url", ex_customdata).groups(1)[0] + "&instance_id=" + name
+ ping_url = re.search(r"echo '(.*)' > /var/tmp/arv-node-data/arv-ping-url", ex_customdata).groups(1)[0]
if ex_userdata:
ping_url = ex_userdata
+ if ex_metadata:
+ ping_url = ex_metadata["arv-ping-url"]
+ ping_url += "&instance_id=" + nodeid
ctx = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
ctx.verify_mode = ssl.CERT_NONE
f = urllib.urlopen(ping_url, "", context=ctx)
@@ -153,3 +159,50 @@ class FakeAwsDriver(FakeDriver):
return [NodeSize("m3.xlarge", "Extra Large Instance", 3500, 80, 0, 0, self),
NodeSize("m4.xlarge", "Extra Large Instance", 3500, 0, 0, 0, self),
NodeSize("m4.2xlarge", "Double Extra Large Instance", 7000, 0, 0, 0, self)]
+
+
+class FakeGceDriver(FakeDriver):
+
+ def create_node(self, name=None,
+ size=None,
+ image=None,
+ auth=None,
+ external_ip=None,
+ ex_metadata=None,
+ ex_tags=None,
+ ex_disks_gce_struct=None):
+ n = super(FakeGceDriver, self).create_node(name=name,
+ size=size,
+ image=image,
+ auth=auth,
+ ex_metadata=ex_metadata)
+ n.extra = {
+ "metadata": {
+ "items": [{"key": k, "value": v} for k,v in ex_metadata.iteritems()]
+ },
+ "zone": "fake"
+ }
+ return n
+
+ def list_images(self, ex_project=None):
+ return [NodeImage("fake_image_id", "fake_image_id", self)]
+
+ def list_sizes(self, **kwargs):
+ return [NodeSize("n1-standard-1", "Standard", 3750, None, 0, 0, self),
+ NodeSize("n1-standard-2", "Double standard", 7500, None, 0, 0, self)]
+
+ def ex_list_disktypes(self, zone=None):
+ return [GCEDiskType("pd-standard", "pd-standard", zone, self,
+ extra={"selfLink": "pd-standard"}),
+ GCEDiskType("local-ssd", "local-ssd", zone, self,
+ extra={"selfLink": "local-ssd"})]
+
+ def ex_get_node(self, name, zone=None):
+ global all_nodes
+ for n in all_nodes:
+ if n.id == name:
+ return n
+ return None
+
+ def ex_set_node_metadata(self, n, items):
+ n.extra["metadata"]["items"] = items
diff --git a/services/nodemanager/tests/fake.cfg.template b/services/nodemanager/tests/fake_azure.cfg.template
similarity index 100%
copy from services/nodemanager/tests/fake.cfg.template
copy to services/nodemanager/tests/fake_azure.cfg.template
diff --git a/services/nodemanager/tests/fake.cfg.template b/services/nodemanager/tests/fake_ec2.cfg.template
similarity index 78%
copy from services/nodemanager/tests/fake.cfg.template
copy to services/nodemanager/tests/fake_ec2.cfg.template
index eacd53f..168ab5c 100644
--- a/services/nodemanager/tests/fake.cfg.template
+++ b/services/nodemanager/tests/fake_ec2.cfg.template
@@ -99,7 +99,7 @@ jobs_queue = no
insecure = yes
[Cloud]
-provider = azure
+provider = ec2
driver_class = {driver_class}
# Shutdown windows define periods of time when a node may and may not be shut
@@ -114,53 +114,21 @@ driver_class = {driver_class}
shutdown_windows = 1, 999999
[Cloud Credentials]
-# Use "azure account list" with the azure CLI to get these values.
-tenant_id = 00000000-0000-0000-0000-000000000000
-subscription_id = 00000000-0000-0000-0000-000000000000
-# The following directions are based on
-# https://azure.microsoft.com/en-us/documentation/articles/resource-group-authenticate-service-principal/
-#
-# azure config mode arm
-# azure ad app create --name "<Your Application Display Name>" --home-page "<https://YourApplicationHomePage>" --identifier-uris "<https://YouApplicationUri>" --password <Your_Password>
-# azure ad sp create "<Application_Id>"
-# azure role assignment create --objectId "<Object_Id>" -o Owner -c /subscriptions/<subscriptionId>/
-#
-# Use <Application_Id> for "key" and the <Your_Password> for "secret"
-#
key = 00000000-0000-0000-0000-000000000000
secret = PASSWORD
timeout = 60
region = East US
[Cloud List]
-# The resource group in which the compute node virtual machines will be created
-# and listed.
-ex_resource_group = ArvadosResourceGroup
[Cloud Create]
-# The image id, in the form "Publisher:Offer:SKU:Version"
-image = Canonical:UbuntuServer:14.04.3-LTS:14.04.201508050
+# The image id
+image = fake_image_id
# Path to a local ssh key file that will be used to provision new nodes.
ssh_key = {ssh_key}
-# The account name for the admin user that will be provisioned on new nodes.
-ex_user_name = arvadosuser
-
-# The Azure storage account that will be used to store the node OS disk images.
-ex_storage_account = arvadosstorage
-
-# The virtual network the VMs will be associated with.
-ex_network = ArvadosNetwork
-
-# Optional subnet of the virtual network.
-#ex_subnet = default
-
-# Node tags
-tag_arvados-class = dynamic-compute
-tag_cluster = zyxwv
-
# the API server to ping
ping_host = {host}
@@ -179,10 +147,12 @@ ping_host = {host}
# in GB) for Crunch jobs. You can also override Microsoft's provided
# data fields by setting them here.
-[Size Standard_D3]
+[Size m4.xlarge]
cores = 4
price = 0.56
+scratch = 250
-[Size Standard_D4]
+[Size m4.2xlarge]
cores = 8
price = 1.12
+scratch = 500
diff --git a/services/nodemanager/tests/fake.cfg.template b/services/nodemanager/tests/fake_gce.cfg.template
similarity index 77%
rename from services/nodemanager/tests/fake.cfg.template
rename to services/nodemanager/tests/fake_gce.cfg.template
index eacd53f..38ac8bb 100644
--- a/services/nodemanager/tests/fake.cfg.template
+++ b/services/nodemanager/tests/fake_gce.cfg.template
@@ -99,7 +99,7 @@ jobs_queue = no
insecure = yes
[Cloud]
-provider = azure
+provider = gce
driver_class = {driver_class}
# Shutdown windows define periods of time when a node may and may not be shut
@@ -114,53 +114,20 @@ driver_class = {driver_class}
shutdown_windows = 1, 999999
[Cloud Credentials]
-# Use "azure account list" with the azure CLI to get these values.
-tenant_id = 00000000-0000-0000-0000-000000000000
-subscription_id = 00000000-0000-0000-0000-000000000000
-
-# The following directions are based on
-# https://azure.microsoft.com/en-us/documentation/articles/resource-group-authenticate-service-principal/
-#
-# azure config mode arm
-# azure ad app create --name "<Your Application Display Name>" --home-page "<https://YourApplicationHomePage>" --identifier-uris "<https://YouApplicationUri>" --password <Your_Password>
-# azure ad sp create "<Application_Id>"
-# azure role assignment create --objectId "<Object_Id>" -o Owner -c /subscriptions/<subscriptionId>/
-#
-# Use <Application_Id> for "key" and the <Your_Password> for "secret"
-#
key = 00000000-0000-0000-0000-000000000000
secret = PASSWORD
timeout = 60
region = East US
[Cloud List]
-# The resource group in which the compute node virtual machines will be created
-# and listed.
-ex_resource_group = ArvadosResourceGroup
[Cloud Create]
-# The image id, in the form "Publisher:Offer:SKU:Version"
-image = Canonical:UbuntuServer:14.04.3-LTS:14.04.201508050
+# The image id
+image = fake_image_id
# Path to a local ssh key file that will be used to provision new nodes.
ssh_key = {ssh_key}
-# The account name for the admin user that will be provisioned on new nodes.
-ex_user_name = arvadosuser
-
-# The Azure storage account that will be used to store the node OS disk images.
-ex_storage_account = arvadosstorage
-
-# The virtual network the VMs will be associated with.
-ex_network = ArvadosNetwork
-
-# Optional subnet of the virtual network.
-#ex_subnet = default
-
-# Node tags
-tag_arvados-class = dynamic-compute
-tag_cluster = zyxwv
-
# the API server to ping
ping_host = {host}
@@ -179,10 +146,10 @@ ping_host = {host}
# in GB) for Crunch jobs. You can also override Microsoft's provided
# data fields by setting them here.
-[Size Standard_D3]
-cores = 4
+[Size n1-standard-1]
+cores = 1
price = 0.56
-[Size Standard_D4]
-cores = 8
-price = 1.12
+[Size n1-standard-2]
+cores = 2
+price = 1.12
\ No newline at end of file
diff --git a/services/nodemanager/tests/integration_test.py b/services/nodemanager/tests/integration_test.py
index c6f1827..e77a4a0 100755
--- a/services/nodemanager/tests/integration_test.py
+++ b/services/nodemanager/tests/integration_test.py
@@ -27,10 +27,10 @@ logger.addHandler(logging.StreamHandler(sys.stderr))
detail = logging.getLogger("detail")
detail.setLevel(logging.INFO)
-if os.environ.get("ANMTEST_LOGLEVEL"):
- detail_content = sys.stderr
-else:
- detail_content = StringIO.StringIO()
+#if os.environ.get("ANMTEST_LOGLEVEL"):
+detail_content = sys.stderr
+#else:
+# detail_content = StringIO.StringIO()
detail.addHandler(logging.StreamHandler(detail_content))
fake_slurm = None
@@ -109,7 +109,7 @@ def expect_count(count, checks, pattern, g):
checks[pattern] = partial(expect_count, count-1)
return 0
-def run_test(name, actions, checks, driver_class, jobs):
+def run_test(name, actions, checks, driver_class, jobs, provider):
code = 0
# Delete any stale node records
@@ -137,7 +137,7 @@ def run_test(name, actions, checks, driver_class, jobs):
update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n")
# Write configuration file for test
- with open("tests/fake.cfg.template") as f:
+ with open("tests/fake_%s.cfg.template" % provider) as f:
open(os.path.join(fake_slurm, "id_rsa.pub"), "w").close()
with open(os.path.join(fake_slurm, "fake.cfg"), "w") as cfg:
cfg.write(f.read().format(host=os.environ["ARVADOS_API_HOST"],
@@ -224,7 +224,7 @@ def main():
# Test lifecycle.
tests = {
- "test_single_node": (
+ "test_single_node_azure": (
[
(r".*Daemon started", set_squeue),
(r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
@@ -237,7 +237,8 @@ def main():
r".*Setting node quota.*": fail,
},
"arvnodeman.test.fake_driver.FakeDriver",
- {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"}),
+ {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
+ "azure"),
"test_multiple_nodes": (
[
(r".*Daemon started", set_squeue),
@@ -261,7 +262,7 @@ def main():
"34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
"34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
"34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"
- }),
+ }, "azure"),
"test_hit_quota": (
[
(r".*Daemon started", set_squeue),
@@ -282,7 +283,7 @@ def main():
"34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
"34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
"34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"
- }),
+ }, "azure"),
"test_probe_quota": (
[
(r".*Daemon started", set_squeue),
@@ -314,7 +315,7 @@ def main():
"34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
"34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
"34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"
- }),
+ }, "azure"),
"test_no_hang_failing_node_create": (
[
(r".*Daemon started", set_squeue),
@@ -329,7 +330,7 @@ def main():
"34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
"34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
"34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"
- }),
+ }, "azure"),
"test_retry_create": (
[
(r".*Daemon started", set_squeue),
@@ -339,7 +340,37 @@ def main():
{},
"arvnodeman.test.fake_driver.RetryDriver",
{"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail"
- })
+ }, "azure"),
+ "test_single_node_aws": (
+ [
+ (r".*Daemon started", set_squeue),
+ (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
+ (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
+ (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
+ (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
+ ], {
+ r".*Suggesting shutdown because node state is \('down', .*\)": fail,
+ r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
+ r".*Setting node quota.*": fail,
+ },
+ "arvnodeman.test.fake_driver.FakeAwsDriver",
+ {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
+ "ec2"),
+ "test_single_node_gce": (
+ [
+ (r".*Daemon started", set_squeue),
+ (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
+ (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
+ (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
+ (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
+ ], {
+ r".*Suggesting shutdown because node state is \('down', .*\)": fail,
+ r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
+ r".*Setting node quota.*": fail,
+ },
+ "arvnodeman.test.fake_driver.FakeGceDriver",
+ {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
+ "gce")
}
code = 0
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list