[ARVADOS] updated: ecc8ee4e5edabaf7d888f55a99184b1efe4a9060

Git user git at public.curoverse.com
Fri Jun 23 13:43:41 EDT 2017


Summary of changes:
 .../arvnodeman/computenode/driver/gce.py           |  4 ++
 services/nodemanager/arvnodeman/jobqueue.py        |  2 +
 services/nodemanager/arvnodeman/nodelist.py        |  2 +-
 .../nodemanager/arvnodeman/test/fake_driver.py     | 59 ++++++++++++++++++++--
 .../{fake.cfg.template => fake_azure.cfg.template} |  0
 .../{fake.cfg.template => fake_ec2.cfg.template}   | 44 +++-------------
 .../{fake.cfg.template => fake_gce.cfg.template}   | 49 +++---------------
 services/nodemanager/tests/integration_test.py     | 49 ++++++++++++++----
 8 files changed, 118 insertions(+), 91 deletions(-)
 copy services/nodemanager/tests/{fake.cfg.template => fake_azure.cfg.template} (100%)
 copy services/nodemanager/tests/{fake.cfg.template => fake_ec2.cfg.template} (78%)
 rename services/nodemanager/tests/{fake.cfg.template => fake_gce.cfg.template} (77%)

       via  ecc8ee4e5edabaf7d888f55a99184b1efe4a9060 (commit)
       via  b17b7dd25a7daea501538419fadfd79b46a72aeb (commit)
      from  2e1d7b1356e2063c2fbd6a1bf26176ebbcb4e63c (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.


commit ecc8ee4e5edabaf7d888f55a99184b1efe4a9060
Merge: 2e1d7b1 b17b7dd
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Fri Jun 23 13:43:34 2017 -0400

    Merge branch '11896-gce-no-disk' closes #11896
    
    Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz at curoverse.com>


commit b17b7dd25a7daea501538419fadfd79b46a72aeb
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Thu Jun 22 17:40:09 2017 -0400

    11896: Add integration test coverage for gce and aws drivers.  Handle disk=None from gce driver.
    
    Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz at curoverse.com>

diff --git a/services/nodemanager/arvnodeman/computenode/driver/gce.py b/services/nodemanager/arvnodeman/computenode/driver/gce.py
index 79e43cb..9837ad9 100644
--- a/services/nodemanager/arvnodeman/computenode/driver/gce.py
+++ b/services/nodemanager/arvnodeman/computenode/driver/gce.py
@@ -67,6 +67,10 @@ class ComputeNodeDriver(BaseComputeNodeDriver):
 
     def arvados_create_kwargs(self, size, arvados_node):
         name = self.create_cloud_name(arvados_node)
+
+        if size.scratch > 375000:
+            self._logger.warning("Requested %d MB scratch space, but GCE driver currently only supports attaching a single 375 GB disk.", size.scratch)
+
         disks = [
             {'autoDelete': True,
              'boot': True,
diff --git a/services/nodemanager/arvnodeman/jobqueue.py b/services/nodemanager/arvnodeman/jobqueue.py
index 1716a57..2237420 100644
--- a/services/nodemanager/arvnodeman/jobqueue.py
+++ b/services/nodemanager/arvnodeman/jobqueue.py
@@ -27,6 +27,8 @@ class ServerCalculator(object):
             self.cores = kwargs.pop('cores')
             # libcloud disk sizes are in GB, Arvados/SLURM are in MB
             # multiply by 1000 instead of 1024 to err on low side
+            if self.disk is None:
+                self.disk = 0
             self.scratch = self.disk * 1000
             self.ram = int(self.ram * node_mem_scaling)
             for name, override in kwargs.iteritems():
diff --git a/services/nodemanager/arvnodeman/nodelist.py b/services/nodemanager/arvnodeman/nodelist.py
index 7bc3a5e..4faa8ff 100644
--- a/services/nodemanager/arvnodeman/nodelist.py
+++ b/services/nodemanager/arvnodeman/nodelist.py
@@ -64,7 +64,7 @@ class CloudNodeListMonitorActor(clientactor.RemotePollLoopActor):
         self._calculator = server_calc
 
     def is_common_error(self, exception):
-        return self._client.is_cloud_exception(exception)
+        return isinstance(exception, config.CLOUD_ERRORS)
 
     def _item_key(self, node):
         return node.id
diff --git a/services/nodemanager/arvnodeman/test/fake_driver.py b/services/nodemanager/arvnodeman/test/fake_driver.py
index 1e15002..4251680 100644
--- a/services/nodemanager/arvnodeman/test/fake_driver.py
+++ b/services/nodemanager/arvnodeman/test/fake_driver.py
@@ -5,7 +5,8 @@ import time
 
 from arvnodeman.computenode import ARVADOS_TIMEFMT
 
-from libcloud.compute.base import NodeSize, Node, NodeDriver, NodeState
+from libcloud.compute.base import NodeSize, Node, NodeDriver, NodeState, NodeImage
+from libcloud.compute.drivers.gce import GCEDiskType
 from libcloud.common.exceptions import BaseHTTPError
 
 all_nodes = []
@@ -32,16 +33,21 @@ class FakeDriver(NodeDriver):
                     ex_resource_group=None,
                     ex_user_name=None,
                     ex_tags=None,
+                    ex_metadata=None,
                     ex_network=None,
                     ex_userdata=None):
         global all_nodes, create_calls
         create_calls += 1
-        n = Node(name, name, NodeState.RUNNING, [], [], self, size=size, extra={"tags": ex_tags})
+        nodeid = "node%i" % create_calls
+        n = Node(nodeid, nodeid, NodeState.RUNNING, [], [], self, size=size, extra={"tags": ex_tags})
         all_nodes.append(n)
         if ex_customdata:
-            ping_url = re.search(r"echo '(.*)' > /var/tmp/arv-node-data/arv-ping-url", ex_customdata).groups(1)[0] + "&instance_id=" + name
+            ping_url = re.search(r"echo '(.*)' > /var/tmp/arv-node-data/arv-ping-url", ex_customdata).groups(1)[0]
         if ex_userdata:
             ping_url = ex_userdata
+        if ex_metadata:
+            ping_url = ex_metadata["arv-ping-url"]
+        ping_url += "&instance_id=" + nodeid
         ctx = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
         ctx.verify_mode = ssl.CERT_NONE
         f = urllib.urlopen(ping_url, "", context=ctx)
@@ -153,3 +159,50 @@ class FakeAwsDriver(FakeDriver):
         return [NodeSize("m3.xlarge", "Extra Large Instance", 3500, 80, 0, 0, self),
                 NodeSize("m4.xlarge", "Extra Large Instance", 3500, 0, 0, 0, self),
                 NodeSize("m4.2xlarge", "Double Extra Large Instance", 7000, 0, 0, 0, self)]
+
+
+class FakeGceDriver(FakeDriver):
+
+    def create_node(self, name=None,
+                    size=None,
+                    image=None,
+                    auth=None,
+                    external_ip=None,
+                    ex_metadata=None,
+                    ex_tags=None,
+                    ex_disks_gce_struct=None):
+        n = super(FakeGceDriver, self).create_node(name=name,
+                                                   size=size,
+                                                   image=image,
+                                                   auth=auth,
+                                                   ex_metadata=ex_metadata)
+        n.extra = {
+            "metadata": {
+                "items": [{"key": k, "value": v} for k,v in ex_metadata.iteritems()]
+            },
+            "zone": "fake"
+        }
+        return n
+
+    def list_images(self, ex_project=None):
+        return [NodeImage("fake_image_id", "fake_image_id", self)]
+
+    def list_sizes(self, **kwargs):
+        return [NodeSize("n1-standard-1", "Standard", 3750, None, 0, 0, self),
+                NodeSize("n1-standard-2", "Double standard", 7500, None, 0, 0, self)]
+
+    def ex_list_disktypes(self, zone=None):
+        return [GCEDiskType("pd-standard", "pd-standard", zone, self,
+                            extra={"selfLink": "pd-standard"}),
+                GCEDiskType("local-ssd", "local-ssd", zone, self,
+                            extra={"selfLink": "local-ssd"})]
+
+    def ex_get_node(self, name, zone=None):
+        global all_nodes
+        for n in all_nodes:
+            if n.id == name:
+                return n
+        return None
+
+    def ex_set_node_metadata(self, n, items):
+        n.extra["metadata"]["items"] = items
diff --git a/services/nodemanager/tests/fake.cfg.template b/services/nodemanager/tests/fake_azure.cfg.template
similarity index 100%
copy from services/nodemanager/tests/fake.cfg.template
copy to services/nodemanager/tests/fake_azure.cfg.template
diff --git a/services/nodemanager/tests/fake.cfg.template b/services/nodemanager/tests/fake_ec2.cfg.template
similarity index 78%
copy from services/nodemanager/tests/fake.cfg.template
copy to services/nodemanager/tests/fake_ec2.cfg.template
index eacd53f..168ab5c 100644
--- a/services/nodemanager/tests/fake.cfg.template
+++ b/services/nodemanager/tests/fake_ec2.cfg.template
@@ -99,7 +99,7 @@ jobs_queue = no
 insecure = yes
 
 [Cloud]
-provider = azure
+provider = ec2
 driver_class = {driver_class}
 
 # Shutdown windows define periods of time when a node may and may not be shut
@@ -114,53 +114,21 @@ driver_class = {driver_class}
 shutdown_windows = 1, 999999
 
 [Cloud Credentials]
-# Use "azure account list" with the azure CLI to get these values.
-tenant_id = 00000000-0000-0000-0000-000000000000
-subscription_id = 00000000-0000-0000-0000-000000000000
 
-# The following directions are based on
-# https://azure.microsoft.com/en-us/documentation/articles/resource-group-authenticate-service-principal/
-#
-# azure config mode arm
-# azure ad app create --name "<Your Application Display Name>" --home-page "<https://YourApplicationHomePage>" --identifier-uris "<https://YouApplicationUri>" --password <Your_Password>
-# azure ad sp create "<Application_Id>"
-# azure role assignment create --objectId "<Object_Id>" -o Owner -c /subscriptions/<subscriptionId>/
-#
-# Use <Application_Id> for "key" and the <Your_Password> for "secret"
-#
 key = 00000000-0000-0000-0000-000000000000
 secret = PASSWORD
 timeout = 60
 region = East US
 
 [Cloud List]
-# The resource group in which the compute node virtual machines will be created
-# and listed.
-ex_resource_group = ArvadosResourceGroup
 
 [Cloud Create]
-# The image id, in the form "Publisher:Offer:SKU:Version"
-image = Canonical:UbuntuServer:14.04.3-LTS:14.04.201508050
+# The image id
+image = fake_image_id
 
 # Path to a local ssh key file that will be used to provision new nodes.
 ssh_key = {ssh_key}
 
-# The account name for the admin user that will be provisioned on new nodes.
-ex_user_name = arvadosuser
-
-# The Azure storage account that will be used to store the node OS disk images.
-ex_storage_account = arvadosstorage
-
-# The virtual network the VMs will be associated with.
-ex_network = ArvadosNetwork
-
-# Optional subnet of the virtual network.
-#ex_subnet = default
-
-# Node tags
-tag_arvados-class = dynamic-compute
-tag_cluster = zyxwv
-
 # the API server to ping
 ping_host = {host}
 
@@ -179,10 +147,12 @@ ping_host = {host}
 # in GB) for Crunch jobs.  You can also override Microsoft's provided
 # data fields by setting them here.
 
-[Size Standard_D3]
+[Size m4.xlarge]
 cores = 4
 price = 0.56
+scratch = 250
 
-[Size Standard_D4]
+[Size m4.2xlarge]
 cores = 8
 price = 1.12
+scratch = 500
diff --git a/services/nodemanager/tests/fake.cfg.template b/services/nodemanager/tests/fake_gce.cfg.template
similarity index 77%
rename from services/nodemanager/tests/fake.cfg.template
rename to services/nodemanager/tests/fake_gce.cfg.template
index eacd53f..38ac8bb 100644
--- a/services/nodemanager/tests/fake.cfg.template
+++ b/services/nodemanager/tests/fake_gce.cfg.template
@@ -99,7 +99,7 @@ jobs_queue = no
 insecure = yes
 
 [Cloud]
-provider = azure
+provider = gce
 driver_class = {driver_class}
 
 # Shutdown windows define periods of time when a node may and may not be shut
@@ -114,53 +114,20 @@ driver_class = {driver_class}
 shutdown_windows = 1, 999999
 
 [Cloud Credentials]
-# Use "azure account list" with the azure CLI to get these values.
-tenant_id = 00000000-0000-0000-0000-000000000000
-subscription_id = 00000000-0000-0000-0000-000000000000
-
-# The following directions are based on
-# https://azure.microsoft.com/en-us/documentation/articles/resource-group-authenticate-service-principal/
-#
-# azure config mode arm
-# azure ad app create --name "<Your Application Display Name>" --home-page "<https://YourApplicationHomePage>" --identifier-uris "<https://YouApplicationUri>" --password <Your_Password>
-# azure ad sp create "<Application_Id>"
-# azure role assignment create --objectId "<Object_Id>" -o Owner -c /subscriptions/<subscriptionId>/
-#
-# Use <Application_Id> for "key" and the <Your_Password> for "secret"
-#
 key = 00000000-0000-0000-0000-000000000000
 secret = PASSWORD
 timeout = 60
 region = East US
 
 [Cloud List]
-# The resource group in which the compute node virtual machines will be created
-# and listed.
-ex_resource_group = ArvadosResourceGroup
 
 [Cloud Create]
-# The image id, in the form "Publisher:Offer:SKU:Version"
-image = Canonical:UbuntuServer:14.04.3-LTS:14.04.201508050
+# The image id
+image = fake_image_id
 
 # Path to a local ssh key file that will be used to provision new nodes.
 ssh_key = {ssh_key}
 
-# The account name for the admin user that will be provisioned on new nodes.
-ex_user_name = arvadosuser
-
-# The Azure storage account that will be used to store the node OS disk images.
-ex_storage_account = arvadosstorage
-
-# The virtual network the VMs will be associated with.
-ex_network = ArvadosNetwork
-
-# Optional subnet of the virtual network.
-#ex_subnet = default
-
-# Node tags
-tag_arvados-class = dynamic-compute
-tag_cluster = zyxwv
-
 # the API server to ping
 ping_host = {host}
 
@@ -179,10 +146,10 @@ ping_host = {host}
 # in GB) for Crunch jobs.  You can also override Microsoft's provided
 # data fields by setting them here.
 
-[Size Standard_D3]
-cores = 4
+[Size n1-standard-1]
+cores = 1
 price = 0.56
 
-[Size Standard_D4]
-cores = 8
-price = 1.12
+[Size n1-standard-2]
+cores = 2
+price = 1.12
\ No newline at end of file
diff --git a/services/nodemanager/tests/integration_test.py b/services/nodemanager/tests/integration_test.py
index c6f1827..31f928a 100755
--- a/services/nodemanager/tests/integration_test.py
+++ b/services/nodemanager/tests/integration_test.py
@@ -109,7 +109,7 @@ def expect_count(count, checks, pattern, g):
         checks[pattern] = partial(expect_count, count-1)
         return 0
 
-def run_test(name, actions, checks, driver_class, jobs):
+def run_test(name, actions, checks, driver_class, jobs, provider):
     code = 0
 
     # Delete any stale node records
@@ -137,7 +137,7 @@ def run_test(name, actions, checks, driver_class, jobs):
     update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n")
 
     # Write configuration file for test
-    with open("tests/fake.cfg.template") as f:
+    with open("tests/fake_%s.cfg.template" % provider) as f:
         open(os.path.join(fake_slurm, "id_rsa.pub"), "w").close()
         with open(os.path.join(fake_slurm, "fake.cfg"), "w") as cfg:
             cfg.write(f.read().format(host=os.environ["ARVADOS_API_HOST"],
@@ -224,7 +224,7 @@ def main():
     # Test lifecycle.
 
     tests = {
-        "test_single_node": (
+        "test_single_node_azure": (
             [
                 (r".*Daemon started", set_squeue),
                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
@@ -237,7 +237,8 @@ def main():
                 r".*Setting node quota.*": fail,
             },
             "arvnodeman.test.fake_driver.FakeDriver",
-            {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"}),
+            {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
+            "azure"),
         "test_multiple_nodes": (
             [
                 (r".*Daemon started", set_squeue),
@@ -261,7 +262,7 @@ def main():
              "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
              "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
              "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"
-         }),
+         }, "azure"),
         "test_hit_quota": (
             [
                 (r".*Daemon started", set_squeue),
@@ -282,7 +283,7 @@ def main():
              "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
              "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
              "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"
-         }),
+         }, "azure"),
         "test_probe_quota": (
             [
                 (r".*Daemon started", set_squeue),
@@ -314,7 +315,7 @@ def main():
              "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
              "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
              "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"
-         }),
+         }, "azure"),
         "test_no_hang_failing_node_create": (
             [
                 (r".*Daemon started", set_squeue),
@@ -329,7 +330,7 @@ def main():
              "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
              "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
              "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"
-         }),
+         }, "azure"),
         "test_retry_create": (
             [
                 (r".*Daemon started", set_squeue),
@@ -339,7 +340,37 @@ def main():
             {},
             "arvnodeman.test.fake_driver.RetryDriver",
             {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail"
-         })
+         }, "azure"),
+        "test_single_node_aws": (
+            [
+                (r".*Daemon started", set_squeue),
+                (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
+                (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
+                (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
+                (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
+            ], {
+                r".*Suggesting shutdown because node state is \('down', .*\)": fail,
+                r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
+                r".*Setting node quota.*": fail,
+            },
+            "arvnodeman.test.fake_driver.FakeAwsDriver",
+            {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
+            "ec2"),
+        "test_single_node_gce": (
+            [
+                (r".*Daemon started", set_squeue),
+                (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
+                (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
+                (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
+                (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
+            ], {
+                r".*Suggesting shutdown because node state is \('down', .*\)": fail,
+                r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
+                r".*Setting node quota.*": fail,
+            },
+            "arvnodeman.test.fake_driver.FakeGceDriver",
+            {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
+            "gce")
     }
 
     code = 0

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list