[ARVADOS] created: f5f5de0ae41e12738a380c422417d5a5e5af7f09

Git user git at public.curoverse.com
Thu Apr 27 15:35:27 EDT 2017


        at  f5f5de0ae41e12738a380c422417d5a5e5af7f09 (commit)


commit f5f5de0ae41e12738a380c422417d5a5e5af7f09
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Thu Apr 27 14:24:43 2017 -0400

    11549: Fix container requests so it doesn't mount each file individually,
    instead mount common collections/subdirectories.

diff --git a/sdk/cwl/arvados_cwl/arvcontainer.py b/sdk/cwl/arvados_cwl/arvcontainer.py
index 9e76cf7..657d592 100644
--- a/sdk/cwl/arvados_cwl/arvcontainer.py
+++ b/sdk/cwl/arvados_cwl/arvcontainer.py
@@ -7,16 +7,16 @@ import ruamel.yaml as yaml
 
 from cwltool.errors import WorkflowException
 from cwltool.process import get_feature, UnsupportedRequirement, shortname
-from cwltool.pathmapper import adjustFiles, adjustDirObjs
+from cwltool.pathmapper import adjustFileObjs, adjustDirObjs
 from cwltool.utils import aslist
 
 import arvados.collection
 
 from .arvdocker import arv_docker_get_image
 from . import done
-from .runner import Runner, arvados_jobs_image, packed_workflow, trim_listing
+from .runner import Runner, arvados_jobs_image, packed_workflow, trim_anonymous_location
 from .fsaccess import CollectionFetcher
-from .pathmapper import NoFollowPathMapper
+from .pathmapper import NoFollowPathMapper, trim_listing
 from .perf import Perf
 
 logger = logging.getLogger('arvados.cwl-runner')
@@ -48,37 +48,39 @@ class ArvadosContainer(object):
         mounts = {
             self.outdir: {
                 "kind": "tmp"
+            },
+            self.tmpdir: {
+                "kind": "tmp"
             }
         }
         scheduling_parameters = {}
 
-        dirs = set()
-        for f in self.pathmapper.files():
-            pdh, p, tp, stg = self.pathmapper.mapper(f)
-            if tp == "Directory" and '/' not in pdh:
-                mounts[p] = {
-                    "kind": "collection",
-                    "portable_data_hash": pdh[5:]
-                }
-                dirs.add(pdh)
-
-        for f in self.pathmapper.files():
-            res, p, tp, stg = self.pathmapper.mapper(f)
-            if res.startswith("keep:"):
-                res = res[5:]
-            elif res.startswith("/keep/"):
-                res = res[6:]
-            else:
+        rf = [self.pathmapper.mapper(f) for f in self.pathmapper.referenced_files]
+        rf.sort(key=lambda k: k.resolved)
+        prevdir = None
+        for resolved, target, tp, stg in rf:
+            if not stg:
                 continue
-            sp = res.split("/", 1)
-            pdh = sp[0]
-            if pdh not in dirs:
-                mounts[p] = {
-                    "kind": "collection",
-                    "portable_data_hash": pdh
-                }
-                if len(sp) == 2:
-                    mounts[p]["path"] = urllib.unquote(sp[1])
+            if prevdir and target.startswith(prevdir):
+                continue
+            if tp == "Directory":
+                targetdir = target
+            else:
+                targetdir = os.path.dirname(target)
+            sp = resolved.split("/", 1)
+            pdh = sp[0][5:]   # remove "keep:"
+            mounts[targetdir] = {
+                "kind": "collection",
+                "portable_data_hash": pdh
+            }
+            if len(sp) == 2:
+                if tp == "Directory":
+                    path = sp[1]
+                else:
+                    path = os.path.dirname(sp[1])
+                if path and path != "/":
+                    mounts[targetdir]["path"] = path
+            prevdir = targetdir + "/"
 
         with Perf(metrics, "generatefiles %s" % self.name):
             if self.generatefiles["listing"]:
@@ -238,6 +240,8 @@ class RunnerContainer(Runner):
         """
 
         adjustDirObjs(self.job_order, trim_listing)
+        adjustFileObjs(self.job_order, trim_anonymous_location)
+        adjustDirObjs(self.job_order, trim_anonymous_location)
 
         container_req = {
             "owner_uuid": self.arvrunner.project_uuid,
diff --git a/sdk/cwl/arvados_cwl/arvjob.py b/sdk/cwl/arvados_cwl/arvjob.py
index 04a6295..e86f505 100644
--- a/sdk/cwl/arvados_cwl/arvjob.py
+++ b/sdk/cwl/arvados_cwl/arvjob.py
@@ -9,7 +9,7 @@ from cwltool.errors import WorkflowException
 from cwltool.draft2tool import revmap_file, CommandLineTool
 from cwltool.load_tool import fetch_document
 from cwltool.builder import Builder
-from cwltool.pathmapper import adjustDirObjs
+from cwltool.pathmapper import adjustFileObjs, adjustDirObjs
 
 from schema_salad.sourceline import SourceLine
 
@@ -18,8 +18,8 @@ import ruamel.yaml as yaml
 import arvados.collection
 
 from .arvdocker import arv_docker_get_image
-from .runner import Runner, arvados_jobs_image, packed_workflow, trim_listing, upload_workflow_collection
-from .pathmapper import VwdPathMapper
+from .runner import Runner, arvados_jobs_image, packed_workflow, upload_workflow_collection, trim_anonymous_location
+from .pathmapper import VwdPathMapper, trim_listing
 from .perf import Perf
 from . import done
 from ._version import __version__
@@ -254,6 +254,8 @@ class RunnerJob(Runner):
             self.job_order["cwl:tool"] = "%s/workflow.cwl#main" % wf_pdh
 
         adjustDirObjs(self.job_order, trim_listing)
+        adjustFileObjs(self.job_order, trim_anonymous_location)
+        adjustDirObjs(self.job_order, trim_anonymous_location)
 
         if self.output_name:
             self.job_order["arv:output_name"] = self.output_name
diff --git a/sdk/cwl/arvados_cwl/arvworkflow.py b/sdk/cwl/arvados_cwl/arvworkflow.py
index 4db1f4f..22f662e 100644
--- a/sdk/cwl/arvados_cwl/arvworkflow.py
+++ b/sdk/cwl/arvados_cwl/arvworkflow.py
@@ -13,7 +13,8 @@ from cwltool.pathmapper import adjustFileObjs, adjustDirObjs
 
 import ruamel.yaml as yaml
 
-from .runner import upload_dependencies, trim_listing, packed_workflow, upload_workflow_collection
+from .runner import upload_dependencies, packed_workflow, upload_workflow_collection, trim_anonymous_location
+from .pathmapper import trim_listing
 from .arvtool import ArvadosCommandTool
 from .perf import Perf
 
@@ -26,6 +27,8 @@ def upload_workflow(arvRunner, tool, job_order, project_uuid, uuid=None,
     packed = packed_workflow(arvRunner, tool)
 
     adjustDirObjs(job_order, trim_listing)
+    adjustFileObjs(job_order, trim_anonymous_location)
+    adjustDirObjs(job_order, trim_anonymous_location)
 
     main = [p for p in packed["$graph"] if p["id"] == "#main"][0]
     for inp in main["inputs"]:
diff --git a/sdk/cwl/arvados_cwl/pathmapper.py b/sdk/cwl/arvados_cwl/pathmapper.py
index cddb408..5e2ee46 100644
--- a/sdk/cwl/arvados_cwl/pathmapper.py
+++ b/sdk/cwl/arvados_cwl/pathmapper.py
@@ -14,6 +14,21 @@ from cwltool.workflow import WorkflowException
 
 logger = logging.getLogger('arvados.cwl-runner')
 
+def trim_listing(obj):
+    """Remove 'listing' field from Directory objects that are keep references.
+
+    When Directory objects represent Keep references, it is redundant and
+    potentially very expensive to pass fully enumerated Directory objects
+    between instances of cwl-runner (e.g. a submitting a job, or using the
+    RunInSingleContainer feature), so delete the 'listing' field when it is
+    safe to do so.
+
+    """
+
+    if obj.get("location", "").startswith("keep:") and "listing" in obj:
+        del obj["listing"]
+
+
 class ArvPathMapper(PathMapper):
     """Convert container-local paths to and from Keep collection ids."""
 
@@ -27,6 +42,7 @@ class ArvPathMapper(PathMapper):
         self.collection_pattern = collection_pattern
         self.file_pattern = file_pattern
         self.name = name
+        self.referenced_files = [r["location"] for r in referenced_files]
         super(ArvPathMapper, self).__init__(referenced_files, input_basedir, None)
 
     def visit(self, srcobj, uploadfiles):
diff --git a/sdk/cwl/arvados_cwl/runner.py b/sdk/cwl/arvados_cwl/runner.py
index 57a6723..69e4f5b 100644
--- a/sdk/cwl/arvados_cwl/runner.py
+++ b/sdk/cwl/arvados_cwl/runner.py
@@ -23,24 +23,22 @@ import arvados.collection
 import ruamel.yaml as yaml
 
 from .arvdocker import arv_docker_get_image
-from .pathmapper import ArvPathMapper
+from .pathmapper import ArvPathMapper, trim_listing
 from ._version import __version__
 from . import done
 
 logger = logging.getLogger('arvados.cwl-runner')
 
-def trim_listing(obj):
-    """Remove 'listing' field from Directory objects that are keep references.
+def trim_anonymous_location(obj):
+    """Remove 'location' field from File and Directory literals.
+
+    To make internal handling easier, literals are assigned a random id for
+    'location'.  However, when writing the record back out, this can break
+    reproducibility.  Since it is valid for literals not have a 'location'
+    field, remove it.
 
-    When Directory objects represent Keep references, it redundant and
-    potentially very expensive to pass fully enumerated Directory objects
-    between instances of cwl-runner (e.g. a submitting a job, or using the
-    RunInSingleContainer feature), so delete the 'listing' field when it is
-    safe to do so.
     """
 
-    if obj.get("location", "").startswith("keep:") and "listing" in obj:
-        del obj["listing"]
     if obj.get("location", "").startswith("_:"):
         del obj["location"]
 
diff --git a/sdk/cwl/tests/test_container.py b/sdk/cwl/tests/test_container.py
index 7786e7f..b06eae8 100644
--- a/sdk/cwl/tests/test_container.py
+++ b/sdk/cwl/tests/test_container.py
@@ -63,6 +63,7 @@ class TestContainer(unittest.TestCase):
                         'use_existing': enable_reuse,
                         'priority': 1,
                         'mounts': {
+                            '/tmp': {'kind': 'tmp'},
                             '/var/spool/cwl': {'kind': 'tmp'}
                         },
                         'state': 'Committed',
@@ -135,6 +136,7 @@ class TestContainer(unittest.TestCase):
                 'use_existing': True,
                 'priority': 1,
                 'mounts': {
+                    '/tmp': {'kind': 'tmp'},
                     '/var/spool/cwl': {'kind': 'tmp'}
                 },
                 'state': 'Committed',
@@ -240,6 +242,7 @@ class TestContainer(unittest.TestCase):
                 'use_existing': True,
                 'priority': 1,
                 'mounts': {
+                    '/tmp': {'kind': 'tmp'},
                     '/var/spool/cwl': {'kind': 'tmp'},
                     '/var/spool/cwl/foo': {
                         'kind': 'collection',
@@ -325,6 +328,7 @@ class TestContainer(unittest.TestCase):
                     'use_existing': True,
                     'priority': 1,
                     'mounts': {
+                        '/tmp': {'kind': 'tmp'},
                         '/var/spool/cwl': {'kind': 'tmp'},
                         "stderr": {
                             "kind": "file",

commit e8317521741c3814e824e209f75edf23636e32ff
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Thu Apr 27 14:05:00 2017 -0400

    11549: Test that containers don't mount each file individually, instead mount
    common collections/subdirectories.

diff --git a/sdk/cwl/tests/test_container.py b/sdk/cwl/tests/test_container.py
index 08b50e0..7786e7f 100644
--- a/sdk/cwl/tests/test_container.py
+++ b/sdk/cwl/tests/test_container.py
@@ -388,3 +388,83 @@ class TestContainer(unittest.TestCase):
 
         arvjob.collect_outputs.assert_called_with("keep:abc+123")
         arvjob.output_callback.assert_called_with({"out": "stuff"}, "success")
+
+    # The test passes no builder.resources
+    # Hence the default resources will apply: {'cores': 1, 'ram': 1024, 'outdirSize': 1024, 'tmpdirSize': 1024}
+    @mock.patch("arvados.commands.keepdocker.list_images_in_arv")
+    def test_mounts(self, keepdocker):
+        arv_docker_clear_cache()
+
+        runner = mock.MagicMock()
+        runner.project_uuid = "zzzzz-8i9sb-zzzzzzzzzzzzzzz"
+        runner.ignore_docker_for_reuse = False
+
+        keepdocker.return_value = [("zzzzz-4zz18-zzzzzzzzzzzzzz3", "")]
+        runner.api.collections().get().execute.return_value = {
+            "portable_data_hash": "99999999999999999999999999999993+99"}
+
+        document_loader, avsc_names, schema_metadata, metaschema_loader = cwltool.process.get_schema("v1.0")
+
+        tool = cmap({
+            "inputs": [
+                {"id": "p1",
+                 "type": "Directory"}
+            ],
+            "outputs": [],
+            "baseCommand": "ls",
+            "arguments": [{"valueFrom": "$(runtime.outdir)"}]
+        })
+        make_fs_access=functools.partial(arvados_cwl.CollectionFsAccess,
+                                     collection_cache=arvados_cwl.CollectionCache(runner.api, None, 0))
+        arvtool = arvados_cwl.ArvadosCommandTool(runner, tool, work_api="containers", avsc_names=avsc_names,
+                                                 basedir="", make_fs_access=make_fs_access, loader=Loader({}))
+        arvtool.formatgraph = None
+        job_order = {
+            "p1": {
+                "class": "Directory",
+                "location": "keep:99999999999999999999999999999994+44",
+                "listing": [
+                    {
+                        "class": "File",
+                        "location": "keep:99999999999999999999999999999994+44/file1",
+                    },
+                    {
+                        "class": "File",
+                        "location": "keep:99999999999999999999999999999994+44/file2",
+                    }
+                ]
+            }
+        }
+        for j in arvtool.job(job_order, mock.MagicMock(), basedir="", name="test_run_mounts",
+                             make_fs_access=make_fs_access, tmpdir="/tmp"):
+            j.run()
+            runner.api.container_requests().create.assert_called_with(
+                body=JsonDiffMatcher({
+                    'environment': {
+                        'HOME': '/var/spool/cwl',
+                        'TMPDIR': '/tmp'
+                    },
+                    'name': 'test_run_mounts',
+                    'runtime_constraints': {
+                        'vcpus': 1,
+                        'ram': 1073741824
+                    },
+                    'use_existing': True,
+                    'priority': 1,
+                    'mounts': {
+                        "/keep/99999999999999999999999999999994+44": {
+                            "kind": "collection",
+                            "portable_data_hash": "99999999999999999999999999999994+44"
+                        },
+                        '/tmp': {'kind': 'tmp'},
+                        '/var/spool/cwl': {'kind': 'tmp'}
+                    },
+                    'state': 'Committed',
+                    'owner_uuid': 'zzzzz-8i9sb-zzzzzzzzzzzzzzz',
+                    'output_path': '/var/spool/cwl',
+                    'container_image': 'arvados/jobs',
+                    'command': ['ls', '/var/spool/cwl'],
+                    'cwd': '/var/spool/cwl',
+                    'scheduling_parameters': {},
+                    'properties': {},
+                }))

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list