[arvados] updated: 2.1.0-2724-g61c68cf08

git repository hosting git at public.arvados.org
Wed Aug 3 21:13:52 UTC 2022


Summary of changes:
 sdk/cwl/arvados_cwl/runner.py | 53 +++++++++++++++++++++++++++++++++++++------
 sdk/cwl/setup.py              |  3 ++-
 2 files changed, 48 insertions(+), 8 deletions(-)

       via  61c68cf08258d3292257b67c6b50a223b17f4bfd (commit)
       via  4e5838bd9e1a7baa5b3e53e97e308140e4b6105f (commit)
       via  b334b065b36357dd08099adad9835f4aa7075337 (commit)
       via  a8f70f5f978641afa273adcbf995423228f0c7c4 (commit)
       via  2fdb9c8541e96756604439f604b82a68e747a35a (commit)
       via  b23d2434ab8162ea67be50fc3299a0e4450e13ee (commit)
       via  275e7919b78fd9d19c8f6b62c8ba97052bba589c (commit)
      from  d791fa7adb14991c972b6166f39155ff314b7d1e (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.


commit 61c68cf08258d3292257b67c6b50a223b17f4bfd
Author: Peter Amstutz <peter.amstutz at curii.com>
Date:   Wed Aug 3 17:13:36 2022 -0400

    19280: pin msgpack because 1.0.4 has install problems
    
    Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz at curii.com>

diff --git a/sdk/cwl/setup.py b/sdk/cwl/setup.py
index c5eccaad6..66cda19f4 100644
--- a/sdk/cwl/setup.py
+++ b/sdk/cwl/setup.py
@@ -41,7 +41,8 @@ setup(name='arvados-cwl-runner',
           'arvados-python-client{}'.format(pysdk_dep),
           'setuptools',
           'ciso8601 >= 2.0.0',
-          'networkx < 2.6'
+          'networkx < 2.6',
+          'msgpack==1.0.3'
       ],
       data_files=[
           ('share/doc/arvados-cwl-runner', ['LICENSE-2.0.txt', 'README.rst']),

commit 4e5838bd9e1a7baa5b3e53e97e308140e4b6105f
Author: Peter Amstutz <peter.amstutz at curii.com>
Date:   Tue Aug 2 12:40:27 2022 -0400

    19280: explicitly include/exclude primary
    
    Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz at curii.com>

diff --git a/sdk/cwl/arvados_cwl/runner.py b/sdk/cwl/arvados_cwl/runner.py
index d2486c164..d2dfcf26b 100644
--- a/sdk/cwl/arvados_cwl/runner.py
+++ b/sdk/cwl/arvados_cwl/runner.py
@@ -417,8 +417,14 @@ def upload_dependencies(arvrunner, name, document_loader,
 
     normalizeFilesDirs(sc)
 
-    if include_primary and "id" in workflowobj:
-        sc.append({"class": "File", "location": workflowobj["id"]})
+    if "id" in workflowobj:
+        defrg, _ = urllib.parse.urldefrag(workflowobj["id"])
+        if include_primary:
+            # make sure it's included
+            sc.append({"class": "File", "location": defrg})
+        else:
+            # make sure it's excluded
+            sc = [d for d in sc if d.get("location") != defrg]
 
     def visit_default(obj):
         def defaults_are_optional(f):

commit b334b065b36357dd08099adad9835f4aa7075337
Author: Peter Amstutz <peter.amstutz at curii.com>
Date:   Tue Aug 2 12:25:37 2022 -0400

    19280: Try this again
    
    Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz at curii.com>

diff --git a/sdk/cwl/arvados_cwl/runner.py b/sdk/cwl/arvados_cwl/runner.py
index 79876008b..d2486c164 100644
--- a/sdk/cwl/arvados_cwl/runner.py
+++ b/sdk/cwl/arvados_cwl/runner.py
@@ -329,7 +329,7 @@ def upload_dependencies(arvrunner, name, document_loader,
     scanobj = workflowobj
     if "id" in workflowobj and not workflowobj["id"].startswith("_:"):
         defrg, _ = urllib.parse.urldefrag(workflowobj["id"])
-        if cache is not None and defrg in cache:
+        if cache is not None and defrg not in cache:
             # if we haven't seen this file before, want raw file
             # content (before preprocessing) to ensure that external
             # references like $include haven't already been inlined.

commit a8f70f5f978641afa273adcbf995423228f0c7c4
Author: Peter Amstutz <peter.amstutz at curii.com>
Date:   Tue Aug 2 12:13:40 2022 -0400

    19280: Only scan tools
    
    Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz at curii.com>

diff --git a/sdk/cwl/arvados_cwl/runner.py b/sdk/cwl/arvados_cwl/runner.py
index e81f62175..79876008b 100644
--- a/sdk/cwl/arvados_cwl/runner.py
+++ b/sdk/cwl/arvados_cwl/runner.py
@@ -51,7 +51,7 @@ from schema_salad.sourceline import SourceLine, cmap
 
 from cwltool.command_line_tool import CommandLineTool
 import cwltool.workflow
-from cwltool.process import (UnsupportedRequirement, normalizeFilesDirs,
+from cwltool.process import (scandeps, UnsupportedRequirement, normalizeFilesDirs,
                              shortname, Process, fill_in_defaults)
 from cwltool.load_tool import fetch_document
 from cwltool.utils import aslist, adjustFileObjs, adjustDirObjs, visit_class
@@ -328,9 +328,12 @@ def upload_dependencies(arvrunner, name, document_loader,
 
     scanobj = workflowobj
     if "id" in workflowobj and not workflowobj["id"].startswith("_:"):
-        # Need raw file content (before preprocessing) to ensure
-        # that external references in $include and $mixin are captured.
-        scanobj = loadref("", workflowobj["id"])
+        defrg, _ = urllib.parse.urldefrag(workflowobj["id"])
+        if cache is not None and defrg in cache:
+            # if we haven't seen this file before, want raw file
+            # content (before preprocessing) to ensure that external
+            # references like $include haven't already been inlined.
+            scanobj = loadref("", workflowobj["id"])
 
     metadata = scanobj
 
@@ -888,239 +891,3 @@ class Runner(Process):
             self.arvrunner.output_callback({}, "permanentFail")
         else:
             self.arvrunner.output_callback(outputs, processStatus)
-
-
-
-
-# --- from cwltool ---
-
-
-CWL_IANA = "https://www.iana.org/assignments/media-types/application/cwl"
-
-
-def scandeps(
-    base: str,
-    doc: Union[CWLObjectType, MutableSequence[CWLObjectType]],
-    reffields: Set[str],
-    urlfields: Set[str],
-    loadref: Callable[[str, str], Union[CommentedMap, CommentedSeq, str, None]],
-    urljoin: Callable[[str, str], str] = urllib.parse.urljoin,
-    nestdirs: bool = True,
-    do_normalize: bool = True,
-) -> Optional[MutableSequence[CWLObjectType]]:
-
-    """Given a CWL document or input object, search for dependencies
-    (references to external files) of 'doc' and return them as a list
-    of File or Directory objects.
-
-    The 'base' is the base URL for relative references.
-
-    Looks for objects with 'class: File' or 'class: Directory' and
-    adds them to the list of dependencies.
-
-    Anything in 'urlfields' is also added as a File dependency.
-
-    Anything in 'reffields' (such as workflow step 'run') will be
-    added as a dependency and also loaded (using the 'loadref'
-    function) and recursively scanned for dependencies.  Those
-    dependencies will be added as secondary files to the primary file.
-
-    If "nestdirs" is true, create intermediate directory objects when
-    a file is located in a subdirectory under the starting directory.
-    This is so that if the dependencies are materialized, they will
-    produce the same relative file system locations.
-
-    """
-
-    if do_normalize:
-        import pprint
-        pprint.pprint(doc)
-
-    r: Optional[MutableSequence[CWLObjectType]] = None
-    if isinstance(doc, MutableMapping):
-        if "id" in doc:
-            if cast(str, doc["id"]).startswith("file://"):
-                df, _ = urllib.parse.urldefrag(cast(str, doc["id"]))
-                if base != df:
-                    if r is None:
-                        r = []
-                    r.append({"class": "File", "location": df, "format": CWL_IANA})
-                    base = df
-
-        if doc.get("class") in ("File", "Directory") and "location" in urlfields:
-            with Perf(metrics, "File or Directory with location"):
-                u = cast(Optional[str], doc.get("location", doc.get("path")))
-                if u and not u.startswith("_:"):
-                    deps = {
-                        "class": doc["class"],
-                        "location": urljoin(base, u),
-                    }  # type: CWLObjectType
-                    if "basename" in doc:
-                        deps["basename"] = doc["basename"]
-                    if doc["class"] == "Directory" and "listing" in doc:
-                        deps["listing"] = doc["listing"]
-                    if doc["class"] == "File" and "secondaryFiles" in doc:
-                        sd = scandeps(
-                            base,
-                            cast(
-                                Union[CWLObjectType, MutableSequence[CWLObjectType]],
-                                doc["secondaryFiles"],
-                            ),
-                            reffields,
-                            urlfields,
-                            loadref,
-                            urljoin=urljoin,
-                            nestdirs=nestdirs,
-                            do_normalize=False,
-                        )
-                        if sd:
-                            deps["secondaryFiles"] = cast(
-                                CWLOutputAtomType,
-                                sd
-                            )
-                    if nestdirs:
-                        deps = nestdir(base, deps)
-                    if r is None:
-                        r = []
-                    r.append(deps)
-                else:
-                    if doc["class"] == "Directory" and "listing" in doc:
-                        sd = scandeps(
-                                base,
-                                cast(MutableSequence[CWLObjectType], doc["listing"]),
-                                reffields,
-                                urlfields,
-                                loadref,
-                                urljoin=urljoin,
-                                nestdirs=nestdirs,
-                                do_normalize=False,
-                            )
-                        if sd:
-                            if r is None:
-                                r = []
-                            r.extend(sd)
-                    elif doc["class"] == "File" and "secondaryFiles" in doc:
-                        sd = scandeps(
-                                base,
-                                cast(MutableSequence[CWLObjectType], doc["secondaryFiles"]),
-                                reffields,
-                                urlfields,
-                                loadref,
-                                urljoin=urljoin,
-                                nestdirs=nestdirs,
-                                do_normalize=False,
-                            )
-                        if sd:
-                            if r is None:
-                                r = sd
-                            else:
-                                r.extend(sd)
-
-        for k, v in doc.items():
-            if k in reffields:
-                with Perf(metrics, "k in reffields"):
-                    for u2 in aslist(v):
-                        if isinstance(u2, MutableMapping):
-                            sd = scandeps(
-                                    base,
-                                    u2,
-                                    reffields,
-                                    urlfields,
-                                    loadref,
-                                    urljoin=urljoin,
-                                    nestdirs=nestdirs,
-                                    do_normalize=False,
-                                )
-                            if sd:
-                                if r is None:
-                                    r = sd
-                                else:
-                                    r.extend(sd)
-                        else:
-                            subid = urljoin(base, u2)
-                            basedf, _ = urllib.parse.urldefrag(base)
-                            subiddf, _ = urllib.parse.urldefrag(subid)
-                            if basedf == subiddf:
-                                continue
-                            sub = cast(
-                                Union[MutableSequence[CWLObjectType], CWLObjectType],
-                                loadref(base, u2),
-                            )
-                            deps2 = {
-                                "class": "File",
-                                "location": subid,
-                                "format": CWL_IANA,
-                            }  # type: CWLObjectType
-                            sf = scandeps(
-                                subid,
-                                sub,
-                                reffields,
-                                urlfields,
-                                loadref,
-                                urljoin=urljoin,
-                                nestdirs=nestdirs,
-                                do_normalize=False,
-                            )
-                            if sf:
-                                deps2["secondaryFiles"] = cast(
-                                    MutableSequence[CWLOutputAtomType], mergedirs(sf)
-                                )
-                            if nestdirs:
-                                deps2 = nestdir(base, deps2)
-                            if r is None:
-                                r = []
-                            r.append(deps2)
-            elif k in urlfields and k != "location":
-                with Perf(metrics, "k in urlfields"):
-                    for u3 in aslist(v):
-                        deps = {"class": "File", "location": urljoin(base, u3)}
-                        if nestdirs:
-                            deps = nestdir(base, deps)
-                        if r is None:
-                            r = []
-                        r.append(deps)
-            elif doc.get("class") in ("File", "Directory") and k in (
-                "listing",
-                "secondaryFiles",
-            ):
-                # should be handled earlier.
-                pass
-            else:
-                with Perf(metrics, "k is something else"):
-                    sd = scandeps(
-                            base,
-                            cast(Union[MutableSequence[CWLObjectType], CWLObjectType], v),
-                            reffields,
-                            urlfields,
-                            loadref,
-                            urljoin=urljoin,
-                            nestdirs=nestdirs,
-                            do_normalize=False,
-                        )
-                    if sd:
-                        if r is None:
-                            r = sd
-                        else:
-                            r.extend(sd)
-    elif isinstance(doc, MutableSequence):
-        with Perf(metrics, "d in doc"):
-            for d in doc:
-                sd = scandeps(
-                        base,
-                        d,
-                        reffields,
-                        urlfields,
-                        loadref,
-                        urljoin=urljoin,
-                        nestdirs=nestdirs,
-                        do_normalize=False,
-                    )
-                if r is None:
-                    r = sd
-                else:
-                    r.extend(sd)
-
-    if r and do_normalize:
-        normalizeFilesDirs(r)
-
-    return r

commit 2fdb9c8541e96756604439f604b82a68e747a35a
Author: Peter Amstutz <peter.amstutz at curii.com>
Date:   Tue Aug 2 11:51:53 2022 -0400

    19280: what are we scanning here actually
    
    Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz at curii.com>

diff --git a/sdk/cwl/arvados_cwl/runner.py b/sdk/cwl/arvados_cwl/runner.py
index 6db7dfb2e..e81f62175 100644
--- a/sdk/cwl/arvados_cwl/runner.py
+++ b/sdk/cwl/arvados_cwl/runner.py
@@ -932,6 +932,10 @@ def scandeps(
 
     """
 
+    if do_normalize:
+        import pprint
+        pprint.pprint(doc)
+
     r: Optional[MutableSequence[CWLObjectType]] = None
     if isinstance(doc, MutableMapping):
         if "id" in doc:

commit b23d2434ab8162ea67be50fc3299a0e4450e13ee
Author: Peter Amstutz <peter.amstutz at curii.com>
Date:   Tue Aug 2 11:45:57 2022 -0400

    19280: Metrics on scandeps
    
    Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz at curii.com>

diff --git a/sdk/cwl/arvados_cwl/runner.py b/sdk/cwl/arvados_cwl/runner.py
index bbf8f202d..6db7dfb2e 100644
--- a/sdk/cwl/arvados_cwl/runner.py
+++ b/sdk/cwl/arvados_cwl/runner.py
@@ -944,60 +944,24 @@ def scandeps(
                     base = df
 
         if doc.get("class") in ("File", "Directory") and "location" in urlfields:
-            u = cast(Optional[str], doc.get("location", doc.get("path")))
-            if u and not u.startswith("_:"):
-                deps = {
-                    "class": doc["class"],
-                    "location": urljoin(base, u),
-                }  # type: CWLObjectType
-                if "basename" in doc:
-                    deps["basename"] = doc["basename"]
-                if doc["class"] == "Directory" and "listing" in doc:
-                    deps["listing"] = doc["listing"]
-                if doc["class"] == "File" and "secondaryFiles" in doc:
-                    sd = scandeps(
-                        base,
-                        cast(
-                            Union[CWLObjectType, MutableSequence[CWLObjectType]],
-                            doc["secondaryFiles"],
-                        ),
-                        reffields,
-                        urlfields,
-                        loadref,
-                        urljoin=urljoin,
-                        nestdirs=nestdirs,
-                        do_normalize=False,
-                    )
-                    if sd:
-                        deps["secondaryFiles"] = cast(
-                            CWLOutputAtomType,
-                            sd
-                        )
-                if nestdirs:
-                    deps = nestdir(base, deps)
-                if r is None:
-                    r = []
-                r.append(deps)
-            else:
-                if doc["class"] == "Directory" and "listing" in doc:
-                    sd = scandeps(
-                            base,
-                            cast(MutableSequence[CWLObjectType], doc["listing"]),
-                            reffields,
-                            urlfields,
-                            loadref,
-                            urljoin=urljoin,
-                            nestdirs=nestdirs,
-                            do_normalize=False,
-                        )
-                    if sd:
-                        if r is None:
-                            r = []
-                        r.extend(sd)
-                elif doc["class"] == "File" and "secondaryFiles" in doc:
-                    sd = scandeps(
+            with Perf(metrics, "File or Directory with location"):
+                u = cast(Optional[str], doc.get("location", doc.get("path")))
+                if u and not u.startswith("_:"):
+                    deps = {
+                        "class": doc["class"],
+                        "location": urljoin(base, u),
+                    }  # type: CWLObjectType
+                    if "basename" in doc:
+                        deps["basename"] = doc["basename"]
+                    if doc["class"] == "Directory" and "listing" in doc:
+                        deps["listing"] = doc["listing"]
+                    if doc["class"] == "File" and "secondaryFiles" in doc:
+                        sd = scandeps(
                             base,
-                            cast(MutableSequence[CWLObjectType], doc["secondaryFiles"]),
+                            cast(
+                                Union[CWLObjectType, MutableSequence[CWLObjectType]],
+                                doc["secondaryFiles"],
+                            ),
                             reffields,
                             urlfields,
                             loadref,
@@ -1005,19 +969,36 @@ def scandeps(
                             nestdirs=nestdirs,
                             do_normalize=False,
                         )
-                    if sd:
-                        if r is None:
-                            r = sd
-                        else:
+                        if sd:
+                            deps["secondaryFiles"] = cast(
+                                CWLOutputAtomType,
+                                sd
+                            )
+                    if nestdirs:
+                        deps = nestdir(base, deps)
+                    if r is None:
+                        r = []
+                    r.append(deps)
+                else:
+                    if doc["class"] == "Directory" and "listing" in doc:
+                        sd = scandeps(
+                                base,
+                                cast(MutableSequence[CWLObjectType], doc["listing"]),
+                                reffields,
+                                urlfields,
+                                loadref,
+                                urljoin=urljoin,
+                                nestdirs=nestdirs,
+                                do_normalize=False,
+                            )
+                        if sd:
+                            if r is None:
+                                r = []
                             r.extend(sd)
-
-        for k, v in doc.items():
-            if k in reffields:
-                for u2 in aslist(v):
-                    if isinstance(u2, MutableMapping):
+                    elif doc["class"] == "File" and "secondaryFiles" in doc:
                         sd = scandeps(
                                 base,
-                                u2,
+                                cast(MutableSequence[CWLObjectType], doc["secondaryFiles"]),
                                 reffields,
                                 urlfields,
                                 loadref,
@@ -1030,48 +1011,70 @@ def scandeps(
                                 r = sd
                             else:
                                 r.extend(sd)
-                    else:
-                        subid = urljoin(base, u2)
-                        basedf, _ = urllib.parse.urldefrag(base)
-                        subiddf, _ = urllib.parse.urldefrag(subid)
-                        if basedf == subiddf:
-                            continue
-                        sub = cast(
-                            Union[MutableSequence[CWLObjectType], CWLObjectType],
-                            loadref(base, u2),
-                        )
-                        deps2 = {
-                            "class": "File",
-                            "location": subid,
-                            "format": CWL_IANA,
-                        }  # type: CWLObjectType
-                        sf = scandeps(
-                            subid,
-                            sub,
-                            reffields,
-                            urlfields,
-                            loadref,
-                            urljoin=urljoin,
-                            nestdirs=nestdirs,
-                            do_normalize=False,
-                        )
-                        if sf:
-                            deps2["secondaryFiles"] = cast(
-                                MutableSequence[CWLOutputAtomType], mergedirs(sf)
+
+        for k, v in doc.items():
+            if k in reffields:
+                with Perf(metrics, "k in reffields"):
+                    for u2 in aslist(v):
+                        if isinstance(u2, MutableMapping):
+                            sd = scandeps(
+                                    base,
+                                    u2,
+                                    reffields,
+                                    urlfields,
+                                    loadref,
+                                    urljoin=urljoin,
+                                    nestdirs=nestdirs,
+                                    do_normalize=False,
+                                )
+                            if sd:
+                                if r is None:
+                                    r = sd
+                                else:
+                                    r.extend(sd)
+                        else:
+                            subid = urljoin(base, u2)
+                            basedf, _ = urllib.parse.urldefrag(base)
+                            subiddf, _ = urllib.parse.urldefrag(subid)
+                            if basedf == subiddf:
+                                continue
+                            sub = cast(
+                                Union[MutableSequence[CWLObjectType], CWLObjectType],
+                                loadref(base, u2),
                             )
+                            deps2 = {
+                                "class": "File",
+                                "location": subid,
+                                "format": CWL_IANA,
+                            }  # type: CWLObjectType
+                            sf = scandeps(
+                                subid,
+                                sub,
+                                reffields,
+                                urlfields,
+                                loadref,
+                                urljoin=urljoin,
+                                nestdirs=nestdirs,
+                                do_normalize=False,
+                            )
+                            if sf:
+                                deps2["secondaryFiles"] = cast(
+                                    MutableSequence[CWLOutputAtomType], mergedirs(sf)
+                                )
+                            if nestdirs:
+                                deps2 = nestdir(base, deps2)
+                            if r is None:
+                                r = []
+                            r.append(deps2)
+            elif k in urlfields and k != "location":
+                with Perf(metrics, "k in urlfields"):
+                    for u3 in aslist(v):
+                        deps = {"class": "File", "location": urljoin(base, u3)}
                         if nestdirs:
-                            deps2 = nestdir(base, deps2)
+                            deps = nestdir(base, deps)
                         if r is None:
                             r = []
-                        r.append(deps2)
-            elif k in urlfields and k != "location":
-                for u3 in aslist(v):
-                    deps = {"class": "File", "location": urljoin(base, u3)}
-                    if nestdirs:
-                        deps = nestdir(base, deps)
-                    if r is None:
-                        r = []
-                    r.append(deps)
+                        r.append(deps)
             elif doc.get("class") in ("File", "Directory") and k in (
                 "listing",
                 "secondaryFiles",
@@ -1079,9 +1082,28 @@ def scandeps(
                 # should be handled earlier.
                 pass
             else:
+                with Perf(metrics, "k is something else"):
+                    sd = scandeps(
+                            base,
+                            cast(Union[MutableSequence[CWLObjectType], CWLObjectType], v),
+                            reffields,
+                            urlfields,
+                            loadref,
+                            urljoin=urljoin,
+                            nestdirs=nestdirs,
+                            do_normalize=False,
+                        )
+                    if sd:
+                        if r is None:
+                            r = sd
+                        else:
+                            r.extend(sd)
+    elif isinstance(doc, MutableSequence):
+        with Perf(metrics, "d in doc"):
+            for d in doc:
                 sd = scandeps(
                         base,
-                        cast(Union[MutableSequence[CWLObjectType], CWLObjectType], v),
+                        d,
                         reffields,
                         urlfields,
                         loadref,
@@ -1089,27 +1111,10 @@ def scandeps(
                         nestdirs=nestdirs,
                         do_normalize=False,
                     )
-                if sd:
-                    if r is None:
-                        r = sd
-                    else:
-                        r.extend(sd)
-    elif isinstance(doc, MutableSequence):
-        for d in doc:
-            sd = scandeps(
-                    base,
-                    d,
-                    reffields,
-                    urlfields,
-                    loadref,
-                    urljoin=urljoin,
-                    nestdirs=nestdirs,
-                    do_normalize=False,
-                )
-            if r is None:
-                r = sd
-            else:
-                r.extend(sd)
+                if r is None:
+                    r = sd
+                else:
+                    r.extend(sd)
 
     if r and do_normalize:
         normalizeFilesDirs(r)

commit 275e7919b78fd9d19c8f6b62c8ba97052bba589c
Author: Peter Amstutz <peter.amstutz at curii.com>
Date:   Tue Aug 2 11:25:00 2022 -0400

    19280: Try optimizing scandeps
    
    Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz at curii.com>

diff --git a/sdk/cwl/arvados_cwl/runner.py b/sdk/cwl/arvados_cwl/runner.py
index 2582c0a3a..bbf8f202d 100644
--- a/sdk/cwl/arvados_cwl/runner.py
+++ b/sdk/cwl/arvados_cwl/runner.py
@@ -17,7 +17,30 @@ import json
 import copy
 from collections import namedtuple
 from io import StringIO
-from typing import Mapping, Sequence
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Iterable,
+    Iterator,
+    List,
+    Mapping,
+    MutableMapping,
+    Sequence,
+    MutableSequence,
+    Optional,
+    Set,
+    Sized,
+    Tuple,
+    Type,
+    Union,
+    cast,
+)
+from cwltool.utils import (
+    CWLObjectType,
+    CWLOutputAtomType,
+    CWLOutputType,
+)
 
 if os.name == "posix" and sys.version_info[0] < 3:
     import subprocess32 as subprocess
@@ -28,7 +51,7 @@ from schema_salad.sourceline import SourceLine, cmap
 
 from cwltool.command_line_tool import CommandLineTool
 import cwltool.workflow
-from cwltool.process import (scandeps, UnsupportedRequirement, normalizeFilesDirs,
+from cwltool.process import (UnsupportedRequirement, normalizeFilesDirs,
                              shortname, Process, fill_in_defaults)
 from cwltool.load_tool import fetch_document
 from cwltool.utils import aslist, adjustFileObjs, adjustDirObjs, visit_class
@@ -325,7 +348,14 @@ def upload_dependencies(arvrunner, name, document_loader,
                                       loadref, urljoin=document_loader.fetcher.urljoin,
                                       nestdirs=False)
 
-    sc_result.extend(optional_deps)
+    if sc_result is None:
+        sc_result = []
+
+    if optional_deps is None:
+        optional_deps = []
+
+    if optional_deps:
+        sc_result.extend(optional_deps)
 
     sc = []
     uuids = {}
@@ -858,3 +888,230 @@ class Runner(Process):
             self.arvrunner.output_callback({}, "permanentFail")
         else:
             self.arvrunner.output_callback(outputs, processStatus)
+
+
+
+
+# --- from cwltool ---
+
+
+CWL_IANA = "https://www.iana.org/assignments/media-types/application/cwl"
+
+
+def scandeps(
+    base: str,
+    doc: Union[CWLObjectType, MutableSequence[CWLObjectType]],
+    reffields: Set[str],
+    urlfields: Set[str],
+    loadref: Callable[[str, str], Union[CommentedMap, CommentedSeq, str, None]],
+    urljoin: Callable[[str, str], str] = urllib.parse.urljoin,
+    nestdirs: bool = True,
+    do_normalize: bool = True,
+) -> Optional[MutableSequence[CWLObjectType]]:
+
+    """Given a CWL document or input object, search for dependencies
+    (references to external files) of 'doc' and return them as a list
+    of File or Directory objects.
+
+    The 'base' is the base URL for relative references.
+
+    Looks for objects with 'class: File' or 'class: Directory' and
+    adds them to the list of dependencies.
+
+    Anything in 'urlfields' is also added as a File dependency.
+
+    Anything in 'reffields' (such as workflow step 'run') will be
+    added as a dependency and also loaded (using the 'loadref'
+    function) and recursively scanned for dependencies.  Those
+    dependencies will be added as secondary files to the primary file.
+
+    If "nestdirs" is true, create intermediate directory objects when
+    a file is located in a subdirectory under the starting directory.
+    This is so that if the dependencies are materialized, they will
+    produce the same relative file system locations.
+
+    """
+
+    r: Optional[MutableSequence[CWLObjectType]] = None
+    if isinstance(doc, MutableMapping):
+        if "id" in doc:
+            if cast(str, doc["id"]).startswith("file://"):
+                df, _ = urllib.parse.urldefrag(cast(str, doc["id"]))
+                if base != df:
+                    if r is None:
+                        r = []
+                    r.append({"class": "File", "location": df, "format": CWL_IANA})
+                    base = df
+
+        if doc.get("class") in ("File", "Directory") and "location" in urlfields:
+            u = cast(Optional[str], doc.get("location", doc.get("path")))
+            if u and not u.startswith("_:"):
+                deps = {
+                    "class": doc["class"],
+                    "location": urljoin(base, u),
+                }  # type: CWLObjectType
+                if "basename" in doc:
+                    deps["basename"] = doc["basename"]
+                if doc["class"] == "Directory" and "listing" in doc:
+                    deps["listing"] = doc["listing"]
+                if doc["class"] == "File" and "secondaryFiles" in doc:
+                    sd = scandeps(
+                        base,
+                        cast(
+                            Union[CWLObjectType, MutableSequence[CWLObjectType]],
+                            doc["secondaryFiles"],
+                        ),
+                        reffields,
+                        urlfields,
+                        loadref,
+                        urljoin=urljoin,
+                        nestdirs=nestdirs,
+                        do_normalize=False,
+                    )
+                    if sd:
+                        deps["secondaryFiles"] = cast(
+                            CWLOutputAtomType,
+                            sd
+                        )
+                if nestdirs:
+                    deps = nestdir(base, deps)
+                if r is None:
+                    r = []
+                r.append(deps)
+            else:
+                if doc["class"] == "Directory" and "listing" in doc:
+                    sd = scandeps(
+                            base,
+                            cast(MutableSequence[CWLObjectType], doc["listing"]),
+                            reffields,
+                            urlfields,
+                            loadref,
+                            urljoin=urljoin,
+                            nestdirs=nestdirs,
+                            do_normalize=False,
+                        )
+                    if sd:
+                        if r is None:
+                            r = []
+                        r.extend(sd)
+                elif doc["class"] == "File" and "secondaryFiles" in doc:
+                    sd = scandeps(
+                            base,
+                            cast(MutableSequence[CWLObjectType], doc["secondaryFiles"]),
+                            reffields,
+                            urlfields,
+                            loadref,
+                            urljoin=urljoin,
+                            nestdirs=nestdirs,
+                            do_normalize=False,
+                        )
+                    if sd:
+                        if r is None:
+                            r = sd
+                        else:
+                            r.extend(sd)
+
+        for k, v in doc.items():
+            if k in reffields:
+                for u2 in aslist(v):
+                    if isinstance(u2, MutableMapping):
+                        sd = scandeps(
+                                base,
+                                u2,
+                                reffields,
+                                urlfields,
+                                loadref,
+                                urljoin=urljoin,
+                                nestdirs=nestdirs,
+                                do_normalize=False,
+                            )
+                        if sd:
+                            if r is None:
+                                r = sd
+                            else:
+                                r.extend(sd)
+                    else:
+                        subid = urljoin(base, u2)
+                        basedf, _ = urllib.parse.urldefrag(base)
+                        subiddf, _ = urllib.parse.urldefrag(subid)
+                        if basedf == subiddf:
+                            continue
+                        sub = cast(
+                            Union[MutableSequence[CWLObjectType], CWLObjectType],
+                            loadref(base, u2),
+                        )
+                        deps2 = {
+                            "class": "File",
+                            "location": subid,
+                            "format": CWL_IANA,
+                        }  # type: CWLObjectType
+                        sf = scandeps(
+                            subid,
+                            sub,
+                            reffields,
+                            urlfields,
+                            loadref,
+                            urljoin=urljoin,
+                            nestdirs=nestdirs,
+                            do_normalize=False,
+                        )
+                        if sf:
+                            deps2["secondaryFiles"] = cast(
+                                MutableSequence[CWLOutputAtomType], mergedirs(sf)
+                            )
+                        if nestdirs:
+                            deps2 = nestdir(base, deps2)
+                        if r is None:
+                            r = []
+                        r.append(deps2)
+            elif k in urlfields and k != "location":
+                for u3 in aslist(v):
+                    deps = {"class": "File", "location": urljoin(base, u3)}
+                    if nestdirs:
+                        deps = nestdir(base, deps)
+                    if r is None:
+                        r = []
+                    r.append(deps)
+            elif doc.get("class") in ("File", "Directory") and k in (
+                "listing",
+                "secondaryFiles",
+            ):
+                # should be handled earlier.
+                pass
+            else:
+                sd = scandeps(
+                        base,
+                        cast(Union[MutableSequence[CWLObjectType], CWLObjectType], v),
+                        reffields,
+                        urlfields,
+                        loadref,
+                        urljoin=urljoin,
+                        nestdirs=nestdirs,
+                        do_normalize=False,
+                    )
+                if sd:
+                    if r is None:
+                        r = sd
+                    else:
+                        r.extend(sd)
+    elif isinstance(doc, MutableSequence):
+        for d in doc:
+            sd = scandeps(
+                    base,
+                    d,
+                    reffields,
+                    urlfields,
+                    loadref,
+                    urljoin=urljoin,
+                    nestdirs=nestdirs,
+                    do_normalize=False,
+                )
+            if r is None:
+                r = sd
+            else:
+                r.extend(sd)
+
+    if r and do_normalize:
+        normalizeFilesDirs(r)
+
+    return r

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list