[arvados] updated: 2.1.0-3014-gb19bca1fe
git repository hosting
git at public.arvados.org
Tue Nov 8 15:28:25 UTC 2022
Summary of changes:
sdk/cwl/arvados_cwl/__init__.py | 3 +++
sdk/cwl/arvados_cwl/context.py | 1 +
sdk/cwl/arvados_cwl/executor.py | 1 +
sdk/cwl/arvados_cwl/http.py | 31 ++++++++++++++++++++-----------
sdk/cwl/arvados_cwl/pathmapper.py | 2 +-
5 files changed, 26 insertions(+), 12 deletions(-)
via b19bca1fea6e45f2327929e14aac3fab26b198b9 (commit)
from 71f896e5a04fe7254f86886299c77d4c71de12d9 (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
commit b19bca1fea6e45f2327929e14aac3fab26b198b9
Author: Peter Amstutz <peter.amstutz at curii.com>
Date: Tue Nov 8 10:08:27 2022 -0500
19699: Add --varying-url-params
Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz at curii.com>
diff --git a/sdk/cwl/arvados_cwl/__init__.py b/sdk/cwl/arvados_cwl/__init__.py
index 86f49fa18..69fbd7d82 100644
--- a/sdk/cwl/arvados_cwl/__init__.py
+++ b/sdk/cwl/arvados_cwl/__init__.py
@@ -216,6 +216,9 @@ def arg_parser(): # type: () -> argparse.ArgumentParser
parser.add_argument("--defer-downloads", action="store_true", default=False,
help="When submitting a workflow, defer downloading HTTP URLs to workflow launch instead of downloading to Keep before submit.")
+ parser.add_argument("--varying-url-params", type=str, default="",
+ help="A comma separated list of URL query parameters that should be ignored when storing HTTP URLs in Keep.")
+
exgroup = parser.add_mutually_exclusive_group()
exgroup.add_argument("--enable-preemptible", dest="enable_preemptible", default=None, action="store_true", help="Use preemptible instances. Control individual steps with arv:UsePreemptible hint.")
exgroup.add_argument("--disable-preemptible", dest="enable_preemptible", default=None, action="store_false", help="Don't use preemptible instances.")
diff --git a/sdk/cwl/arvados_cwl/context.py b/sdk/cwl/arvados_cwl/context.py
index ec473a04f..66ccb26fa 100644
--- a/sdk/cwl/arvados_cwl/context.py
+++ b/sdk/cwl/arvados_cwl/context.py
@@ -40,6 +40,7 @@ class ArvRuntimeContext(RuntimeContext):
self.enable_preemptible = None
self.copy_deps = None
self.defer_downloads = False
+ self.varying_url_params = ""
super(ArvRuntimeContext, self).__init__(kwargs)
diff --git a/sdk/cwl/arvados_cwl/executor.py b/sdk/cwl/arvados_cwl/executor.py
index 115c962d6..88830579e 100644
--- a/sdk/cwl/arvados_cwl/executor.py
+++ b/sdk/cwl/arvados_cwl/executor.py
@@ -204,6 +204,7 @@ The 'jobs' API is no longer supported.
collection_cache=self.collection_cache)
self.defer_downloads = arvargs.submit and arvargs.defer_downloads
+ self.varying_url_params = arvargs.varying_url_params
validate_cluster_target(self, self.toplevel_runtimeContext)
diff --git a/sdk/cwl/arvados_cwl/http.py b/sdk/cwl/arvados_cwl/http.py
index 3acc06d48..34f921133 100644
--- a/sdk/cwl/arvados_cwl/http.py
+++ b/sdk/cwl/arvados_cwl/http.py
@@ -72,7 +72,7 @@ def remember_headers(url, properties, headers, now):
properties[url]["Date"] = my_formatdate(now)
-def changed(url, properties, now):
+def changed(url, clean_url, properties, now):
req = requests.head(url, allow_redirects=True)
remember_headers(url, properties, req.headers, now)
@@ -81,7 +81,7 @@ def changed(url, properties, now):
# allow GET so instead of failing here, we'll try GET If-None-Match
return True
- pr = properties[url]
+ pr = properties[clean_url]
if "ETag" in pr and "ETag" in req.headers:
if pr["ETag"] == req.headers["ETag"]:
return False
@@ -96,8 +96,17 @@ def etag_quote(etag):
# Add quotes.
return '"' + etag + '"'
-def http_to_keep(api, project_uuid, url, utcnow=datetime.datetime.utcnow):
- r = api.collections().list(filters=[["properties", "exists", url]]).execute()
+
+def http_to_keep(api, project_uuid, url, utcnow=datetime.datetime.utcnow, varying_url_params=""):
+ varying_params = [s.strip() for s in varying_url_params.split(",")]
+
+ parsed = urllib.parse.urlparse(url)
+ query = [q for q in urllib.parse.parse_qsl(parsed.query)
+ if q[0] not in varying_params]
+ clean_url = urllib.parse.urlunparse((parsed.scheme, parsed.netloc, parsed.path, parsed.params,
+ urllib.parse.urlencode(query), parsed.fragment))
+
+ r = api.collections().list(filters=[["properties", "exists", clean_url]]).execute()
now = utcnow()
@@ -105,12 +114,12 @@ def http_to_keep(api, project_uuid, url, utcnow=datetime.datetime.utcnow):
for item in r["items"]:
properties = item["properties"]
- if fresh_cache(url, properties, now):
+ if fresh_cache(clean_url, properties, now):
# Do nothing
cr = arvados.collection.CollectionReader(item["portable_data_hash"], api_client=api)
return "keep:%s/%s" % (item["portable_data_hash"], list(cr.keys())[0])
- if not changed(url, properties, now):
+ if not changed(url, clean_url, properties, now):
# ETag didn't change, same content, just update headers
api.collections().update(uuid=item["uuid"], body={"collection":{"properties": properties}}).execute()
cr = arvados.collection.CollectionReader(item["portable_data_hash"], api_client=api)
@@ -131,7 +140,7 @@ def http_to_keep(api, project_uuid, url, utcnow=datetime.datetime.utcnow):
if req.status_code not in (200, 304):
raise Exception("Failed to download '%s' got status %s " % (url, req.status_code))
- remember_headers(url, properties, req.headers, now)
+ remember_headers(clean_url, properties, req.headers, now)
if req.status_code == 304 and "ETag" in req.headers and req.headers["ETag"] in etags:
item = etags[req.headers["ETag"]]
@@ -140,8 +149,8 @@ def http_to_keep(api, project_uuid, url, utcnow=datetime.datetime.utcnow):
cr = arvados.collection.CollectionReader(item["portable_data_hash"], api_client=api)
return "keep:%s/%s" % (item["portable_data_hash"], list(cr.keys())[0])
- if "Content-Length" in properties[url]:
- cl = int(properties[url]["Content-Length"])
+ if "Content-Length" in properties[clean_url]:
+ cl = int(properties[clean_url]["Content-Length"])
logger.info("Downloading %s (%s bytes)", url, cl)
else:
cl = None
@@ -156,7 +165,7 @@ def http_to_keep(api, project_uuid, url, utcnow=datetime.datetime.utcnow):
else:
name = grp.group(4)
else:
- name = urllib.parse.urlparse(url).path.split("/")[-1]
+ name = parsed.path.split("/")[-1]
count = 0
start = time.time()
@@ -179,7 +188,7 @@ def http_to_keep(api, project_uuid, url, utcnow=datetime.datetime.utcnow):
logger.info("Download complete")
- collectionname = "Downloaded from %s" % urllib.parse.quote(url, safe='')
+ collectionname = "Downloaded from %s" % urllib.parse.quote(clean_url, safe='')
# max length - space to add a timestamp used by ensure_unique_name
max_name_len = 254 - 28
diff --git a/sdk/cwl/arvados_cwl/pathmapper.py b/sdk/cwl/arvados_cwl/pathmapper.py
index a7f210347..700b5b5f9 100644
--- a/sdk/cwl/arvados_cwl/pathmapper.py
+++ b/sdk/cwl/arvados_cwl/pathmapper.py
@@ -109,7 +109,7 @@ class ArvPathMapper(PathMapper):
# passthrough, we'll download it later.
self._pathmap[src] = MapperEnt(src, src, srcobj["class"], True)
else:
- keepref = http_to_keep(self.arvrunner.api, self.arvrunner.project_uuid, src)
+ keepref = http_to_keep(self.arvrunner.api, self.arvrunner.project_uuid, src, varying_url_params=self.arvrunner.varying_url_params)
logger.info("%s is %s", src, keepref)
self._pathmap[src] = MapperEnt(keepref, keepref, srcobj["class"], True)
except Exception as e:
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list