[arvados] updated: 2.6.0-36-g10aaf718b
git repository hosting
git at public.arvados.org
Fri Apr 21 14:58:39 UTC 2023
Summary of changes:
sdk/cwl/arvados_cwl/__init__.py | 3 +-
sdk/cwl/arvados_cwl/pathmapper.py | 8 +--
sdk/python/arvados/{pycurl.py => _pycurlhelper.py} | 0
.../arvados/{http_import.py => http_to_keep.py} | 80 ++++++++++++----------
sdk/python/arvados/keep.py | 2 +-
sdk/python/tests/test_http.py | 20 +++---
6 files changed, 61 insertions(+), 52 deletions(-)
rename sdk/python/arvados/{pycurl.py => _pycurlhelper.py} (100%)
rename sdk/python/arvados/{http_import.py => http_to_keep.py} (80%)
via 10aaf718b0795aa37a4e74063b0206c507ddc6fe (commit)
from 08fe9bab402a591674993e6cd472570967e92dec (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
commit 10aaf718b0795aa37a4e74063b0206c507ddc6fe
Author: Peter Amstutz <peter.amstutz at curii.com>
Date: Fri Apr 21 10:57:30 2023 -0400
20257: Mark functions and classes internal not intended for use
Adjust return value of http_to_keep
Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz at curii.com>
diff --git a/sdk/cwl/arvados_cwl/__init__.py b/sdk/cwl/arvados_cwl/__init__.py
index 63df4db7a..fe27b91ab 100644
--- a/sdk/cwl/arvados_cwl/__init__.py
+++ b/sdk/cwl/arvados_cwl/__init__.py
@@ -354,8 +354,7 @@ def main(args=sys.argv[1:],
# Note that unless in debug mode, some stack traces related to user
# workflow errors may be suppressed.
- # For some reason if I don't set these explicitly some logs won't
- # show up.
+ # Set the logging on most modules INFO (instead of default which is WARNING)
logger.setLevel(logging.INFO)
logging.getLogger('arvados').setLevel(logging.INFO)
logging.getLogger('arvados.keep').setLevel(logging.WARNING)
diff --git a/sdk/cwl/arvados_cwl/pathmapper.py b/sdk/cwl/arvados_cwl/pathmapper.py
index 62a59a8cf..3f54a396b 100644
--- a/sdk/cwl/arvados_cwl/pathmapper.py
+++ b/sdk/cwl/arvados_cwl/pathmapper.py
@@ -26,7 +26,7 @@ from cwltool.utils import adjustFileObjs, adjustDirObjs
from cwltool.stdfsaccess import abspath
from cwltool.workflow import WorkflowException
-from arvados.http_import import http_to_keep
+from arvados.http_to_keep import http_to_keep
logger = logging.getLogger('arvados.cwl-runner')
@@ -109,9 +109,9 @@ class ArvPathMapper(PathMapper):
# passthrough, we'll download it later.
self._pathmap[src] = MapperEnt(src, src, srcobj["class"], True)
else:
- keepref = http_to_keep(self.arvrunner.api, self.arvrunner.project_uuid, src,
- varying_url_params=self.arvrunner.toplevel_runtimeContext.varying_url_params,
- prefer_cached_downloads=self.arvrunner.toplevel_runtimeContext.prefer_cached_downloads)
+ keepref = "keep:%s/%s" % http_to_keep(self.arvrunner.api, self.arvrunner.project_uuid, src,
+ varying_url_params=self.arvrunner.toplevel_runtimeContext.varying_url_params,
+ prefer_cached_downloads=self.arvrunner.toplevel_runtimeContext.prefer_cached_downloads)
logger.info("%s is %s", src, keepref)
self._pathmap[src] = MapperEnt(keepref, keepref, srcobj["class"], True)
except Exception as e:
diff --git a/sdk/python/arvados/pycurl.py b/sdk/python/arvados/_pycurlhelper.py
similarity index 100%
rename from sdk/python/arvados/pycurl.py
rename to sdk/python/arvados/_pycurlhelper.py
diff --git a/sdk/python/arvados/http_import.py b/sdk/python/arvados/http_to_keep.py
similarity index 80%
rename from sdk/python/arvados/http_import.py
rename to sdk/python/arvados/http_to_keep.py
index fef6d6719..b07409912 100644
--- a/sdk/python/arvados/http_import.py
+++ b/sdk/python/arvados/http_to_keep.py
@@ -17,15 +17,16 @@ import logging
import calendar
import urllib.parse
import pycurl
-from arvados.pycurl import PyCurlHelper
+import dataclasses
+from arvados._pycurlhelper import PyCurlHelper
logger = logging.getLogger('arvados.http_import')
-def my_formatdate(dt):
+def _my_formatdate(dt):
return email.utils.formatdate(timeval=calendar.timegm(dt.timetuple()),
localtime=False, usegmt=True)
-def my_parsedate(text):
+def _my_parsedate(text):
parsed = email.utils.parsedate_tz(text)
if parsed:
if parsed[9]:
@@ -37,7 +38,7 @@ def my_parsedate(text):
else:
return datetime.datetime(1970, 1, 1)
-def fresh_cache(url, properties, now):
+def _fresh_cache(url, properties, now):
pr = properties[url]
expires = None
@@ -49,51 +50,52 @@ def fresh_cache(url, properties, now):
g = re.match(r"(s-maxage|max-age)=(\d+)", pr["Cache-Control"])
if g:
- expires = my_parsedate(pr["Date"]) + datetime.timedelta(seconds=int(g.group(2)))
+ expires = _my_parsedate(pr["Date"]) + datetime.timedelta(seconds=int(g.group(2)))
if expires is None and "Expires" in pr:
- expires = my_parsedate(pr["Expires"])
+ expires = _my_parsedate(pr["Expires"])
if expires is None:
# Use a default cache time of 24 hours if upstream didn't set
# any cache headers, to reduce redundant downloads.
- expires = my_parsedate(pr["Date"]) + datetime.timedelta(hours=24)
+ expires = _my_parsedate(pr["Date"]) + datetime.timedelta(hours=24)
if not expires:
return False
return (now < expires)
-def remember_headers(url, properties, headers, now):
+def _remember_headers(url, properties, headers, now):
properties.setdefault(url, {})
for h in ("Cache-Control", "Etag", "Expires", "Date", "Content-Length"):
if h in headers:
properties[url][h] = headers[h]
if "Date" not in headers:
- properties[url]["Date"] = my_formatdate(now)
+ properties[url]["Date"] = _my_formatdate(now)
-class Response:
+ at dataclasses.dataclass
+class _Response:
def __init__(self, status_code, headers):
self.status_code = status_code
self.headers = headers
-class CurlDownloader(PyCurlHelper):
+class _Downloader(PyCurlHelper):
# Wait up to 60 seconds for connection
# How long it can be in "low bandwidth" state before it gives up
# Low bandwidth threshold is 32 KiB/s
DOWNLOADER_TIMEOUT = (60, 300, 32768)
- def __init__(self):
- super(CurlDownloader, self).__init__(title_case_headers=True)
+ def __init__(self, apiclient):
+ super(_Downloader, self).__init__(title_case_headers=True)
self.curl = pycurl.Curl()
self.curl.setopt(pycurl.NOSIGNAL, 1)
self.curl.setopt(pycurl.OPENSOCKETFUNCTION,
lambda *args, **kwargs: self._socket_open(*args, **kwargs))
self.target = None
+ self.apiclient = apiclient
- def head(self, url, headers={}):
+ def head(self, url):
get_headers = {'Accept': 'application/octet-stream'}
- get_headers.update(headers)
self._headers = {}
self.curl.setopt(pycurl.URL, url.encode('utf-8'))
@@ -116,7 +118,7 @@ class CurlDownloader(PyCurlHelper):
self._socket.close()
self._socket = None
- return Response(self.curl.getinfo(pycurl.RESPONSE_CODE), self._headers)
+ return _Response(self.curl.getinfo(pycurl.RESPONSE_CODE), self._headers)
def download(self, url, headers):
self.count = 0
@@ -152,10 +154,10 @@ class CurlDownloader(PyCurlHelper):
self._socket.close()
self._socket = None
- return Response(self.curl.getinfo(pycurl.RESPONSE_CODE), self._headers)
+ return _Response(self.curl.getinfo(pycurl.RESPONSE_CODE), self._headers)
def headers_received(self):
- self.collection = arvados.collection.Collection()
+ self.collection = arvados.collection.Collection(api_client=self.apiclient)
if "Content-Length" in self._headers:
self.contentlength = int(self._headers["Content-Length"])
@@ -204,7 +206,7 @@ class CurlDownloader(PyCurlHelper):
self.checkpoint = loopnow
-def changed(url, clean_url, properties, now, curldownloader):
+def _changed(url, clean_url, properties, now, curldownloader):
req = curldownloader.head(url)
if req.status_code != 200:
@@ -218,7 +220,7 @@ def changed(url, clean_url, properties, now, curldownloader):
if url in properties:
del properties[url]
- remember_headers(clean_url, properties, req.headers, now)
+ _remember_headers(clean_url, properties, req.headers, now)
if "Etag" in req.headers and etag == req.headers["Etag"]:
# Didn't change
@@ -226,7 +228,7 @@ def changed(url, clean_url, properties, now, curldownloader):
return True
-def etag_quote(etag):
+def _etag_quote(etag):
# if it already has leading and trailing quotes, do nothing
if etag[0] == '"' and etag[-1] == '"':
return etag
@@ -238,6 +240,13 @@ def etag_quote(etag):
def http_to_keep(api, project_uuid, url,
utcnow=datetime.datetime.utcnow, varying_url_params="",
prefer_cached_downloads=False):
+ """Download a file over HTTP and upload it to keep, with HTTP headers as metadata.
+
+ Before downloading the URL, checks to see if the URL already
+ exists in Keep and applies HTTP caching policy, the
+ varying_url_params and prefer_cached_downloads flags in order to
+ decide whether to use the version in Keep or re-download it.x
+ """
logger.info("Checking Keep for %s", url)
@@ -262,7 +271,7 @@ def http_to_keep(api, project_uuid, url,
etags = {}
- curldownloader = CurlDownloader()
+ curldownloader = _Downloader(api)
for item in items:
properties = item["properties"]
@@ -272,28 +281,29 @@ def http_to_keep(api, project_uuid, url,
elif url in properties:
cache_url = url
else:
- return False
+ raise Exception("Shouldn't happen, got an API result for %s that doesn't have the URL in properties" % item["uuid"])
- if prefer_cached_downloads or fresh_cache(cache_url, properties, now):
+ if prefer_cached_downloads or _fresh_cache(cache_url, properties, now):
# HTTP caching rules say we should use the cache
cr = arvados.collection.CollectionReader(item["portable_data_hash"], api_client=api)
- return "keep:%s/%s" % (item["portable_data_hash"], list(cr.keys())[0])
+ return (item["portable_data_hash"], list(cr.keys())[0])
- if not changed(cache_url, clean_url, properties, now, curldownloader):
+ if not _changed(cache_url, clean_url, properties, now, curldownloader):
# Etag didn't change, same content, just update headers
api.collections().update(uuid=item["uuid"], body={"collection":{"properties": properties}}).execute()
cr = arvados.collection.CollectionReader(item["portable_data_hash"], api_client=api)
- return "keep:%s/%s" % (item["portable_data_hash"], list(cr.keys())[0])
+ return (item["portable_data_hash"], list(cr.keys())[0])
- if "Etag" in properties[cache_url] and len(properties[cache_url]["Etag"]) > 2:
- etags[properties[cache_url]["Etag"]] = item
+ for etagstr in ("Etag", "ETag"):
+ if etagstr in properties[cache_url] and len(properties[cache_url][etagstr]) > 2:
+ etags[properties[cache_url][etagstr]] = item
- logger.debug("Found Etags %s", etags)
+ logger.debug("Found ETag values %s", etags)
properties = {}
headers = {}
if etags:
- headers['If-None-Match'] = ', '.join([etag_quote(k) for k,v in etags.items()])
+ headers['If-None-Match'] = ', '.join([_etag_quote(k) for k,v in etags.items()])
logger.debug("Sending GET request with headers %s", headers)
logger.info("Beginning download of %s", url)
@@ -308,14 +318,14 @@ def http_to_keep(api, project_uuid, url,
if curldownloader.target is not None:
curldownloader.target.close()
- remember_headers(clean_url, properties, req.headers, now)
+ _remember_headers(clean_url, properties, req.headers, now)
if req.status_code == 304 and "Etag" in req.headers and req.headers["Etag"] in etags:
item = etags[req.headers["Etag"]]
item["properties"].update(properties)
api.collections().update(uuid=item["uuid"], body={"collection":{"properties": item["properties"]}}).execute()
cr = arvados.collection.CollectionReader(item["portable_data_hash"], api_client=api)
- return "keep:%s/%s" % (item["portable_data_hash"], list(cr.keys())[0])
+ return (item["portable_data_hash"], list(cr.keys())[0])
logger.info("Download complete")
@@ -331,6 +341,6 @@ def http_to_keep(api, project_uuid, url,
c.save_new(name=collectionname, owner_uuid=project_uuid, ensure_unique_name=True)
- api.collections().update(uuid=c.manifest_locator(), body={"collection":{"properties": properties}}) #.execute()
+ api.collections().update(uuid=c.manifest_locator(), body={"collection":{"properties": properties}}).execute()
- return "keep:%s/%s" % (c.portable_data_hash(), curldownloader.name)
+ return (c.portable_data_hash(), curldownloader.name)
diff --git a/sdk/python/arvados/keep.py b/sdk/python/arvados/keep.py
index c80111efe..6804f355a 100644
--- a/sdk/python/arvados/keep.py
+++ b/sdk/python/arvados/keep.py
@@ -44,7 +44,7 @@ import arvados.errors
import arvados.retry as retry
import arvados.util
import arvados.diskcache
-from arvados.pycurl import PyCurlHelper
+from arvados._pycurlhelper import PyCurlHelper
_logger = logging.getLogger('arvados.keep')
global_client_object = None
diff --git a/sdk/python/tests/test_http.py b/sdk/python/tests/test_http.py
index 296c1c654..381a61e2a 100644
--- a/sdk/python/tests/test_http.py
+++ b/sdk/python/tests/test_http.py
@@ -21,7 +21,7 @@ import arvados.collection
import arvados.keep
import pycurl
-from arvados.http_import import http_to_keep
+from arvados.http_to_keep import http_to_keep
import ruamel.yaml as yaml
@@ -97,7 +97,7 @@ class TestHttpToKeep(unittest.TestCase):
utcnow.return_value = datetime.datetime(2018, 5, 15)
r = http_to_keep(api, None, "http://example.com/file1.txt", utcnow=utcnow)
- self.assertEqual(r, "keep:99999999999999999999999999999998+99/file1.txt")
+ self.assertEqual(r, ("99999999999999999999999999999998+99", "file1.txt"))
assert mockobj.url == b"http://example.com/file1.txt"
assert mockobj.perform_was_called is True
@@ -146,7 +146,7 @@ class TestHttpToKeep(unittest.TestCase):
utcnow.return_value = datetime.datetime(2018, 5, 16)
r = http_to_keep(api, None, "http://example.com/file1.txt", utcnow=utcnow)
- self.assertEqual(r, "keep:99999999999999999999999999999998+99/file1.txt")
+ self.assertEqual(r, ("99999999999999999999999999999998+99", "file1.txt"))
assert mockobj.perform_was_called is False
@@ -185,7 +185,7 @@ class TestHttpToKeep(unittest.TestCase):
utcnow.return_value = datetime.datetime(2018, 5, 16)
r = http_to_keep(api, None, "http://example.com/file1.txt", utcnow=utcnow)
- self.assertEqual(r, "keep:99999999999999999999999999999998+99/file1.txt")
+ self.assertEqual(r, ("99999999999999999999999999999998+99", "file1.txt"))
assert mockobj.perform_was_called is False
@@ -224,7 +224,7 @@ class TestHttpToKeep(unittest.TestCase):
utcnow.return_value = datetime.datetime(2018, 5, 17)
r = http_to_keep(api, None, "http://example.com/file1.txt", utcnow=utcnow)
- self.assertEqual(r, "keep:99999999999999999999999999999997+99/file1.txt")
+ self.assertEqual(r, ("99999999999999999999999999999997+99", "file1.txt"))
assert mockobj.url == b"http://example.com/file1.txt"
assert mockobj.perform_was_called is True
@@ -278,7 +278,7 @@ class TestHttpToKeep(unittest.TestCase):
utcnow.return_value = datetime.datetime(2018, 5, 17)
r = http_to_keep(api, None, "http://example.com/file1.txt", utcnow=utcnow)
- self.assertEqual(r, "keep:99999999999999999999999999999998+99/file1.txt")
+ self.assertEqual(r, ("99999999999999999999999999999998+99", "file1.txt"))
cm.open.assert_not_called()
@@ -315,7 +315,7 @@ class TestHttpToKeep(unittest.TestCase):
utcnow.return_value = datetime.datetime(2018, 5, 15)
r = http_to_keep(api, None, "http://example.com/download?fn=/file1.txt", utcnow=utcnow)
- self.assertEqual(r, "keep:99999999999999999999999999999998+99/file1.txt")
+ self.assertEqual(r, ("99999999999999999999999999999998+99", "file1.txt"))
assert mockobj.url == b"http://example.com/download?fn=/file1.txt"
@@ -369,7 +369,7 @@ class TestHttpToKeep(unittest.TestCase):
utcnow.return_value = datetime.datetime(2018, 5, 17)
r = http_to_keep(api, None, "http://example.com/file1.txt", utcnow=utcnow)
- self.assertEqual(r, "keep:99999999999999999999999999999998+99/file1.txt")
+ self.assertEqual(r, ("99999999999999999999999999999998+99", "file1.txt"))
print(mockobj.req_headers)
assert mockobj.req_headers == ["Accept: application/octet-stream", "If-None-Match: \"123456\""]
@@ -418,7 +418,7 @@ class TestHttpToKeep(unittest.TestCase):
utcnow.return_value = datetime.datetime(2018, 5, 17)
r = http_to_keep(api, None, "http://example.com/file1.txt", utcnow=utcnow, prefer_cached_downloads=True)
- self.assertEqual(r, "keep:99999999999999999999999999999998+99/file1.txt")
+ self.assertEqual(r, ("99999999999999999999999999999998+99", "file1.txt"))
assert mockobj.perform_was_called is False
cm.open.assert_not_called()
@@ -465,7 +465,7 @@ class TestHttpToKeep(unittest.TestCase):
r = http_to_keep(api, None, "http://example.com/file1.txt?KeyId=123&Signature=456&Expires=789",
utcnow=utcnow, varying_url_params="KeyId,Signature,Expires")
- self.assertEqual(r, "keep:99999999999999999999999999999998+99/file1.txt")
+ self.assertEqual(r, ("99999999999999999999999999999998+99", "file1.txt"))
assert mockobj.perform_was_called is True
cm.open.assert_not_called()
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list