[arvados] updated: 2.6.0-36-g10aaf718b

git repository hosting git at public.arvados.org
Fri Apr 21 14:58:39 UTC 2023


Summary of changes:
 sdk/cwl/arvados_cwl/__init__.py                    |  3 +-
 sdk/cwl/arvados_cwl/pathmapper.py                  |  8 +--
 sdk/python/arvados/{pycurl.py => _pycurlhelper.py} |  0
 .../arvados/{http_import.py => http_to_keep.py}    | 80 ++++++++++++----------
 sdk/python/arvados/keep.py                         |  2 +-
 sdk/python/tests/test_http.py                      | 20 +++---
 6 files changed, 61 insertions(+), 52 deletions(-)
 rename sdk/python/arvados/{pycurl.py => _pycurlhelper.py} (100%)
 rename sdk/python/arvados/{http_import.py => http_to_keep.py} (80%)

       via  10aaf718b0795aa37a4e74063b0206c507ddc6fe (commit)
      from  08fe9bab402a591674993e6cd472570967e92dec (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.


commit 10aaf718b0795aa37a4e74063b0206c507ddc6fe
Author: Peter Amstutz <peter.amstutz at curii.com>
Date:   Fri Apr 21 10:57:30 2023 -0400

    20257: Mark functions and classes internal not intended for use
    
    Adjust return value of http_to_keep
    
    Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz at curii.com>

diff --git a/sdk/cwl/arvados_cwl/__init__.py b/sdk/cwl/arvados_cwl/__init__.py
index 63df4db7a..fe27b91ab 100644
--- a/sdk/cwl/arvados_cwl/__init__.py
+++ b/sdk/cwl/arvados_cwl/__init__.py
@@ -354,8 +354,7 @@ def main(args=sys.argv[1:],
     # Note that unless in debug mode, some stack traces related to user
     # workflow errors may be suppressed.
 
-    # For some reason if I don't set these explicitly some logs won't
-    # show up.
+    # Set the logging on most modules INFO (instead of default which is WARNING)
     logger.setLevel(logging.INFO)
     logging.getLogger('arvados').setLevel(logging.INFO)
     logging.getLogger('arvados.keep').setLevel(logging.WARNING)
diff --git a/sdk/cwl/arvados_cwl/pathmapper.py b/sdk/cwl/arvados_cwl/pathmapper.py
index 62a59a8cf..3f54a396b 100644
--- a/sdk/cwl/arvados_cwl/pathmapper.py
+++ b/sdk/cwl/arvados_cwl/pathmapper.py
@@ -26,7 +26,7 @@ from cwltool.utils import adjustFileObjs, adjustDirObjs
 from cwltool.stdfsaccess import abspath
 from cwltool.workflow import WorkflowException
 
-from arvados.http_import import http_to_keep
+from arvados.http_to_keep import http_to_keep
 
 logger = logging.getLogger('arvados.cwl-runner')
 
@@ -109,9 +109,9 @@ class ArvPathMapper(PathMapper):
                         # passthrough, we'll download it later.
                         self._pathmap[src] = MapperEnt(src, src, srcobj["class"], True)
                     else:
-                        keepref = http_to_keep(self.arvrunner.api, self.arvrunner.project_uuid, src,
-                                               varying_url_params=self.arvrunner.toplevel_runtimeContext.varying_url_params,
-                                               prefer_cached_downloads=self.arvrunner.toplevel_runtimeContext.prefer_cached_downloads)
+                        keepref = "keep:%s/%s" % http_to_keep(self.arvrunner.api, self.arvrunner.project_uuid, src,
+                                                              varying_url_params=self.arvrunner.toplevel_runtimeContext.varying_url_params,
+                                                              prefer_cached_downloads=self.arvrunner.toplevel_runtimeContext.prefer_cached_downloads)
                         logger.info("%s is %s", src, keepref)
                         self._pathmap[src] = MapperEnt(keepref, keepref, srcobj["class"], True)
                 except Exception as e:
diff --git a/sdk/python/arvados/pycurl.py b/sdk/python/arvados/_pycurlhelper.py
similarity index 100%
rename from sdk/python/arvados/pycurl.py
rename to sdk/python/arvados/_pycurlhelper.py
diff --git a/sdk/python/arvados/http_import.py b/sdk/python/arvados/http_to_keep.py
similarity index 80%
rename from sdk/python/arvados/http_import.py
rename to sdk/python/arvados/http_to_keep.py
index fef6d6719..b07409912 100644
--- a/sdk/python/arvados/http_import.py
+++ b/sdk/python/arvados/http_to_keep.py
@@ -17,15 +17,16 @@ import logging
 import calendar
 import urllib.parse
 import pycurl
-from arvados.pycurl import PyCurlHelper
+import dataclasses
+from arvados._pycurlhelper import PyCurlHelper
 
 logger = logging.getLogger('arvados.http_import')
 
-def my_formatdate(dt):
+def _my_formatdate(dt):
     return email.utils.formatdate(timeval=calendar.timegm(dt.timetuple()),
                                   localtime=False, usegmt=True)
 
-def my_parsedate(text):
+def _my_parsedate(text):
     parsed = email.utils.parsedate_tz(text)
     if parsed:
         if parsed[9]:
@@ -37,7 +38,7 @@ def my_parsedate(text):
     else:
         return datetime.datetime(1970, 1, 1)
 
-def fresh_cache(url, properties, now):
+def _fresh_cache(url, properties, now):
     pr = properties[url]
     expires = None
 
@@ -49,51 +50,52 @@ def fresh_cache(url, properties, now):
 
         g = re.match(r"(s-maxage|max-age)=(\d+)", pr["Cache-Control"])
         if g:
-            expires = my_parsedate(pr["Date"]) + datetime.timedelta(seconds=int(g.group(2)))
+            expires = _my_parsedate(pr["Date"]) + datetime.timedelta(seconds=int(g.group(2)))
 
     if expires is None and "Expires" in pr:
-        expires = my_parsedate(pr["Expires"])
+        expires = _my_parsedate(pr["Expires"])
 
     if expires is None:
         # Use a default cache time of 24 hours if upstream didn't set
         # any cache headers, to reduce redundant downloads.
-        expires = my_parsedate(pr["Date"]) + datetime.timedelta(hours=24)
+        expires = _my_parsedate(pr["Date"]) + datetime.timedelta(hours=24)
 
     if not expires:
         return False
 
     return (now < expires)
 
-def remember_headers(url, properties, headers, now):
+def _remember_headers(url, properties, headers, now):
     properties.setdefault(url, {})
     for h in ("Cache-Control", "Etag", "Expires", "Date", "Content-Length"):
         if h in headers:
             properties[url][h] = headers[h]
     if "Date" not in headers:
-        properties[url]["Date"] = my_formatdate(now)
+        properties[url]["Date"] = _my_formatdate(now)
 
-class Response:
+ at dataclasses.dataclass
+class _Response:
     def __init__(self, status_code, headers):
         self.status_code = status_code
         self.headers = headers
 
-class CurlDownloader(PyCurlHelper):
+class _Downloader(PyCurlHelper):
     # Wait up to 60 seconds for connection
     # How long it can be in "low bandwidth" state before it gives up
     # Low bandwidth threshold is 32 KiB/s
     DOWNLOADER_TIMEOUT = (60, 300, 32768)
 
-    def __init__(self):
-        super(CurlDownloader, self).__init__(title_case_headers=True)
+    def __init__(self, apiclient):
+        super(_Downloader, self).__init__(title_case_headers=True)
         self.curl = pycurl.Curl()
         self.curl.setopt(pycurl.NOSIGNAL, 1)
         self.curl.setopt(pycurl.OPENSOCKETFUNCTION,
                     lambda *args, **kwargs: self._socket_open(*args, **kwargs))
         self.target = None
+        self.apiclient = apiclient
 
-    def head(self, url, headers={}):
+    def head(self, url):
         get_headers = {'Accept': 'application/octet-stream'}
-        get_headers.update(headers)
         self._headers = {}
 
         self.curl.setopt(pycurl.URL, url.encode('utf-8'))
@@ -116,7 +118,7 @@ class CurlDownloader(PyCurlHelper):
                 self._socket.close()
                 self._socket = None
 
-        return Response(self.curl.getinfo(pycurl.RESPONSE_CODE), self._headers)
+        return _Response(self.curl.getinfo(pycurl.RESPONSE_CODE), self._headers)
 
     def download(self, url, headers):
         self.count = 0
@@ -152,10 +154,10 @@ class CurlDownloader(PyCurlHelper):
                 self._socket.close()
                 self._socket = None
 
-        return Response(self.curl.getinfo(pycurl.RESPONSE_CODE), self._headers)
+        return _Response(self.curl.getinfo(pycurl.RESPONSE_CODE), self._headers)
 
     def headers_received(self):
-        self.collection = arvados.collection.Collection()
+        self.collection = arvados.collection.Collection(api_client=self.apiclient)
 
         if "Content-Length" in self._headers:
             self.contentlength = int(self._headers["Content-Length"])
@@ -204,7 +206,7 @@ class CurlDownloader(PyCurlHelper):
         self.checkpoint = loopnow
 
 
-def changed(url, clean_url, properties, now, curldownloader):
+def _changed(url, clean_url, properties, now, curldownloader):
     req = curldownloader.head(url)
 
     if req.status_code != 200:
@@ -218,7 +220,7 @@ def changed(url, clean_url, properties, now, curldownloader):
 
     if url in properties:
         del properties[url]
-    remember_headers(clean_url, properties, req.headers, now)
+    _remember_headers(clean_url, properties, req.headers, now)
 
     if "Etag" in req.headers and etag == req.headers["Etag"]:
         # Didn't change
@@ -226,7 +228,7 @@ def changed(url, clean_url, properties, now, curldownloader):
 
     return True
 
-def etag_quote(etag):
+def _etag_quote(etag):
     # if it already has leading and trailing quotes, do nothing
     if etag[0] == '"' and etag[-1] == '"':
         return etag
@@ -238,6 +240,13 @@ def etag_quote(etag):
 def http_to_keep(api, project_uuid, url,
                  utcnow=datetime.datetime.utcnow, varying_url_params="",
                  prefer_cached_downloads=False):
+    """Download a file over HTTP and upload it to keep, with HTTP headers as metadata.
+
+    Before downloading the URL, checks to see if the URL already
+    exists in Keep and applies HTTP caching policy, the
+    varying_url_params and prefer_cached_downloads flags in order to
+    decide whether to use the version in Keep or re-download it.x
+    """
 
     logger.info("Checking Keep for %s", url)
 
@@ -262,7 +271,7 @@ def http_to_keep(api, project_uuid, url,
 
     etags = {}
 
-    curldownloader = CurlDownloader()
+    curldownloader = _Downloader(api)
 
     for item in items:
         properties = item["properties"]
@@ -272,28 +281,29 @@ def http_to_keep(api, project_uuid, url,
         elif url in properties:
             cache_url = url
         else:
-            return False
+            raise Exception("Shouldn't happen, got an API result for %s that doesn't have the URL in properties" % item["uuid"])
 
-        if prefer_cached_downloads or fresh_cache(cache_url, properties, now):
+        if prefer_cached_downloads or _fresh_cache(cache_url, properties, now):
             # HTTP caching rules say we should use the cache
             cr = arvados.collection.CollectionReader(item["portable_data_hash"], api_client=api)
-            return "keep:%s/%s" % (item["portable_data_hash"], list(cr.keys())[0])
+            return (item["portable_data_hash"], list(cr.keys())[0])
 
-        if not changed(cache_url, clean_url, properties, now, curldownloader):
+        if not _changed(cache_url, clean_url, properties, now, curldownloader):
             # Etag didn't change, same content, just update headers
             api.collections().update(uuid=item["uuid"], body={"collection":{"properties": properties}}).execute()
             cr = arvados.collection.CollectionReader(item["portable_data_hash"], api_client=api)
-            return "keep:%s/%s" % (item["portable_data_hash"], list(cr.keys())[0])
+            return (item["portable_data_hash"], list(cr.keys())[0])
 
-        if "Etag" in properties[cache_url] and len(properties[cache_url]["Etag"]) > 2:
-            etags[properties[cache_url]["Etag"]] = item
+        for etagstr in ("Etag", "ETag"):
+            if etagstr in properties[cache_url] and len(properties[cache_url][etagstr]) > 2:
+                etags[properties[cache_url][etagstr]] = item
 
-    logger.debug("Found Etags %s", etags)
+    logger.debug("Found ETag values %s", etags)
 
     properties = {}
     headers = {}
     if etags:
-        headers['If-None-Match'] = ', '.join([etag_quote(k) for k,v in etags.items()])
+        headers['If-None-Match'] = ', '.join([_etag_quote(k) for k,v in etags.items()])
     logger.debug("Sending GET request with headers %s", headers)
 
     logger.info("Beginning download of %s", url)
@@ -308,14 +318,14 @@ def http_to_keep(api, project_uuid, url,
     if curldownloader.target is not None:
         curldownloader.target.close()
 
-    remember_headers(clean_url, properties, req.headers, now)
+    _remember_headers(clean_url, properties, req.headers, now)
 
     if req.status_code == 304 and "Etag" in req.headers and req.headers["Etag"] in etags:
         item = etags[req.headers["Etag"]]
         item["properties"].update(properties)
         api.collections().update(uuid=item["uuid"], body={"collection":{"properties": item["properties"]}}).execute()
         cr = arvados.collection.CollectionReader(item["portable_data_hash"], api_client=api)
-        return "keep:%s/%s" % (item["portable_data_hash"], list(cr.keys())[0])
+        return (item["portable_data_hash"], list(cr.keys())[0])
 
     logger.info("Download complete")
 
@@ -331,6 +341,6 @@ def http_to_keep(api, project_uuid, url,
 
     c.save_new(name=collectionname, owner_uuid=project_uuid, ensure_unique_name=True)
 
-    api.collections().update(uuid=c.manifest_locator(), body={"collection":{"properties": properties}}) #.execute()
+    api.collections().update(uuid=c.manifest_locator(), body={"collection":{"properties": properties}}).execute()
 
-    return "keep:%s/%s" % (c.portable_data_hash(), curldownloader.name)
+    return (c.portable_data_hash(), curldownloader.name)
diff --git a/sdk/python/arvados/keep.py b/sdk/python/arvados/keep.py
index c80111efe..6804f355a 100644
--- a/sdk/python/arvados/keep.py
+++ b/sdk/python/arvados/keep.py
@@ -44,7 +44,7 @@ import arvados.errors
 import arvados.retry as retry
 import arvados.util
 import arvados.diskcache
-from arvados.pycurl import PyCurlHelper
+from arvados._pycurlhelper import PyCurlHelper
 
 _logger = logging.getLogger('arvados.keep')
 global_client_object = None
diff --git a/sdk/python/tests/test_http.py b/sdk/python/tests/test_http.py
index 296c1c654..381a61e2a 100644
--- a/sdk/python/tests/test_http.py
+++ b/sdk/python/tests/test_http.py
@@ -21,7 +21,7 @@ import arvados.collection
 import arvados.keep
 import pycurl
 
-from arvados.http_import import http_to_keep
+from arvados.http_to_keep import http_to_keep
 
 import ruamel.yaml as yaml
 
@@ -97,7 +97,7 @@ class TestHttpToKeep(unittest.TestCase):
         utcnow.return_value = datetime.datetime(2018, 5, 15)
 
         r = http_to_keep(api, None, "http://example.com/file1.txt", utcnow=utcnow)
-        self.assertEqual(r, "keep:99999999999999999999999999999998+99/file1.txt")
+        self.assertEqual(r, ("99999999999999999999999999999998+99", "file1.txt"))
 
         assert mockobj.url == b"http://example.com/file1.txt"
         assert mockobj.perform_was_called is True
@@ -146,7 +146,7 @@ class TestHttpToKeep(unittest.TestCase):
         utcnow.return_value = datetime.datetime(2018, 5, 16)
 
         r = http_to_keep(api, None, "http://example.com/file1.txt", utcnow=utcnow)
-        self.assertEqual(r, "keep:99999999999999999999999999999998+99/file1.txt")
+        self.assertEqual(r, ("99999999999999999999999999999998+99", "file1.txt"))
 
         assert mockobj.perform_was_called is False
 
@@ -185,7 +185,7 @@ class TestHttpToKeep(unittest.TestCase):
         utcnow.return_value = datetime.datetime(2018, 5, 16)
 
         r = http_to_keep(api, None, "http://example.com/file1.txt", utcnow=utcnow)
-        self.assertEqual(r, "keep:99999999999999999999999999999998+99/file1.txt")
+        self.assertEqual(r, ("99999999999999999999999999999998+99", "file1.txt"))
 
         assert mockobj.perform_was_called is False
 
@@ -224,7 +224,7 @@ class TestHttpToKeep(unittest.TestCase):
         utcnow.return_value = datetime.datetime(2018, 5, 17)
 
         r = http_to_keep(api, None, "http://example.com/file1.txt", utcnow=utcnow)
-        self.assertEqual(r, "keep:99999999999999999999999999999997+99/file1.txt")
+        self.assertEqual(r, ("99999999999999999999999999999997+99", "file1.txt"))
 
         assert mockobj.url == b"http://example.com/file1.txt"
         assert mockobj.perform_was_called is True
@@ -278,7 +278,7 @@ class TestHttpToKeep(unittest.TestCase):
         utcnow.return_value = datetime.datetime(2018, 5, 17)
 
         r = http_to_keep(api, None, "http://example.com/file1.txt", utcnow=utcnow)
-        self.assertEqual(r, "keep:99999999999999999999999999999998+99/file1.txt")
+        self.assertEqual(r, ("99999999999999999999999999999998+99", "file1.txt"))
 
         cm.open.assert_not_called()
 
@@ -315,7 +315,7 @@ class TestHttpToKeep(unittest.TestCase):
         utcnow.return_value = datetime.datetime(2018, 5, 15)
 
         r = http_to_keep(api, None, "http://example.com/download?fn=/file1.txt", utcnow=utcnow)
-        self.assertEqual(r, "keep:99999999999999999999999999999998+99/file1.txt")
+        self.assertEqual(r, ("99999999999999999999999999999998+99", "file1.txt"))
 
         assert mockobj.url == b"http://example.com/download?fn=/file1.txt"
 
@@ -369,7 +369,7 @@ class TestHttpToKeep(unittest.TestCase):
         utcnow.return_value = datetime.datetime(2018, 5, 17)
 
         r = http_to_keep(api, None, "http://example.com/file1.txt", utcnow=utcnow)
-        self.assertEqual(r, "keep:99999999999999999999999999999998+99/file1.txt")
+        self.assertEqual(r, ("99999999999999999999999999999998+99", "file1.txt"))
 
         print(mockobj.req_headers)
         assert mockobj.req_headers == ["Accept: application/octet-stream", "If-None-Match: \"123456\""]
@@ -418,7 +418,7 @@ class TestHttpToKeep(unittest.TestCase):
         utcnow.return_value = datetime.datetime(2018, 5, 17)
 
         r = http_to_keep(api, None, "http://example.com/file1.txt", utcnow=utcnow, prefer_cached_downloads=True)
-        self.assertEqual(r, "keep:99999999999999999999999999999998+99/file1.txt")
+        self.assertEqual(r, ("99999999999999999999999999999998+99", "file1.txt"))
 
         assert mockobj.perform_was_called is False
         cm.open.assert_not_called()
@@ -465,7 +465,7 @@ class TestHttpToKeep(unittest.TestCase):
 
             r = http_to_keep(api, None, "http://example.com/file1.txt?KeyId=123&Signature=456&Expires=789",
                                               utcnow=utcnow, varying_url_params="KeyId,Signature,Expires")
-            self.assertEqual(r, "keep:99999999999999999999999999999998+99/file1.txt")
+            self.assertEqual(r, ("99999999999999999999999999999998+99", "file1.txt"))
 
             assert mockobj.perform_was_called is True
             cm.open.assert_not_called()

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list