[ARVADOS] updated: 1.1.4-327-g739b1b9

Git user git at public.curoverse.com
Fri May 25 16:47:15 EDT 2018


Summary of changes:
 sdk/cwl/arvados_cwl/http.py |  58 +++++----
 sdk/cwl/tests/test_http.py  | 286 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 321 insertions(+), 23 deletions(-)
 create mode 100644 sdk/cwl/tests/test_http.py

       via  739b1b9ec3662f988ad09509bcc933ce5c23c4e8 (commit)
      from  57e511e900aa1eb175fa7f308b09516ad83492b4 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.


commit 739b1b9ec3662f988ad09509bcc933ce5c23c4e8
Author: Peter Amstutz <pamstutz at veritasgenetics.com>
Date:   Fri May 25 16:45:12 2018 -0400

    11162: Add tests for http_to_keep.
    
    Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <pamstutz at veritasgenetics.com>

diff --git a/sdk/cwl/arvados_cwl/http.py b/sdk/cwl/arvados_cwl/http.py
index 1ee1607..32fc1cf 100644
--- a/sdk/cwl/arvados_cwl/http.py
+++ b/sdk/cwl/arvados_cwl/http.py
@@ -11,7 +11,8 @@ import logging
 logger = logging.getLogger('arvados.cwl-runner')
 
 def my_formatdate(dt):
-    return email.utils.formatdate(timeval=time.mktime(now.timetuple()), localtime=False, usegmt=True)
+    return email.utils.formatdate(timeval=time.mktime(dt.timetuple()),
+                                  localtime=False, usegmt=True)
 
 def my_parsedate(text):
     parsed = email.utils.parsedate(text)
@@ -20,7 +21,7 @@ def my_parsedate(text):
     else:
         return datetime.datetime(1970, 1, 1)
 
-def fresh_cache(url, properties):
+def fresh_cache(url, properties, now):
     pr = properties[url]
     expires = None
 
@@ -45,20 +46,20 @@ def fresh_cache(url, properties):
     if not expires:
         return False
 
-    return (datetime.datetime.utcnow() < expires)
+    return (now < expires)
 
-def remember_headers(url, properties, headers):
+def remember_headers(url, properties, headers, now):
     properties.setdefault(url, {})
     for h in ("Cache-Control", "ETag", "Expires", "Date", "Content-Length"):
         if h in headers:
             properties[url][h] = headers[h]
     if "Date" not in headers:
-        properties[url]["Date"] = my_formatdate(datetime.datetime.utcnow())
+        properties[url]["Date"] = my_formatdate(now)
 
 
-def changed(url, properties):
+def changed(url, properties, now):
     req = requests.head(url, allow_redirects=True)
-    remember_headers(url, properties, req.headers)
+    remember_headers(url, properties, req.headers, now)
 
     if req.status_code != 200:
         raise Exception("Got status %s" % req.status_code)
@@ -67,19 +68,22 @@ def changed(url, properties):
     if "ETag" in pr and "ETag" in req.headers:
         if pr["ETag"] == req.headers["ETag"]:
             return False
+
     return True
 
-def http_to_keep(api, project_uuid, url):
+def http_to_keep(api, project_uuid, url, utcnow=datetime.datetime.utcnow):
     r = api.collections().list(filters=[["properties", "exists", url]]).execute()
 
+    now = utcnow()
+
     for item in r["items"]:
         properties = item["properties"]
-        if fresh_cache(url, properties):
+        if fresh_cache(url, properties, now):
             # Do nothing
             cr = arvados.collection.CollectionReader(item["portable_data_hash"], api_client=api)
             return "keep:%s/%s" % (item["portable_data_hash"], cr.keys()[0])
 
-        if not changed(url, properties):
+        if not changed(url, properties, now):
             # ETag didn't change, same content, just update headers
             api.collections().update(uuid=item["uuid"], body={"collection":{"properties": properties}}).execute()
             cr = arvados.collection.CollectionReader(item["portable_data_hash"], api_client=api)
@@ -91,18 +95,23 @@ def http_to_keep(api, project_uuid, url):
     if req.status_code != 200:
         raise Exception("Failed to download '%s' got status %s " % (url, req.status_code))
 
-    remember_headers(url, properties, req.headers)
+    remember_headers(url, properties, req.headers, now)
 
-    logger.info("Downloading %s (%s bytes)", url, properties[url]["Content-Length"])
+    if "Content-Length" in properties[url]:
+        cl = int(properties[url]["Content-Length"])
+        logger.info("Downloading %s (%s bytes)", url, cl)
+    else:
+        cl = None
+        logger.info("Downloading %s (unknown size)", url)
 
     c = arvados.collection.Collection()
 
     if req.headers.get("Content-Disposition"):
         grp = re.search(r'filename=("((\"|[^"])+)"|([^][()<>@,;:\"/?={} ]+))', req.headers["Content-Disposition"])
-        if grp.groups(2):
-            name = grp.groups(2)
+        if grp.group(2):
+            name = grp.group(2)
         else:
-            name = grp.groups(3)
+            name = grp.group(4)
     else:
         name = urlparse.urlparse(url).path.split("/")[-1]
 
@@ -113,14 +122,17 @@ def http_to_keep(api, project_uuid, url):
         for chunk in req.iter_content(chunk_size=1024):
             count += len(chunk)
             f.write(chunk)
-            now = time.time()
-            if (now - checkpoint) > 20:
-                bps = (float(count)/float(now - start))
-                logger.info("%2.1f%% complete, %3.2f MiB/s, %1.0f seconds left",
-                            float(count * 100) / float(properties[url]["Content-Length"]),
-                            bps/(1024*1024),
-                            (int(properties[url]["Content-Length"])-count)/bps)
-                checkpoint = now
+            loopnow = time.time()
+            if (loopnow - checkpoint) > 20:
+                bps = (float(count)/float(loopnow - start))
+                if cl is not None:
+                    logger.info("%2.1f%% complete, %3.2f MiB/s, %1.0f seconds left",
+                                float(count * 100) / float(cl),
+                                bps/(1024*1024),
+                                (cl-count)/bps)
+                else:
+                    logger.info("%d downloaded, %3.2f MiB/s", count, bps/(1024*1024))
+                checkpoint = loopnow
 
     c.save_new(name="Downloaded from %s" % url, owner_uuid=project_uuid, ensure_unique_name=True)
 
diff --git a/sdk/cwl/tests/test_http.py b/sdk/cwl/tests/test_http.py
new file mode 100644
index 0000000..0c66c39
--- /dev/null
+++ b/sdk/cwl/tests/test_http.py
@@ -0,0 +1,286 @@
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import copy
+import cStringIO
+import functools
+import hashlib
+import json
+import logging
+import mock
+import sys
+import unittest
+import datetime
+
+import arvados
+import arvados.collection
+import arvados_cwl
+import arvados_cwl.runner
+import arvados.keep
+
+from .matcher import JsonDiffMatcher, StripYAMLComments
+from .mock_discovery import get_rootDesc
+
+import arvados_cwl.http
+
+import ruamel.yaml as yaml
+
+
+class TestHttpToKeep(unittest.TestCase):
+
+    @mock.patch("requests.get")
+    @mock.patch("arvados.collection.Collection")
+    def test_http_get(self, collectionmock, getmock):
+        api = mock.MagicMock()
+
+        api.collections().list().execute.return_value = {
+            "items": []
+        }
+
+        cm = mock.MagicMock()
+        cm.manifest_locator.return_value = "zzzzz-4zz18-zzzzzzzzzzzzzz3"
+        cm.portable_data_hash.return_value = "99999999999999999999999999999998+99"
+        collectionmock.return_value = cm
+
+        req = mock.MagicMock()
+        req.status_code = 200
+        req.headers = {}
+        req.iter_content.return_value = ["abc"]
+        getmock.return_value = req
+
+        utcnow = mock.MagicMock()
+        utcnow.return_value = datetime.datetime(2018, 5, 15)
+
+        r = arvados_cwl.http.http_to_keep(api, None, "http://example.com/file1.txt", utcnow=utcnow)
+        self.assertEqual(r, "keep:99999999999999999999999999999998+99/file1.txt")
+
+        getmock.assert_called_with("http://example.com/file1.txt", stream=True, allow_redirects=True)
+
+        cm.open.assert_called_with("file1.txt", "w")
+        cm.save_new.assert_called_with(name="Downloaded from http://example.com/file1.txt",
+                                       owner_uuid=None, ensure_unique_name=True)
+
+        api.collections().update.assert_has_calls([
+            mock.call(uuid=cm.manifest_locator(),
+                      body={"collection":{"properties": {'http://example.com/file1.txt': {'Date': 'Tue, 15 May 2018 00:00:00 GMT'}}}})
+        ])
+
+
+    @mock.patch("requests.get")
+    @mock.patch("arvados.collection.CollectionReader")
+    def test_http_expires(self, collectionmock, getmock):
+        api = mock.MagicMock()
+
+        api.collections().list().execute.return_value = {
+            "items": [{
+                "uuid": "zzzzz-4zz18-zzzzzzzzzzzzzz3",
+                "portable_data_hash": "99999999999999999999999999999998+99",
+                "properties": {
+                    'http://example.com/file1.txt': {
+                        'Date': 'Tue, 15 May 2018 00:00:00 GMT',
+                        'Expires': 'Tue, 17 May 2018 00:00:00 GMT'
+                    }
+                }
+            }]
+        }
+
+        cm = mock.MagicMock()
+        cm.manifest_locator.return_value = "zzzzz-4zz18-zzzzzzzzzzzzzz3"
+        cm.portable_data_hash.return_value = "99999999999999999999999999999998+99"
+        cm.keys.return_value = ["file1.txt"]
+        collectionmock.return_value = cm
+
+        req = mock.MagicMock()
+        req.status_code = 200
+        req.headers = {}
+        req.iter_content.return_value = ["abc"]
+        getmock.return_value = req
+
+        utcnow = mock.MagicMock()
+        utcnow.return_value = datetime.datetime(2018, 5, 16)
+
+        r = arvados_cwl.http.http_to_keep(api, None, "http://example.com/file1.txt", utcnow=utcnow)
+        self.assertEqual(r, "keep:99999999999999999999999999999998+99/file1.txt")
+
+        getmock.assert_not_called()
+
+
+    @mock.patch("requests.get")
+    @mock.patch("arvados.collection.CollectionReader")
+    def test_http_cache_control(self, collectionmock, getmock):
+        api = mock.MagicMock()
+
+        api.collections().list().execute.return_value = {
+            "items": [{
+                "uuid": "zzzzz-4zz18-zzzzzzzzzzzzzz3",
+                "portable_data_hash": "99999999999999999999999999999998+99",
+                "properties": {
+                    'http://example.com/file1.txt': {
+                        'Date': 'Tue, 15 May 2018 00:00:00 GMT',
+                        'Cache-Control': 'max-age=172800'
+                    }
+                }
+            }]
+        }
+
+        cm = mock.MagicMock()
+        cm.manifest_locator.return_value = "zzzzz-4zz18-zzzzzzzzzzzzzz3"
+        cm.portable_data_hash.return_value = "99999999999999999999999999999998+99"
+        cm.keys.return_value = ["file1.txt"]
+        collectionmock.return_value = cm
+
+        req = mock.MagicMock()
+        req.status_code = 200
+        req.headers = {}
+        req.iter_content.return_value = ["abc"]
+        getmock.return_value = req
+
+        utcnow = mock.MagicMock()
+        utcnow.return_value = datetime.datetime(2018, 5, 16)
+
+        r = arvados_cwl.http.http_to_keep(api, None, "http://example.com/file1.txt", utcnow=utcnow)
+        self.assertEqual(r, "keep:99999999999999999999999999999998+99/file1.txt")
+
+        getmock.assert_not_called()
+
+
+    @mock.patch("requests.get")
+    @mock.patch("requests.head")
+    @mock.patch("arvados.collection.Collection")
+    def test_http_expired(self, collectionmock, headmock, getmock):
+        api = mock.MagicMock()
+
+        api.collections().list().execute.return_value = {
+            "items": [{
+                "uuid": "zzzzz-4zz18-zzzzzzzzzzzzzz3",
+                "portable_data_hash": "99999999999999999999999999999998+99",
+                "properties": {
+                    'http://example.com/file1.txt': {
+                        'Date': 'Tue, 15 May 2018 00:00:00 GMT',
+                        'Expires': 'Tue, 16 May 2018 00:00:00 GMT'
+                    }
+                }
+            }]
+        }
+
+        cm = mock.MagicMock()
+        cm.manifest_locator.return_value = "zzzzz-4zz18-zzzzzzzzzzzzzz4"
+        cm.portable_data_hash.return_value = "99999999999999999999999999999997+99"
+        cm.keys.return_value = ["file1.txt"]
+        collectionmock.return_value = cm
+
+        req = mock.MagicMock()
+        req.status_code = 200
+        req.headers = {'Date': 'Tue, 17 May 2018 00:00:00 GMT'}
+        req.iter_content.return_value = ["def"]
+        getmock.return_value = req
+        headmock.return_value = req
+
+        utcnow = mock.MagicMock()
+        utcnow.return_value = datetime.datetime(2018, 5, 17)
+
+        r = arvados_cwl.http.http_to_keep(api, None, "http://example.com/file1.txt", utcnow=utcnow)
+        self.assertEqual(r, "keep:99999999999999999999999999999997+99/file1.txt")
+
+        getmock.assert_called_with("http://example.com/file1.txt", stream=True, allow_redirects=True)
+
+        cm.open.assert_called_with("file1.txt", "w")
+        cm.save_new.assert_called_with(name="Downloaded from http://example.com/file1.txt",
+                                       owner_uuid=None, ensure_unique_name=True)
+
+        api.collections().update.assert_has_calls([
+            mock.call(uuid=cm.manifest_locator(),
+                      body={"collection":{"properties": {'http://example.com/file1.txt': {'Date': 'Tue, 17 May 2018 00:00:00 GMT'}}}})
+        ])
+
+
+    @mock.patch("requests.get")
+    @mock.patch("requests.head")
+    @mock.patch("arvados.collection.CollectionReader")
+    def test_http_etag(self, collectionmock, headmock, getmock):
+        api = mock.MagicMock()
+
+        api.collections().list().execute.return_value = {
+            "items": [{
+                "uuid": "zzzzz-4zz18-zzzzzzzzzzzzzz3",
+                "portable_data_hash": "99999999999999999999999999999998+99",
+                "properties": {
+                    'http://example.com/file1.txt': {
+                        'Date': 'Tue, 15 May 2018 00:00:00 GMT',
+                        'Expires': 'Tue, 16 May 2018 00:00:00 GMT',
+                        'ETag': '123456'
+                    }
+                }
+            }]
+        }
+
+        cm = mock.MagicMock()
+        cm.manifest_locator.return_value = "zzzzz-4zz18-zzzzzzzzzzzzzz3"
+        cm.portable_data_hash.return_value = "99999999999999999999999999999998+99"
+        cm.keys.return_value = ["file1.txt"]
+        collectionmock.return_value = cm
+
+        req = mock.MagicMock()
+        req.status_code = 200
+        req.headers = {
+            'Date': 'Tue, 17 May 2018 00:00:00 GMT',
+            'Expires': 'Tue, 19 May 2018 00:00:00 GMT',
+            'ETag': '123456'
+        }
+        headmock.return_value = req
+
+        utcnow = mock.MagicMock()
+        utcnow.return_value = datetime.datetime(2018, 5, 17)
+
+        r = arvados_cwl.http.http_to_keep(api, None, "http://example.com/file1.txt", utcnow=utcnow)
+        self.assertEqual(r, "keep:99999999999999999999999999999998+99/file1.txt")
+
+        getmock.assert_not_called()
+        cm.open.assert_not_called()
+
+        api.collections().update.assert_has_calls([
+            mock.call(uuid=cm.manifest_locator(),
+                      body={"collection":{"properties": {'http://example.com/file1.txt': {
+                          'Date': 'Tue, 17 May 2018 00:00:00 GMT',
+                          'Expires': 'Tue, 19 May 2018 00:00:00 GMT',
+                          'ETag': '123456'
+                      }}}})
+                      ])
+
+    @mock.patch("requests.get")
+    @mock.patch("arvados.collection.Collection")
+    def test_http_content_disp(self, collectionmock, getmock):
+        api = mock.MagicMock()
+
+        api.collections().list().execute.return_value = {
+            "items": []
+        }
+
+        cm = mock.MagicMock()
+        cm.manifest_locator.return_value = "zzzzz-4zz18-zzzzzzzzzzzzzz3"
+        cm.portable_data_hash.return_value = "99999999999999999999999999999998+99"
+        collectionmock.return_value = cm
+
+        req = mock.MagicMock()
+        req.status_code = 200
+        req.headers = {"Content-Disposition": "attachment; filename=file1.txt"}
+        req.iter_content.return_value = ["abc"]
+        getmock.return_value = req
+
+        utcnow = mock.MagicMock()
+        utcnow.return_value = datetime.datetime(2018, 5, 15)
+
+        r = arvados_cwl.http.http_to_keep(api, None, "http://example.com/download?fn=/file1.txt", utcnow=utcnow)
+        self.assertEqual(r, "keep:99999999999999999999999999999998+99/file1.txt")
+
+        getmock.assert_called_with("http://example.com/download?fn=/file1.txt", stream=True, allow_redirects=True)
+
+        cm.open.assert_called_with("file1.txt", "w")
+        cm.save_new.assert_called_with(name="Downloaded from http://example.com/download?fn=/file1.txt",
+                                       owner_uuid=None, ensure_unique_name=True)
+
+        api.collections().update.assert_has_calls([
+            mock.call(uuid=cm.manifest_locator(),
+                      body={"collection":{"properties": {"http://example.com/download?fn=/file1.txt": {'Date': 'Tue, 15 May 2018 00:00:00 GMT'}}}})
+        ])

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list