[arvados] created: 2.6.0-28-g8f737e306
git repository hosting
git at public.arvados.org
Wed Apr 19 18:26:58 UTC 2023
at 8f737e30667628842cb8f85b1b2d7851536099f8 (commit)
commit 8f737e30667628842cb8f85b1b2d7851536099f8
Author: Peter Amstutz <peter.amstutz at curii.com>
Date: Wed Apr 19 10:07:05 2023 -0400
20257: Refactor http-to-keep downloader
* Moved into the SDK, so that other programs besides cwl-runner can use
it
* Migrated to pyCurl
* Updated tests
Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz at curii.com>
diff --git a/sdk/cwl/arvados_cwl/http.py b/sdk/cwl/arvados_cwl/http.py
deleted file mode 100644
index f2415bcff..000000000
--- a/sdk/cwl/arvados_cwl/http.py
+++ /dev/null
@@ -1,224 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-from __future__ import division
-from future import standard_library
-standard_library.install_aliases()
-
-import requests
-import email.utils
-import time
-import datetime
-import re
-import arvados
-import arvados.collection
-import urllib.parse
-import logging
-import calendar
-import urllib.parse
-
-logger = logging.getLogger('arvados.cwl-runner')
-
-def my_formatdate(dt):
- return email.utils.formatdate(timeval=calendar.timegm(dt.timetuple()),
- localtime=False, usegmt=True)
-
-def my_parsedate(text):
- parsed = email.utils.parsedate_tz(text)
- if parsed:
- if parsed[9]:
- # Adjust to UTC
- return datetime.datetime(*parsed[:6]) + datetime.timedelta(seconds=parsed[9])
- else:
- # TZ is zero or missing, assume UTC.
- return datetime.datetime(*parsed[:6])
- else:
- return datetime.datetime(1970, 1, 1)
-
-def fresh_cache(url, properties, now):
- pr = properties[url]
- expires = None
-
- logger.debug("Checking cache freshness for %s using %s", url, pr)
-
- if "Cache-Control" in pr:
- if re.match(r"immutable", pr["Cache-Control"]):
- return True
-
- g = re.match(r"(s-maxage|max-age)=(\d+)", pr["Cache-Control"])
- if g:
- expires = my_parsedate(pr["Date"]) + datetime.timedelta(seconds=int(g.group(2)))
-
- if expires is None and "Expires" in pr:
- expires = my_parsedate(pr["Expires"])
-
- if expires is None:
- # Use a default cache time of 24 hours if upstream didn't set
- # any cache headers, to reduce redundant downloads.
- expires = my_parsedate(pr["Date"]) + datetime.timedelta(hours=24)
-
- if not expires:
- return False
-
- return (now < expires)
-
-def remember_headers(url, properties, headers, now):
- properties.setdefault(url, {})
- for h in ("Cache-Control", "ETag", "Expires", "Date", "Content-Length"):
- if h in headers:
- properties[url][h] = headers[h]
- if "Date" not in headers:
- properties[url]["Date"] = my_formatdate(now)
-
-
-def changed(url, clean_url, properties, now):
- req = requests.head(url, allow_redirects=True)
-
- if req.status_code != 200:
- # Sometimes endpoints are misconfigured and will deny HEAD but
- # allow GET so instead of failing here, we'll try GET If-None-Match
- return True
-
- etag = properties[url].get("ETag")
-
- if url in properties:
- del properties[url]
- remember_headers(clean_url, properties, req.headers, now)
-
- if "ETag" in req.headers and etag == req.headers["ETag"]:
- # Didn't change
- return False
-
- return True
-
-def etag_quote(etag):
- # if it already has leading and trailing quotes, do nothing
- if etag[0] == '"' and etag[-1] == '"':
- return etag
- else:
- # Add quotes.
- return '"' + etag + '"'
-
-
-def http_to_keep(api, project_uuid, url, utcnow=datetime.datetime.utcnow, varying_url_params="", prefer_cached_downloads=False):
- varying_params = [s.strip() for s in varying_url_params.split(",")]
-
- parsed = urllib.parse.urlparse(url)
- query = [q for q in urllib.parse.parse_qsl(parsed.query)
- if q[0] not in varying_params]
-
- clean_url = urllib.parse.urlunparse((parsed.scheme, parsed.netloc, parsed.path, parsed.params,
- urllib.parse.urlencode(query, safe="/"), parsed.fragment))
-
- r1 = api.collections().list(filters=[["properties", "exists", url]]).execute()
-
- if clean_url == url:
- items = r1["items"]
- else:
- r2 = api.collections().list(filters=[["properties", "exists", clean_url]]).execute()
- items = r1["items"] + r2["items"]
-
- now = utcnow()
-
- etags = {}
-
- for item in items:
- properties = item["properties"]
-
- if clean_url in properties:
- cache_url = clean_url
- elif url in properties:
- cache_url = url
- else:
- return False
-
- if prefer_cached_downloads or fresh_cache(cache_url, properties, now):
- # HTTP caching rules say we should use the cache
- cr = arvados.collection.CollectionReader(item["portable_data_hash"], api_client=api)
- return "keep:%s/%s" % (item["portable_data_hash"], list(cr.keys())[0])
-
- if not changed(cache_url, clean_url, properties, now):
- # ETag didn't change, same content, just update headers
- api.collections().update(uuid=item["uuid"], body={"collection":{"properties": properties}}).execute()
- cr = arvados.collection.CollectionReader(item["portable_data_hash"], api_client=api)
- return "keep:%s/%s" % (item["portable_data_hash"], list(cr.keys())[0])
-
- if "ETag" in properties[cache_url] and len(properties[cache_url]["ETag"]) > 2:
- etags[properties[cache_url]["ETag"]] = item
-
- logger.debug("Found ETags %s", etags)
-
- properties = {}
- headers = {}
- if etags:
- headers['If-None-Match'] = ', '.join([etag_quote(k) for k,v in etags.items()])
- logger.debug("Sending GET request with headers %s", headers)
- req = requests.get(url, stream=True, allow_redirects=True, headers=headers)
-
- if req.status_code not in (200, 304):
- raise Exception("Failed to download '%s' got status %s " % (url, req.status_code))
-
- remember_headers(clean_url, properties, req.headers, now)
-
- if req.status_code == 304 and "ETag" in req.headers and req.headers["ETag"] in etags:
- item = etags[req.headers["ETag"]]
- item["properties"].update(properties)
- api.collections().update(uuid=item["uuid"], body={"collection":{"properties": item["properties"]}}).execute()
- cr = arvados.collection.CollectionReader(item["portable_data_hash"], api_client=api)
- return "keep:%s/%s" % (item["portable_data_hash"], list(cr.keys())[0])
-
- if "Content-Length" in properties[clean_url]:
- cl = int(properties[clean_url]["Content-Length"])
- logger.info("Downloading %s (%s bytes)", url, cl)
- else:
- cl = None
- logger.info("Downloading %s (unknown size)", url)
-
- c = arvados.collection.Collection()
-
- if req.headers.get("Content-Disposition"):
- grp = re.search(r'filename=("((\"|[^"])+)"|([^][()<>@,;:\"/?={} ]+))', req.headers["Content-Disposition"])
- if grp.group(2):
- name = grp.group(2)
- else:
- name = grp.group(4)
- else:
- name = parsed.path.split("/")[-1]
-
- count = 0
- start = time.time()
- checkpoint = start
- with c.open(name, "wb") as f:
- for chunk in req.iter_content(chunk_size=1024):
- count += len(chunk)
- f.write(chunk)
- loopnow = time.time()
- if (loopnow - checkpoint) > 20:
- bps = count / (loopnow - start)
- if cl is not None:
- logger.info("%2.1f%% complete, %3.2f MiB/s, %1.0f seconds left",
- ((count * 100) / cl),
- (bps // (1024*1024)),
- ((cl-count) // bps))
- else:
- logger.info("%d downloaded, %3.2f MiB/s", count, (bps / (1024*1024)))
- checkpoint = loopnow
-
- logger.info("Download complete")
-
- collectionname = "Downloaded from %s" % urllib.parse.quote(clean_url, safe='')
-
- # max length - space to add a timestamp used by ensure_unique_name
- max_name_len = 254 - 28
-
- if len(collectionname) > max_name_len:
- over = len(collectionname) - max_name_len
- split = int(max_name_len/2)
- collectionname = collectionname[0:split] + "…" + collectionname[split+over:]
-
- c.save_new(name=collectionname, owner_uuid=project_uuid, ensure_unique_name=True)
-
- api.collections().update(uuid=c.manifest_locator(), body={"collection":{"properties": properties}}).execute()
-
- return "keep:%s/%s" % (c.portable_data_hash(), name)
diff --git a/sdk/cwl/arvados_cwl/pathmapper.py b/sdk/cwl/arvados_cwl/pathmapper.py
index e2e287bf1..62a59a8cf 100644
--- a/sdk/cwl/arvados_cwl/pathmapper.py
+++ b/sdk/cwl/arvados_cwl/pathmapper.py
@@ -26,7 +26,7 @@ from cwltool.utils import adjustFileObjs, adjustDirObjs
from cwltool.stdfsaccess import abspath
from cwltool.workflow import WorkflowException
-from .http import http_to_keep
+from arvados.http_import import http_to_keep
logger = logging.getLogger('arvados.cwl-runner')
diff --git a/sdk/python/arvados/http_import.py b/sdk/python/arvados/http_import.py
new file mode 100644
index 000000000..d45eb7b63
--- /dev/null
+++ b/sdk/python/arvados/http_import.py
@@ -0,0 +1,325 @@
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import division
+from future import standard_library
+standard_library.install_aliases()
+
+import email.utils
+import time
+import datetime
+import re
+import arvados
+import arvados.collection
+import urllib.parse
+import logging
+import calendar
+import urllib.parse
+import pycurl
+from arvados.pycurl import PyCurlHelper
+
+logger = logging.getLogger('arvados.http_import')
+
+def my_formatdate(dt):
+ return email.utils.formatdate(timeval=calendar.timegm(dt.timetuple()),
+ localtime=False, usegmt=True)
+
+def my_parsedate(text):
+ parsed = email.utils.parsedate_tz(text)
+ if parsed:
+ if parsed[9]:
+ # Adjust to UTC
+ return datetime.datetime(*parsed[:6]) + datetime.timedelta(seconds=parsed[9])
+ else:
+ # TZ is zero or missing, assume UTC.
+ return datetime.datetime(*parsed[:6])
+ else:
+ return datetime.datetime(1970, 1, 1)
+
+def fresh_cache(url, properties, now):
+ pr = properties[url]
+ expires = None
+
+ logger.debug("Checking cache freshness for %s using %s", url, pr)
+
+ if "Cache-Control" in pr:
+ if re.match(r"immutable", pr["Cache-Control"]):
+ return True
+
+ g = re.match(r"(s-maxage|max-age)=(\d+)", pr["Cache-Control"])
+ if g:
+ expires = my_parsedate(pr["Date"]) + datetime.timedelta(seconds=int(g.group(2)))
+
+ if expires is None and "Expires" in pr:
+ expires = my_parsedate(pr["Expires"])
+
+ if expires is None:
+ # Use a default cache time of 24 hours if upstream didn't set
+ # any cache headers, to reduce redundant downloads.
+ expires = my_parsedate(pr["Date"]) + datetime.timedelta(hours=24)
+
+ if not expires:
+ return False
+
+ return (now < expires)
+
+def remember_headers(url, properties, headers, now):
+ properties.setdefault(url, {})
+ for h in ("Cache-Control", "Etag", "Expires", "Date", "Content-Length"):
+ if h in headers:
+ properties[url][h] = headers[h]
+ if "Date" not in headers:
+ properties[url]["Date"] = my_formatdate(now)
+
+class Response:
+ def __init__(self, status_code, headers):
+ self.status_code = status_code
+ self.headers = headers
+
+class CurlDownloader(PyCurlHelper):
+ # Wait up to 60 seconds for connection
+ # How long it can be in "low bandwidth" state before it gives up
+ # Low bandwidth threshold is 32 KiB/s
+ DOWNLOADER_TIMEOUT = (60, 300, 32768)
+
+ def __init__(self):
+ super(CurlDownloader, self).__init__(title_case_headers=True)
+ self.curl = pycurl.Curl()
+ self.curl.setopt(pycurl.NOSIGNAL, 1)
+ self.curl.setopt(pycurl.OPENSOCKETFUNCTION,
+ lambda *args, **kwargs: self._socket_open(*args, **kwargs))
+ self.target = None
+
+ def head(self, url, headers={}):
+ get_headers = {'Accept': 'application/octet-stream'}
+ get_headers.update(headers)
+ self._headers = {}
+
+ self.curl.setopt(pycurl.URL, url.encode('utf-8'))
+ self.curl.setopt(pycurl.HTTPHEADER, [
+ '{}: {}'.format(k,v) for k,v in get_headers.items()])
+
+ self.curl.setopt(pycurl.HEADERFUNCTION, self._headerfunction)
+ self.curl.setopt(pycurl.CAINFO, arvados.util.ca_certs_path())
+ self.curl.setopt(pycurl.NOBODY, True)
+ self.curl.setopt(pycurl.FOLLOWLOCATION, True)
+
+ self._setcurltimeouts(self.curl, self.DOWNLOADER_TIMEOUT, True)
+
+ try:
+ self.curl.perform()
+ except Exception as e:
+ raise arvados.errors.HttpError(0, str(e))
+ finally:
+ if self._socket:
+ self._socket.close()
+ self._socket = None
+
+ return Response(self.curl.getinfo(pycurl.RESPONSE_CODE), self._headers)
+
+ def download(self, url, headers):
+ self.count = 0
+ self.start = time.time()
+ self.checkpoint = self.start
+ self._headers = {}
+ self._first_chunk = True
+ self.collection = None
+ self.parsedurl = urllib.parse.urlparse(url)
+
+ get_headers = {'Accept': 'application/octet-stream'}
+ get_headers.update(headers)
+
+ self.curl.setopt(pycurl.URL, url.encode('utf-8'))
+ self.curl.setopt(pycurl.HTTPHEADER, [
+ '{}: {}'.format(k,v) for k,v in get_headers.items()])
+
+ self.curl.setopt(pycurl.WRITEFUNCTION, self.body_write)
+ self.curl.setopt(pycurl.HEADERFUNCTION, self._headerfunction)
+
+ self.curl.setopt(pycurl.CAINFO, arvados.util.ca_certs_path())
+ self.curl.setopt(pycurl.HTTPGET, True)
+ self.curl.setopt(pycurl.FOLLOWLOCATION, True)
+
+ self._setcurltimeouts(self.curl, self.DOWNLOADER_TIMEOUT, False)
+
+ try:
+ self.curl.perform()
+ except Exception as e:
+ raise arvados.errors.HttpError(0, str(e))
+ finally:
+ if self._socket:
+ self._socket.close()
+ self._socket = None
+
+ return Response(self.curl.getinfo(pycurl.RESPONSE_CODE), self._headers)
+
+ def headers_received(self):
+ self.collection = arvados.collection.Collection()
+
+ if "Content-Length" in self._headers:
+ self.contentlength = int(self._headers["Content-Length"])
+ logger.info("File size is %s bytes", self.contentlength)
+ else:
+ self.contentlength = None
+
+ if self._headers.get("Content-Disposition"):
+ grp = re.search(r'filename=("((\"|[^"])+)"|([^][()<>@,;:\"/?={} ]+))',
+ self._headers["Content-Disposition"])
+ if grp.group(2):
+ self.name = grp.group(2)
+ else:
+ self.name = grp.group(4)
+ else:
+ self.name = self.parsedurl.path.split("/")[-1]
+
+ if self.curl.getinfo(pycurl.RESPONSE_CODE) == 200:
+ self.target = self.collection.open(self.name, "wb")
+
+ def body_write(self, chunk):
+ if self._first_chunk:
+ self.headers_received()
+ self._first_chunk = False
+
+ self.count += len(chunk)
+ self.target.write(chunk)
+ loopnow = time.time()
+ if (loopnow - self.checkpoint) < 20:
+ return
+
+ bps = self.count / (loopnow - self.start)
+ if self.contentlength is not None:
+ logger.info("%2.1f%% complete, %6.2f MiB/s, %1.0f seconds left",
+ ((self.count * 100) / self.contentlength),
+ (bps / (1024.0*1024.0)),
+ ((self.contentlength-self.count) // bps))
+ else:
+ logger.info("%d downloaded, %6.2f MiB/s", count, (bps / (1024.0*1024.0)))
+ self.checkpoint = loopnow
+
+
+def changed(url, clean_url, properties, now, curldownloader):
+ req = curldownloader.head(url)
+
+ if req.status_code != 200:
+ # Sometimes endpoints are misconfigured and will deny HEAD but
+ # allow GET so instead of failing here, we'll try GET If-None-Match
+ return True
+
+ # previous version of this code used "ETag", now we are
+ # normalizing to "Etag", check for both.
+ etag = properties[url].get("Etag") or properties[url].get("ETag")
+
+ if url in properties:
+ del properties[url]
+ remember_headers(clean_url, properties, req.headers, now)
+
+ if "Etag" in req.headers and etag == req.headers["Etag"]:
+ # Didn't change
+ return False
+
+ return True
+
+def etag_quote(etag):
+ # if it already has leading and trailing quotes, do nothing
+ if etag[0] == '"' and etag[-1] == '"':
+ return etag
+ else:
+ # Add quotes.
+ return '"' + etag + '"'
+
+
+def http_to_keep(api, project_uuid, url, utcnow=datetime.datetime.utcnow, varying_url_params="", prefer_cached_downloads=False):
+ varying_params = [s.strip() for s in varying_url_params.split(",")]
+
+ parsed = urllib.parse.urlparse(url)
+ query = [q for q in urllib.parse.parse_qsl(parsed.query)
+ if q[0] not in varying_params]
+
+ clean_url = urllib.parse.urlunparse((parsed.scheme, parsed.netloc, parsed.path, parsed.params,
+ urllib.parse.urlencode(query, safe="/"), parsed.fragment))
+
+ r1 = api.collections().list(filters=[["properties", "exists", url]]).execute()
+
+ if clean_url == url:
+ items = r1["items"]
+ else:
+ r2 = api.collections().list(filters=[["properties", "exists", clean_url]]).execute()
+ items = r1["items"] + r2["items"]
+
+ now = utcnow()
+
+ etags = {}
+
+ curldownloader = CurlDownloader()
+
+ for item in items:
+ properties = item["properties"]
+
+ if clean_url in properties:
+ cache_url = clean_url
+ elif url in properties:
+ cache_url = url
+ else:
+ return False
+
+ if prefer_cached_downloads or fresh_cache(cache_url, properties, now):
+ # HTTP caching rules say we should use the cache
+ cr = arvados.collection.CollectionReader(item["portable_data_hash"], api_client=api)
+ return "keep:%s/%s" % (item["portable_data_hash"], list(cr.keys())[0])
+
+ if not changed(cache_url, clean_url, properties, now, curldownloader):
+ # Etag didn't change, same content, just update headers
+ api.collections().update(uuid=item["uuid"], body={"collection":{"properties": properties}}).execute()
+ cr = arvados.collection.CollectionReader(item["portable_data_hash"], api_client=api)
+ return "keep:%s/%s" % (item["portable_data_hash"], list(cr.keys())[0])
+
+ if "Etag" in properties[cache_url] and len(properties[cache_url]["Etag"]) > 2:
+ etags[properties[cache_url]["Etag"]] = item
+
+ logger.debug("Found Etags %s", etags)
+
+ properties = {}
+ headers = {}
+ if etags:
+ headers['If-None-Match'] = ', '.join([etag_quote(k) for k,v in etags.items()])
+ logger.debug("Sending GET request with headers %s", headers)
+
+ logger.info("Beginning download of %s", url)
+
+ req = curldownloader.download(url, headers)
+
+ c = curldownloader.collection
+
+ if req.status_code not in (200, 304):
+ raise Exception("Failed to download '%s' got status %s " % (url, req.status_code))
+
+ if curldownloader.target is not None:
+ curldownloader.target.close()
+
+ remember_headers(clean_url, properties, req.headers, now)
+
+ if req.status_code == 304 and "Etag" in req.headers and req.headers["Etag"] in etags:
+ item = etags[req.headers["Etag"]]
+ item["properties"].update(properties)
+ api.collections().update(uuid=item["uuid"], body={"collection":{"properties": item["properties"]}}).execute()
+ cr = arvados.collection.CollectionReader(item["portable_data_hash"], api_client=api)
+ return "keep:%s/%s" % (item["portable_data_hash"], list(cr.keys())[0])
+
+ logger.info("Download complete")
+
+ collectionname = "Downloaded from %s" % urllib.parse.quote(clean_url, safe='')
+
+ # max length - space to add a timestamp used by ensure_unique_name
+ max_name_len = 254 - 28
+
+ if len(collectionname) > max_name_len:
+ over = len(collectionname) - max_name_len
+ split = int(max_name_len/2)
+ collectionname = collectionname[0:split] + "…" + collectionname[split+over:]
+
+ c.save_new(name=collectionname, owner_uuid=project_uuid, ensure_unique_name=True)
+
+ api.collections().update(uuid=c.manifest_locator(), body={"collection":{"properties": properties}}) #.execute()
+
+ return "keep:%s/%s" % (c.portable_data_hash(), curldownloader.name)
diff --git a/sdk/python/arvados/keep.py b/sdk/python/arvados/keep.py
index cbe96ffa2..c80111efe 100644
--- a/sdk/python/arvados/keep.py
+++ b/sdk/python/arvados/keep.py
@@ -44,6 +44,7 @@ import arvados.errors
import arvados.retry as retry
import arvados.util
import arvados.diskcache
+from arvados.pycurl import PyCurlHelper
_logger = logging.getLogger('arvados.keep')
global_client_object = None
@@ -405,18 +406,10 @@ class Counter(object):
class KeepClient(object):
+ DEFAULT_TIMEOUT = PyCurlHelper.DEFAULT_TIMEOUT
+ DEFAULT_PROXY_TIMEOUT = PyCurlHelper.DEFAULT_PROXY_TIMEOUT
- # Default Keep server connection timeout: 2 seconds
- # Default Keep server read timeout: 256 seconds
- # Default Keep server bandwidth minimum: 32768 bytes per second
- # Default Keep proxy connection timeout: 20 seconds
- # Default Keep proxy read timeout: 256 seconds
- # Default Keep proxy bandwidth minimum: 32768 bytes per second
- DEFAULT_TIMEOUT = (2, 256, 32768)
- DEFAULT_PROXY_TIMEOUT = (20, 256, 32768)
-
-
- class KeepService(object):
+ class KeepService(PyCurlHelper):
"""Make requests to a single Keep service, and track results.
A KeepService is intended to last long enough to perform one
@@ -439,6 +432,7 @@ class KeepClient(object):
download_counter=None,
headers={},
insecure=False):
+ super(KeepClient.KeepService, self).__init__()
self.root = root
self._user_agent_pool = user_agent_pool
self._result = {'error': None}
@@ -476,30 +470,6 @@ class KeepClient(object):
except:
ua.close()
- def _socket_open(self, *args, **kwargs):
- if len(args) + len(kwargs) == 2:
- return self._socket_open_pycurl_7_21_5(*args, **kwargs)
- else:
- return self._socket_open_pycurl_7_19_3(*args, **kwargs)
-
- def _socket_open_pycurl_7_19_3(self, family, socktype, protocol, address=None):
- return self._socket_open_pycurl_7_21_5(
- purpose=None,
- address=collections.namedtuple(
- 'Address', ['family', 'socktype', 'protocol', 'addr'],
- )(family, socktype, protocol, address))
-
- def _socket_open_pycurl_7_21_5(self, purpose, address):
- """Because pycurl doesn't have CURLOPT_TCP_KEEPALIVE"""
- s = socket.socket(address.family, address.socktype, address.protocol)
- s.setsockopt(socket.SOL_SOCKET, socket.SO_KEEPALIVE, 1)
- # Will throw invalid protocol error on mac. This test prevents that.
- if hasattr(socket, 'TCP_KEEPIDLE'):
- s.setsockopt(socket.IPPROTO_TCP, socket.TCP_KEEPIDLE, 75)
- s.setsockopt(socket.IPPROTO_TCP, socket.TCP_KEEPINTVL, 75)
- self._socket = s
- return s
-
def get(self, locator, method="GET", timeout=None):
# locator is a KeepLocator object.
url = self.root + str(locator)
@@ -525,6 +495,8 @@ class KeepClient(object):
curl.setopt(pycurl.CAINFO, arvados.util.ca_certs_path())
if method == "HEAD":
curl.setopt(pycurl.NOBODY, True)
+ else:
+ curl.setopt(pycurl.HTTPGET, True)
self._setcurltimeouts(curl, timeout, method=="HEAD")
try:
@@ -669,43 +641,6 @@ class KeepClient(object):
self.upload_counter.add(len(body))
return True
- def _setcurltimeouts(self, curl, timeouts, ignore_bandwidth=False):
- if not timeouts:
- return
- elif isinstance(timeouts, tuple):
- if len(timeouts) == 2:
- conn_t, xfer_t = timeouts
- bandwidth_bps = KeepClient.DEFAULT_TIMEOUT[2]
- else:
- conn_t, xfer_t, bandwidth_bps = timeouts
- else:
- conn_t, xfer_t = (timeouts, timeouts)
- bandwidth_bps = KeepClient.DEFAULT_TIMEOUT[2]
- curl.setopt(pycurl.CONNECTTIMEOUT_MS, int(conn_t*1000))
- if not ignore_bandwidth:
- curl.setopt(pycurl.LOW_SPEED_TIME, int(math.ceil(xfer_t)))
- curl.setopt(pycurl.LOW_SPEED_LIMIT, int(math.ceil(bandwidth_bps)))
-
- def _headerfunction(self, header_line):
- if isinstance(header_line, bytes):
- header_line = header_line.decode('iso-8859-1')
- if ':' in header_line:
- name, value = header_line.split(':', 1)
- name = name.strip().lower()
- value = value.strip()
- elif self._headers:
- name = self._lastheadername
- value = self._headers[name] + ' ' + header_line.strip()
- elif header_line.startswith('HTTP/'):
- name = 'x-status-line'
- value = header_line
- else:
- _logger.error("Unexpected header line: %s", header_line)
- return
- self._lastheadername = name
- self._headers[name] = value
- # Returning None implies all bytes were written
-
class KeepWriterQueue(queue.Queue):
def __init__(self, copies, classes=[]):
diff --git a/sdk/python/arvados/pycurl.py b/sdk/python/arvados/pycurl.py
new file mode 100644
index 000000000..e1153ad9e
--- /dev/null
+++ b/sdk/python/arvados/pycurl.py
@@ -0,0 +1,85 @@
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import socket
+import pycurl
+import math
+
+class PyCurlHelper:
+ # Default Keep server connection timeout: 2 seconds
+ # Default Keep server read timeout: 256 seconds
+ # Default Keep server bandwidth minimum: 32768 bytes per second
+ # Default Keep proxy connection timeout: 20 seconds
+ # Default Keep proxy read timeout: 256 seconds
+ # Default Keep proxy bandwidth minimum: 32768 bytes per second
+ DEFAULT_TIMEOUT = (2, 256, 32768)
+ DEFAULT_PROXY_TIMEOUT = (20, 256, 32768)
+
+ def __init__(self, title_case_headers=False):
+ self._socket = None
+ self.title_case_headers = title_case_headers
+
+ def _socket_open(self, *args, **kwargs):
+ if len(args) + len(kwargs) == 2:
+ return self._socket_open_pycurl_7_21_5(*args, **kwargs)
+ else:
+ return self._socket_open_pycurl_7_19_3(*args, **kwargs)
+
+ def _socket_open_pycurl_7_19_3(self, family, socktype, protocol, address=None):
+ return self._socket_open_pycurl_7_21_5(
+ purpose=None,
+ address=collections.namedtuple(
+ 'Address', ['family', 'socktype', 'protocol', 'addr'],
+ )(family, socktype, protocol, address))
+
+ def _socket_open_pycurl_7_21_5(self, purpose, address):
+ """Because pycurl doesn't have CURLOPT_TCP_KEEPALIVE"""
+ s = socket.socket(address.family, address.socktype, address.protocol)
+ s.setsockopt(socket.SOL_SOCKET, socket.SO_KEEPALIVE, 1)
+ # Will throw invalid protocol error on mac. This test prevents that.
+ if hasattr(socket, 'TCP_KEEPIDLE'):
+ s.setsockopt(socket.IPPROTO_TCP, socket.TCP_KEEPIDLE, 75)
+ s.setsockopt(socket.IPPROTO_TCP, socket.TCP_KEEPINTVL, 75)
+ self._socket = s
+ return s
+
+ def _setcurltimeouts(self, curl, timeouts, ignore_bandwidth=False):
+ if not timeouts:
+ return
+ elif isinstance(timeouts, tuple):
+ if len(timeouts) == 2:
+ conn_t, xfer_t = timeouts
+ bandwidth_bps = self.DEFAULT_TIMEOUT[2]
+ else:
+ conn_t, xfer_t, bandwidth_bps = timeouts
+ else:
+ conn_t, xfer_t = (timeouts, timeouts)
+ bandwidth_bps = self.DEFAULT_TIMEOUT[2]
+ curl.setopt(pycurl.CONNECTTIMEOUT_MS, int(conn_t*1000))
+ if not ignore_bandwidth:
+ curl.setopt(pycurl.LOW_SPEED_TIME, int(math.ceil(xfer_t)))
+ curl.setopt(pycurl.LOW_SPEED_LIMIT, int(math.ceil(bandwidth_bps)))
+
+ def _headerfunction(self, header_line):
+ if isinstance(header_line, bytes):
+ header_line = header_line.decode('iso-8859-1')
+ if ':' in header_line:
+ name, value = header_line.split(':', 1)
+ if self.title_case_headers:
+ name = name.strip().title()
+ else:
+ name = name.strip().lower()
+ value = value.strip()
+ elif self._headers:
+ name = self._lastheadername
+ value = self._headers[name] + ' ' + header_line.strip()
+ elif header_line.startswith('HTTP/'):
+ name = 'x-status-line'
+ value = header_line
+ else:
+ _logger.error("Unexpected header line: %s", header_line)
+ return
+ self._lastheadername = name
+ self._headers[name] = value
+ # Returning None implies all bytes were written
diff --git a/sdk/cwl/tests/test_http.py b/sdk/python/tests/test_http.py
similarity index 67%
rename from sdk/cwl/tests/test_http.py
rename to sdk/python/tests/test_http.py
index 5598b1f13..44d0d2599 100644
--- a/sdk/cwl/tests/test_http.py
+++ b/sdk/python/tests/test_http.py
@@ -18,23 +18,59 @@ import datetime
import arvados
import arvados.collection
-import arvados_cwl
-import arvados_cwl.runner
import arvados.keep
+import pycurl
-from .matcher import JsonDiffMatcher, StripYAMLComments
-from .mock_discovery import get_rootDesc
-
-import arvados_cwl.http
+from arvados.http_import import http_to_keep
import ruamel.yaml as yaml
+# Turns out there was already "FakeCurl" that serves the same purpose, but
+# I wrote this before I knew that. Whoops.
+class CurlMock:
+ def __init__(self, headers = {}):
+ self.perform_was_called = False
+ self.headers = headers
+ self.get_response = 200
+ self.head_response = 200
+ self.req_headers = []
+
+ def setopt(self, op, *args):
+ if op == pycurl.URL:
+ self.url = args[0]
+ if op == pycurl.WRITEFUNCTION:
+ self.writefn = args[0]
+ if op == pycurl.HEADERFUNCTION:
+ self.headerfn = args[0]
+ if op == pycurl.NOBODY:
+ self.head = True
+ if op == pycurl.HTTPGET:
+ self.head = False
+ if op == pycurl.HTTPHEADER:
+ self.req_headers = args[0]
+
+ def getinfo(self, op):
+ if op == pycurl.RESPONSE_CODE:
+ if self.head:
+ return self.head_response
+ else:
+ return self.get_response
+
+ def perform(self):
+ self.perform_was_called = True
+
+ for k,v in self.headers.items():
+ self.headerfn("%s: %s" % (k,v))
+
+ if not self.head and self.get_response == 200:
+ self.writefn(self.chunk)
+
class TestHttpToKeep(unittest.TestCase):
- @mock.patch("requests.get")
+ @mock.patch("pycurl.Curl")
@mock.patch("arvados.collection.Collection")
- def test_http_get(self, collectionmock, getmock):
+ def test_http_get(self, collectionmock, curlmock):
api = mock.MagicMock()
api.collections().list().execute.return_value = {
@@ -46,19 +82,20 @@ class TestHttpToKeep(unittest.TestCase):
cm.portable_data_hash.return_value = "99999999999999999999999999999998+99"
collectionmock.return_value = cm
- req = mock.MagicMock()
- req.status_code = 200
- req.headers = {}
- req.iter_content.return_value = ["abc"]
- getmock.return_value = req
+ mockobj = CurlMock()
+ mockobj.chunk = b'abc'
+ def init():
+ return mockobj
+ curlmock.side_effect = init
utcnow = mock.MagicMock()
utcnow.return_value = datetime.datetime(2018, 5, 15)
- r = arvados_cwl.http.http_to_keep(api, None, "http://example.com/file1.txt", utcnow=utcnow)
+ r = http_to_keep(api, None, "http://example.com/file1.txt", utcnow=utcnow)
self.assertEqual(r, "keep:99999999999999999999999999999998+99/file1.txt")
- getmock.assert_called_with("http://example.com/file1.txt", stream=True, allow_redirects=True, headers={})
+ assert mockobj.url == b"http://example.com/file1.txt"
+ assert mockobj.perform_was_called is True
cm.open.assert_called_with("file1.txt", "wb")
cm.save_new.assert_called_with(name="Downloaded from http%3A%2F%2Fexample.com%2Ffile1.txt",
@@ -70,9 +107,9 @@ class TestHttpToKeep(unittest.TestCase):
])
- @mock.patch("requests.get")
+ @mock.patch("pycurl.Curl")
@mock.patch("arvados.collection.CollectionReader")
- def test_http_expires(self, collectionmock, getmock):
+ def test_http_expires(self, collectionmock, curlmock):
api = mock.MagicMock()
api.collections().list().execute.return_value = {
@@ -94,24 +131,24 @@ class TestHttpToKeep(unittest.TestCase):
cm.keys.return_value = ["file1.txt"]
collectionmock.return_value = cm
- req = mock.MagicMock()
- req.status_code = 200
- req.headers = {}
- req.iter_content.return_value = ["abc"]
- getmock.return_value = req
+ mockobj = CurlMock()
+ mockobj.chunk = b'abc'
+ def init():
+ return mockobj
+ curlmock.side_effect = init
utcnow = mock.MagicMock()
utcnow.return_value = datetime.datetime(2018, 5, 16)
- r = arvados_cwl.http.http_to_keep(api, None, "http://example.com/file1.txt", utcnow=utcnow)
+ r = http_to_keep(api, None, "http://example.com/file1.txt", utcnow=utcnow)
self.assertEqual(r, "keep:99999999999999999999999999999998+99/file1.txt")
- getmock.assert_not_called()
+ assert mockobj.perform_was_called is False
- @mock.patch("requests.get")
+ @mock.patch("pycurl.Curl")
@mock.patch("arvados.collection.CollectionReader")
- def test_http_cache_control(self, collectionmock, getmock):
+ def test_http_cache_control(self, collectionmock, curlmock):
api = mock.MagicMock()
api.collections().list().execute.return_value = {
@@ -133,25 +170,24 @@ class TestHttpToKeep(unittest.TestCase):
cm.keys.return_value = ["file1.txt"]
collectionmock.return_value = cm
- req = mock.MagicMock()
- req.status_code = 200
- req.headers = {}
- req.iter_content.return_value = ["abc"]
- getmock.return_value = req
+ mockobj = CurlMock()
+ mockobj.chunk = b'abc'
+ def init():
+ return mockobj
+ curlmock.side_effect = init
utcnow = mock.MagicMock()
utcnow.return_value = datetime.datetime(2018, 5, 16)
- r = arvados_cwl.http.http_to_keep(api, None, "http://example.com/file1.txt", utcnow=utcnow)
+ r = http_to_keep(api, None, "http://example.com/file1.txt", utcnow=utcnow)
self.assertEqual(r, "keep:99999999999999999999999999999998+99/file1.txt")
- getmock.assert_not_called()
+ assert mockobj.perform_was_called is False
- @mock.patch("requests.get")
- @mock.patch("requests.head")
+ @mock.patch("pycurl.Curl")
@mock.patch("arvados.collection.Collection")
- def test_http_expired(self, collectionmock, headmock, getmock):
+ def test_http_expired(self, collectionmock, curlmock):
api = mock.MagicMock()
api.collections().list().execute.return_value = {
@@ -161,7 +197,7 @@ class TestHttpToKeep(unittest.TestCase):
"properties": {
'http://example.com/file1.txt': {
'Date': 'Tue, 15 May 2018 00:00:00 GMT',
- 'Expires': 'Tue, 16 May 2018 00:00:00 GMT'
+ 'Expires': 'Wed, 16 May 2018 00:00:00 GMT'
}
}
}]
@@ -173,20 +209,20 @@ class TestHttpToKeep(unittest.TestCase):
cm.keys.return_value = ["file1.txt"]
collectionmock.return_value = cm
- req = mock.MagicMock()
- req.status_code = 200
- req.headers = {'Date': 'Tue, 17 May 2018 00:00:00 GMT'}
- req.iter_content.return_value = ["def"]
- getmock.return_value = req
- headmock.return_value = req
+ mockobj = CurlMock({'Date': 'Thu, 17 May 2018 00:00:00 GMT'})
+ mockobj.chunk = b'def'
+ def init():
+ return mockobj
+ curlmock.side_effect = init
utcnow = mock.MagicMock()
utcnow.return_value = datetime.datetime(2018, 5, 17)
- r = arvados_cwl.http.http_to_keep(api, None, "http://example.com/file1.txt", utcnow=utcnow)
+ r = http_to_keep(api, None, "http://example.com/file1.txt", utcnow=utcnow)
self.assertEqual(r, "keep:99999999999999999999999999999997+99/file1.txt")
- getmock.assert_called_with("http://example.com/file1.txt", stream=True, allow_redirects=True, headers={})
+ assert mockobj.url == b"http://example.com/file1.txt"
+ assert mockobj.perform_was_called is True
cm.open.assert_called_with("file1.txt", "wb")
cm.save_new.assert_called_with(name="Downloaded from http%3A%2F%2Fexample.com%2Ffile1.txt",
@@ -194,14 +230,13 @@ class TestHttpToKeep(unittest.TestCase):
api.collections().update.assert_has_calls([
mock.call(uuid=cm.manifest_locator(),
- body={"collection":{"properties": {'http://example.com/file1.txt': {'Date': 'Tue, 17 May 2018 00:00:00 GMT'}}}})
+ body={"collection":{"properties": {'http://example.com/file1.txt': {'Date': 'Thu, 17 May 2018 00:00:00 GMT'}}}})
])
- @mock.patch("requests.get")
- @mock.patch("requests.head")
+ @mock.patch("pycurl.Curl")
@mock.patch("arvados.collection.CollectionReader")
- def test_http_etag(self, collectionmock, headmock, getmock):
+ def test_http_etag(self, collectionmock, curlmock):
api = mock.MagicMock()
api.collections().list().execute.return_value = {
@@ -211,8 +246,8 @@ class TestHttpToKeep(unittest.TestCase):
"properties": {
'http://example.com/file1.txt': {
'Date': 'Tue, 15 May 2018 00:00:00 GMT',
- 'Expires': 'Tue, 16 May 2018 00:00:00 GMT',
- 'ETag': '"123456"'
+ 'Expires': 'Wed, 16 May 2018 00:00:00 GMT',
+ 'Etag': '"123456"'
}
}
}]
@@ -224,36 +259,36 @@ class TestHttpToKeep(unittest.TestCase):
cm.keys.return_value = ["file1.txt"]
collectionmock.return_value = cm
- req = mock.MagicMock()
- req.status_code = 200
- req.headers = {
- 'Date': 'Tue, 17 May 2018 00:00:00 GMT',
- 'Expires': 'Tue, 19 May 2018 00:00:00 GMT',
- 'ETag': '"123456"'
- }
- headmock.return_value = req
+ mockobj = CurlMock({
+ 'Date': 'Thu, 17 May 2018 00:00:00 GMT',
+ 'Expires': 'Sat, 19 May 2018 00:00:00 GMT',
+ 'Etag': '"123456"'
+ })
+ mockobj.chunk = None
+ def init():
+ return mockobj
+ curlmock.side_effect = init
utcnow = mock.MagicMock()
utcnow.return_value = datetime.datetime(2018, 5, 17)
- r = arvados_cwl.http.http_to_keep(api, None, "http://example.com/file1.txt", utcnow=utcnow)
+ r = http_to_keep(api, None, "http://example.com/file1.txt", utcnow=utcnow)
self.assertEqual(r, "keep:99999999999999999999999999999998+99/file1.txt")
- getmock.assert_not_called()
cm.open.assert_not_called()
api.collections().update.assert_has_calls([
mock.call(uuid=cm.manifest_locator(),
body={"collection":{"properties": {'http://example.com/file1.txt': {
- 'Date': 'Tue, 17 May 2018 00:00:00 GMT',
- 'Expires': 'Tue, 19 May 2018 00:00:00 GMT',
- 'ETag': '"123456"'
+ 'Date': 'Thu, 17 May 2018 00:00:00 GMT',
+ 'Expires': 'Sat, 19 May 2018 00:00:00 GMT',
+ 'Etag': '"123456"'
}}}})
])
- @mock.patch("requests.get")
+ @mock.patch("pycurl.Curl")
@mock.patch("arvados.collection.Collection")
- def test_http_content_disp(self, collectionmock, getmock):
+ def test_http_content_disp(self, collectionmock, curlmock):
api = mock.MagicMock()
api.collections().list().execute.return_value = {
@@ -265,19 +300,19 @@ class TestHttpToKeep(unittest.TestCase):
cm.portable_data_hash.return_value = "99999999999999999999999999999998+99"
collectionmock.return_value = cm
- req = mock.MagicMock()
- req.status_code = 200
- req.headers = {"Content-Disposition": "attachment; filename=file1.txt"}
- req.iter_content.return_value = ["abc"]
- getmock.return_value = req
+ mockobj = CurlMock({"Content-Disposition": "attachment; filename=file1.txt"})
+ mockobj.chunk = "abc"
+ def init():
+ return mockobj
+ curlmock.side_effect = init
utcnow = mock.MagicMock()
utcnow.return_value = datetime.datetime(2018, 5, 15)
- r = arvados_cwl.http.http_to_keep(api, None, "http://example.com/download?fn=/file1.txt", utcnow=utcnow)
+ r = http_to_keep(api, None, "http://example.com/download?fn=/file1.txt", utcnow=utcnow)
self.assertEqual(r, "keep:99999999999999999999999999999998+99/file1.txt")
- getmock.assert_called_with("http://example.com/download?fn=/file1.txt", stream=True, allow_redirects=True, headers={})
+ assert mockobj.url == b"http://example.com/download?fn=/file1.txt"
cm.open.assert_called_with("file1.txt", "wb")
cm.save_new.assert_called_with(name="Downloaded from http%3A%2F%2Fexample.com%2Fdownload%3Ffn%3D%2Ffile1.txt",
@@ -288,10 +323,9 @@ class TestHttpToKeep(unittest.TestCase):
body={"collection":{"properties": {"http://example.com/download?fn=/file1.txt": {'Date': 'Tue, 15 May 2018 00:00:00 GMT'}}}})
])
- @mock.patch("requests.get")
- @mock.patch("requests.head")
+ @mock.patch("pycurl.Curl")
@mock.patch("arvados.collection.CollectionReader")
- def test_http_etag_if_none_match(self, collectionmock, headmock, getmock):
+ def test_http_etag_if_none_match(self, collectionmock, curlmock):
api = mock.MagicMock()
api.collections().list().execute.return_value = {
@@ -302,7 +336,7 @@ class TestHttpToKeep(unittest.TestCase):
'http://example.com/file1.txt': {
'Date': 'Tue, 15 May 2018 00:00:00 GMT',
'Expires': 'Tue, 16 May 2018 00:00:00 GMT',
- 'ETag': '"123456"'
+ 'Etag': '"123456"'
}
}
}]
@@ -314,29 +348,26 @@ class TestHttpToKeep(unittest.TestCase):
cm.keys.return_value = ["file1.txt"]
collectionmock.return_value = cm
- # Head request fails, will try a conditional GET instead
- req = mock.MagicMock()
- req.status_code = 403
- req.headers = {
- }
- headmock.return_value = req
+ mockobj = CurlMock({
+ 'Date': 'Tue, 17 May 2018 00:00:00 GMT',
+ 'Expires': 'Tue, 19 May 2018 00:00:00 GMT',
+ 'Etag': '"123456"'
+ })
+ mockobj.chunk = None
+ mockobj.head_response = 403
+ mockobj.get_response = 304
+ def init():
+ return mockobj
+ curlmock.side_effect = init
utcnow = mock.MagicMock()
utcnow.return_value = datetime.datetime(2018, 5, 17)
- req = mock.MagicMock()
- req.status_code = 304
- req.headers = {
- 'Date': 'Tue, 17 May 2018 00:00:00 GMT',
- 'Expires': 'Tue, 19 May 2018 00:00:00 GMT',
- 'ETag': '"123456"'
- }
- getmock.return_value = req
-
- r = arvados_cwl.http.http_to_keep(api, None, "http://example.com/file1.txt", utcnow=utcnow)
+ r = http_to_keep(api, None, "http://example.com/file1.txt", utcnow=utcnow)
self.assertEqual(r, "keep:99999999999999999999999999999998+99/file1.txt")
- getmock.assert_called_with("http://example.com/file1.txt", stream=True, allow_redirects=True, headers={"If-None-Match": '"123456"'})
+ print(mockobj.req_headers)
+ assert mockobj.req_headers == ["Accept: application/octet-stream", "If-None-Match: \"123456\""]
cm.open.assert_not_called()
api.collections().update.assert_has_calls([
@@ -344,15 +375,13 @@ class TestHttpToKeep(unittest.TestCase):
body={"collection":{"properties": {'http://example.com/file1.txt': {
'Date': 'Tue, 17 May 2018 00:00:00 GMT',
'Expires': 'Tue, 19 May 2018 00:00:00 GMT',
- 'ETag': '"123456"'
+ 'Etag': '"123456"'
}}}})
])
-
- @mock.patch("requests.get")
- @mock.patch("requests.head")
+ @mock.patch("pycurl.Curl")
@mock.patch("arvados.collection.CollectionReader")
- def test_http_prefer_cached_downloads(self, collectionmock, headmock, getmock):
+ def test_http_prefer_cached_downloads(self, collectionmock, curlmock):
api = mock.MagicMock()
api.collections().list().execute.return_value = {
@@ -363,7 +392,7 @@ class TestHttpToKeep(unittest.TestCase):
'http://example.com/file1.txt': {
'Date': 'Tue, 15 May 2018 00:00:00 GMT',
'Expires': 'Tue, 16 May 2018 00:00:00 GMT',
- 'ETag': '"123456"'
+ 'Etag': '"123456"'
}
}
}]
@@ -375,21 +404,24 @@ class TestHttpToKeep(unittest.TestCase):
cm.keys.return_value = ["file1.txt"]
collectionmock.return_value = cm
+ mockobj = CurlMock()
+ def init():
+ return mockobj
+ curlmock.side_effect = init
+
utcnow = mock.MagicMock()
utcnow.return_value = datetime.datetime(2018, 5, 17)
- r = arvados_cwl.http.http_to_keep(api, None, "http://example.com/file1.txt", utcnow=utcnow, prefer_cached_downloads=True)
+ r = http_to_keep(api, None, "http://example.com/file1.txt", utcnow=utcnow, prefer_cached_downloads=True)
self.assertEqual(r, "keep:99999999999999999999999999999998+99/file1.txt")
- headmock.assert_not_called()
- getmock.assert_not_called()
+ assert mockobj.perform_was_called is False
cm.open.assert_not_called()
api.collections().update.assert_not_called()
- @mock.patch("requests.get")
- @mock.patch("requests.head")
+ @mock.patch("pycurl.Curl")
@mock.patch("arvados.collection.CollectionReader")
- def test_http_varying_url_params(self, collectionmock, headmock, getmock):
+ def test_http_varying_url_params(self, collectionmock, curlmock):
for prurl in ("http://example.com/file1.txt", "http://example.com/file1.txt?KeyId=123&Signature=456&Expires=789"):
api = mock.MagicMock()
@@ -401,7 +433,7 @@ class TestHttpToKeep(unittest.TestCase):
prurl: {
'Date': 'Tue, 15 May 2018 00:00:00 GMT',
'Expires': 'Tue, 16 May 2018 00:00:00 GMT',
- 'ETag': '"123456"'
+ 'Etag': '"123456"'
}
}
}]
@@ -413,23 +445,24 @@ class TestHttpToKeep(unittest.TestCase):
cm.keys.return_value = ["file1.txt"]
collectionmock.return_value = cm
- req = mock.MagicMock()
- req.status_code = 200
- req.headers = {
+ mockobj = CurlMock({
'Date': 'Tue, 17 May 2018 00:00:00 GMT',
'Expires': 'Tue, 19 May 2018 00:00:00 GMT',
- 'ETag': '"123456"'
- }
- headmock.return_value = req
+ 'Etag': '"123456"'
+ })
+ mockobj.chunk = None
+ def init():
+ return mockobj
+ curlmock.side_effect = init
utcnow = mock.MagicMock()
utcnow.return_value = datetime.datetime(2018, 5, 17)
- r = arvados_cwl.http.http_to_keep(api, None, "http://example.com/file1.txt?KeyId=123&Signature=456&Expires=789",
+ r = http_to_keep(api, None, "http://example.com/file1.txt?KeyId=123&Signature=456&Expires=789",
utcnow=utcnow, varying_url_params="KeyId,Signature,Expires")
self.assertEqual(r, "keep:99999999999999999999999999999998+99/file1.txt")
- getmock.assert_not_called()
+ assert mockobj.perform_was_called is True
cm.open.assert_not_called()
api.collections().update.assert_has_calls([
@@ -437,6 +470,6 @@ class TestHttpToKeep(unittest.TestCase):
body={"collection":{"properties": {'http://example.com/file1.txt': {
'Date': 'Tue, 17 May 2018 00:00:00 GMT',
'Expires': 'Tue, 19 May 2018 00:00:00 GMT',
- 'ETag': '"123456"'
+ 'Etag': '"123456"'
}}}})
])
diff --git a/services/api/Gemfile b/services/api/Gemfile
index 9b401cc6a..383469007 100644
--- a/services/api/Gemfile
+++ b/services/api/Gemfile
@@ -63,6 +63,8 @@ gem 'rails-observers'
gem 'rails-perftest'
gem 'rails-controller-testing'
+gem 'mini_portile2', '~> 2.8', '>= 2.8.1'
+
# arvados-google-api-client and googleauth depend on signet, but
# signet 0.12 is incompatible with ruby 2.3.
gem 'signet', '< 0.12'
diff --git a/services/api/Gemfile.lock b/services/api/Gemfile.lock
index bdfaaf4ef..21d0ad471 100644
--- a/services/api/Gemfile.lock
+++ b/services/api/Gemfile.lock
@@ -242,6 +242,7 @@ DEPENDENCIES
listen
lograge
logstash-event
+ mini_portile2 (~> 2.8, >= 2.8.1)
minitest (= 5.10.3)
mocha
multi_json
@@ -265,4 +266,4 @@ DEPENDENCIES
themes_for_rails!
BUNDLED WITH
- 2.2.19
+ 2.3.26
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list