[ARVADOS] created: 2b6948ff7e11cd236ea26562ed76a534dcf3c0be
git at public.curoverse.com
git at public.curoverse.com
Thu Oct 30 14:13:19 EDT 2014
at 2b6948ff7e11cd236ea26562ed76a534dcf3c0be (commit)
commit 2b6948ff7e11cd236ea26562ed76a534dcf3c0be
Merge: 8323e56 846ac5d
Author: Tom Clegg <tom at curoverse.com>
Date: Mon Oct 27 22:24:35 2014 -0400
3706: Merge branch 'master' into 3706-keep-warning
commit 8323e56e9dafb1bd4aee4f548c26854ea3eb05e9
Author: Tom Clegg <tom at curoverse.com>
Date: Mon Oct 27 22:20:22 2014 -0400
3706: Remove automatic normalization. Add --normalize option to
arv-put. Add normalize() and stripped_manifest() methods to
CollectionReader. Clean up normalize code.
diff --git a/sdk/python/arvados/collection.py b/sdk/python/arvados/collection.py
index 874c38e..5b00ea0 100644
--- a/sdk/python/arvados/collection.py
+++ b/sdk/python/arvados/collection.py
@@ -68,28 +68,6 @@ def normalize_stream(s, stream):
return stream_tokens
-def normalize(collection):
- streams = {}
- for s in collection.all_streams():
- for f in s.all_files():
- filestream = s.name() + "/" + f.name()
- r = filestream.rindex("/")
- streamname = filestream[:r]
- filename = filestream[r+1:]
- if streamname not in streams:
- streams[streamname] = {}
- if filename not in streams[streamname]:
- streams[streamname][filename] = []
- for r in f.segments:
- streams[streamname][filename].extend(s.locators_and_ranges(r[0], r[1]))
-
- normalized_streams = []
- sortedstreams = list(streams.keys())
- sortedstreams.sort()
- for s in sortedstreams:
- normalized_streams.append(normalize_stream(s, streams[s]))
- return normalized_streams
-
class CollectionBase(object):
def __enter__(self):
@@ -104,6 +82,21 @@ class CollectionBase(object):
num_retries=self.num_retries)
return self._keep_client
+ def stripped_manifest(self):
+ """
+ Return the manifest for the current collection with all
+ non-portable hints (i.e., permission signatures and other
+ hints other than size hints) removed from the locators.
+ """
+ raw = self.manifest_text()
+ clean = ''
+ for line in raw.split("\n"):
+ fields = line.split()
+ if len(fields) > 0:
+ locators = [ (re.sub(r'\+[^\d][^\+]*', '', x) if re.match(util.keep_locator_pattern, x) else x)
+ for x in fields[1:-1] ]
+ clean += fields[0] + ' ' + ' '.join(locators) + ' ' + fields[-1] + "\n"
+ return clean
class CollectionReader(CollectionBase):
def __init__(self, manifest_locator_or_text, api_client=None,
@@ -144,6 +137,29 @@ class CollectionReader(CollectionBase):
"Argument to CollectionReader must be a manifest or a collection UUID")
self._streams = None
+ def _populate_from_api_server(self):
+ # As in KeepClient itself, we must wait until the last possible
+ # moment to instantiate an API client, in order to avoid
+ # tripping up clients that don't have access to an API server.
+ # If we do build one, make sure our Keep client uses it.
+ # If instantiation fails, we'll fall back to the except clause,
+ # just like any other Collection lookup failure.
+ if self._api_client is None:
+ self._api_client = arvados.api('v1')
+ self._keep_client = None # Make a new one with the new api.
+ c = self._api_client.collections().get(
+ uuid=self._manifest_locator).execute(
+ num_retries=self.num_retries)
+ self._manifest_text = c['manifest_text']
+
+ def _populate_from_keep(self):
+ # Retrieve a manifest directly from Keep. This has a chance of
+ # working if [a] the locator includes a permission signature
+ # or [b] the Keep services are operating in world-readable
+ # mode.
+ self._manifest_text = self._my_keep().get(
+ self._manifest_locator, num_retries=self.num_retries)
+
def _populate(self):
if self._streams is not None:
return
@@ -185,14 +201,35 @@ class CollectionReader(CollectionBase):
self._streams = [sline.split()
for sline in self._manifest_text.split("\n")
if sline]
- self._streams = normalize(self)
- # now regenerate the manifest text based on the normalized stream
+ def normalize(self):
+ self._populate()
- #print "normalizing", self._manifest_text
+ # Rearrange streams
+ streams = {}
+ for s in self.all_streams():
+ for f in s.all_files():
+ filestream = s.name() + "/" + f.name()
+ r = filestream.rindex("/")
+ streamname = filestream[:r]
+ filename = filestream[r+1:]
+ if streamname not in streams:
+ streams[streamname] = {}
+ if filename not in streams[streamname]:
+ streams[streamname][filename] = []
+ for r in f.segments:
+ streams[streamname][filename].extend(s.locators_and_ranges(r[0], r[1]))
+
+ self._streams = []
+ sortedstreams = list(streams.keys())
+ sortedstreams.sort()
+ for s in sortedstreams:
+ self._streams.append(normalize_stream(s, streams[s]))
+
+ # Regenerate the manifest text based on the normalized streams
self._manifest_text = ''.join([StreamReader(stream, keep=self._my_keep()).manifest_text() for stream in self._streams])
- #print "result", self._manifest_text
+ return self
def all_streams(self):
self._populate()
@@ -205,11 +242,10 @@ class CollectionReader(CollectionBase):
yield f
def manifest_text(self, strip=False):
- self._populate()
if strip:
- m = ''.join([StreamReader(stream, keep=self._my_keep()).manifest_text(strip=True) for stream in self._streams])
- return m
+ return self.stripped_manifest()
else:
+ self._populate()
return self._manifest_text
@@ -433,20 +469,9 @@ class CollectionWriter(CollectionBase):
# Store the manifest in Keep and return its locator.
return self._my_keep().put(self.manifest_text())
- def stripped_manifest(self):
- """
- Return the manifest for the current collection with all permission
- hints removed from the locators in the manifest.
- """
- raw = self.manifest_text()
- clean = ''
- for line in raw.split("\n"):
- fields = line.split()
- if len(fields) > 0:
- locators = [ re.sub(r'\+A[a-z0-9 at _-]+', '', x)
- for x in fields[1:-1] ]
- clean += fields[0] + ' ' + ' '.join(locators) + ' ' + fields[-1] + "\n"
- return clean
+ def portable_data_hash(self):
+ stripped = self.stripped_manifest()
+ return hashlib.md5(stripped).hexdigest() + '+' + str(len(stripped))
def manifest_text(self):
self.finish_current_stream()
@@ -461,7 +486,7 @@ class CollectionWriter(CollectionBase):
manifest += "\n"
if manifest:
- return CollectionReader(manifest, self._api_client).manifest_text()
+ return manifest
else:
return ""
diff --git a/sdk/python/arvados/commands/put.py b/sdk/python/arvados/commands/put.py
index 4a926c7..ac81ab8 100644
--- a/sdk/python/arvados/commands/put.py
+++ b/sdk/python/arvados/commands/put.py
@@ -102,6 +102,12 @@ Print the portable data hash instead of the Arvados UUID for the collection
created by the upload.
""")
+upload_opts.add_argument('--normalize', action='store_true',
+ help="""
+Normalize the manifest by re-ordering files and streams after writing
+data. This makes the --max-manifest-depth option ineffective.
+""")
+
run_opts = argparse.ArgumentParser(add_help=False)
run_opts.add_argument('--project-uuid', metavar='UUID', help="""
@@ -447,16 +453,21 @@ def main(arguments=None, stdout=sys.stdout, stderr=sys.stderr):
if args.stream:
output = writer.manifest_text()
+ if args.normalize:
+ output = CollectionReader(output).normalize().manifest_text()
elif args.raw:
output = ','.join(writer.data_locators())
else:
try:
+ manifest_text = writer.manifest_text()
+ if args.normalize:
+ manifest_text = CollectionReader(manifest_text).normalize().manifest_text()
# Register the resulting collection in Arvados.
collection = api_client.collections().create(
body={
'owner_uuid': project_uuid,
'name': collection_name,
- 'manifest_text': writer.manifest_text()
+ 'manifest_text': manifest_text
},
ensure_unique_name=True
).execute(num_retries=args.retries)
diff --git a/sdk/python/arvados/stream.py b/sdk/python/arvados/stream.py
index 04b6b81..0e153f5 100644
--- a/sdk/python/arvados/stream.py
+++ b/sdk/python/arvados/stream.py
@@ -208,7 +208,7 @@ class StreamFileReader(object):
manifest_text = ['.']
manifest_text.extend([d[LOCATOR] for d in self._stream._data_locators])
manifest_text.extend(["{}:{}:{}".format(seg[LOCATOR], seg[BLOCKSIZE], self.name().replace(' ', '\\040')) for seg in self.segments])
- return arvados.CollectionReader(' '.join(manifest_text) + '\n').manifest_text()
+ return arvados.CollectionReader(' '.join(manifest_text) + '\n').normalize().manifest_text()
class StreamReader(object):
diff --git a/sdk/python/bin/arv-get b/sdk/python/bin/arv-get
index 0059f3f..272fa84 100755
--- a/sdk/python/bin/arv-get
+++ b/sdk/python/bin/arv-get
@@ -143,6 +143,8 @@ if not get_prefix:
abort("failed to download '{}': {}".format(collection, error))
sys.exit(0)
+reader.normalize()
+
# Scan the collection. Make an array of (stream, file, local
# destination filename) tuples, and add up total size to extract.
todo = []
diff --git a/sdk/python/bin/arv-normalize b/sdk/python/bin/arv-normalize
index 478a74c..b84910e 100755
--- a/sdk/python/bin/arv-normalize
+++ b/sdk/python/bin/arv-normalize
@@ -20,6 +20,7 @@ import arvados
r = sys.stdin.read()
cr = arvados.CollectionReader(r)
+cr.normalize()
if args.extract:
i = args.extract.rfind('/')
diff --git a/sdk/python/tests/test_collections.py b/sdk/python/tests/test_collections.py
index 4889e92..45067ad 100644
--- a/sdk/python/tests/test_collections.py
+++ b/sdk/python/tests/test_collections.py
@@ -47,7 +47,12 @@ class ArvadosCollectionsTest(run_test_server.TestCaseWithServers,
cw.start_new_stream('baz')
cw.write('baz')
cw.set_current_file_name('baz.txt')
- return cw.finish()
+ self.assertEqual(cw.manifest_text(),
+ ". 3858f62230ac3c915f300c664312c63f+6 0:3:foo.txt 3:3:bar.txt\n" +
+ "./baz 73feffa4b7f6bb68e44cf984c85f6e88+3 0:3:baz.txt\n",
+ "wrong manifest: got {}".format(cw.manifest_text()))
+ cw.finish()
+ return cw.portable_data_hash()
def test_keep_local_store(self):
self.assertEqual(self.keep_client.put('foo'), 'acbd18db4cc2f85cedef654fccc4a4d8+3', 'wrong md5 hash from Keep.put')
@@ -55,19 +60,19 @@ class ArvadosCollectionsTest(run_test_server.TestCaseWithServers,
def test_local_collection_writer(self):
self.assertEqual(self.write_foo_bar_baz(),
- 'd6c3b8e571f1b81ebb150a45ed06c884+114',
- "wrong locator hash for files foo, bar, baz")
+ '23ca013983d6239e98931cc779e68426+114',
+ 'wrong locator hash: ' + self.write_foo_bar_baz())
def test_local_collection_reader(self):
- self.write_foo_bar_baz()
+ foobarbaz = self.write_foo_bar_baz()
cr = arvados.CollectionReader(
- 'd6c3b8e571f1b81ebb150a45ed06c884+114+Xzizzle', self.api_client)
+ foobarbaz + '+Xzizzle', self.api_client)
got = []
for s in cr.all_streams():
for f in s.all_files():
got += [[f.size(), f.stream_name(), f.name(), f.read(2**26)]]
- expected = [[3, '.', 'bar.txt', 'bar'],
- [3, '.', 'foo.txt', 'foo'],
+ expected = [[3, '.', 'foo.txt', 'foo'],
+ [3, '.', 'bar.txt', 'bar'],
[3, './baz', 'baz.txt', 'baz']]
self.assertEqual(got,
expected)
@@ -94,8 +99,8 @@ class ArvadosCollectionsTest(run_test_server.TestCaseWithServers,
'all_files|as_manifest did not preserve manifest contents: got %s expected %s' % (got, ex))
def test_collection_manifest_subset(self):
- self.write_foo_bar_baz()
- self._test_subset('d6c3b8e571f1b81ebb150a45ed06c884+114',
+ foobarbaz = self.write_foo_bar_baz()
+ self._test_subset(foobarbaz,
[[3, '.', 'bar.txt', 'bar'],
[3, '.', 'foo.txt', 'foo'],
[3, './baz', 'baz.txt', 'baz']])
@@ -151,7 +156,19 @@ class ArvadosCollectionsTest(run_test_server.TestCaseWithServers,
cw.start_new_stream('foo')
cw.start_new_file('zero.txt')
cw.write('')
- self.check_manifest_file_sizes(cw.manifest_text(), [1,0,0])
+ self.check_manifest_file_sizes(cw.manifest_text(), [0,1,0])
+
+ def test_no_implicit_normalize(self):
+ cw = arvados.CollectionWriter(self.api_client)
+ cw.start_new_file('b')
+ cw.write('b')
+ cw.start_new_file('a')
+ cw.write('')
+ self.check_manifest_file_sizes(cw.manifest_text(), [1,0])
+ self.check_manifest_file_sizes(
+ arvados.CollectionReader(
+ cw.manifest_text()).normalize().manifest_text(),
+ [0,1])
def check_manifest_file_sizes(self, manifest_text, expect_sizes):
cr = arvados.CollectionReader(manifest_text, self.api_client)
@@ -210,49 +227,53 @@ class ArvadosCollectionsTest(run_test_server.TestCaseWithServers,
def test_normalized_collection(self):
m1 = """. 5348b82a029fd9e971a811ce1f71360b+43 0:43:md5sum.txt
. 085c37f02916da1cad16f93c54d899b7+41 0:41:md5sum.txt
-. 8b22da26f9f433dea0a10e5ec66d73ba+43 0:43:md5sum.txt"""
- self.assertEqual(arvados.CollectionReader(m1, self.api_client).manifest_text(),
+. 8b22da26f9f433dea0a10e5ec66d73ba+43 0:43:md5sum.txt
+"""
+ self.assertEqual(arvados.CollectionReader(m1, self.api_client).normalize().manifest_text(),
""". 5348b82a029fd9e971a811ce1f71360b+43 085c37f02916da1cad16f93c54d899b7+41 8b22da26f9f433dea0a10e5ec66d73ba+43 0:127:md5sum.txt
""")
m2 = """. 204e43b8a1185621ca55a94839582e6f+67108864 b9677abbac956bd3e86b1deb28dfac03+67108864 fc15aff2a762b13f521baf042140acec+67108864 323d2a3ce20370c4ca1d3462a344f8fd+25885655 0:227212247:var-GS000016015-ASM.tsv.bz2
"""
- self.assertEqual(arvados.CollectionReader(m2, self.api_client).manifest_text(), m2)
+ self.assertEqual(arvados.CollectionReader(m2, self.api_client).normalize().manifest_text(), m2)
m3 = """. 5348b82a029fd9e971a811ce1f71360b+43 3:40:md5sum.txt
. 085c37f02916da1cad16f93c54d899b7+41 0:41:md5sum.txt
-. 8b22da26f9f433dea0a10e5ec66d73ba+43 0:43:md5sum.txt"""
- self.assertEqual(arvados.CollectionReader(m3, self.api_client).manifest_text(),
+. 8b22da26f9f433dea0a10e5ec66d73ba+43 0:43:md5sum.txt
+"""
+ self.assertEqual(arvados.CollectionReader(m3, self.api_client).normalize().manifest_text(),
""". 5348b82a029fd9e971a811ce1f71360b+43 085c37f02916da1cad16f93c54d899b7+41 8b22da26f9f433dea0a10e5ec66d73ba+43 3:124:md5sum.txt
""")
m4 = """. 204e43b8a1185621ca55a94839582e6f+67108864 0:3:foo/bar
./zzz 204e43b8a1185621ca55a94839582e6f+67108864 0:999:zzz
-./foo 323d2a3ce20370c4ca1d3462a344f8fd+25885655 0:3:bar"""
- self.assertEqual(arvados.CollectionReader(m4, self.api_client).manifest_text(),
+./foo 323d2a3ce20370c4ca1d3462a344f8fd+25885655 0:3:bar
+"""
+ self.assertEqual(arvados.CollectionReader(m4, self.api_client).normalize().manifest_text(),
"""./foo 204e43b8a1185621ca55a94839582e6f+67108864 323d2a3ce20370c4ca1d3462a344f8fd+25885655 0:3:bar 67108864:3:bar
./zzz 204e43b8a1185621ca55a94839582e6f+67108864 0:999:zzz
""")
m5 = """. 204e43b8a1185621ca55a94839582e6f+67108864 0:3:foo/bar
./zzz 204e43b8a1185621ca55a94839582e6f+67108864 0:999:zzz
-./foo 204e43b8a1185621ca55a94839582e6f+67108864 3:3:bar"""
- self.assertEqual(arvados.CollectionReader(m5, self.api_client).manifest_text(),
+./foo 204e43b8a1185621ca55a94839582e6f+67108864 3:3:bar
+"""
+ self.assertEqual(arvados.CollectionReader(m5, self.api_client).normalize().manifest_text(),
"""./foo 204e43b8a1185621ca55a94839582e6f+67108864 0:6:bar
./zzz 204e43b8a1185621ca55a94839582e6f+67108864 0:999:zzz
""")
with self.data_file('1000G_ref_manifest') as f6:
m6 = f6.read()
- self.assertEqual(arvados.CollectionReader(m6, self.api_client).manifest_text(), m6)
+ self.assertEqual(arvados.CollectionReader(m6, self.api_client).normalize().manifest_text(), m6)
with self.data_file('jlake_manifest') as f7:
m7 = f7.read()
- self.assertEqual(arvados.CollectionReader(m7, self.api_client).manifest_text(), m7)
+ self.assertEqual(arvados.CollectionReader(m7, self.api_client).normalize().manifest_text(), m7)
m8 = """./a\\040b\\040c 59ca0efa9f5633cb0371bbc0355478d8+13 0:13:hello\\040world.txt
"""
- self.assertEqual(arvados.CollectionReader(m8, self.api_client).manifest_text(), m8)
+ self.assertEqual(arvados.CollectionReader(m8, self.api_client).normalize().manifest_text(), m8)
def test_locators_and_ranges(self):
blocks2 = [['a', 10, 0],
@@ -480,20 +501,23 @@ class ArvadosCollectionsTest(run_test_server.TestCaseWithServers,
. 085c37f02916da1cad16f93c54d899b7+41 0:41:md6sum.txt
. 8b22da26f9f433dea0a10e5ec66d73ba+43 0:43:md7sum.txt
. 085c37f02916da1cad16f93c54d899b7+41 5348b82a029fd9e971a811ce1f71360b+43 8b22da26f9f433dea0a10e5ec66d73ba+43 47:80:md8sum.txt
-. 085c37f02916da1cad16f93c54d899b7+41 5348b82a029fd9e971a811ce1f71360b+43 8b22da26f9f433dea0a10e5ec66d73ba+43 40:80:md9sum.txt"""
+. 085c37f02916da1cad16f93c54d899b7+41 5348b82a029fd9e971a811ce1f71360b+43 8b22da26f9f433dea0a10e5ec66d73ba+43 40:80:md9sum.txt
+"""
- m2 = arvados.CollectionReader(m1, self.api_client)
+ m2 = arvados.CollectionReader(m1, self.api_client).normalize().manifest_text()
- self.assertEqual(m2.manifest_text(),
+ self.assertEqual(m2,
". 5348b82a029fd9e971a811ce1f71360b+43 085c37f02916da1cad16f93c54d899b7+41 8b22da26f9f433dea0a10e5ec66d73ba+43 0:43:md5sum.txt 43:41:md6sum.txt 84:43:md7sum.txt 6:37:md8sum.txt 84:43:md8sum.txt 83:1:md9sum.txt 0:43:md9sum.txt 84:36:md9sum.txt\n")
+ files = arvados.CollectionReader(
+ m2, self.api_client).all_streams()[0].files()
- self.assertEqual(arvados.CollectionReader(m1, self.api_client).all_streams()[0].files()['md5sum.txt'].as_manifest(),
+ self.assertEqual(files['md5sum.txt'].as_manifest(),
". 5348b82a029fd9e971a811ce1f71360b+43 0:43:md5sum.txt\n")
- self.assertEqual(arvados.CollectionReader(m1, self.api_client).all_streams()[0].files()['md6sum.txt'].as_manifest(),
+ self.assertEqual(files['md6sum.txt'].as_manifest(),
". 085c37f02916da1cad16f93c54d899b7+41 0:41:md6sum.txt\n")
- self.assertEqual(arvados.CollectionReader(m1, self.api_client).all_streams()[0].files()['md7sum.txt'].as_manifest(),
+ self.assertEqual(files['md7sum.txt'].as_manifest(),
". 8b22da26f9f433dea0a10e5ec66d73ba+43 0:43:md7sum.txt\n")
- self.assertEqual(arvados.CollectionReader(m1, self.api_client).all_streams()[0].files()['md9sum.txt'].as_manifest(),
+ self.assertEqual(files['md9sum.txt'].as_manifest(),
". 085c37f02916da1cad16f93c54d899b7+41 5348b82a029fd9e971a811ce1f71360b+43 8b22da26f9f433dea0a10e5ec66d73ba+43 40:80:md9sum.txt\n")
def test_write_directory_tree(self):
@@ -518,8 +542,7 @@ class ArvadosCollectionsTest(run_test_server.TestCaseWithServers,
cwriter.write_directory_tree(self.build_directory_tree(
['basefile', 'subdir/subfile']), max_manifest_depth=0)
self.assertEqual(cwriter.manifest_text(),
- """. 4ace875ffdc6824a04950f06858f4465+22 0:8:basefile
-./subdir 4ace875ffdc6824a04950f06858f4465+22 8:14:subfile\n""")
+ """. 4ace875ffdc6824a04950f06858f4465+22 0:8:basefile 8:14:subdir/subfile\n""")
def test_write_directory_tree_with_limited_recursion(self):
cwriter = arvados.CollectionWriter(self.api_client)
@@ -528,8 +551,7 @@ class ArvadosCollectionsTest(run_test_server.TestCaseWithServers,
max_manifest_depth=1)
self.assertEqual(cwriter.manifest_text(),
""". bd19836ddb62c11c55ab251ccaca5645+2 0:2:f1
-./d1 50170217e5b04312024aa5cd42934494+13 8:5:f2
-./d1/d2 50170217e5b04312024aa5cd42934494+13 0:8:f3\n""")
+./d1 50170217e5b04312024aa5cd42934494+13 0:8:d2/f3 8:5:f2\n""")
def test_write_one_file(self):
cwriter = arvados.CollectionWriter(self.api_client)
@@ -620,6 +642,15 @@ class ArvadosCollectionsTest(run_test_server.TestCaseWithServers,
self.assertRaises(arvados.errors.AssertionError,
cwriter.write, "badtext")
+ def test_read_arbitrary_data_with_collection_reader(self):
+ # arv-get relies on this to do "arv-get {keep-locator} -".
+ self.write_foo_bar_baz()
+ self.assertEqual(
+ 'foobar',
+ arvados.CollectionReader(
+ '3858f62230ac3c915f300c664312c63f+6'
+ ).manifest_text())
+
class CollectionTestMixin(object):
PROXY_RESPONSE = {
@@ -732,6 +763,24 @@ class CollectionReaderTestCase(unittest.TestCase, CollectionTestMixin):
self.assertEqual('foo',
''.join(f.read(9) for f in reader.all_files()))
+ def test_read_nonnormalized_manifest_with_collection_reader(self):
+ # client should be able to use CollectionReader on a manifest without normalizing it
+ client = self.api_client_mock(500)
+ nonnormal = ". acbd18db4cc2f85cedef654fccc4a4d8+3+Aabadbadbee at abeebdee 0:3:foo.txt 1:0:bar.txt 0:3:foo.txt\n"
+ self.assertEqual(
+ ". acbd18db4cc2f85cedef654fccc4a4d8+3 0:3:foo.txt 1:0:bar.txt 0:3:foo.txt\n",
+ arvados.CollectionReader(
+ nonnormal,
+ api_client=client, num_retries=0).stripped_manifest())
+ self.assertEqual(
+ [[6, '.', 'foo.txt'],
+ [0, '.', 'bar.txt']],
+ [[f.size(), f.stream_name(), f.name()]
+ for f in
+ arvados.CollectionReader(
+ nonnormal,
+ api_client=client, num_retries=0).all_streams()[0].all_files()])
+
@tutil.skip_sleep
class CollectionWriterTestCase(unittest.TestCase, CollectionTestMixin):
commit 112d6ce9132ea6749aef115e3787483958e858fd
Author: Tom Clegg <tom at curoverse.com>
Date: Mon Oct 27 22:12:53 2014 -0400
3706: Consolidate more regular expressions into util package.
diff --git a/sdk/python/arvados/collection.py b/sdk/python/arvados/collection.py
index 1b712ea..874c38e 100644
--- a/sdk/python/arvados/collection.py
+++ b/sdk/python/arvados/collection.py
@@ -130,13 +130,13 @@ class CollectionReader(CollectionBase):
self._api_client = api_client
self._keep_client = keep_client
self.num_retries = num_retries
- if re.match(r'[a-f0-9]{32}(\+\d+)?(\+\S+)*$', manifest_locator_or_text):
+ if re.match(util.keep_locator_pattern, manifest_locator_or_text):
self._manifest_locator = manifest_locator_or_text
self._manifest_text = None
- elif re.match(r'[a-z0-9]{5}-[a-z0-9]{5}-[a-z0-9]{15}$', manifest_locator_or_text):
+ elif re.match(util.collection_uuid_pattern, manifest_locator_or_text):
self._manifest_locator = manifest_locator_or_text
self._manifest_text = None
- elif re.match(r'((\S+)( +[a-f0-9]{32}(\+\d+)(\+\S+)*)+( +\d+:\d+:\S+)+$)+', manifest_locator_or_text, re.MULTILINE):
+ elif re.match(util.manifest_pattern, manifest_locator_or_text):
self._manifest_text = manifest_locator_or_text
self._manifest_locator = None
else:
diff --git a/sdk/python/arvados/util.py b/sdk/python/arvados/util.py
index 67436ae..2532ee2 100644
--- a/sdk/python/arvados/util.py
+++ b/sdk/python/arvados/util.py
@@ -10,12 +10,14 @@ from arvados.collection import *
HEX_RE = re.compile(r'^[0-9a-fA-F]+$')
keep_locator_pattern = re.compile(r'[0-9a-f]{32}\+\d+(\+\S+)*')
+signed_locator_pattern = re.compile(r'[0-9a-f]{32}\+\d+(\+\S+)*\+A\S+(\+\S+)*')
portable_data_hash_pattern = re.compile(r'[0-9a-f]{32}\+\d+')
uuid_pattern = re.compile(r'[a-z0-9]{5}-[a-z0-9]{5}-[a-z0-9]{15}')
collection_uuid_pattern = re.compile(r'[a-z0-9]{5}-4zz18-[a-z0-9]{15}')
group_uuid_pattern = re.compile(r'[a-z0-9]{5}-j7d0g-[a-z0-9]{15}')
user_uuid_pattern = re.compile(r'[a-z0-9]{5}-tpzed-[a-z0-9]{15}')
link_uuid_pattern = re.compile(r'[a-z0-9]{5}-o0j2j-[a-z0-9]{15}')
+manifest_pattern = re.compile(r'((\S+)( +[a-f0-9]{32}(\+\d+)(\+\S+)*)+( +\d+:\d+:\S+)+$)+', flags=re.MULTILINE)
def clear_tmpdir(path=None):
"""
commit d9e2eb19dfef24c8b6af396b2b61b2dab4cf740d
Author: Tom Clegg <tom at curoverse.com>
Date: Mon Oct 27 22:08:24 2014 -0400
3706: Do not leave zero-length file segment at end of stream after finish_current_file().
diff --git a/sdk/python/arvados/collection.py b/sdk/python/arvados/collection.py
index fcb30c5..1b712ea 100644
--- a/sdk/python/arvados/collection.py
+++ b/sdk/python/arvados/collection.py
@@ -392,6 +392,7 @@ class CollectionWriter(CollectionBase):
self._current_stream_length - self._current_file_pos,
self._current_file_name])
self._current_file_pos = self._current_stream_length
+ self._current_file_name = None
def start_new_stream(self, newstreamname='.'):
self.finish_current_stream()
commit 6817b34daf51030fd25459d349a940ff59eee690
Author: Tom Clegg <tom at curoverse.com>
Date: Mon Oct 27 22:04:54 2014 -0400
3706: In CollectionReader, try fetching manifests before/after API
server when the ID is a Keep locator with/without a permission hint
respectively. Do not display a warning if the manifest is found one
way or the other.
diff --git a/sdk/python/arvados/collection.py b/sdk/python/arvados/collection.py
index 7425d80..fcb30c5 100644
--- a/sdk/python/arvados/collection.py
+++ b/sdk/python/arvados/collection.py
@@ -147,41 +147,41 @@ class CollectionReader(CollectionBase):
def _populate(self):
if self._streams is not None:
return
+ error_via_api = None
+ error_via_keep = None
+ should_try_keep = (not self._manifest_text and
+ util.keep_locator_pattern.match(
+ self._manifest_locator))
+ if (not self._manifest_text and
+ util.signed_locator_pattern.match(self._manifest_locator)):
+ try:
+ self._populate_from_keep()
+ except e:
+ error_via_keep = e
if not self._manifest_text:
try:
- # As in KeepClient itself, we must wait until the last possible
- # moment to instantiate an API client, in order to avoid
- # tripping up clients that don't have access to an API server.
- # If we do build one, make sure our Keep client uses it.
- # If instantiation fails, we'll fall back to the except clause,
- # just like any other Collection lookup failure.
- if self._api_client is None:
- self._api_client = arvados.api('v1')
- self._keep_client = None # Make a new one with the new api.
- c = self._api_client.collections().get(
- uuid=self._manifest_locator).execute(
- num_retries=self.num_retries)
- self._manifest_text = c['manifest_text']
- except Exception as error_via_api:
- if not util.keep_locator_pattern.match(
- self._manifest_locator):
+ self._populate_from_api_server()
+ except Exception as e:
+ if not should_try_keep:
raise
- # _manifest_locator is a Keep locator. Perhaps we can
- # retrieve it directly from Keep. This has a chance of
- # working if [a] the locator includes a permission
- # signature or [b] the Keep services are operating in
- # world-readable mode.
- try:
- self._manifest_text = self._my_keep().get(
- self._manifest_locator, num_retries=self.num_retries)
- except Exception as error_via_keep:
- raise arvados.errors.NotFoundError(
- ("Failed to retrieve collection '%s' " +
- "from either API server (%s) or Keep (%s)."
- ).format(
- self._manifest_locator,
- error_via_api,
- error_via_keep))
+ error_via_api = e
+ if (not self._manifest_text and
+ not error_via_keep and
+ should_try_keep):
+ # Looks like a keep locator, and we didn't already try keep above
+ try:
+ self._populate_from_keep()
+ except Exception as e:
+ error_via_keep = e
+ if not self._manifest_text:
+ # Nothing worked!
+ raise arvados.errors.NotFoundError(
+ ("Failed to retrieve collection '{}' " +
+ "from either API server ({}) or Keep ({})."
+ ).format(
+ self._manifest_locator,
+ error_via_api,
+ error_via_keep))
self._streams = [sline.split()
for sline in self._manifest_text.split("\n")
if sline]
diff --git a/sdk/python/tests/test_collections.py b/sdk/python/tests/test_collections.py
index 98a72f6..4889e92 100644
--- a/sdk/python/tests/test_collections.py
+++ b/sdk/python/tests/test_collections.py
@@ -694,16 +694,35 @@ class CollectionReaderTestCase(unittest.TestCase, CollectionTestMixin):
api_client=client)
self.assertEqual(self.DEFAULT_MANIFEST, reader.manifest_text())
- def test_locator_init_falls_back_to_keep(self):
- # Reading manifests from Keep is deprecated. Feel free to
- # remove this test when we remove the fallback.
+ def test_locator_init_fallback_to_keep(self):
+ # crunch-job needs this to read manifests that have only ever
+ # been written to Keep.
client = self.api_client_mock(200)
- self.mock_get_collection(client, 404, None)
- with tutil.mock_responses(self.DEFAULT_MANIFEST, 200):
+ with tutil.mock_responses(self.DEFAULT_MANIFEST, 404, 200):
reader = arvados.CollectionReader(self.DEFAULT_DATA_HASH,
- api_client=client, num_retries=3)
+ api_client=client)
self.assertEqual(self.DEFAULT_MANIFEST, reader.manifest_text())
+ def test_uuid_init_no_fallback_to_keep(self):
+ # Do not look up a collection UUID in Keep.
+ client = self.api_client_mock(404)
+ reader = arvados.CollectionReader(self.DEFAULT_UUID,
+ api_client=client)
+ with tutil.mock_responses(self.DEFAULT_MANIFEST, 200):
+ with self.assertRaises(arvados.errors.ApiError):
+ reader.manifest_text()
+
+ def test_try_keep_first_if_permission_hint(self):
+ # To verify that CollectionReader tries Keep first here, we
+ # mock API server to return the wrong data.
+ client = self.api_client_mock(200)
+ with tutil.mock_responses(self.DEFAULT_MANIFEST, 200):
+ self.assertEqual(
+ self.DEFAULT_MANIFEST,
+ arvados.CollectionReader(
+ self.DEFAULT_DATA_HASH + '+Affffffffffffffffffffffffffffffffffffffff at fedcba98',
+ api_client=client).manifest_text())
+
def test_init_num_retries_propagated(self):
# More of an integration test...
client = self.api_client_mock(200)
commit 47853db92d6e1b9eca4beddf471189ee9f4e0217
Author: Tom Clegg <tom at curoverse.com>
Date: Mon Oct 27 13:35:28 2014 -0400
3706: Silence fallback-to-keep warning, show both errors (API and Keep) if both fail.
diff --git a/sdk/python/arvados/collection.py b/sdk/python/arvados/collection.py
index 782e85c..7425d80 100644
--- a/sdk/python/arvados/collection.py
+++ b/sdk/python/arvados/collection.py
@@ -162,16 +162,26 @@ class CollectionReader(CollectionBase):
uuid=self._manifest_locator).execute(
num_retries=self.num_retries)
self._manifest_text = c['manifest_text']
- except Exception as e:
- if not util.portable_data_hash_pattern.match(
+ except Exception as error_via_api:
+ if not util.keep_locator_pattern.match(
self._manifest_locator):
raise
- _logger.warning(
- "API server did not return Collection '%s'. " +
- "Trying to fetch directly from Keep (deprecated).",
- self._manifest_locator)
- self._manifest_text = self._my_keep().get(
- self._manifest_locator, num_retries=self.num_retries)
+ # _manifest_locator is a Keep locator. Perhaps we can
+ # retrieve it directly from Keep. This has a chance of
+ # working if [a] the locator includes a permission
+ # signature or [b] the Keep services are operating in
+ # world-readable mode.
+ try:
+ self._manifest_text = self._my_keep().get(
+ self._manifest_locator, num_retries=self.num_retries)
+ except Exception as error_via_keep:
+ raise arvados.errors.NotFoundError(
+ ("Failed to retrieve collection '%s' " +
+ "from either API server (%s) or Keep (%s)."
+ ).format(
+ self._manifest_locator,
+ error_via_api,
+ error_via_keep))
self._streams = [sline.split()
for sline in self._manifest_text.split("\n")
if sline]
diff --git a/sdk/python/arvados/util.py b/sdk/python/arvados/util.py
index 2609f11..67436ae 100644
--- a/sdk/python/arvados/util.py
+++ b/sdk/python/arvados/util.py
@@ -9,6 +9,7 @@ from arvados.collection import *
HEX_RE = re.compile(r'^[0-9a-fA-F]+$')
+keep_locator_pattern = re.compile(r'[0-9a-f]{32}\+\d+(\+\S+)*')
portable_data_hash_pattern = re.compile(r'[0-9a-f]{32}\+\d+')
uuid_pattern = re.compile(r'[a-z0-9]{5}-[a-z0-9]{5}-[a-z0-9]{15}')
collection_uuid_pattern = re.compile(r'[a-z0-9]{5}-4zz18-[a-z0-9]{15}')
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list