[ARVADOS] created: 2b6948ff7e11cd236ea26562ed76a534dcf3c0be

git at public.curoverse.com git at public.curoverse.com
Thu Oct 30 14:13:19 EDT 2014


        at  2b6948ff7e11cd236ea26562ed76a534dcf3c0be (commit)


commit 2b6948ff7e11cd236ea26562ed76a534dcf3c0be
Merge: 8323e56 846ac5d
Author: Tom Clegg <tom at curoverse.com>
Date:   Mon Oct 27 22:24:35 2014 -0400

    3706: Merge branch 'master' into 3706-keep-warning


commit 8323e56e9dafb1bd4aee4f548c26854ea3eb05e9
Author: Tom Clegg <tom at curoverse.com>
Date:   Mon Oct 27 22:20:22 2014 -0400

    3706: Remove automatic normalization. Add --normalize option to
    arv-put. Add normalize() and stripped_manifest() methods to
    CollectionReader. Clean up normalize code.

diff --git a/sdk/python/arvados/collection.py b/sdk/python/arvados/collection.py
index 874c38e..5b00ea0 100644
--- a/sdk/python/arvados/collection.py
+++ b/sdk/python/arvados/collection.py
@@ -68,28 +68,6 @@ def normalize_stream(s, stream):
 
     return stream_tokens
 
-def normalize(collection):
-    streams = {}
-    for s in collection.all_streams():
-        for f in s.all_files():
-            filestream = s.name() + "/" + f.name()
-            r = filestream.rindex("/")
-            streamname = filestream[:r]
-            filename = filestream[r+1:]
-            if streamname not in streams:
-                streams[streamname] = {}
-            if filename not in streams[streamname]:
-                streams[streamname][filename] = []
-            for r in f.segments:
-                streams[streamname][filename].extend(s.locators_and_ranges(r[0], r[1]))
-
-    normalized_streams = []
-    sortedstreams = list(streams.keys())
-    sortedstreams.sort()
-    for s in sortedstreams:
-        normalized_streams.append(normalize_stream(s, streams[s]))
-    return normalized_streams
-
 
 class CollectionBase(object):
     def __enter__(self):
@@ -104,6 +82,21 @@ class CollectionBase(object):
                                            num_retries=self.num_retries)
         return self._keep_client
 
+    def stripped_manifest(self):
+        """
+        Return the manifest for the current collection with all
+        non-portable hints (i.e., permission signatures and other
+        hints other than size hints) removed from the locators.
+        """
+        raw = self.manifest_text()
+        clean = ''
+        for line in raw.split("\n"):
+            fields = line.split()
+            if len(fields) > 0:
+                locators = [ (re.sub(r'\+[^\d][^\+]*', '', x) if re.match(util.keep_locator_pattern, x) else x)
+                             for x in fields[1:-1] ]
+                clean += fields[0] + ' ' + ' '.join(locators) + ' ' + fields[-1] + "\n"
+        return clean
 
 class CollectionReader(CollectionBase):
     def __init__(self, manifest_locator_or_text, api_client=None,
@@ -144,6 +137,29 @@ class CollectionReader(CollectionBase):
                 "Argument to CollectionReader must be a manifest or a collection UUID")
         self._streams = None
 
+    def _populate_from_api_server(self):
+        # As in KeepClient itself, we must wait until the last possible
+        # moment to instantiate an API client, in order to avoid
+        # tripping up clients that don't have access to an API server.
+        # If we do build one, make sure our Keep client uses it.
+        # If instantiation fails, we'll fall back to the except clause,
+        # just like any other Collection lookup failure.
+        if self._api_client is None:
+            self._api_client = arvados.api('v1')
+            self._keep_client = None  # Make a new one with the new api.
+        c = self._api_client.collections().get(
+            uuid=self._manifest_locator).execute(
+            num_retries=self.num_retries)
+        self._manifest_text = c['manifest_text']
+
+    def _populate_from_keep(self):
+        # Retrieve a manifest directly from Keep. This has a chance of
+        # working if [a] the locator includes a permission signature
+        # or [b] the Keep services are operating in world-readable
+        # mode.
+        self._manifest_text = self._my_keep().get(
+            self._manifest_locator, num_retries=self.num_retries)
+
     def _populate(self):
         if self._streams is not None:
             return
@@ -185,14 +201,35 @@ class CollectionReader(CollectionBase):
         self._streams = [sline.split()
                          for sline in self._manifest_text.split("\n")
                          if sline]
-        self._streams = normalize(self)
 
-        # now regenerate the manifest text based on the normalized stream
+    def normalize(self):
+        self._populate()
 
-        #print "normalizing", self._manifest_text
+        # Rearrange streams
+        streams = {}
+        for s in self.all_streams():
+            for f in s.all_files():
+                filestream = s.name() + "/" + f.name()
+                r = filestream.rindex("/")
+                streamname = filestream[:r]
+                filename = filestream[r+1:]
+                if streamname not in streams:
+                    streams[streamname] = {}
+                if filename not in streams[streamname]:
+                    streams[streamname][filename] = []
+                for r in f.segments:
+                    streams[streamname][filename].extend(s.locators_and_ranges(r[0], r[1]))
+
+        self._streams = []
+        sortedstreams = list(streams.keys())
+        sortedstreams.sort()
+        for s in sortedstreams:
+            self._streams.append(normalize_stream(s, streams[s]))
+
+        # Regenerate the manifest text based on the normalized streams
         self._manifest_text = ''.join([StreamReader(stream, keep=self._my_keep()).manifest_text() for stream in self._streams])
-        #print "result", self._manifest_text
 
+        return self
 
     def all_streams(self):
         self._populate()
@@ -205,11 +242,10 @@ class CollectionReader(CollectionBase):
                 yield f
 
     def manifest_text(self, strip=False):
-        self._populate()
         if strip:
-            m = ''.join([StreamReader(stream, keep=self._my_keep()).manifest_text(strip=True) for stream in self._streams])
-            return m
+            return self.stripped_manifest()
         else:
+            self._populate()
             return self._manifest_text
 
 
@@ -433,20 +469,9 @@ class CollectionWriter(CollectionBase):
         # Store the manifest in Keep and return its locator.
         return self._my_keep().put(self.manifest_text())
 
-    def stripped_manifest(self):
-        """
-        Return the manifest for the current collection with all permission
-        hints removed from the locators in the manifest.
-        """
-        raw = self.manifest_text()
-        clean = ''
-        for line in raw.split("\n"):
-            fields = line.split()
-            if len(fields) > 0:
-                locators = [ re.sub(r'\+A[a-z0-9 at _-]+', '', x)
-                             for x in fields[1:-1] ]
-                clean += fields[0] + ' ' + ' '.join(locators) + ' ' + fields[-1] + "\n"
-        return clean
+    def portable_data_hash(self):
+        stripped = self.stripped_manifest()
+        return hashlib.md5(stripped).hexdigest() + '+' + str(len(stripped))
 
     def manifest_text(self):
         self.finish_current_stream()
@@ -461,7 +486,7 @@ class CollectionWriter(CollectionBase):
             manifest += "\n"
 
         if manifest:
-            return CollectionReader(manifest, self._api_client).manifest_text()
+            return manifest
         else:
             return ""
 
diff --git a/sdk/python/arvados/commands/put.py b/sdk/python/arvados/commands/put.py
index 4a926c7..ac81ab8 100644
--- a/sdk/python/arvados/commands/put.py
+++ b/sdk/python/arvados/commands/put.py
@@ -102,6 +102,12 @@ Print the portable data hash instead of the Arvados UUID for the collection
 created by the upload.
 """)
 
+upload_opts.add_argument('--normalize', action='store_true',
+                    help="""
+Normalize the manifest by re-ordering files and streams after writing
+data. This makes the --max-manifest-depth option ineffective.
+""")
+
 run_opts = argparse.ArgumentParser(add_help=False)
 
 run_opts.add_argument('--project-uuid', metavar='UUID', help="""
@@ -447,16 +453,21 @@ def main(arguments=None, stdout=sys.stdout, stderr=sys.stderr):
 
     if args.stream:
         output = writer.manifest_text()
+        if args.normalize:
+            output = CollectionReader(output).normalize().manifest_text()
     elif args.raw:
         output = ','.join(writer.data_locators())
     else:
         try:
+            manifest_text = writer.manifest_text()
+            if args.normalize:
+                manifest_text = CollectionReader(manifest_text).normalize().manifest_text()
             # Register the resulting collection in Arvados.
             collection = api_client.collections().create(
                 body={
                     'owner_uuid': project_uuid,
                     'name': collection_name,
-                    'manifest_text': writer.manifest_text()
+                    'manifest_text': manifest_text
                     },
                 ensure_unique_name=True
                 ).execute(num_retries=args.retries)
diff --git a/sdk/python/arvados/stream.py b/sdk/python/arvados/stream.py
index 04b6b81..0e153f5 100644
--- a/sdk/python/arvados/stream.py
+++ b/sdk/python/arvados/stream.py
@@ -208,7 +208,7 @@ class StreamFileReader(object):
         manifest_text = ['.']
         manifest_text.extend([d[LOCATOR] for d in self._stream._data_locators])
         manifest_text.extend(["{}:{}:{}".format(seg[LOCATOR], seg[BLOCKSIZE], self.name().replace(' ', '\\040')) for seg in self.segments])
-        return arvados.CollectionReader(' '.join(manifest_text) + '\n').manifest_text()
+        return arvados.CollectionReader(' '.join(manifest_text) + '\n').normalize().manifest_text()
 
 
 class StreamReader(object):
diff --git a/sdk/python/bin/arv-get b/sdk/python/bin/arv-get
index 0059f3f..272fa84 100755
--- a/sdk/python/bin/arv-get
+++ b/sdk/python/bin/arv-get
@@ -143,6 +143,8 @@ if not get_prefix:
             abort("failed to download '{}': {}".format(collection, error))
     sys.exit(0)
 
+reader.normalize()
+
 # Scan the collection. Make an array of (stream, file, local
 # destination filename) tuples, and add up total size to extract.
 todo = []
diff --git a/sdk/python/bin/arv-normalize b/sdk/python/bin/arv-normalize
index 478a74c..b84910e 100755
--- a/sdk/python/bin/arv-normalize
+++ b/sdk/python/bin/arv-normalize
@@ -20,6 +20,7 @@ import arvados
 r = sys.stdin.read()
 
 cr = arvados.CollectionReader(r)
+cr.normalize()
 
 if args.extract:
     i = args.extract.rfind('/')
diff --git a/sdk/python/tests/test_collections.py b/sdk/python/tests/test_collections.py
index 4889e92..45067ad 100644
--- a/sdk/python/tests/test_collections.py
+++ b/sdk/python/tests/test_collections.py
@@ -47,7 +47,12 @@ class ArvadosCollectionsTest(run_test_server.TestCaseWithServers,
         cw.start_new_stream('baz')
         cw.write('baz')
         cw.set_current_file_name('baz.txt')
-        return cw.finish()
+        self.assertEqual(cw.manifest_text(),
+                         ". 3858f62230ac3c915f300c664312c63f+6 0:3:foo.txt 3:3:bar.txt\n" +
+                         "./baz 73feffa4b7f6bb68e44cf984c85f6e88+3 0:3:baz.txt\n",
+                         "wrong manifest: got {}".format(cw.manifest_text()))
+        cw.finish()
+        return cw.portable_data_hash()
 
     def test_keep_local_store(self):
         self.assertEqual(self.keep_client.put('foo'), 'acbd18db4cc2f85cedef654fccc4a4d8+3', 'wrong md5 hash from Keep.put')
@@ -55,19 +60,19 @@ class ArvadosCollectionsTest(run_test_server.TestCaseWithServers,
 
     def test_local_collection_writer(self):
         self.assertEqual(self.write_foo_bar_baz(),
-                         'd6c3b8e571f1b81ebb150a45ed06c884+114',
-                         "wrong locator hash for files foo, bar, baz")
+                         '23ca013983d6239e98931cc779e68426+114',
+                         'wrong locator hash: ' + self.write_foo_bar_baz())
 
     def test_local_collection_reader(self):
-        self.write_foo_bar_baz()
+        foobarbaz = self.write_foo_bar_baz()
         cr = arvados.CollectionReader(
-            'd6c3b8e571f1b81ebb150a45ed06c884+114+Xzizzle', self.api_client)
+            foobarbaz + '+Xzizzle', self.api_client)
         got = []
         for s in cr.all_streams():
             for f in s.all_files():
                 got += [[f.size(), f.stream_name(), f.name(), f.read(2**26)]]
-        expected = [[3, '.', 'bar.txt', 'bar'],
-                    [3, '.', 'foo.txt', 'foo'],
+        expected = [[3, '.', 'foo.txt', 'foo'],
+                    [3, '.', 'bar.txt', 'bar'],
                     [3, './baz', 'baz.txt', 'baz']]
         self.assertEqual(got,
                          expected)
@@ -94,8 +99,8 @@ class ArvadosCollectionsTest(run_test_server.TestCaseWithServers,
                                      'all_files|as_manifest did not preserve manifest contents: got %s expected %s' % (got, ex))
 
     def test_collection_manifest_subset(self):
-        self.write_foo_bar_baz()
-        self._test_subset('d6c3b8e571f1b81ebb150a45ed06c884+114',
+        foobarbaz = self.write_foo_bar_baz()
+        self._test_subset(foobarbaz,
                           [[3, '.',     'bar.txt', 'bar'],
                            [3, '.',     'foo.txt', 'foo'],
                            [3, './baz', 'baz.txt', 'baz']])
@@ -151,7 +156,19 @@ class ArvadosCollectionsTest(run_test_server.TestCaseWithServers,
         cw.start_new_stream('foo')
         cw.start_new_file('zero.txt')
         cw.write('')
-        self.check_manifest_file_sizes(cw.manifest_text(), [1,0,0])
+        self.check_manifest_file_sizes(cw.manifest_text(), [0,1,0])
+
+    def test_no_implicit_normalize(self):
+        cw = arvados.CollectionWriter(self.api_client)
+        cw.start_new_file('b')
+        cw.write('b')
+        cw.start_new_file('a')
+        cw.write('')
+        self.check_manifest_file_sizes(cw.manifest_text(), [1,0])
+        self.check_manifest_file_sizes(
+            arvados.CollectionReader(
+                cw.manifest_text()).normalize().manifest_text(),
+            [0,1])
 
     def check_manifest_file_sizes(self, manifest_text, expect_sizes):
         cr = arvados.CollectionReader(manifest_text, self.api_client)
@@ -210,49 +227,53 @@ class ArvadosCollectionsTest(run_test_server.TestCaseWithServers,
     def test_normalized_collection(self):
         m1 = """. 5348b82a029fd9e971a811ce1f71360b+43 0:43:md5sum.txt
 . 085c37f02916da1cad16f93c54d899b7+41 0:41:md5sum.txt
-. 8b22da26f9f433dea0a10e5ec66d73ba+43 0:43:md5sum.txt"""
-        self.assertEqual(arvados.CollectionReader(m1, self.api_client).manifest_text(),
+. 8b22da26f9f433dea0a10e5ec66d73ba+43 0:43:md5sum.txt
+"""
+        self.assertEqual(arvados.CollectionReader(m1, self.api_client).normalize().manifest_text(),
                          """. 5348b82a029fd9e971a811ce1f71360b+43 085c37f02916da1cad16f93c54d899b7+41 8b22da26f9f433dea0a10e5ec66d73ba+43 0:127:md5sum.txt
 """)
 
         m2 = """. 204e43b8a1185621ca55a94839582e6f+67108864 b9677abbac956bd3e86b1deb28dfac03+67108864 fc15aff2a762b13f521baf042140acec+67108864 323d2a3ce20370c4ca1d3462a344f8fd+25885655 0:227212247:var-GS000016015-ASM.tsv.bz2
 """
-        self.assertEqual(arvados.CollectionReader(m2, self.api_client).manifest_text(), m2)
+        self.assertEqual(arvados.CollectionReader(m2, self.api_client).normalize().manifest_text(), m2)
 
         m3 = """. 5348b82a029fd9e971a811ce1f71360b+43 3:40:md5sum.txt
 . 085c37f02916da1cad16f93c54d899b7+41 0:41:md5sum.txt
-. 8b22da26f9f433dea0a10e5ec66d73ba+43 0:43:md5sum.txt"""
-        self.assertEqual(arvados.CollectionReader(m3, self.api_client).manifest_text(),
+. 8b22da26f9f433dea0a10e5ec66d73ba+43 0:43:md5sum.txt
+"""
+        self.assertEqual(arvados.CollectionReader(m3, self.api_client).normalize().manifest_text(),
                          """. 5348b82a029fd9e971a811ce1f71360b+43 085c37f02916da1cad16f93c54d899b7+41 8b22da26f9f433dea0a10e5ec66d73ba+43 3:124:md5sum.txt
 """)
 
         m4 = """. 204e43b8a1185621ca55a94839582e6f+67108864 0:3:foo/bar
 ./zzz 204e43b8a1185621ca55a94839582e6f+67108864 0:999:zzz
-./foo 323d2a3ce20370c4ca1d3462a344f8fd+25885655 0:3:bar"""
-        self.assertEqual(arvados.CollectionReader(m4, self.api_client).manifest_text(),
+./foo 323d2a3ce20370c4ca1d3462a344f8fd+25885655 0:3:bar
+"""
+        self.assertEqual(arvados.CollectionReader(m4, self.api_client).normalize().manifest_text(),
                          """./foo 204e43b8a1185621ca55a94839582e6f+67108864 323d2a3ce20370c4ca1d3462a344f8fd+25885655 0:3:bar 67108864:3:bar
 ./zzz 204e43b8a1185621ca55a94839582e6f+67108864 0:999:zzz
 """)
 
         m5 = """. 204e43b8a1185621ca55a94839582e6f+67108864 0:3:foo/bar
 ./zzz 204e43b8a1185621ca55a94839582e6f+67108864 0:999:zzz
-./foo 204e43b8a1185621ca55a94839582e6f+67108864 3:3:bar"""
-        self.assertEqual(arvados.CollectionReader(m5, self.api_client).manifest_text(),
+./foo 204e43b8a1185621ca55a94839582e6f+67108864 3:3:bar
+"""
+        self.assertEqual(arvados.CollectionReader(m5, self.api_client).normalize().manifest_text(),
                          """./foo 204e43b8a1185621ca55a94839582e6f+67108864 0:6:bar
 ./zzz 204e43b8a1185621ca55a94839582e6f+67108864 0:999:zzz
 """)
 
         with self.data_file('1000G_ref_manifest') as f6:
             m6 = f6.read()
-            self.assertEqual(arvados.CollectionReader(m6, self.api_client).manifest_text(), m6)
+            self.assertEqual(arvados.CollectionReader(m6, self.api_client).normalize().manifest_text(), m6)
 
         with self.data_file('jlake_manifest') as f7:
             m7 = f7.read()
-            self.assertEqual(arvados.CollectionReader(m7, self.api_client).manifest_text(), m7)
+            self.assertEqual(arvados.CollectionReader(m7, self.api_client).normalize().manifest_text(), m7)
 
         m8 = """./a\\040b\\040c 59ca0efa9f5633cb0371bbc0355478d8+13 0:13:hello\\040world.txt
 """
-        self.assertEqual(arvados.CollectionReader(m8, self.api_client).manifest_text(), m8)
+        self.assertEqual(arvados.CollectionReader(m8, self.api_client).normalize().manifest_text(), m8)
 
     def test_locators_and_ranges(self):
         blocks2 = [['a', 10, 0],
@@ -480,20 +501,23 @@ class ArvadosCollectionsTest(run_test_server.TestCaseWithServers,
 . 085c37f02916da1cad16f93c54d899b7+41 0:41:md6sum.txt
 . 8b22da26f9f433dea0a10e5ec66d73ba+43 0:43:md7sum.txt
 . 085c37f02916da1cad16f93c54d899b7+41 5348b82a029fd9e971a811ce1f71360b+43 8b22da26f9f433dea0a10e5ec66d73ba+43 47:80:md8sum.txt
-. 085c37f02916da1cad16f93c54d899b7+41 5348b82a029fd9e971a811ce1f71360b+43 8b22da26f9f433dea0a10e5ec66d73ba+43 40:80:md9sum.txt"""
+. 085c37f02916da1cad16f93c54d899b7+41 5348b82a029fd9e971a811ce1f71360b+43 8b22da26f9f433dea0a10e5ec66d73ba+43 40:80:md9sum.txt
+"""
 
-        m2 = arvados.CollectionReader(m1, self.api_client)
+        m2 = arvados.CollectionReader(m1, self.api_client).normalize().manifest_text()
 
-        self.assertEqual(m2.manifest_text(),
+        self.assertEqual(m2,
                          ". 5348b82a029fd9e971a811ce1f71360b+43 085c37f02916da1cad16f93c54d899b7+41 8b22da26f9f433dea0a10e5ec66d73ba+43 0:43:md5sum.txt 43:41:md6sum.txt 84:43:md7sum.txt 6:37:md8sum.txt 84:43:md8sum.txt 83:1:md9sum.txt 0:43:md9sum.txt 84:36:md9sum.txt\n")
+        files = arvados.CollectionReader(
+            m2, self.api_client).all_streams()[0].files()
 
-        self.assertEqual(arvados.CollectionReader(m1, self.api_client).all_streams()[0].files()['md5sum.txt'].as_manifest(),
+        self.assertEqual(files['md5sum.txt'].as_manifest(),
                          ". 5348b82a029fd9e971a811ce1f71360b+43 0:43:md5sum.txt\n")
-        self.assertEqual(arvados.CollectionReader(m1, self.api_client).all_streams()[0].files()['md6sum.txt'].as_manifest(),
+        self.assertEqual(files['md6sum.txt'].as_manifest(),
                          ". 085c37f02916da1cad16f93c54d899b7+41 0:41:md6sum.txt\n")
-        self.assertEqual(arvados.CollectionReader(m1, self.api_client).all_streams()[0].files()['md7sum.txt'].as_manifest(),
+        self.assertEqual(files['md7sum.txt'].as_manifest(),
                          ". 8b22da26f9f433dea0a10e5ec66d73ba+43 0:43:md7sum.txt\n")
-        self.assertEqual(arvados.CollectionReader(m1, self.api_client).all_streams()[0].files()['md9sum.txt'].as_manifest(),
+        self.assertEqual(files['md9sum.txt'].as_manifest(),
                          ". 085c37f02916da1cad16f93c54d899b7+41 5348b82a029fd9e971a811ce1f71360b+43 8b22da26f9f433dea0a10e5ec66d73ba+43 40:80:md9sum.txt\n")
 
     def test_write_directory_tree(self):
@@ -518,8 +542,7 @@ class ArvadosCollectionsTest(run_test_server.TestCaseWithServers,
         cwriter.write_directory_tree(self.build_directory_tree(
                 ['basefile', 'subdir/subfile']), max_manifest_depth=0)
         self.assertEqual(cwriter.manifest_text(),
-                         """. 4ace875ffdc6824a04950f06858f4465+22 0:8:basefile
-./subdir 4ace875ffdc6824a04950f06858f4465+22 8:14:subfile\n""")
+                         """. 4ace875ffdc6824a04950f06858f4465+22 0:8:basefile 8:14:subdir/subfile\n""")
 
     def test_write_directory_tree_with_limited_recursion(self):
         cwriter = arvados.CollectionWriter(self.api_client)
@@ -528,8 +551,7 @@ class ArvadosCollectionsTest(run_test_server.TestCaseWithServers,
             max_manifest_depth=1)
         self.assertEqual(cwriter.manifest_text(),
                          """. bd19836ddb62c11c55ab251ccaca5645+2 0:2:f1
-./d1 50170217e5b04312024aa5cd42934494+13 8:5:f2
-./d1/d2 50170217e5b04312024aa5cd42934494+13 0:8:f3\n""")
+./d1 50170217e5b04312024aa5cd42934494+13 0:8:d2/f3 8:5:f2\n""")
 
     def test_write_one_file(self):
         cwriter = arvados.CollectionWriter(self.api_client)
@@ -620,6 +642,15 @@ class ArvadosCollectionsTest(run_test_server.TestCaseWithServers,
         self.assertRaises(arvados.errors.AssertionError,
                           cwriter.write, "badtext")
 
+    def test_read_arbitrary_data_with_collection_reader(self):
+        # arv-get relies on this to do "arv-get {keep-locator} -".
+        self.write_foo_bar_baz()
+        self.assertEqual(
+            'foobar',
+            arvados.CollectionReader(
+                '3858f62230ac3c915f300c664312c63f+6'
+                ).manifest_text())
+
 
 class CollectionTestMixin(object):
     PROXY_RESPONSE = {
@@ -732,6 +763,24 @@ class CollectionReaderTestCase(unittest.TestCase, CollectionTestMixin):
             self.assertEqual('foo',
                              ''.join(f.read(9) for f in reader.all_files()))
 
+    def test_read_nonnormalized_manifest_with_collection_reader(self):
+        # client should be able to use CollectionReader on a manifest without normalizing it
+        client = self.api_client_mock(500)
+        nonnormal = ". acbd18db4cc2f85cedef654fccc4a4d8+3+Aabadbadbee at abeebdee 0:3:foo.txt 1:0:bar.txt 0:3:foo.txt\n"
+        self.assertEqual(
+            ". acbd18db4cc2f85cedef654fccc4a4d8+3 0:3:foo.txt 1:0:bar.txt 0:3:foo.txt\n",
+            arvados.CollectionReader(
+                nonnormal,
+                api_client=client, num_retries=0).stripped_manifest())
+        self.assertEqual(
+            [[6, '.', 'foo.txt'],
+             [0, '.', 'bar.txt']],
+            [[f.size(), f.stream_name(), f.name()]
+             for f in
+             arvados.CollectionReader(
+                    nonnormal,
+                    api_client=client, num_retries=0).all_streams()[0].all_files()])
+
 
 @tutil.skip_sleep
 class CollectionWriterTestCase(unittest.TestCase, CollectionTestMixin):

commit 112d6ce9132ea6749aef115e3787483958e858fd
Author: Tom Clegg <tom at curoverse.com>
Date:   Mon Oct 27 22:12:53 2014 -0400

    3706: Consolidate more regular expressions into util package.

diff --git a/sdk/python/arvados/collection.py b/sdk/python/arvados/collection.py
index 1b712ea..874c38e 100644
--- a/sdk/python/arvados/collection.py
+++ b/sdk/python/arvados/collection.py
@@ -130,13 +130,13 @@ class CollectionReader(CollectionBase):
         self._api_client = api_client
         self._keep_client = keep_client
         self.num_retries = num_retries
-        if re.match(r'[a-f0-9]{32}(\+\d+)?(\+\S+)*$', manifest_locator_or_text):
+        if re.match(util.keep_locator_pattern, manifest_locator_or_text):
             self._manifest_locator = manifest_locator_or_text
             self._manifest_text = None
-        elif re.match(r'[a-z0-9]{5}-[a-z0-9]{5}-[a-z0-9]{15}$', manifest_locator_or_text):
+        elif re.match(util.collection_uuid_pattern, manifest_locator_or_text):
             self._manifest_locator = manifest_locator_or_text
             self._manifest_text = None
-        elif re.match(r'((\S+)( +[a-f0-9]{32}(\+\d+)(\+\S+)*)+( +\d+:\d+:\S+)+$)+', manifest_locator_or_text, re.MULTILINE):
+        elif re.match(util.manifest_pattern, manifest_locator_or_text):
             self._manifest_text = manifest_locator_or_text
             self._manifest_locator = None
         else:
diff --git a/sdk/python/arvados/util.py b/sdk/python/arvados/util.py
index 67436ae..2532ee2 100644
--- a/sdk/python/arvados/util.py
+++ b/sdk/python/arvados/util.py
@@ -10,12 +10,14 @@ from arvados.collection import *
 HEX_RE = re.compile(r'^[0-9a-fA-F]+$')
 
 keep_locator_pattern = re.compile(r'[0-9a-f]{32}\+\d+(\+\S+)*')
+signed_locator_pattern = re.compile(r'[0-9a-f]{32}\+\d+(\+\S+)*\+A\S+(\+\S+)*')
 portable_data_hash_pattern = re.compile(r'[0-9a-f]{32}\+\d+')
 uuid_pattern = re.compile(r'[a-z0-9]{5}-[a-z0-9]{5}-[a-z0-9]{15}')
 collection_uuid_pattern = re.compile(r'[a-z0-9]{5}-4zz18-[a-z0-9]{15}')
 group_uuid_pattern = re.compile(r'[a-z0-9]{5}-j7d0g-[a-z0-9]{15}')
 user_uuid_pattern = re.compile(r'[a-z0-9]{5}-tpzed-[a-z0-9]{15}')
 link_uuid_pattern = re.compile(r'[a-z0-9]{5}-o0j2j-[a-z0-9]{15}')
+manifest_pattern = re.compile(r'((\S+)( +[a-f0-9]{32}(\+\d+)(\+\S+)*)+( +\d+:\d+:\S+)+$)+', flags=re.MULTILINE)
 
 def clear_tmpdir(path=None):
     """

commit d9e2eb19dfef24c8b6af396b2b61b2dab4cf740d
Author: Tom Clegg <tom at curoverse.com>
Date:   Mon Oct 27 22:08:24 2014 -0400

    3706: Do not leave zero-length file segment at end of stream after finish_current_file().

diff --git a/sdk/python/arvados/collection.py b/sdk/python/arvados/collection.py
index fcb30c5..1b712ea 100644
--- a/sdk/python/arvados/collection.py
+++ b/sdk/python/arvados/collection.py
@@ -392,6 +392,7 @@ class CollectionWriter(CollectionBase):
                 self._current_stream_length - self._current_file_pos,
                 self._current_file_name])
         self._current_file_pos = self._current_stream_length
+        self._current_file_name = None
 
     def start_new_stream(self, newstreamname='.'):
         self.finish_current_stream()

commit 6817b34daf51030fd25459d349a940ff59eee690
Author: Tom Clegg <tom at curoverse.com>
Date:   Mon Oct 27 22:04:54 2014 -0400

    3706: In CollectionReader, try fetching manifests before/after API
    server when the ID is a Keep locator with/without a permission hint
    respectively. Do not display a warning if the manifest is found one
    way or the other.

diff --git a/sdk/python/arvados/collection.py b/sdk/python/arvados/collection.py
index 7425d80..fcb30c5 100644
--- a/sdk/python/arvados/collection.py
+++ b/sdk/python/arvados/collection.py
@@ -147,41 +147,41 @@ class CollectionReader(CollectionBase):
     def _populate(self):
         if self._streams is not None:
             return
+        error_via_api = None
+        error_via_keep = None
+        should_try_keep = (not self._manifest_text and
+                           util.keep_locator_pattern.match(
+                self._manifest_locator))
+        if (not self._manifest_text and
+            util.signed_locator_pattern.match(self._manifest_locator)):
+            try:
+                self._populate_from_keep()
+            except e:
+                error_via_keep = e
         if not self._manifest_text:
             try:
-                # As in KeepClient itself, we must wait until the last possible
-                # moment to instantiate an API client, in order to avoid
-                # tripping up clients that don't have access to an API server.
-                # If we do build one, make sure our Keep client uses it.
-                # If instantiation fails, we'll fall back to the except clause,
-                # just like any other Collection lookup failure.
-                if self._api_client is None:
-                    self._api_client = arvados.api('v1')
-                    self._keep_client = None  # Make a new one with the new api.
-                c = self._api_client.collections().get(
-                    uuid=self._manifest_locator).execute(
-                    num_retries=self.num_retries)
-                self._manifest_text = c['manifest_text']
-            except Exception as error_via_api:
-                if not util.keep_locator_pattern.match(
-                      self._manifest_locator):
+                self._populate_from_api_server()
+            except Exception as e:
+                if not should_try_keep:
                     raise
-                # _manifest_locator is a Keep locator. Perhaps we can
-                # retrieve it directly from Keep. This has a chance of
-                # working if [a] the locator includes a permission
-                # signature or [b] the Keep services are operating in
-                # world-readable mode.
-                try:
-                    self._manifest_text = self._my_keep().get(
-                        self._manifest_locator, num_retries=self.num_retries)
-                except Exception as error_via_keep:
-                    raise arvados.errors.NotFoundError(
-                        ("Failed to retrieve collection '%s' " +
-                         "from either API server (%s) or Keep (%s)."
-                         ).format(
-                                self._manifest_locator,
-                                error_via_api,
-                                error_via_keep))
+                error_via_api = e
+        if (not self._manifest_text and
+            not error_via_keep and
+            should_try_keep):
+            # Looks like a keep locator, and we didn't already try keep above
+            try:
+                self._populate_from_keep()
+            except Exception as e:
+                error_via_keep = e
+        if not self._manifest_text:
+            # Nothing worked!
+            raise arvados.errors.NotFoundError(
+                ("Failed to retrieve collection '{}' " +
+                 "from either API server ({}) or Keep ({})."
+                 ).format(
+                    self._manifest_locator,
+                    error_via_api,
+                    error_via_keep))
         self._streams = [sline.split()
                          for sline in self._manifest_text.split("\n")
                          if sline]
diff --git a/sdk/python/tests/test_collections.py b/sdk/python/tests/test_collections.py
index 98a72f6..4889e92 100644
--- a/sdk/python/tests/test_collections.py
+++ b/sdk/python/tests/test_collections.py
@@ -694,16 +694,35 @@ class CollectionReaderTestCase(unittest.TestCase, CollectionTestMixin):
                                               api_client=client)
             self.assertEqual(self.DEFAULT_MANIFEST, reader.manifest_text())
 
-    def test_locator_init_falls_back_to_keep(self):
-        # Reading manifests from Keep is deprecated.  Feel free to
-        # remove this test when we remove the fallback.
+    def test_locator_init_fallback_to_keep(self):
+        # crunch-job needs this to read manifests that have only ever
+        # been written to Keep.
         client = self.api_client_mock(200)
-        self.mock_get_collection(client, 404, None)
-        with tutil.mock_responses(self.DEFAULT_MANIFEST, 200):
+        with tutil.mock_responses(self.DEFAULT_MANIFEST, 404, 200):
             reader = arvados.CollectionReader(self.DEFAULT_DATA_HASH,
-                                              api_client=client, num_retries=3)
+                                              api_client=client)
             self.assertEqual(self.DEFAULT_MANIFEST, reader.manifest_text())
 
+    def test_uuid_init_no_fallback_to_keep(self):
+        # Do not look up a collection UUID in Keep.
+        client = self.api_client_mock(404)
+        reader = arvados.CollectionReader(self.DEFAULT_UUID,
+                                          api_client=client)
+        with tutil.mock_responses(self.DEFAULT_MANIFEST, 200):
+            with self.assertRaises(arvados.errors.ApiError):
+                reader.manifest_text()
+
+    def test_try_keep_first_if_permission_hint(self):
+        # To verify that CollectionReader tries Keep first here, we
+        # mock API server to return the wrong data.
+        client = self.api_client_mock(200)
+        with tutil.mock_responses(self.DEFAULT_MANIFEST, 200):
+            self.assertEqual(
+                self.DEFAULT_MANIFEST,
+                arvados.CollectionReader(
+                    self.DEFAULT_DATA_HASH + '+Affffffffffffffffffffffffffffffffffffffff at fedcba98',
+                    api_client=client).manifest_text())
+
     def test_init_num_retries_propagated(self):
         # More of an integration test...
         client = self.api_client_mock(200)

commit 47853db92d6e1b9eca4beddf471189ee9f4e0217
Author: Tom Clegg <tom at curoverse.com>
Date:   Mon Oct 27 13:35:28 2014 -0400

    3706: Silence fallback-to-keep warning, show both errors (API and Keep) if both fail.

diff --git a/sdk/python/arvados/collection.py b/sdk/python/arvados/collection.py
index 782e85c..7425d80 100644
--- a/sdk/python/arvados/collection.py
+++ b/sdk/python/arvados/collection.py
@@ -162,16 +162,26 @@ class CollectionReader(CollectionBase):
                     uuid=self._manifest_locator).execute(
                     num_retries=self.num_retries)
                 self._manifest_text = c['manifest_text']
-            except Exception as e:
-                if not util.portable_data_hash_pattern.match(
+            except Exception as error_via_api:
+                if not util.keep_locator_pattern.match(
                       self._manifest_locator):
                     raise
-                _logger.warning(
-                    "API server did not return Collection '%s'. " +
-                    "Trying to fetch directly from Keep (deprecated).",
-                    self._manifest_locator)
-                self._manifest_text = self._my_keep().get(
-                    self._manifest_locator, num_retries=self.num_retries)
+                # _manifest_locator is a Keep locator. Perhaps we can
+                # retrieve it directly from Keep. This has a chance of
+                # working if [a] the locator includes a permission
+                # signature or [b] the Keep services are operating in
+                # world-readable mode.
+                try:
+                    self._manifest_text = self._my_keep().get(
+                        self._manifest_locator, num_retries=self.num_retries)
+                except Exception as error_via_keep:
+                    raise arvados.errors.NotFoundError(
+                        ("Failed to retrieve collection '%s' " +
+                         "from either API server (%s) or Keep (%s)."
+                         ).format(
+                                self._manifest_locator,
+                                error_via_api,
+                                error_via_keep))
         self._streams = [sline.split()
                          for sline in self._manifest_text.split("\n")
                          if sline]
diff --git a/sdk/python/arvados/util.py b/sdk/python/arvados/util.py
index 2609f11..67436ae 100644
--- a/sdk/python/arvados/util.py
+++ b/sdk/python/arvados/util.py
@@ -9,6 +9,7 @@ from arvados.collection import *
 
 HEX_RE = re.compile(r'^[0-9a-fA-F]+$')
 
+keep_locator_pattern = re.compile(r'[0-9a-f]{32}\+\d+(\+\S+)*')
 portable_data_hash_pattern = re.compile(r'[0-9a-f]{32}\+\d+')
 uuid_pattern = re.compile(r'[a-z0-9]{5}-[a-z0-9]{5}-[a-z0-9]{15}')
 collection_uuid_pattern = re.compile(r'[a-z0-9]{5}-4zz18-[a-z0-9]{15}')

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list