[ARVADOS] updated: 519c5922c34da64a8cce9e4d0030892e9b4bdd83

git at public.curoverse.com git at public.curoverse.com
Fri Feb 14 14:40:18 EST 2014


Summary of changes:
 sdk/python/arvados/collection.py |   36 +++++++++++++++++++++----------
 sdk/python/test_collections.py   |   43 +++++++++++++++++++------------------
 2 files changed, 46 insertions(+), 33 deletions(-)

       via  519c5922c34da64a8cce9e4d0030892e9b4bdd83 (commit)
      from  1b82cd274ecebba9302e8a06f6c9e99eaf8ec717 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.


commit 519c5922c34da64a8cce9e4d0030892e9b4bdd83
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Fri Feb 14 14:41:22 2014 -0500

    Collections are now automatically normalized on both read and write.  Tests updated.

diff --git a/sdk/python/arvados/collection.py b/sdk/python/arvados/collection.py
index b069e8d..2535669 100644
--- a/sdk/python/arvados/collection.py
+++ b/sdk/python/arvados/collection.py
@@ -36,15 +36,14 @@ def normalize(collection):
             if filename not in streams[streamname]:
                 streams[streamname][filename] = []
             streams[streamname][filename].extend(s.locators_and_ranges(f.stream_offset(), f.size()))
-            
-    manifest = ""
+
+    normalized_streams = []
     sortedstreams = list(streams.keys())
     sortedstreams.sort()
-    #import pprint
-    #pprint.pprint(streams)
     for s in sortedstreams:
         stream = streams[s]
-        manifest += s.replace(' ', '\\040')
+        stream_tokens = [s]
+
         sortedfiles = list(stream.keys())
         sortedfiles.sort()
 
@@ -53,7 +52,7 @@ def normalize(collection):
         for f in sortedfiles:
             for b in stream[f]:
                 if b[StreamReader.LOCATOR] not in blocks:
-                    manifest += " " + b[StreamReader.LOCATOR]
+                    stream_tokens.append(b[StreamReader.LOCATOR])
                     blocks[b[StreamReader.LOCATOR]] = streamoffset
                     streamoffset += b[StreamReader.BLOCKSIZE]
 
@@ -68,15 +67,15 @@ def normalize(collection):
                     if chunkoffset == current_span[1]:
                         current_span[1] += chunk[StreamReader.CHUNKSIZE]
                     else:
-                        manifest += " " + "{0}:{1}:{2}".format(current_span[0], current_span[1] - current_span[0], fout)
+                        stream_tokens.append("{0}:{1}:{2}".format(current_span[0], current_span[1] - current_span[0], fout))
                         current_span = [chunkoffset, chunkoffset + chunk[StreamReader.CHUNKSIZE]]
 
             if current_span != None:
-                manifest += " " + "{0}:{1}:{2}".format(current_span[0], current_span[1] - current_span[0], fout)
+                stream_tokens.append("{0}:{1}:{2}".format(current_span[0], current_span[1] - current_span[0], fout))
+
+        normalized_streams.append(stream_tokens)
+    return normalized_streams
 
-        manifest += "\n"
-    manifest = manifest
-    return manifest
 
 class CollectionReader(object):
     def __init__(self, manifest_locator_or_text):
@@ -111,6 +110,18 @@ class CollectionReader(object):
             if stream_line != '':
                 stream_tokens = stream_line.split()
                 self._streams += [stream_tokens]
+        self._streams = normalize(self)
+
+        # now regenerate the manifest text based on the normalized stream
+
+        #print "normalizing", self._manifest_text        
+        self._manifest_text = ''
+        for stream in self._streams:
+            self._manifest_text += stream[0].replace(' ', '\\040')
+            for t in stream[1:]:
+                self._manifest_text += (" " + t.replace(' ', '\\040'))
+            self._manifest_text += "\n"
+        #print "result     ", self._manifest_text
 
     def all_streams(self):
         self._populate()
@@ -258,6 +269,7 @@ class CollectionWriter(object):
     def finish(self):
         return Keep.put(self.manifest_text())
 
+
     def manifest_text(self):
         self.finish_current_stream()
         manifest = ''
@@ -270,7 +282,7 @@ class CollectionWriter(object):
             for sfile in stream[2]:
                 manifest += " %d:%d:%s" % (sfile[0], sfile[1], sfile[2].replace(' ', '\\040'))
             manifest += "\n"
-        return manifest
+        return CollectionReader(manifest).manifest_text()
 
     def data_locators(self):
         ret = []
diff --git a/sdk/python/test_collections.py b/sdk/python/test_collections.py
index c5b552e..a352527 100644
--- a/sdk/python/test_collections.py
+++ b/sdk/python/test_collections.py
@@ -34,8 +34,8 @@ class LocalCollectionWriterTest(unittest.TestCase):
         cw.set_current_file_name('baz.txt')
         hash = cw.finish()
         self.assertEqual(hash,
-                         '23ca013983d6239e98931cc779e68426+114',
-                         'resulting manifest hash is not what I expected')
+                         'd6c3b8e571f1b81ebb150a45ed06c884+114',
+                         "resulting manifest hash was {0}, expecting d6c3b8e571f1b81ebb150a45ed06c884+114".format(hash))
 
 class LocalCollectionReaderTest(unittest.TestCase):
     def setUp(self):
@@ -47,8 +47,8 @@ class LocalCollectionReaderTest(unittest.TestCase):
         for s in cr.all_streams():
             for f in s.all_files():
                 got += [[f.size(), f.stream_name(), f.name(), f.read(2**26)]]
-        expected = [[3, '.', 'foo.txt', 'foo'],
-                    [3, '.', 'bar.txt', 'bar'],
+        expected = [[3, '.', 'bar.txt', 'bar'],
+                    [3, '.', 'foo.txt', 'foo'],
                     [3, './baz', 'baz.txt', 'baz']]
         self.assertEqual(got,
                          expected,
@@ -73,14 +73,14 @@ class LocalCollectionManifestSubsetTest(unittest.TestCase):
         LocalCollectionWriterTest().runTest()
     def runTest(self):
         self._runTest('23ca013983d6239e98931cc779e68426+114',
-                      [[3, '.', 'foo.txt', 'foo'],
-                       [3, '.', 'bar.txt', 'bar'],
+                      [[3, '.',     'bar.txt', 'bar'],
+                       [3, '.',     'foo.txt', 'foo'],
                        [3, './baz', 'baz.txt', 'baz']])
         self._runTest((". %s %s 0:3:foo.txt 3:3:bar.txt\n" %
                        (arvados.Keep.put("foo"),
                         arvados.Keep.put("bar"))),
-                      [[3, '.', 'foo.txt', 'foo'],
-                       [3, '.', 'bar.txt', 'bar']])
+                      [[3, '.', 'bar.txt', 'bar'],
+                       [3, '.', 'foo.txt', 'foo']])
         self._runTest((". %s %s 0:2:fo.txt 2:4:obar.txt\n" %
                        (arvados.Keep.put("foo"),
                         arvados.Keep.put("bar"))),
@@ -89,10 +89,11 @@ class LocalCollectionManifestSubsetTest(unittest.TestCase):
         self._runTest((". %s %s 0:2:fo.txt 2:0:zero.txt 2:2:ob.txt 4:2:ar.txt\n" %
                        (arvados.Keep.put("foo"),
                         arvados.Keep.put("bar"))),
-                      [[2, '.', 'fo.txt', 'fo'],
-                       [0, '.', 'zero.txt', ''],
-                       [2, '.', 'ob.txt', 'ob'],
-                       [2, '.', 'ar.txt', 'ar']])
+                      [[2, '.', 'ar.txt', 'ar'],
+                       [2, '.', 'fo.txt', 'fo'],                       
+                       [1, '.', 'ob.txt', 'o'],
+                       [1, '.', 'ob.txt', 'b'],
+                       [0, '.', 'zero.txt', ''],])
     def _runTest(self, collection, expected):
         cr = arvados.CollectionReader(collection)
         manifest_subsets = []
@@ -146,7 +147,7 @@ class LocalCollectionEmptyFileTest(unittest.TestCase):
         cw.start_new_stream('foo')
         cw.start_new_file('zero.txt')
         cw.write('')
-        self.check_manifest_file_sizes(cw.manifest_text(), [0,1,0])
+        self.check_manifest_file_sizes(cw.manifest_text(), [1,0,0])
     def check_manifest_file_sizes(self, manifest_text, expect_sizes):
         cr = arvados.CollectionReader(manifest_text)
         got_sizes = []
@@ -214,25 +215,25 @@ class NormalizedCollectionTest(unittest.TestCase):
         m1 = """. 5348b82a029fd9e971a811ce1f71360b+43 0:43:md5sum.txt
 . 085c37f02916da1cad16f93c54d899b7+41 0:41:md5sum.txt
 . 8b22da26f9f433dea0a10e5ec66d73ba+43 0:43:md5sum.txt"""
-        self.assertEqual(arvados.collection.normalize(arvados.CollectionReader(m1)),
+        self.assertEqual(arvados.CollectionReader(m1).manifest_text(),
                          """. 5348b82a029fd9e971a811ce1f71360b+43 085c37f02916da1cad16f93c54d899b7+41 8b22da26f9f433dea0a10e5ec66d73ba+43 0:127:md5sum.txt
 """)
 
         m2 = """. 204e43b8a1185621ca55a94839582e6f+67108864 b9677abbac956bd3e86b1deb28dfac03+67108864 fc15aff2a762b13f521baf042140acec+67108864 323d2a3ce20370c4ca1d3462a344f8fd+25885655 0:227212247:var-GS000016015-ASM.tsv.bz2
 """
-        self.assertEqual(arvados.collection.normalize(arvados.CollectionReader(m2)), m2)
+        self.assertEqual(arvados.CollectionReader(m2).manifest_text(), m2)
 
         m3 = """. 5348b82a029fd9e971a811ce1f71360b+43 3:40:md5sum.txt
 . 085c37f02916da1cad16f93c54d899b7+41 0:41:md5sum.txt
 . 8b22da26f9f433dea0a10e5ec66d73ba+43 0:43:md5sum.txt"""
-        self.assertEqual(arvados.collection.normalize(arvados.CollectionReader(m3)),
+        self.assertEqual(arvados.CollectionReader(m3).manifest_text(),
                          """. 5348b82a029fd9e971a811ce1f71360b+43 085c37f02916da1cad16f93c54d899b7+41 8b22da26f9f433dea0a10e5ec66d73ba+43 3:124:md5sum.txt
 """)
 
         m4 = """. 204e43b8a1185621ca55a94839582e6f+67108864 0:3:foo/bar
 ./zzz 204e43b8a1185621ca55a94839582e6f+67108864 0:999:zzz
 ./foo 323d2a3ce20370c4ca1d3462a344f8fd+25885655 0:3:bar"""
-        self.assertEqual(arvados.collection.normalize(arvados.CollectionReader(m4)),
+        self.assertEqual(arvados.CollectionReader(m4).manifest_text(),
                          """./foo 204e43b8a1185621ca55a94839582e6f+67108864 323d2a3ce20370c4ca1d3462a344f8fd+25885655 0:3:bar 67108864:3:bar
 ./zzz 204e43b8a1185621ca55a94839582e6f+67108864 0:999:zzz
 """)
@@ -240,19 +241,19 @@ class NormalizedCollectionTest(unittest.TestCase):
         m5 = """. 204e43b8a1185621ca55a94839582e6f+67108864 0:3:foo/bar
 ./zzz 204e43b8a1185621ca55a94839582e6f+67108864 0:999:zzz
 ./foo 204e43b8a1185621ca55a94839582e6f+67108864 3:3:bar"""
-        self.assertEqual(arvados.collection.normalize(arvados.CollectionReader(m5)),
+        self.assertEqual(arvados.CollectionReader(m5).manifest_text(),
                          """./foo 204e43b8a1185621ca55a94839582e6f+67108864 0:6:bar
 ./zzz 204e43b8a1185621ca55a94839582e6f+67108864 0:999:zzz
 """)
 
         with open('testdata/1000G_ref_manifest') as f6:
             m6 = f6.read()
-            self.assertEqual(arvados.collection.normalize(arvados.CollectionReader(m6)), m6)
+            self.assertEqual(arvados.CollectionReader(m6).manifest_text(), m6)
 
         with open('testdata/jlake_manifest') as f7:
             m7 = f7.read()
-            self.assertEqual(arvados.collection.normalize(arvados.CollectionReader(m7)), m7)
+            self.assertEqual(arvados.CollectionReader(m7).manifest_text(), m7)
 
         m8 = """./a\\040b\\040c 59ca0efa9f5633cb0371bbc0355478d8+13 0:13:hello\\040world.txt
 """
-        self.assertEqual(arvados.collection.normalize(arvados.CollectionReader(m8)), m8)
+        self.assertEqual(arvados.CollectionReader(m8).manifest_text(), m8)

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list