[ARVADOS] updated: 519c5922c34da64a8cce9e4d0030892e9b4bdd83
git at public.curoverse.com
git at public.curoverse.com
Fri Feb 14 14:40:18 EST 2014
Summary of changes:
sdk/python/arvados/collection.py | 36 +++++++++++++++++++++----------
sdk/python/test_collections.py | 43 +++++++++++++++++++------------------
2 files changed, 46 insertions(+), 33 deletions(-)
via 519c5922c34da64a8cce9e4d0030892e9b4bdd83 (commit)
from 1b82cd274ecebba9302e8a06f6c9e99eaf8ec717 (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
commit 519c5922c34da64a8cce9e4d0030892e9b4bdd83
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date: Fri Feb 14 14:41:22 2014 -0500
Collections are now automatically normalized on both read and write. Tests updated.
diff --git a/sdk/python/arvados/collection.py b/sdk/python/arvados/collection.py
index b069e8d..2535669 100644
--- a/sdk/python/arvados/collection.py
+++ b/sdk/python/arvados/collection.py
@@ -36,15 +36,14 @@ def normalize(collection):
if filename not in streams[streamname]:
streams[streamname][filename] = []
streams[streamname][filename].extend(s.locators_and_ranges(f.stream_offset(), f.size()))
-
- manifest = ""
+
+ normalized_streams = []
sortedstreams = list(streams.keys())
sortedstreams.sort()
- #import pprint
- #pprint.pprint(streams)
for s in sortedstreams:
stream = streams[s]
- manifest += s.replace(' ', '\\040')
+ stream_tokens = [s]
+
sortedfiles = list(stream.keys())
sortedfiles.sort()
@@ -53,7 +52,7 @@ def normalize(collection):
for f in sortedfiles:
for b in stream[f]:
if b[StreamReader.LOCATOR] not in blocks:
- manifest += " " + b[StreamReader.LOCATOR]
+ stream_tokens.append(b[StreamReader.LOCATOR])
blocks[b[StreamReader.LOCATOR]] = streamoffset
streamoffset += b[StreamReader.BLOCKSIZE]
@@ -68,15 +67,15 @@ def normalize(collection):
if chunkoffset == current_span[1]:
current_span[1] += chunk[StreamReader.CHUNKSIZE]
else:
- manifest += " " + "{0}:{1}:{2}".format(current_span[0], current_span[1] - current_span[0], fout)
+ stream_tokens.append("{0}:{1}:{2}".format(current_span[0], current_span[1] - current_span[0], fout))
current_span = [chunkoffset, chunkoffset + chunk[StreamReader.CHUNKSIZE]]
if current_span != None:
- manifest += " " + "{0}:{1}:{2}".format(current_span[0], current_span[1] - current_span[0], fout)
+ stream_tokens.append("{0}:{1}:{2}".format(current_span[0], current_span[1] - current_span[0], fout))
+
+ normalized_streams.append(stream_tokens)
+ return normalized_streams
- manifest += "\n"
- manifest = manifest
- return manifest
class CollectionReader(object):
def __init__(self, manifest_locator_or_text):
@@ -111,6 +110,18 @@ class CollectionReader(object):
if stream_line != '':
stream_tokens = stream_line.split()
self._streams += [stream_tokens]
+ self._streams = normalize(self)
+
+ # now regenerate the manifest text based on the normalized stream
+
+ #print "normalizing", self._manifest_text
+ self._manifest_text = ''
+ for stream in self._streams:
+ self._manifest_text += stream[0].replace(' ', '\\040')
+ for t in stream[1:]:
+ self._manifest_text += (" " + t.replace(' ', '\\040'))
+ self._manifest_text += "\n"
+ #print "result ", self._manifest_text
def all_streams(self):
self._populate()
@@ -258,6 +269,7 @@ class CollectionWriter(object):
def finish(self):
return Keep.put(self.manifest_text())
+
def manifest_text(self):
self.finish_current_stream()
manifest = ''
@@ -270,7 +282,7 @@ class CollectionWriter(object):
for sfile in stream[2]:
manifest += " %d:%d:%s" % (sfile[0], sfile[1], sfile[2].replace(' ', '\\040'))
manifest += "\n"
- return manifest
+ return CollectionReader(manifest).manifest_text()
def data_locators(self):
ret = []
diff --git a/sdk/python/test_collections.py b/sdk/python/test_collections.py
index c5b552e..a352527 100644
--- a/sdk/python/test_collections.py
+++ b/sdk/python/test_collections.py
@@ -34,8 +34,8 @@ class LocalCollectionWriterTest(unittest.TestCase):
cw.set_current_file_name('baz.txt')
hash = cw.finish()
self.assertEqual(hash,
- '23ca013983d6239e98931cc779e68426+114',
- 'resulting manifest hash is not what I expected')
+ 'd6c3b8e571f1b81ebb150a45ed06c884+114',
+ "resulting manifest hash was {0}, expecting d6c3b8e571f1b81ebb150a45ed06c884+114".format(hash))
class LocalCollectionReaderTest(unittest.TestCase):
def setUp(self):
@@ -47,8 +47,8 @@ class LocalCollectionReaderTest(unittest.TestCase):
for s in cr.all_streams():
for f in s.all_files():
got += [[f.size(), f.stream_name(), f.name(), f.read(2**26)]]
- expected = [[3, '.', 'foo.txt', 'foo'],
- [3, '.', 'bar.txt', 'bar'],
+ expected = [[3, '.', 'bar.txt', 'bar'],
+ [3, '.', 'foo.txt', 'foo'],
[3, './baz', 'baz.txt', 'baz']]
self.assertEqual(got,
expected,
@@ -73,14 +73,14 @@ class LocalCollectionManifestSubsetTest(unittest.TestCase):
LocalCollectionWriterTest().runTest()
def runTest(self):
self._runTest('23ca013983d6239e98931cc779e68426+114',
- [[3, '.', 'foo.txt', 'foo'],
- [3, '.', 'bar.txt', 'bar'],
+ [[3, '.', 'bar.txt', 'bar'],
+ [3, '.', 'foo.txt', 'foo'],
[3, './baz', 'baz.txt', 'baz']])
self._runTest((". %s %s 0:3:foo.txt 3:3:bar.txt\n" %
(arvados.Keep.put("foo"),
arvados.Keep.put("bar"))),
- [[3, '.', 'foo.txt', 'foo'],
- [3, '.', 'bar.txt', 'bar']])
+ [[3, '.', 'bar.txt', 'bar'],
+ [3, '.', 'foo.txt', 'foo']])
self._runTest((". %s %s 0:2:fo.txt 2:4:obar.txt\n" %
(arvados.Keep.put("foo"),
arvados.Keep.put("bar"))),
@@ -89,10 +89,11 @@ class LocalCollectionManifestSubsetTest(unittest.TestCase):
self._runTest((". %s %s 0:2:fo.txt 2:0:zero.txt 2:2:ob.txt 4:2:ar.txt\n" %
(arvados.Keep.put("foo"),
arvados.Keep.put("bar"))),
- [[2, '.', 'fo.txt', 'fo'],
- [0, '.', 'zero.txt', ''],
- [2, '.', 'ob.txt', 'ob'],
- [2, '.', 'ar.txt', 'ar']])
+ [[2, '.', 'ar.txt', 'ar'],
+ [2, '.', 'fo.txt', 'fo'],
+ [1, '.', 'ob.txt', 'o'],
+ [1, '.', 'ob.txt', 'b'],
+ [0, '.', 'zero.txt', ''],])
def _runTest(self, collection, expected):
cr = arvados.CollectionReader(collection)
manifest_subsets = []
@@ -146,7 +147,7 @@ class LocalCollectionEmptyFileTest(unittest.TestCase):
cw.start_new_stream('foo')
cw.start_new_file('zero.txt')
cw.write('')
- self.check_manifest_file_sizes(cw.manifest_text(), [0,1,0])
+ self.check_manifest_file_sizes(cw.manifest_text(), [1,0,0])
def check_manifest_file_sizes(self, manifest_text, expect_sizes):
cr = arvados.CollectionReader(manifest_text)
got_sizes = []
@@ -214,25 +215,25 @@ class NormalizedCollectionTest(unittest.TestCase):
m1 = """. 5348b82a029fd9e971a811ce1f71360b+43 0:43:md5sum.txt
. 085c37f02916da1cad16f93c54d899b7+41 0:41:md5sum.txt
. 8b22da26f9f433dea0a10e5ec66d73ba+43 0:43:md5sum.txt"""
- self.assertEqual(arvados.collection.normalize(arvados.CollectionReader(m1)),
+ self.assertEqual(arvados.CollectionReader(m1).manifest_text(),
""". 5348b82a029fd9e971a811ce1f71360b+43 085c37f02916da1cad16f93c54d899b7+41 8b22da26f9f433dea0a10e5ec66d73ba+43 0:127:md5sum.txt
""")
m2 = """. 204e43b8a1185621ca55a94839582e6f+67108864 b9677abbac956bd3e86b1deb28dfac03+67108864 fc15aff2a762b13f521baf042140acec+67108864 323d2a3ce20370c4ca1d3462a344f8fd+25885655 0:227212247:var-GS000016015-ASM.tsv.bz2
"""
- self.assertEqual(arvados.collection.normalize(arvados.CollectionReader(m2)), m2)
+ self.assertEqual(arvados.CollectionReader(m2).manifest_text(), m2)
m3 = """. 5348b82a029fd9e971a811ce1f71360b+43 3:40:md5sum.txt
. 085c37f02916da1cad16f93c54d899b7+41 0:41:md5sum.txt
. 8b22da26f9f433dea0a10e5ec66d73ba+43 0:43:md5sum.txt"""
- self.assertEqual(arvados.collection.normalize(arvados.CollectionReader(m3)),
+ self.assertEqual(arvados.CollectionReader(m3).manifest_text(),
""". 5348b82a029fd9e971a811ce1f71360b+43 085c37f02916da1cad16f93c54d899b7+41 8b22da26f9f433dea0a10e5ec66d73ba+43 3:124:md5sum.txt
""")
m4 = """. 204e43b8a1185621ca55a94839582e6f+67108864 0:3:foo/bar
./zzz 204e43b8a1185621ca55a94839582e6f+67108864 0:999:zzz
./foo 323d2a3ce20370c4ca1d3462a344f8fd+25885655 0:3:bar"""
- self.assertEqual(arvados.collection.normalize(arvados.CollectionReader(m4)),
+ self.assertEqual(arvados.CollectionReader(m4).manifest_text(),
"""./foo 204e43b8a1185621ca55a94839582e6f+67108864 323d2a3ce20370c4ca1d3462a344f8fd+25885655 0:3:bar 67108864:3:bar
./zzz 204e43b8a1185621ca55a94839582e6f+67108864 0:999:zzz
""")
@@ -240,19 +241,19 @@ class NormalizedCollectionTest(unittest.TestCase):
m5 = """. 204e43b8a1185621ca55a94839582e6f+67108864 0:3:foo/bar
./zzz 204e43b8a1185621ca55a94839582e6f+67108864 0:999:zzz
./foo 204e43b8a1185621ca55a94839582e6f+67108864 3:3:bar"""
- self.assertEqual(arvados.collection.normalize(arvados.CollectionReader(m5)),
+ self.assertEqual(arvados.CollectionReader(m5).manifest_text(),
"""./foo 204e43b8a1185621ca55a94839582e6f+67108864 0:6:bar
./zzz 204e43b8a1185621ca55a94839582e6f+67108864 0:999:zzz
""")
with open('testdata/1000G_ref_manifest') as f6:
m6 = f6.read()
- self.assertEqual(arvados.collection.normalize(arvados.CollectionReader(m6)), m6)
+ self.assertEqual(arvados.CollectionReader(m6).manifest_text(), m6)
with open('testdata/jlake_manifest') as f7:
m7 = f7.read()
- self.assertEqual(arvados.collection.normalize(arvados.CollectionReader(m7)), m7)
+ self.assertEqual(arvados.CollectionReader(m7).manifest_text(), m7)
m8 = """./a\\040b\\040c 59ca0efa9f5633cb0371bbc0355478d8+13 0:13:hello\\040world.txt
"""
- self.assertEqual(arvados.collection.normalize(arvados.CollectionReader(m8)), m8)
+ self.assertEqual(arvados.CollectionReader(m8).manifest_text(), m8)
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list