[ARVADOS] updated: 67c0bb9abe2f9069761a271145ae48368ee3d7d6

Fri Jul 8 18:25:13 EDT 2016

Summary of changes:
 sdk/python/arvados/arvfile.py      |  13 ++-
 sdk/python/arvados/collection.py   |  21 +++-
 sdk/python/arvados/commands/put.py | 192 +++++++++++++++++--------------------
 sdk/python/tests/test_arv_put.py   | 184 +++++++++++++++++++++--------------
 sdk/python/tests/test_arvfile.py   |  28 ++++--
 5 files changed, 244 insertions(+), 194 deletions(-)
 mode change 100644 => 100755 sdk/python/tests/test_arvfile.py

       via  67c0bb9abe2f9069761a271145ae48368ee3d7d6 (commit)
       via  44c2e41fec0beeab4544103d0a4afb9a775ed706 (commit)
       via  5139cff2d5425debd948131e0f60242845ea0b61 (commit)
       via  82a01df9d9f4f8961978a1383f4b6c09e73fe28d (commit)
      from  dcb4db28681b6949a56a1de579891cb375c423fe (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.


commit 67c0bb9abe2f9069761a271145ae48368ee3d7d6
Author: Lucas Di Pentima <lucas at curoverse.com>
Date:   Fri Jul 8 19:23:54 2016 -0300

    9463: Finished first draft on arv-put command use of Collection class. Also, partial uploads resuming is working. Tests were written to validate this. Refs #9463 #8910

diff --git a/sdk/python/arvados/commands/put.py b/sdk/python/arvados/commands/put.py
index 6bb1a0b..b3e1c5f 100755
--- a/sdk/python/arvados/commands/put.py
+++ b/sdk/python/arvados/commands/put.py
@@ -283,6 +283,7 @@ class ArvPutCollectionCache(object):
         md5.update(arvados.config.get('ARVADOS_API_HOST', '!nohost'))
         realpaths = sorted(os.path.realpath(path) for path in paths)
         self.files = {}
+        self.bytes_written = 0 # Approximate number of bytes already uploaded (partial uploaded files are counted in full)
         for path in realpaths:
             self._get_file_data(path)
         # Only hash args paths
@@ -295,6 +296,8 @@ class ArvPutCollectionCache(object):
         self._lock_file(self.cache_file)
         self.filename = self.cache_file.name
         self.data = self._load()
+        for f in self.data['uploaded'].values():
+            self.bytes_written += f['size']
     
     def _load(self):
         try:
@@ -312,14 +315,14 @@ class ArvPutCollectionCache(object):
         """
         Atomically save
         """
-        # TODO: Should be a good idea to avoid _save() spamming? when writing 
-        # lots of small files.
         try:
             new_cache_fd, new_cache_name = tempfile.mkstemp(
                 dir=os.path.dirname(self.filename))
             self._lock_file(new_cache_fd)
             new_cache = os.fdopen(new_cache_fd, 'r+')
             json.dump(self.data, new_cache)
+            new_cache.flush()
+            os.fsync(new_cache)
             os.rename(new_cache_name, self.filename)
         except (IOError, OSError, ResumeCacheConflict) as error:
             try:
@@ -331,11 +334,9 @@ class ArvPutCollectionCache(object):
             self.cache_file = new_cache
     
     def file_uploaded(self, path):
-        print "About to register an uploaded file: %s" % path
         if path in self.files.keys():
             self.data['uploaded'][path] = self.files[path]
             self._save()
-            print "Already registered the uploaded file!"
     
     def set_collection(self, loc):
         self.data['col_locator'] = loc
@@ -413,73 +414,51 @@ class ArvPutCollectionCache(object):
                 raise
         self.close()
 
-class ArvPutUploader(object):
-    def __init__(self, paths, reporter=None):
-        expected_bytes = expected_bytes_for(paths)
-        self.cache = ArvPutCollectionCache(paths)
-        self.paths = paths
-        self.already_uploaded = False
-        # if self.cache.collection() is not None:
-        #     self.collection = ArvPutCollection(
-        #                         locator=self.cache.collection(),
-        #                         cache=self.cache,
-        #                         reporter=reporter,
-        #                         bytes_expected=expected_bytes)
-        # else:
-        self.collection = ArvPutCollection(
-                            cache=self.cache, 
-                            reporter=reporter,
-                            bytes_expected=expected_bytes)
-            # self.cache.set_collection(self.collection.manifest_locator())
-    
-    def do_upload(self):
-        if not self.already_uploaded:
-            for p in paths:
-                if os.path.isdir(p):
-                    self.collection.write_directory_tree(p)
-                elif os.path.isfile(p):
-                    self.collection.write_file(p, os.path.basename(p))
-            self.cache.destroy()
-            self.already_uploaded = True
-    
-    def manifest(self):
-        return self.collection.manifest()
-    
-    def bytes_written(self):
-        return self.collection.bytes_written
-
 
 class ArvPutCollection(object):
     def __init__(self, cache=None, reporter=None, bytes_expected=None, 
                 name=None, owner_uuid=None, ensure_unique_name=False, 
-                num_retries=None, replication=None):
+                num_retries=None, write_copies=None, replication=None,
+                should_save=True):
         self.collection_flush_time = 60 # Secs
         self.bytes_written = 0
+        self.bytes_skipped = 0
         self.cache = cache
         self.reporter = reporter
-        self.num_retries=num_retries
+        self.num_retries = num_retries
+        self.write_copies = write_copies
+        self.replication = replication
         self.bytes_expected = bytes_expected
+        self.should_save = should_save
         
         locator = self.cache.collection() if self.cache else None
         
         if locator is None:
-            self.collection = arvados.collection.Collection()
-            self.collection.save_new(name=name, owner_uuid=owner_uuid, 
-                                        ensure_unique_name=ensure_unique_name,
-                                        num_retries=num_retries)
-            if self.cache:
-                self.cache.set_collection(self.collection.manifest_locator())
+            self.collection = arvados.collection.Collection(
+                                        num_write_copies=self.write_copies)
+            if self.should_save:
+                self.collection.save_new(name=name, owner_uuid=owner_uuid, 
+                                            ensure_unique_name=ensure_unique_name,
+                                            num_retries=num_retries,
+                                            replication_desired=self.replication)
+                if self.cache:
+                    self.cache.set_collection(self.collection.manifest_locator())
         else:
-            self.collection = arvados.collection.Collection(locator)
+            self.collection = arvados.collection.Collection(locator,
+                                        num_write_copies=self.write_copies)
+    
+    def name(self):
+        return self.collection.api_response()['name'] if self.collection.api_response() else None
     
     def save(self):
-        self.collection.save(num_retries=self.num_retries)
+        if self.should_save:
+            self.collection.save(num_retries=self.num_retries)
     
     def manifest_locator(self):
         return self.collection.manifest_locator()
     
     def portable_data_hash(self):
-        return self.collectin.portable_data_hash()
+        return self.collection.portable_data_hash()
     
     def manifest_text(self, stream_name=".", strip=False, normalize=False):
         return self.collection.manifest_text(stream_name, strip, normalize)
@@ -494,15 +473,14 @@ class ArvPutCollection(object):
             output.flush() # Commit block to Keep
             self.bytes_written += len(data)
             # Is it time to update the collection?
-            if (time.time() - start_time) > self.collection_flush_time:
+            if self.should_save and ((time.time() - start_time) > self.collection_flush_time):
                 self.collection.save(num_retries=self.num_retries)
                 start_time = time.time()
             # Once a block is written on each file, mark it as uploaded on the cache
-            if first_block:
+            if self.should_save and first_block:
                 if self.cache:
                     self.cache.file_uploaded(source_fd.name)
                     self.collection.save(num_retries=self.num_retries)
-                    print "FLUSHED COLLECTION!!!"
                 first_block = False
             self.report_progress()
     
@@ -511,35 +489,31 @@ class ArvPutCollection(object):
             output = c.open(filename, 'w')
             self._write(sys.stdin, output)
             output.close()
-            self.collection.save()
+            if self.should_save:
+                self.collection.save()
     
     def write_file(self, source, filename):
         if self.cache and source in self.cache.dirty_files():
-            print "DIRTY: Removing file %s from collection to be uploaded again" % source
             self.collection.remove(filename)
         
         resume_offset = 0
         resume_upload = False
         try:
-            print "FIND file %s" % filename
             collection_file = self.collection.find(filename)
         except IOError:
             # Not found
             collection_file = None
         
         if collection_file:
-            print "File %s already in the collection, checking!" % source
             if os.path.getsize(source) == collection_file.size():
-                print "WARNING: file %s already uploaded, skipping!" % source
                 # File already there, skip it.
-                self.bytes_written += os.path.getsize(source)
+                self.bytes_skipped += os.path.getsize(source)
                 return
             elif os.path.getsize(source) > collection_file.size():
-                print "WARNING: RESUMING file %s" % source
                 # File partially uploaded, resume!
                 resume_upload = True
                 resume_offset = collection_file.size()
-                self.bytes_written += resume_offset
+                self.bytes_skipped += resume_offset
             else:
                 # Source file smaller than uploaded file, what happened here?
                 # TODO: Raise exception of some kind?
@@ -548,38 +522,62 @@ class ArvPutCollection(object):
         with open(source, 'r') as source_fd:
             with self.collection as c:
                 if resume_upload:
-                    print "Resuming file, source: %s, filename: %s" % (source, filename)
                     output = c.open(filename, 'a')
                     source_fd.seek(resume_offset)
                     first_block = False
                 else:
-                    print "Writing file, source: %s, filename: %s" % (source, filename)
                     output = c.open(filename, 'w')
                     first_block = True
                 
                 self._write(source_fd, output, first_block)
                 output.close()
-                self.collection.save() # One last save...
+                if self.should_save:
+                    self.collection.save() # One last save...
 
     def write_directory_tree(self, path, stream_name='.'):
-        if os.path.isdir(path):
-            for item in os.listdir(path):
-                print "Checking path: '%s' - stream_name: '%s'" % (path, stream_name)
-                if os.path.isdir(os.path.join(path, item)):
-                    self.write_directory_tree(os.path.join(path, item), 
-                                    os.path.join(stream_name, item))
-                else:
-                    self.write_file(os.path.join(path, item), 
-                                    os.path.join(stream_name, item))
+        # TODO: Check what happens when multiple directories are passes as arguments
+        # If the below code is uncommented, integration test
+        # test_ArvPutSignedManifest (tests.test_arv_put.ArvPutIntegrationTest) fails, 
+        # I suppose it is because the manifest_uuid changes because of the dir addition to
+        # stream_name.
+        #
+        # if stream_name == '.':
+        #     stream_name = os.path.join('.', os.path.basename(path))
+        for item in os.listdir(path):
+            if os.path.isdir(os.path.join(path, item)):
+                self.write_directory_tree(os.path.join(path, item), 
+                                os.path.join(stream_name, item))
+            else:
+                self.write_file(os.path.join(path, item), 
+                                os.path.join(stream_name, item))
 
-    def manifest(self):
-        print "BLOCK SIZE: %d" % arvados.config.KEEP_BLOCK_SIZE
-        print "MANIFEST Locator:\n%s\nMANIFEST TEXT:\n%s" % (self.manifest_locator(), self.collection.manifest_text())
-        return True
-    
     def report_progress(self):
         if self.reporter is not None:
-            self.reporter(self.bytes_written, self.bytes_expected)
+            self.reporter(self.bytes_written+self.bytes_skipped, self.bytes_expected)
+    
+    def _datablocks_on_item(self, item):
+        """
+        Return a list of datablock locators, recursively navigating
+        through subcollections
+        """
+        if isinstance(item, arvados.arvfile.ArvadosFile):
+            locators = []
+            for segment in item.segments():
+                loc = segment.locator
+                if loc.startswith("bufferblock"):
+                    loc = self._bufferblocks[loc].calculate_locator()
+                locators.append(loc)
+            return locators
+        elif isinstance(item, arvados.collection.Collection) or isinstance(item, arvados.collection.Subcollection):
+            l = [self._datablocks_on_item(x) for x in item.values()]
+            # Fast list flattener method taken from: 
+            # http://stackoverflow.com/questions/952914/making-a-flat-list-out-of-list-of-lists-in-python
+            return [loc for sublist in l for loc in sublist]
+        else:
+            return None
+    
+    def data_locators(self):
+        return self._datablocks_on_item(self.collection)
 
 
 class ArvPutCollectionWriter(arvados.ResumableCollectionWriter):
@@ -774,13 +772,16 @@ def main_new(arguments=None, stdout=sys.stdout, stderr=sys.stderr):
                         reporter=reporter,
                         bytes_expected=bytes_expected,
                         num_retries=args.retries,
-                        replication=write_copies)
+                        write_copies=write_copies,
+                        replication=args.replication,
+                        should_save=False)
     else:
         writer = ArvPutCollection(cache=resume_cache, 
                         reporter=reporter,
                         bytes_expected=bytes_expected,
                         num_retries=args.retries,
-                        replication=write_copies,
+                        write_copies=write_copies,
+                        replication=args.replication,
                         name=collection_name,
                         owner_uuid=project_uuid,
                         ensure_unique_name=True)
@@ -790,7 +791,7 @@ def main_new(arguments=None, stdout=sys.stdout, stderr=sys.stderr):
     orig_signal_handlers = {sigcode: signal.signal(sigcode, exit_signal_handler)
                             for sigcode in CAUGHT_SIGNALS}
 
-    if writer.bytes_written > 0:  # We're resuming a previous upload. TODO
+    if resume_cache and resume_cache.bytes_written > 0:
         print >>stderr, "\n".join([
                 "arv-put: Resuming previous upload from last checkpoint.",
                 "         Use the --no-resume option to start over."])
@@ -800,7 +801,7 @@ def main_new(arguments=None, stdout=sys.stdout, stderr=sys.stderr):
         if path == '-':
             writer.write_stdin(args.filename)
         elif os.path.isdir(path):
-            writer.write_directory_tree(path)#, os.path.join('.', os.path.basename(path))) TODO: Check what happens with multiple directories params
+            writer.write_directory_tree(path)
         else:
             writer.write_file(path, args.filename or os.path.basename(path))
 
@@ -814,36 +815,15 @@ def main_new(arguments=None, stdout=sys.stdout, stderr=sys.stderr):
         else:
             output = writer.manifest_text()
     elif args.raw:
-        output = ','.join(writer.data_locators()) # TODO
+        output = ','.join(writer.data_locators())
     else:
         try:
-    #         manifest_text = writer.manifest_text()
-    #         if args.normalize:
-    #             manifest_text = arvados.collection.CollectionReader(manifest_text).manifest_text(normalize=True)
-    #         replication_attr = 'replication_desired'
-    #         if api_client._schema.schemas['Collection']['properties'].get(replication_attr, None) is None:
-    #             # API called it 'redundancy' before #3410.
-    #             replication_attr = 'redundancy'
-    #         # Register the resulting collection in Arvados.
-    #         collection = api_client.collections().create(
-    #             body={
-    #                 'owner_uuid': project_uuid,
-    #                 'name': collection_name,
-    #                 'manifest_text': manifest_text,
-    #                 replication_attr: args.replication,
-    #                 },
-    #             ensure_unique_name=True
-    #             ).execute(num_retries=args.retries)
-    #
-    #         print >>stderr, "Collection saved as '%s'" % collection['name']
-    #
             writer.save()
+            print >>stderr, "Collection saved as '%s'" % writer.name()
             if args.portable_data_hash:
                 output = writer.portable_data_hash()
             else:
                 output = writer.manifest_locator()
-            with open('/tmp/lucas.txt', 'w') as f:
-                f.write(output)
 
         except apiclient_errors.Error as error:
             print >>stderr, (
diff --git a/sdk/python/tests/test_arv_put.py b/sdk/python/tests/test_arv_put.py
index 0990075..a373de9 100755
--- a/sdk/python/tests/test_arv_put.py
+++ b/sdk/python/tests/test_arv_put.py
@@ -8,12 +8,15 @@ import pwd
 import re
 import shutil
 import subprocess
-import multiprocessing
 import sys
 import tempfile
 import time
 import unittest
 import yaml
+import multiprocessing
+import shutil
+import hashlib
+import random
 
 from cStringIO import StringIO
 
@@ -238,81 +241,116 @@ class ArvadosPutResumeCacheTest(ArvadosBaseTestCase):
 class ArvadosPutCollectionTest(run_test_server.TestCaseWithServers):
     MAIN_SERVER = {}
     KEEP_SERVER = {}
-    import shutil
-        
-    # def test_write_files(self):
-    #     c = arv_put.ArvPutCollection()
-    #     data = 'a' * 1024 * 1024 # 1 MB
-    #     tmpdir = tempfile.mkdtemp()
-    #     for size in [1, 10, 64, 128]:
-    #         with open(os.path.join(tmpdir, 'file_%d' % size), 'w') as f:
-    #             for _ in range(size):
-    #                 f.write(data)
-    #         c.write_file(f.name, os.path.basename(f.name))
-    #     shutil.rmtree(tmpdir)
-    #     self.assertEqual(True, c.manifest())
-    #
-    # def test_write_directory(self):
-    #     data = 'b' * 1024 * 1024
-    #     tmpdir = tempfile.mkdtemp()
-    #     for size in [1, 5, 10, 70]:
-    #         with open(os.path.join(tmpdir, 'file_%d' % size), 'w') as f:
-    #             for _ in range(size):
-    #                 f.write(data)
-    #     os.mkdir(os.path.join(tmpdir, 'subdir1'))
-    #     for size in [2, 4, 6]:
-    #         with open(os.path.join(tmpdir, 'subdir1', 'file_%d' % size), 'w') as f:
-    #             for _ in range(size):
-    #                 f.write(data)
-    #     c = arv_put.ArvPutUploader([tmpdir])
-    #     shutil.rmtree(tmpdir)
-    #     self.assertEqual(True, c.manifest())
+    
+    def setUp(self):
+        self.lock = multiprocessing.Lock()
 
     def fake_reporter(self, written, expected):
-        # Use this callback as a intra-block pause to be able to simulate an interruption
-        print "Written %d / %d bytes" % (written, expected)
-        time.sleep(10)
+        self.lock.release() # Allow caller process to terminate() us...
     
-    def bg_uploader(self, paths):
-        return arv_put.ArvPutUploader(paths, reporter=self.fake_reporter)
-
-    # def test_resume_large_file_upload(self):
-    #     import multiprocessing
-    #     data = 'x' * 1024 * 1024 # 1 MB
-    #     _, filename = tempfile.mkstemp()
-    #     fileobj = open(filename, 'w')
-    #     for _ in range(200):
-    #         fileobj.write(data)
-    #     fileobj.close()
-    #     uploader = multiprocessing.Process(target=self.bg_uploader, args=([filename],))
-    #     uploader.start()
-    #     time.sleep(5)
-    #     uploader.terminate()
-    #     time.sleep(1)
-    #     # cache = arv_put.ArvPutCollectionCache([filename])
-    #     # print "Collection detected: %s" % cache.collection()
-    #     # c = arv_put.ArvPutCollection(locator=cache.collection(), cache=cache)
-    #     # print "UPLOADED: %d" % c.collection[os.path.basename(filename)].size()
-    #     # self.assertLess(c.collection[os.path.basename(filename)].size(), os.path.getsize(filename))
-    #     os.unlink(filename)
-
-    # def test_write_directory_twice(self):
-    #     data = 'b' * 1024 * 1024
-    #     tmpdir = tempfile.mkdtemp()
-    #     for size in [1, 5, 10, 70]:
-    #         with open(os.path.join(tmpdir, 'file_%d' % size), 'w') as f:
-    #             for _ in range(size):
-    #                 f.write(data)
-    #     os.mkdir(os.path.join(tmpdir, 'subdir1'))
-    #     for size in [2, 4, 6]:
-    #         with open(os.path.join(tmpdir, 'subdir1', 'file_%d' % size), 'w') as f:
-    #             for _ in range(size):
-    #                 f.write(data)
-    #     c = arv_put.ArvPutUploader([tmpdir])
-    #     d = arv_put.ArvPutUploader([tmpdir])
-    #     print "ESCRIBIERON: c: %d, d: %d" % (c.bytes_written(), d.bytes_written())
-    #     shutil.rmtree(tmpdir)
-    #     self.assertEqual(0, d.bytes_written())
+    def bg_uploader(self, filename):
+        cache = arv_put.ArvPutCollectionCache([filename])
+        c = arv_put.ArvPutCollection(reporter=self.fake_reporter, cache=cache)
+        c.collection_flush_time = 0 # flush collection on every block flush, just for this test
+        c.write_file(filename, os.path.basename(filename))
+
+    def test_write_collection_with_name(self):
+        name = 'This is a collection'
+        c = arv_put.ArvPutCollection(name=name)
+        self.assertEqual(name, c.name())
+
+    def test_write_file_on_collection_without_save(self):
+        c = arv_put.ArvPutCollection(should_save=False)
+        with tempfile.NamedTemporaryFile(delete=False) as f:
+            f.write("The quick brown fox jumped over the lazy dog")
+        c.write_file(f.name, os.path.basename(f.name))
+        self.assertEqual(None, c.manifest_locator())
+        os.unlink(f.name)
+
+    def test_write_file_and_check_data_locators(self):
+        c = arv_put.ArvPutCollection(should_save=False)
+        with tempfile.NamedTemporaryFile(delete=False) as f:
+            # Writing ~90 MB, so that it writes 2 data blocks
+            for _ in range(2 * 1024 * 1024):
+                f.write("The quick brown fox jumped over the lazy dog\n")
+        c.write_file(f.name, os.path.basename(f.name))
+        self.assertEqual(2, len(c.data_locators()))
+        os.unlink(f.name)
+        
+    def test_write_directory_and_check_data_locators(self):
+        data = 'b' * 1024 * 1024 # 1 MB
+        tmpdir = tempfile.mkdtemp()
+        for size in [1, 5, 10, 70]:
+            with open(os.path.join(tmpdir, 'file_%d' % size), 'w') as f:
+                for _ in range(size):
+                    f.write(data)
+        os.mkdir(os.path.join(tmpdir, 'subdir1'))
+        for size in [2, 4, 6]:
+            with open(os.path.join(tmpdir, 'subdir1', 'file_%d' % size), 'w') as f:
+                for _ in range(size):
+                    f.write(data)
+        c = arv_put.ArvPutCollection()
+        c.write_directory_tree(tmpdir)
+        shutil.rmtree(tmpdir)
+        self.assertEqual(8, len(c.data_locators()))
+
+    def test_resume_large_file_upload(self):
+        _, filename = tempfile.mkstemp()
+        md5_original = hashlib.md5()
+        md5_uploaded = hashlib.md5()
+        fileobj = open(filename, 'w')
+        for _ in range(70):
+            data = random.choice(['x', 'y', 'z']) * 1024 * 1024 # 1 MB
+            fileobj.write(data)
+            md5_original.update(data)
+        fileobj.close()
+        self.lock.acquire()
+        uploader = multiprocessing.Process(target=self.bg_uploader, args=(filename,))
+        uploader.start()
+        self.lock.acquire() # We can now proceed, because one block and collection flush()ed
+        self.lock.release()
+        uploader.terminate()
+        uploader.join()
+        cache = arv_put.ArvPutCollectionCache([filename])
+        c = arv_put.ArvPutCollection(cache=cache)
+        self.assertLess(c.collection[os.path.basename(filename)].size(), os.path.getsize(filename))
+        c.write_file(filename, os.path.basename(filename))
+        uploaded = c.collection.open(os.path.basename(filename), 'r')
+        while True:
+            data = uploaded.read(1024*1024)
+            if not data:
+                break
+            md5_uploaded.update(data)
+        os.unlink(filename)
+        cache.destroy()
+        self.assertEqual(md5_original.hexdigest(), md5_uploaded.hexdigest())
+
+    def test_write_directory_twice(self):
+        data = 'b' * 1024 * 1024
+        tmpdir = tempfile.mkdtemp()
+        for size in [1, 5, 10, 70]:
+            with open(os.path.join(tmpdir, 'file_%d' % size), 'w') as f:
+                for _ in range(size):
+                    f.write(data)
+        os.mkdir(os.path.join(tmpdir, 'subdir1'))
+        for size in [2, 4, 6]:
+            with open(os.path.join(tmpdir, 'subdir1', 'file_%d' % size), 'w') as f:
+                for _ in range(size):
+                    f.write(data)
+        c_cache = arv_put.ArvPutCollectionCache([tmpdir])
+        c = arv_put.ArvPutCollection(cache=c_cache)
+        c.write_directory_tree(tmpdir)
+        c_cache.close()
+        d_cache = arv_put.ArvPutCollectionCache([tmpdir])
+        d = arv_put.ArvPutCollection(cache=d_cache)
+        d.write_directory_tree(tmpdir)
+        d_cache.close()
+        c_cache.destroy()
+        d_cache.destroy()
+        shutil.rmtree(tmpdir)
+        self.assertNotEqual(c.bytes_written, d.bytes_written)
+        # self.assertGreater(c.bytes_written, 0)
+        # self.assertEqual(d.bytes_written, 0)
         
 
 class ArvadosPutCollectionWriterTest(run_test_server.TestCaseWithServers,

commit 44c2e41fec0beeab4544103d0a4afb9a775ed706
Author: Lucas Di Pentima <lucas at curoverse.com>
Date:   Thu Jul 7 18:32:50 2016 -0300

    9463: Fixed some integration tests broken because of the replication_desired addition to Collection

diff --git a/sdk/python/tests/test_arvfile.py b/sdk/python/tests/test_arvfile.py
old mode 100644
new mode 100755
index ea86614..da8bf68
--- a/sdk/python/tests/test_arvfile.py
+++ b/sdk/python/tests/test_arvfile.py
@@ -37,6 +37,10 @@ class ArvadosFileWriterTestCase(unittest.TestCase):
         def __init__(self, b, r):
             self.body = b
             self.response = r
+            self._schema = ArvadosFileWriterTestCase.MockApi.MockSchema()
+        class MockSchema(object):
+            def __init__(self):
+                self.schemas = {'Collection': {'properties': {'replication_desired': 2}}}
         class MockCollections(object):
             def __init__(self, b, r):
                 self.body = b
@@ -59,7 +63,8 @@ class ArvadosFileWriterTestCase(unittest.TestCase):
     def test_truncate(self):
         keep = ArvadosFileWriterTestCase.MockKeep({"781e5e245d69b566979b86e28d23f2c7+10": "0123456789"})
         api = ArvadosFileWriterTestCase.MockApi({"name":"test_truncate",
-                                                 "manifest_text":". 781e5e245d69b566979b86e28d23f2c7+10 0:8:count.txt\n"},
+                                                 "manifest_text":". 781e5e245d69b566979b86e28d23f2c7+10 0:8:count.txt\n",
+                                                 "replication_desired":None},
                                                 {"uuid":"zzzzz-4zz18-mockcollection0",
                                                  "manifest_text":". 781e5e245d69b566979b86e28d23f2c7+10 0:8:count.txt\n"})
         with Collection('. 781e5e245d69b566979b86e28d23f2c7+10 0:10:count.txt\n',
@@ -86,7 +91,8 @@ class ArvadosFileWriterTestCase(unittest.TestCase):
     def test_write_to_end(self):
         keep = ArvadosFileWriterTestCase.MockKeep({"781e5e245d69b566979b86e28d23f2c7+10": "0123456789"})
         api = ArvadosFileWriterTestCase.MockApi({"name":"test_append",
-                                                 "manifest_text": ". 781e5e245d69b566979b86e28d23f2c7+10 acbd18db4cc2f85cedef654fccc4a4d8+3 0:13:count.txt\n"},
+                                                 "manifest_text": ". 781e5e245d69b566979b86e28d23f2c7+10 acbd18db4cc2f85cedef654fccc4a4d8+3 0:13:count.txt\n",
+                                                 "replication_desired":None},
                                                 {"uuid":"zzzzz-4zz18-mockcollection0",
                                                  "manifest_text": ". 781e5e245d69b566979b86e28d23f2c7+10 acbd18db4cc2f85cedef654fccc4a4d8+3 0:13:count.txt\n"})
         with Collection('. 781e5e245d69b566979b86e28d23f2c7+10 0:10:count.txt\n',
@@ -222,7 +228,8 @@ class ArvadosFileWriterTestCase(unittest.TestCase):
     def test_write_large(self):
         keep = ArvadosFileWriterTestCase.MockKeep({})
         api = ArvadosFileWriterTestCase.MockApi({"name":"test_write_large",
-                                                 "manifest_text": ". a5de24f4417cfba9d5825eadc2f4ca49+67108000 598cc1a4ccaef8ab6e4724d87e675d78+32892000 0:100000000:count.txt\n"},
+                                                 "manifest_text": ". a5de24f4417cfba9d5825eadc2f4ca49+67108000 598cc1a4ccaef8ab6e4724d87e675d78+32892000 0:100000000:count.txt\n",
+                                                 "replication_desired":None},
                                                 {"uuid":"zzzzz-4zz18-mockcollection0",
                                                  "manifest_text": ". a5de24f4417cfba9d5825eadc2f4ca49+67108000 598cc1a4ccaef8ab6e4724d87e675d78+32892000 0:100000000:count.txt\n"})
         with Collection('. ' + arvados.config.EMPTY_BLOCK_LOCATOR + ' 0:0:count.txt',
@@ -313,7 +320,8 @@ class ArvadosFileWriterTestCase(unittest.TestCase):
     def test_write_large_rewrite(self):
         keep = ArvadosFileWriterTestCase.MockKeep({})
         api = ArvadosFileWriterTestCase.MockApi({"name":"test_write_large",
-                                                 "manifest_text": ". 37400a68af9abdd76ca5bf13e819e42a+32892003 a5de24f4417cfba9d5825eadc2f4ca49+67108000 32892000:3:count.txt 32892006:67107997:count.txt 0:32892000:count.txt\n"},
+                                                 "manifest_text": ". 37400a68af9abdd76ca5bf13e819e42a+32892003 a5de24f4417cfba9d5825eadc2f4ca49+67108000 32892000:3:count.txt 32892006:67107997:count.txt 0:32892000:count.txt\n",
+                                                 "replication_desired":None},
                                                 {"uuid":"zzzzz-4zz18-mockcollection0",
                                                  "manifest_text": ". 37400a68af9abdd76ca5bf13e819e42a+32892003 a5de24f4417cfba9d5825eadc2f4ca49+67108000 32892000:3:count.txt 32892006:67107997:count.txt 0:32892000:count.txt\n"})
         with Collection('. ' + arvados.config.EMPTY_BLOCK_LOCATOR + ' 0:0:count.txt',
@@ -335,7 +343,8 @@ class ArvadosFileWriterTestCase(unittest.TestCase):
     def test_create(self):
         keep = ArvadosFileWriterTestCase.MockKeep({})
         api = ArvadosFileWriterTestCase.MockApi({"name":"test_create",
-                                                 "manifest_text":". 2e9ec317e197819358fbc43afca7d837+8 0:8:count.txt\n"},
+                                                 "manifest_text":". 2e9ec317e197819358fbc43afca7d837+8 0:8:count.txt\n",
+                                                 "replication_desired":None},
                                                 {"uuid":"zzzzz-4zz18-mockcollection0",
                                                  "manifest_text":". 2e9ec317e197819358fbc43afca7d837+8 0:8:count.txt\n"})
         with Collection(api_client=api, keep_client=keep) as c:
@@ -356,7 +365,8 @@ class ArvadosFileWriterTestCase(unittest.TestCase):
     def test_create_subdir(self):
         keep = ArvadosFileWriterTestCase.MockKeep({})
         api = ArvadosFileWriterTestCase.MockApi({"name":"test_create",
-                                                 "manifest_text":"./foo/bar 2e9ec317e197819358fbc43afca7d837+8 0:8:count.txt\n"},
+                                                 "manifest_text":"./foo/bar 2e9ec317e197819358fbc43afca7d837+8 0:8:count.txt\n",
+                                                 "replication_desired":None},
                                                 {"uuid":"zzzzz-4zz18-mockcollection0",
                                                  "manifest_text":"./foo/bar 2e9ec317e197819358fbc43afca7d837+8 0:8:count.txt\n"})
         with Collection(api_client=api, keep_client=keep) as c:
@@ -371,7 +381,8 @@ class ArvadosFileWriterTestCase(unittest.TestCase):
     def test_overwrite(self):
         keep = ArvadosFileWriterTestCase.MockKeep({"781e5e245d69b566979b86e28d23f2c7+10": "0123456789"})
         api = ArvadosFileWriterTestCase.MockApi({"name":"test_overwrite",
-                                                 "manifest_text":". 2e9ec317e197819358fbc43afca7d837+8 0:8:count.txt\n"},
+                                                 "manifest_text":". 2e9ec317e197819358fbc43afca7d837+8 0:8:count.txt\n",
+                                                 "replication_desired":None},
                                                 {"uuid":"zzzzz-4zz18-mockcollection0",
                                                  "manifest_text":". 2e9ec317e197819358fbc43afca7d837+8 0:8:count.txt\n"})
         with Collection('. 781e5e245d69b566979b86e28d23f2c7+10 0:10:count.txt\n',
@@ -400,7 +411,8 @@ class ArvadosFileWriterTestCase(unittest.TestCase):
     def test_create_multiple(self):
         keep = ArvadosFileWriterTestCase.MockKeep({})
         api = ArvadosFileWriterTestCase.MockApi({"name":"test_create_multiple",
-                                                 "manifest_text":". 2e9ec317e197819358fbc43afca7d837+8 e8dc4081b13434b45189a720b77b6818+8 0:8:count1.txt 8:8:count2.txt\n"},
+                                                 "manifest_text":". 2e9ec317e197819358fbc43afca7d837+8 e8dc4081b13434b45189a720b77b6818+8 0:8:count1.txt 8:8:count2.txt\n",
+                                                 "replication_desired":None},
                                                 {"uuid":"zzzzz-4zz18-mockcollection0",
                                                  "manifest_text":". 2e9ec317e197819358fbc43afca7d837+8 e8dc4081b13434b45189a720b77b6818+8 0:8:count1.txt 8:8:count2.txt\n"})
         with Collection(api_client=api, keep_client=keep) as c:

commit 5139cff2d5425debd948131e0f60242845ea0b61
Author: Lucas Di Pentima <lucas at curoverse.com>
Date:   Thu Jul 7 15:44:16 2016 -0300

    9463: Added to Collection a way to ask _BlockManager how many copies need to be written when uploading blocks to Keep

diff --git a/sdk/python/arvados/arvfile.py b/sdk/python/arvados/arvfile.py
index b78c63e..90db6ce 100644
--- a/sdk/python/arvados/arvfile.py
+++ b/sdk/python/arvados/arvfile.py
@@ -402,7 +402,7 @@ class _BlockManager(object):
     DEFAULT_PUT_THREADS = 2
     DEFAULT_GET_THREADS = 2
 
-    def __init__(self, keep):
+    def __init__(self, keep, copies=None):
         """keep: KeepClient object to use"""
         self._keep = keep
         self._bufferblocks = {}
@@ -414,6 +414,7 @@ class _BlockManager(object):
         self.prefetch_enabled = True
         self.num_put_threads = _BlockManager.DEFAULT_PUT_THREADS
         self.num_get_threads = _BlockManager.DEFAULT_GET_THREADS
+        self.num_put_copies = copies
 
     @synchronized
     def alloc_bufferblock(self, blockid=None, starting_capacity=2**14, owner=None):
@@ -464,7 +465,10 @@ class _BlockManager(object):
                 if bufferblock is None:
                     return
 
-                loc = self._keep.put(bufferblock.buffer_view[0:bufferblock.write_pointer].tobytes())
+                if self.num_put_copies is None:
+                    loc = self._keep.put(bufferblock.buffer_view[0:bufferblock.write_pointer].tobytes())
+                else:
+                    loc = self._keep.put(bufferblock.buffer_view[0:bufferblock.write_pointer].tobytes(), copies=self.num_put_copies)
                 bufferblock.set_state(_BufferBlock.COMMITTED, loc)
 
             except Exception as e:
@@ -577,7 +581,10 @@ class _BlockManager(object):
 
         if sync:
             try:
-                loc = self._keep.put(block.buffer_view[0:block.write_pointer].tobytes())
+                if self.num_put_copies is None:
+                    loc = self._keep.put(block.buffer_view[0:block.write_pointer].tobytes())
+                else:
+                    loc = self._keep.put(block.buffer_view[0:block.write_pointer].tobytes(), copies=self.num_put_copies)
                 block.set_state(_BufferBlock.COMMITTED, loc)
             except Exception as e:
                 block.set_state(_BufferBlock.ERROR, e)
diff --git a/sdk/python/arvados/collection.py b/sdk/python/arvados/collection.py
index 1047f53..098427c 100644
--- a/sdk/python/arvados/collection.py
+++ b/sdk/python/arvados/collection.py
@@ -1135,7 +1135,8 @@ class Collection(RichCollectionBase):
                  num_retries=None,
                  parent=None,
                  apiconfig=None,
-                 block_manager=None):
+                 block_manager=None,
+                 num_write_copies=None):
         """Collection constructor.
 
         :manifest_locator_or_text:
@@ -1168,6 +1169,7 @@ class Collection(RichCollectionBase):
             self._config = config.settings()
 
         self.num_retries = num_retries if num_retries is not None else 0
+        self.num_write_copies = num_write_copies
         self._manifest_locator = None
         self._manifest_text = None
         self._api_response = None
@@ -1247,7 +1249,7 @@ class Collection(RichCollectionBase):
     @synchronized
     def _my_block_manager(self):
         if self._block_manager is None:
-            self._block_manager = _BlockManager(self._my_keep())
+            self._block_manager = _BlockManager(self._my_keep(), copies=self.num_write_copies)
         return self._block_manager
 
     def _remember_api_response(self, response):

commit 82a01df9d9f4f8961978a1383f4b6c09e73fe28d
Author: Lucas Di Pentima <lucas at curoverse.com>
Date:   Thu Jul 7 13:19:24 2016 -0300

    9463: Added desired replication level to collection record on API Server when using Collection class

diff --git a/sdk/python/arvados/collection.py b/sdk/python/arvados/collection.py
index 8450bd1..1047f53 100644
--- a/sdk/python/arvados/collection.py
+++ b/sdk/python/arvados/collection.py
@@ -1440,7 +1440,8 @@ class Collection(RichCollectionBase):
                  create_collection_record=True,
                  owner_uuid=None,
                  ensure_unique_name=False,
-                 num_retries=None):
+                 num_retries=None,
+                 replication_desired=None):
         """Save collection to a new collection record.
 
         Commit pending buffer blocks to Keep and, when create_collection_record
@@ -1467,17 +1468,27 @@ class Collection(RichCollectionBase):
         :num_retries:
           Retry count on API calls (if None,  use the collection default)
 
+        :replication_desired:
+          How many copies should Arvados maintain. If None, API server default
+          configuration applies.
+
         """
         self._my_block_manager().commit_all()
         text = self.manifest_text(strip=False)
 
         if create_collection_record:
+            replication_attr = 'replication_desired'
+            if self._my_api()._schema.schemas['Collection']['properties'].get(replication_attr, None) is None:
+                # API called it 'redundancy' before #3410.
+                replication_attr = 'redundancy'
+
             if name is None:
                 name = "New collection"
                 ensure_unique_name = True
 
             body = {"manifest_text": text,
-                    "name": name}
+                    "name": name,
+                    replication_attr: replication_desired}
             if owner_uuid:
                 body["owner_uuid"] = owner_uuid
 

-----------------------------------------------------------------------


hooks/post-receive
--