[ARVADOS] created: bfc9660a8b2467893baf131b20e83e76c41ae438

Git user git at public.curoverse.com
Thu Apr 20 11:17:01 EDT 2017


        at  bfc9660a8b2467893baf131b20e83e76c41ae438 (commit)


commit bfc9660a8b2467893baf131b20e83e76c41ae438
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Thu Apr 20 10:41:05 2017 -0400

    11510: Repack writes any time there's more than one segment referencing the same bufferblock.

diff --git a/sdk/python/arvados/arvfile.py b/sdk/python/arvados/arvfile.py
index 33e55ad..ebefc5b 100644
--- a/sdk/python/arvados/arvfile.py
+++ b/sdk/python/arvados/arvfile.py
@@ -985,23 +985,28 @@ class ArvadosFile(object):
         return ''.join(data)
 
     def _repack_writes(self, num_retries):
-        """Test if the buffer block has more data than actual segments.
+        """Optimize buffer block by repacking segments in file sequence.
 
-        This happens when a buffered write over-writes a file range written in
-        a previous buffered write.  Re-pack the buffer block for efficiency
-        and to avoid leaking information.
+        When the client makes random writes, they appear in the buffer block in
+        the sequence they were written rather than the sequence they appear in
+        the file.  This makes for inefficient, fragmented manifests.  Attempt
+        to optimize by repacking writes in file sequence.
 
         """
         segs = self._segments
 
-        # Sum up the segments to get the total bytes of the file referencing
-        # into the buffer block.
+        # Collect the segments that reference the buffer block.
         bufferblock_segs = [s for s in segs if s.locator == self._current_bblock.blockid]
-        write_total = sum([s.range_size for s in bufferblock_segs])
 
-        if write_total < self._current_bblock.size():
-            # There is more data in the buffer block than is actually accounted for by segments, so
-            # re-pack into a new buffer by copying over to a new buffer block.
+        if len(bufferblock_segs) > 1:
+            # Collect total data referenced by segments (could be smaller than
+            # bufferblock size if a portion of the file was written and
+            # then overwritten).
+            write_total = sum([s.range_size for s in bufferblock_segs])
+
+            # If there's more than one segment referencing this block, it is
+            # due to out-of-order writes and will produce a fragmented
+            # manifest, so try to optimize by re-packing into a new buffer.
             contents = self.parent._my_block_manager().get_block_contents(self._current_bblock.blockid, num_retries)
             new_bb = self.parent._my_block_manager().alloc_bufferblock(self._current_bblock.blockid, starting_capacity=write_total, owner=self)
             for t in bufferblock_segs:

commit aed7702a67426dfd9d24b512c90df8e909162179
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Tue Apr 18 17:54:57 2017 -0400

    11510: Fix longstanding bug in replace_range() when appending data from a buffer that was written out of order.

diff --git a/sdk/python/arvados/_ranges.py b/sdk/python/arvados/_ranges.py
index 8387416..5532ea0 100644
--- a/sdk/python/arvados/_ranges.py
+++ b/sdk/python/arvados/_ranges.py
@@ -109,7 +109,7 @@ def locators_and_ranges(data_locators, range_start, range_size, limit=None):
         block_size = dl.range_size
         block_end = block_start + block_size
         _logger.log(RANGES_SPAM,
-            "%s range_start %s block_start %s range_end %s block_end %s",
+            "L&R %s range_start %s block_start %s range_end %s block_end %s",
             dl.locator, range_start, block_start, range_end, block_end)
         if range_end <= block_start:
             # range ends before this block starts, so don't look at any more locators
@@ -165,7 +165,7 @@ def replace_range(data_locators, new_range_start, new_range_size, new_locator, n
 
     last = data_locators[-1]
     if (last.range_start+last.range_size) == new_range_start:
-        if last.locator == new_locator:
+        if last.locator == new_locator and (last.segment_offset+last.range_size) == new_segment_offset:
             # extend last segment
             last.range_size += new_range_size
         else:
@@ -183,7 +183,7 @@ def replace_range(data_locators, new_range_start, new_range_size, new_locator, n
         old_segment_start = dl.range_start
         old_segment_end = old_segment_start + dl.range_size
         _logger.log(RANGES_SPAM,
-            "%s range_start %s segment_start %s range_end %s segment_end %s",
+            "RR %s range_start %s segment_start %s range_end %s segment_end %s",
             dl, new_range_start, old_segment_start, new_range_end,
             old_segment_end)
         if new_range_end <= old_segment_start:
diff --git a/sdk/python/arvados/arvfile.py b/sdk/python/arvados/arvfile.py
index d337785..33e55ad 100644
--- a/sdk/python/arvados/arvfile.py
+++ b/sdk/python/arvados/arvfile.py
@@ -1022,12 +1022,8 @@ class ArvadosFile(object):
         if len(data) == 0:
             return
 
-        print "Writing", len(data), "bytes to offset", offset, "current size is", self.size()
-
         if offset > self.size():
-            print "Need to extend to", offset
             self.truncate(offset)
-            print "Size is now", self.size()
 
         if len(data) > config.KEEP_BLOCK_SIZE:
             # Chunk it up into smaller writes
diff --git a/sdk/python/tests/test_arvfile.py b/sdk/python/tests/test_arvfile.py
index 95106a7..1a2c034 100644
--- a/sdk/python/tests/test_arvfile.py
+++ b/sdk/python/tests/test_arvfile.py
@@ -330,6 +330,23 @@ class ArvadosFileWriterTestCase(unittest.TestCase):
 
             self.assertEqual(c.manifest_text(), ". 7f614da9329cd3aebf59b91aadc30bf0+67108864 781e5e245d69b566979b86e28d23f2c7+10 0:67108864:count.txt 0:67108864:count.txt 0:2:count.txt 67108864:10:count.txt\n")
 
+
+    def test_sparse_write3(self):
+        keep = ArvadosFileWriterTestCase.MockKeep({})
+        api = ArvadosFileWriterTestCase.MockApi({}, {})
+        for r in [[0, 1, 2, 3, 4], [4, 3, 2, 1, 0], [3, 2, 0, 4, 1]]:
+            with Collection() as c:
+                writer = c.open("count.txt", "r+")
+                self.assertEqual(writer.size(), 0)
+
+                for i in r:
+                    w = ("%s" % i) * 10
+                    writer.seek(i*10)
+                    writer.write(w)
+                writer.seek(0)
+                self.assertEqual(writer.read(), "00000000001111111111222222222233333333334444444444")
+
+
     def test_rewrite_on_empty_file(self):
         keep = ArvadosFileWriterTestCase.MockKeep({})
         with Collection('. ' + arvados.config.EMPTY_BLOCK_LOCATOR + ' 0:0:count.txt',

commit 04951581a941697d68cdaf9af6661c3c412f1bce
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Tue Apr 18 16:40:51 2017 -0400

    11510: Add support for sparse writes to ArvadosFile.
    
    Allow seeking past end of file.  Allow using truncate() to extend file size.  Add zero padding to fill in gaps.

diff --git a/sdk/python/arvados/arvfile.py b/sdk/python/arvados/arvfile.py
index 9db19b0..d337785 100644
--- a/sdk/python/arvados/arvfile.py
+++ b/sdk/python/arvados/arvfile.py
@@ -96,7 +96,9 @@ class ArvadosFileReaderBase(_FileLikeObjectBase):
             pos += self._filepos
         elif whence == os.SEEK_END:
             pos += self.size()
-        self._filepos = min(max(pos, 0L), self.size())
+        if pos < 0L:
+            raise IOError(errno.EINVAL, "Tried to seek to negative file offset.")
+        self._filepos = pos
 
     def tell(self):
         return self._filepos
@@ -428,6 +430,7 @@ class _BlockManager(object):
         self.copies = copies
         self._pending_write_size = 0
         self.threads_lock = threading.Lock()
+        self.padding_block = None
 
     @synchronized
     def alloc_bufferblock(self, blockid=None, starting_capacity=2**14, owner=None):
@@ -654,6 +657,17 @@ class _BlockManager(object):
         return self._bufferblocks.get(locator)
 
     @synchronized
+    def get_padding_block(self):
+        """Get a bufferblock 64 MB in size consisting of all zeros, used as padding
+        when using truncate() to extend the size of a file."""
+
+        if self.padding_block is None:
+            self.padding_block = self._alloc_bufferblock(starting_capacity=config.KEEP_BLOCK_SIZE)
+            self.padding_block.write_pointer = config.KEEP_BLOCK_SIZE
+            self.commit_bufferblock(self.padding_block, False)
+        return self.padding_block
+
+    @synchronized
     def delete_bufferblock(self, locator):
         self._delete_bufferblock(locator)
 
@@ -900,11 +914,11 @@ class ArvadosFile(object):
     @must_be_writable
     @synchronized
     def truncate(self, size):
-        """Shrink the size of the file.
+        """Shrink or expand the size of the file.
 
         If `size` is less than the size of the file, the file contents after
         `size` will be discarded.  If `size` is greater than the current size
-        of the file, an IOError will be raised.
+        of the file, it will be filled with zero bytes.
 
         """
         if size < self.size():
@@ -925,7 +939,17 @@ class ArvadosFile(object):
             self._segments = new_segs
             self.set_committed(False)
         elif size > self.size():
-            raise IOError(errno.EINVAL, "truncate() does not support extending the file size")
+            padding = self.parent._my_block_manager().get_padding_block()
+            diff = size - self.size()
+            while diff > config.KEEP_BLOCK_SIZE:
+                self._segments.append(Range(padding.blockid, self.size(), config.KEEP_BLOCK_SIZE, 0))
+                diff -= config.KEEP_BLOCK_SIZE
+            if diff > 0:
+                self._segments.append(Range(padding.blockid, self.size(), diff, 0))
+            self.set_committed(False)
+        else:
+            # size == self.size()
+            pass
 
     def readfrom(self, offset, size, num_retries, exact=False):
         """Read up to `size` bytes from the file starting at `offset`.
@@ -998,8 +1022,12 @@ class ArvadosFile(object):
         if len(data) == 0:
             return
 
+        print "Writing", len(data), "bytes to offset", offset, "current size is", self.size()
+
         if offset > self.size():
-            raise ArgumentError("Offset is past the end of the file")
+            print "Need to extend to", offset
+            self.truncate(offset)
+            print "Size is now", self.size()
 
         if len(data) > config.KEEP_BLOCK_SIZE:
             # Chunk it up into smaller writes
diff --git a/sdk/python/arvados/config.py b/sdk/python/arvados/config.py
index 8f2b265..0c4ccb0 100644
--- a/sdk/python/arvados/config.py
+++ b/sdk/python/arvados/config.py
@@ -15,6 +15,9 @@ else:
 KEEP_BLOCK_SIZE = 2**26
 EMPTY_BLOCK_LOCATOR = 'd41d8cd98f00b204e9800998ecf8427e+0'
 
+# This is the 64 MiB block consisting entirely of zeros
+PADDING_BLOCK_LOCATOR = '7f614da9329cd3aebf59b91aadc30bf0+67108864'
+
 def initialize(config_file=default_config_file):
     global _settings
     _settings = {}
diff --git a/sdk/python/tests/test_arvfile.py b/sdk/python/tests/test_arvfile.py
index 1b66935..95106a7 100644
--- a/sdk/python/tests/test_arvfile.py
+++ b/sdk/python/tests/test_arvfile.py
@@ -91,6 +91,34 @@ class ArvadosFileWriterTestCase(unittest.TestCase):
             self.assertEqual("zzzzz-4zz18-mockcollection0", c.manifest_locator())
             self.assertFalse(c.modified())
 
+
+    def test_truncate2(self):
+        keep = ArvadosFileWriterTestCase.MockKeep({"781e5e245d69b566979b86e28d23f2c7+10": "0123456789"})
+        api = ArvadosFileWriterTestCase.MockApi({"name":"test_truncate2",
+                                                 "manifest_text":". 781e5e245d69b566979b86e28d23f2c7+10 7f614da9329cd3aebf59b91aadc30bf0+67108864 0:12:count.txt\n",
+                                                 "replication_desired":None},
+                                                {"uuid":"zzzzz-4zz18-mockcollection0",
+                                                 "manifest_text":". 781e5e245d69b566979b86e28d23f2c7+10 7f614da9329cd3aebf59b91aadc30bf0+67108864 0:12:count.txt\n",
+                                                 "portable_data_hash":"272da898abdf86ddc71994835e3155f8+95"})
+        with Collection('. 781e5e245d69b566979b86e28d23f2c7+10 0:10:count.txt\n',
+                             api_client=api, keep_client=keep) as c:
+            writer = c.open("count.txt", "r+")
+            self.assertEqual(writer.size(), 10)
+            self.assertEqual("0123456789", writer.read(12))
+
+            # extend file size
+            writer.truncate(12)
+
+            self.assertEqual(writer.size(), 12)
+            writer.seek(0, os.SEEK_SET)
+            self.assertEqual(b"0123456789\x00\x00", writer.read(12))
+
+            self.assertIsNone(c.manifest_locator())
+            self.assertTrue(c.modified())
+            c.save_new("test_truncate2")
+            self.assertEqual("zzzzz-4zz18-mockcollection0", c.manifest_locator())
+            self.assertFalse(c.modified())
+
     def test_write_to_end(self):
         keep = ArvadosFileWriterTestCase.MockKeep({"781e5e245d69b566979b86e28d23f2c7+10": "0123456789"})
         api = ArvadosFileWriterTestCase.MockApi({"name":"test_append",
@@ -268,6 +296,40 @@ class ArvadosFileWriterTestCase(unittest.TestCase):
 
             self.assertEqual(c.manifest_text(), ". 781e5e245d69b566979b86e28d23f2c7+10 48dd23ea1645fd47d789804d71b5bb8e+67108864 77c57dc6ac5a10bb2205caaa73187994+32891126 0:100000000:count.txt\n")
 
+    def test_sparse_write(self):
+        keep = ArvadosFileWriterTestCase.MockKeep({})
+        api = ArvadosFileWriterTestCase.MockApi({}, {})
+        with Collection('. ' + arvados.config.EMPTY_BLOCK_LOCATOR + ' 0:0:count.txt',
+                             api_client=api, keep_client=keep) as c:
+            writer = c.open("count.txt", "r+")
+            self.assertEqual(writer.size(), 0)
+
+            text = "0123456789"
+            writer.seek(2)
+            writer.write(text)
+            self.assertEqual(writer.size(), 12)
+            writer.seek(0, os.SEEK_SET)
+            self.assertEqual(writer.read(), b"\x00\x00"+text)
+
+            self.assertEqual(c.manifest_text(), ". 7f614da9329cd3aebf59b91aadc30bf0+67108864 781e5e245d69b566979b86e28d23f2c7+10 0:2:count.txt 67108864:10:count.txt\n")
+
+
+    def test_sparse_write2(self):
+        keep = ArvadosFileWriterTestCase.MockKeep({})
+        api = ArvadosFileWriterTestCase.MockApi({}, {})
+        with Collection('. ' + arvados.config.EMPTY_BLOCK_LOCATOR + ' 0:0:count.txt',
+                             api_client=api, keep_client=keep) as c:
+            writer = c.open("count.txt", "r+")
+            self.assertEqual(writer.size(), 0)
+
+            text = "0123456789"
+            writer.seek((arvados.config.KEEP_BLOCK_SIZE*2) + 2)
+            writer.write(text)
+            self.assertEqual(writer.size(), (arvados.config.KEEP_BLOCK_SIZE*2) + 12)
+            writer.seek(0, os.SEEK_SET)
+
+            self.assertEqual(c.manifest_text(), ". 7f614da9329cd3aebf59b91aadc30bf0+67108864 781e5e245d69b566979b86e28d23f2c7+10 0:67108864:count.txt 0:67108864:count.txt 0:2:count.txt 67108864:10:count.txt\n")
+
     def test_rewrite_on_empty_file(self):
         keep = ArvadosFileWriterTestCase.MockKeep({})
         with Collection('. ' + arvados.config.EMPTY_BLOCK_LOCATOR + ' 0:0:count.txt',
diff --git a/sdk/python/tests/test_stream.py b/sdk/python/tests/test_stream.py
index 624f1b8..d4b2207 100644
--- a/sdk/python/tests/test_stream.py
+++ b/sdk/python/tests/test_stream.py
@@ -76,7 +76,8 @@ class StreamFileReaderTestCase(unittest.TestCase):
     def test_seek_max_size(self):
         sfile = self.make_count_reader()
         sfile.seek(2, os.SEEK_END)
-        self.assertEqual(9, sfile.tell())
+        # POSIX permits seeking past end of file.
+        self.assertEqual(11, sfile.tell())
 
     def test_size(self):
         self.assertEqual(9, self.make_count_reader().size())

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list