[ARVADOS] updated: 1d61934faad7093807fd3024096ca54d3bc24ea3

Thu Apr 20 10:41:29 EDT 2017

Summary of changes:
 sdk/python/arvados/arvfile.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

       via  1d61934faad7093807fd3024096ca54d3bc24ea3 (commit)
      from  aed7702a67426dfd9d24b512c90df8e909162179 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.


commit 1d61934faad7093807fd3024096ca54d3bc24ea3
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Thu Apr 20 10:41:05 2017 -0400

    11510: Repack writes any time there's more than one segment referencing the same bufferblock.

diff --git a/sdk/python/arvados/arvfile.py b/sdk/python/arvados/arvfile.py
index 33e55ad..4129a15 100644
--- a/sdk/python/arvados/arvfile.py
+++ b/sdk/python/arvados/arvfile.py
@@ -994,14 +994,18 @@ class ArvadosFile(object):
         """
         segs = self._segments
 
-        # Sum up the segments to get the total bytes of the file referencing
-        # into the buffer block.
+        # Collect the segments that reference the buffer block.
         bufferblock_segs = [s for s in segs if s.locator == self._current_bblock.blockid]
-        write_total = sum([s.range_size for s in bufferblock_segs])
 
-        if write_total < self._current_bblock.size():
-            # There is more data in the buffer block than is actually accounted for by segments, so
-            # re-pack into a new buffer by copying over to a new buffer block.
+        if len(bufferblock_segs) > 1:
+            # Collect total data referenced by segments (could be smaller than
+            # bufferblock size if a portion of the file was written and
+            # then overwritten).
+            write_total = sum([s.range_size for s in bufferblock_segs])
+
+            # If there's more than one segment referencing this block, it is
+            # due to out-of-order writes and will produce a fragmented
+            # manifest, so try to optimize by re-packing into a new buffer.
             contents = self.parent._my_block_manager().get_block_contents(self._current_bblock.blockid, num_retries)
             new_bb = self.parent._my_block_manager().alloc_bufferblock(self._current_bblock.blockid, starting_capacity=write_total, owner=self)
             for t in bufferblock_segs:

-----------------------------------------------------------------------


hooks/post-receive
--