[ARVADOS] created: bfc9660a8b2467893baf131b20e83e76c41ae438
Git user
git at public.curoverse.com
Thu Apr 20 11:17:01 EDT 2017
at bfc9660a8b2467893baf131b20e83e76c41ae438 (commit)
commit bfc9660a8b2467893baf131b20e83e76c41ae438
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date: Thu Apr 20 10:41:05 2017 -0400
11510: Repack writes any time there's more than one segment referencing the same bufferblock.
diff --git a/sdk/python/arvados/arvfile.py b/sdk/python/arvados/arvfile.py
index 33e55ad..ebefc5b 100644
--- a/sdk/python/arvados/arvfile.py
+++ b/sdk/python/arvados/arvfile.py
@@ -985,23 +985,28 @@ class ArvadosFile(object):
return ''.join(data)
def _repack_writes(self, num_retries):
- """Test if the buffer block has more data than actual segments.
+ """Optimize buffer block by repacking segments in file sequence.
- This happens when a buffered write over-writes a file range written in
- a previous buffered write. Re-pack the buffer block for efficiency
- and to avoid leaking information.
+ When the client makes random writes, they appear in the buffer block in
+ the sequence they were written rather than the sequence they appear in
+ the file. This makes for inefficient, fragmented manifests. Attempt
+ to optimize by repacking writes in file sequence.
"""
segs = self._segments
- # Sum up the segments to get the total bytes of the file referencing
- # into the buffer block.
+ # Collect the segments that reference the buffer block.
bufferblock_segs = [s for s in segs if s.locator == self._current_bblock.blockid]
- write_total = sum([s.range_size for s in bufferblock_segs])
- if write_total < self._current_bblock.size():
- # There is more data in the buffer block than is actually accounted for by segments, so
- # re-pack into a new buffer by copying over to a new buffer block.
+ if len(bufferblock_segs) > 1:
+ # Collect total data referenced by segments (could be smaller than
+ # bufferblock size if a portion of the file was written and
+ # then overwritten).
+ write_total = sum([s.range_size for s in bufferblock_segs])
+
+ # If there's more than one segment referencing this block, it is
+ # due to out-of-order writes and will produce a fragmented
+ # manifest, so try to optimize by re-packing into a new buffer.
contents = self.parent._my_block_manager().get_block_contents(self._current_bblock.blockid, num_retries)
new_bb = self.parent._my_block_manager().alloc_bufferblock(self._current_bblock.blockid, starting_capacity=write_total, owner=self)
for t in bufferblock_segs:
commit aed7702a67426dfd9d24b512c90df8e909162179
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date: Tue Apr 18 17:54:57 2017 -0400
11510: Fix longstanding bug in replace_range() when appending data from a buffer that was written out of order.
diff --git a/sdk/python/arvados/_ranges.py b/sdk/python/arvados/_ranges.py
index 8387416..5532ea0 100644
--- a/sdk/python/arvados/_ranges.py
+++ b/sdk/python/arvados/_ranges.py
@@ -109,7 +109,7 @@ def locators_and_ranges(data_locators, range_start, range_size, limit=None):
block_size = dl.range_size
block_end = block_start + block_size
_logger.log(RANGES_SPAM,
- "%s range_start %s block_start %s range_end %s block_end %s",
+ "L&R %s range_start %s block_start %s range_end %s block_end %s",
dl.locator, range_start, block_start, range_end, block_end)
if range_end <= block_start:
# range ends before this block starts, so don't look at any more locators
@@ -165,7 +165,7 @@ def replace_range(data_locators, new_range_start, new_range_size, new_locator, n
last = data_locators[-1]
if (last.range_start+last.range_size) == new_range_start:
- if last.locator == new_locator:
+ if last.locator == new_locator and (last.segment_offset+last.range_size) == new_segment_offset:
# extend last segment
last.range_size += new_range_size
else:
@@ -183,7 +183,7 @@ def replace_range(data_locators, new_range_start, new_range_size, new_locator, n
old_segment_start = dl.range_start
old_segment_end = old_segment_start + dl.range_size
_logger.log(RANGES_SPAM,
- "%s range_start %s segment_start %s range_end %s segment_end %s",
+ "RR %s range_start %s segment_start %s range_end %s segment_end %s",
dl, new_range_start, old_segment_start, new_range_end,
old_segment_end)
if new_range_end <= old_segment_start:
diff --git a/sdk/python/arvados/arvfile.py b/sdk/python/arvados/arvfile.py
index d337785..33e55ad 100644
--- a/sdk/python/arvados/arvfile.py
+++ b/sdk/python/arvados/arvfile.py
@@ -1022,12 +1022,8 @@ class ArvadosFile(object):
if len(data) == 0:
return
- print "Writing", len(data), "bytes to offset", offset, "current size is", self.size()
-
if offset > self.size():
- print "Need to extend to", offset
self.truncate(offset)
- print "Size is now", self.size()
if len(data) > config.KEEP_BLOCK_SIZE:
# Chunk it up into smaller writes
diff --git a/sdk/python/tests/test_arvfile.py b/sdk/python/tests/test_arvfile.py
index 95106a7..1a2c034 100644
--- a/sdk/python/tests/test_arvfile.py
+++ b/sdk/python/tests/test_arvfile.py
@@ -330,6 +330,23 @@ class ArvadosFileWriterTestCase(unittest.TestCase):
self.assertEqual(c.manifest_text(), ". 7f614da9329cd3aebf59b91aadc30bf0+67108864 781e5e245d69b566979b86e28d23f2c7+10 0:67108864:count.txt 0:67108864:count.txt 0:2:count.txt 67108864:10:count.txt\n")
+
+ def test_sparse_write3(self):
+ keep = ArvadosFileWriterTestCase.MockKeep({})
+ api = ArvadosFileWriterTestCase.MockApi({}, {})
+ for r in [[0, 1, 2, 3, 4], [4, 3, 2, 1, 0], [3, 2, 0, 4, 1]]:
+ with Collection() as c:
+ writer = c.open("count.txt", "r+")
+ self.assertEqual(writer.size(), 0)
+
+ for i in r:
+ w = ("%s" % i) * 10
+ writer.seek(i*10)
+ writer.write(w)
+ writer.seek(0)
+ self.assertEqual(writer.read(), "00000000001111111111222222222233333333334444444444")
+
+
def test_rewrite_on_empty_file(self):
keep = ArvadosFileWriterTestCase.MockKeep({})
with Collection('. ' + arvados.config.EMPTY_BLOCK_LOCATOR + ' 0:0:count.txt',
commit 04951581a941697d68cdaf9af6661c3c412f1bce
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date: Tue Apr 18 16:40:51 2017 -0400
11510: Add support for sparse writes to ArvadosFile.
Allow seeking past end of file. Allow using truncate() to extend file size. Add zero padding to fill in gaps.
diff --git a/sdk/python/arvados/arvfile.py b/sdk/python/arvados/arvfile.py
index 9db19b0..d337785 100644
--- a/sdk/python/arvados/arvfile.py
+++ b/sdk/python/arvados/arvfile.py
@@ -96,7 +96,9 @@ class ArvadosFileReaderBase(_FileLikeObjectBase):
pos += self._filepos
elif whence == os.SEEK_END:
pos += self.size()
- self._filepos = min(max(pos, 0L), self.size())
+ if pos < 0L:
+ raise IOError(errno.EINVAL, "Tried to seek to negative file offset.")
+ self._filepos = pos
def tell(self):
return self._filepos
@@ -428,6 +430,7 @@ class _BlockManager(object):
self.copies = copies
self._pending_write_size = 0
self.threads_lock = threading.Lock()
+ self.padding_block = None
@synchronized
def alloc_bufferblock(self, blockid=None, starting_capacity=2**14, owner=None):
@@ -654,6 +657,17 @@ class _BlockManager(object):
return self._bufferblocks.get(locator)
@synchronized
+ def get_padding_block(self):
+ """Get a bufferblock 64 MB in size consisting of all zeros, used as padding
+ when using truncate() to extend the size of a file."""
+
+ if self.padding_block is None:
+ self.padding_block = self._alloc_bufferblock(starting_capacity=config.KEEP_BLOCK_SIZE)
+ self.padding_block.write_pointer = config.KEEP_BLOCK_SIZE
+ self.commit_bufferblock(self.padding_block, False)
+ return self.padding_block
+
+ @synchronized
def delete_bufferblock(self, locator):
self._delete_bufferblock(locator)
@@ -900,11 +914,11 @@ class ArvadosFile(object):
@must_be_writable
@synchronized
def truncate(self, size):
- """Shrink the size of the file.
+ """Shrink or expand the size of the file.
If `size` is less than the size of the file, the file contents after
`size` will be discarded. If `size` is greater than the current size
- of the file, an IOError will be raised.
+ of the file, it will be filled with zero bytes.
"""
if size < self.size():
@@ -925,7 +939,17 @@ class ArvadosFile(object):
self._segments = new_segs
self.set_committed(False)
elif size > self.size():
- raise IOError(errno.EINVAL, "truncate() does not support extending the file size")
+ padding = self.parent._my_block_manager().get_padding_block()
+ diff = size - self.size()
+ while diff > config.KEEP_BLOCK_SIZE:
+ self._segments.append(Range(padding.blockid, self.size(), config.KEEP_BLOCK_SIZE, 0))
+ diff -= config.KEEP_BLOCK_SIZE
+ if diff > 0:
+ self._segments.append(Range(padding.blockid, self.size(), diff, 0))
+ self.set_committed(False)
+ else:
+ # size == self.size()
+ pass
def readfrom(self, offset, size, num_retries, exact=False):
"""Read up to `size` bytes from the file starting at `offset`.
@@ -998,8 +1022,12 @@ class ArvadosFile(object):
if len(data) == 0:
return
+ print "Writing", len(data), "bytes to offset", offset, "current size is", self.size()
+
if offset > self.size():
- raise ArgumentError("Offset is past the end of the file")
+ print "Need to extend to", offset
+ self.truncate(offset)
+ print "Size is now", self.size()
if len(data) > config.KEEP_BLOCK_SIZE:
# Chunk it up into smaller writes
diff --git a/sdk/python/arvados/config.py b/sdk/python/arvados/config.py
index 8f2b265..0c4ccb0 100644
--- a/sdk/python/arvados/config.py
+++ b/sdk/python/arvados/config.py
@@ -15,6 +15,9 @@ else:
KEEP_BLOCK_SIZE = 2**26
EMPTY_BLOCK_LOCATOR = 'd41d8cd98f00b204e9800998ecf8427e+0'
+# This is the 64 MiB block consisting entirely of zeros
+PADDING_BLOCK_LOCATOR = '7f614da9329cd3aebf59b91aadc30bf0+67108864'
+
def initialize(config_file=default_config_file):
global _settings
_settings = {}
diff --git a/sdk/python/tests/test_arvfile.py b/sdk/python/tests/test_arvfile.py
index 1b66935..95106a7 100644
--- a/sdk/python/tests/test_arvfile.py
+++ b/sdk/python/tests/test_arvfile.py
@@ -91,6 +91,34 @@ class ArvadosFileWriterTestCase(unittest.TestCase):
self.assertEqual("zzzzz-4zz18-mockcollection0", c.manifest_locator())
self.assertFalse(c.modified())
+
+ def test_truncate2(self):
+ keep = ArvadosFileWriterTestCase.MockKeep({"781e5e245d69b566979b86e28d23f2c7+10": "0123456789"})
+ api = ArvadosFileWriterTestCase.MockApi({"name":"test_truncate2",
+ "manifest_text":". 781e5e245d69b566979b86e28d23f2c7+10 7f614da9329cd3aebf59b91aadc30bf0+67108864 0:12:count.txt\n",
+ "replication_desired":None},
+ {"uuid":"zzzzz-4zz18-mockcollection0",
+ "manifest_text":". 781e5e245d69b566979b86e28d23f2c7+10 7f614da9329cd3aebf59b91aadc30bf0+67108864 0:12:count.txt\n",
+ "portable_data_hash":"272da898abdf86ddc71994835e3155f8+95"})
+ with Collection('. 781e5e245d69b566979b86e28d23f2c7+10 0:10:count.txt\n',
+ api_client=api, keep_client=keep) as c:
+ writer = c.open("count.txt", "r+")
+ self.assertEqual(writer.size(), 10)
+ self.assertEqual("0123456789", writer.read(12))
+
+ # extend file size
+ writer.truncate(12)
+
+ self.assertEqual(writer.size(), 12)
+ writer.seek(0, os.SEEK_SET)
+ self.assertEqual(b"0123456789\x00\x00", writer.read(12))
+
+ self.assertIsNone(c.manifest_locator())
+ self.assertTrue(c.modified())
+ c.save_new("test_truncate2")
+ self.assertEqual("zzzzz-4zz18-mockcollection0", c.manifest_locator())
+ self.assertFalse(c.modified())
+
def test_write_to_end(self):
keep = ArvadosFileWriterTestCase.MockKeep({"781e5e245d69b566979b86e28d23f2c7+10": "0123456789"})
api = ArvadosFileWriterTestCase.MockApi({"name":"test_append",
@@ -268,6 +296,40 @@ class ArvadosFileWriterTestCase(unittest.TestCase):
self.assertEqual(c.manifest_text(), ". 781e5e245d69b566979b86e28d23f2c7+10 48dd23ea1645fd47d789804d71b5bb8e+67108864 77c57dc6ac5a10bb2205caaa73187994+32891126 0:100000000:count.txt\n")
+ def test_sparse_write(self):
+ keep = ArvadosFileWriterTestCase.MockKeep({})
+ api = ArvadosFileWriterTestCase.MockApi({}, {})
+ with Collection('. ' + arvados.config.EMPTY_BLOCK_LOCATOR + ' 0:0:count.txt',
+ api_client=api, keep_client=keep) as c:
+ writer = c.open("count.txt", "r+")
+ self.assertEqual(writer.size(), 0)
+
+ text = "0123456789"
+ writer.seek(2)
+ writer.write(text)
+ self.assertEqual(writer.size(), 12)
+ writer.seek(0, os.SEEK_SET)
+ self.assertEqual(writer.read(), b"\x00\x00"+text)
+
+ self.assertEqual(c.manifest_text(), ". 7f614da9329cd3aebf59b91aadc30bf0+67108864 781e5e245d69b566979b86e28d23f2c7+10 0:2:count.txt 67108864:10:count.txt\n")
+
+
+ def test_sparse_write2(self):
+ keep = ArvadosFileWriterTestCase.MockKeep({})
+ api = ArvadosFileWriterTestCase.MockApi({}, {})
+ with Collection('. ' + arvados.config.EMPTY_BLOCK_LOCATOR + ' 0:0:count.txt',
+ api_client=api, keep_client=keep) as c:
+ writer = c.open("count.txt", "r+")
+ self.assertEqual(writer.size(), 0)
+
+ text = "0123456789"
+ writer.seek((arvados.config.KEEP_BLOCK_SIZE*2) + 2)
+ writer.write(text)
+ self.assertEqual(writer.size(), (arvados.config.KEEP_BLOCK_SIZE*2) + 12)
+ writer.seek(0, os.SEEK_SET)
+
+ self.assertEqual(c.manifest_text(), ". 7f614da9329cd3aebf59b91aadc30bf0+67108864 781e5e245d69b566979b86e28d23f2c7+10 0:67108864:count.txt 0:67108864:count.txt 0:2:count.txt 67108864:10:count.txt\n")
+
def test_rewrite_on_empty_file(self):
keep = ArvadosFileWriterTestCase.MockKeep({})
with Collection('. ' + arvados.config.EMPTY_BLOCK_LOCATOR + ' 0:0:count.txt',
diff --git a/sdk/python/tests/test_stream.py b/sdk/python/tests/test_stream.py
index 624f1b8..d4b2207 100644
--- a/sdk/python/tests/test_stream.py
+++ b/sdk/python/tests/test_stream.py
@@ -76,7 +76,8 @@ class StreamFileReaderTestCase(unittest.TestCase):
def test_seek_max_size(self):
sfile = self.make_count_reader()
sfile.seek(2, os.SEEK_END)
- self.assertEqual(9, sfile.tell())
+ # POSIX permits seeking past end of file.
+ self.assertEqual(11, sfile.tell())
def test_size(self):
self.assertEqual(9, self.make_count_reader().size())
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list