[arvados] updated: 2.7.1-52-g864d721d5c
git repository hosting
git at public.arvados.org
Sat Apr 6 01:26:21 UTC 2024
Summary of changes:
sdk/python/arvados/arvfile.py | 30 ++++++++++++++++++++++++------
sdk/python/arvados/keep.py | 2 ++
sdk/python/tests/test_arvfile.py | 1 +
services/fuse/arvados_fuse/command.py | 8 --------
4 files changed, 27 insertions(+), 14 deletions(-)
via 864d721d5c32b3fdb260fbc1e3ddb5170bd670c5 (commit)
from c06d01222a65a21674b31d76f0bfbcc4facb253a (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
commit 864d721d5c32b3fdb260fbc1e3ddb5170bd670c5
Author: Peter Amstutz <peter.amstutz at curii.com>
Date: Fri Apr 5 18:53:37 2024 -0400
21639: Reenable prefetch, but not on every read()
Only do prefetch every 128 invocations of read().
This should dramatically reduce the overhead of computing prefetch
while still getting some or moste of the benefits of prefetching.
Indeed, benchmarking suggests that this prefetching strategy, by
advising the kernel to map blocks into RAM, may actually improve
throughput on the high end.
Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz at curii.com>
diff --git a/sdk/python/arvados/arvfile.py b/sdk/python/arvados/arvfile.py
index 0cc7d25a33..e0e972b5c1 100644
--- a/sdk/python/arvados/arvfile.py
+++ b/sdk/python/arvados/arvfile.py
@@ -491,7 +491,7 @@ class _BlockManager(object):
self._put_queue = None
self._put_threads = None
self.lock = threading.Lock()
- self.prefetch_enabled = True
+ self.prefetch_lookahead = self._keep.num_prefetch_threads
self.num_put_threads = put_threads or _BlockManager.DEFAULT_PUT_THREADS
self.copies = copies
self.storage_classes = storage_classes_func or (lambda: [])
@@ -803,7 +803,7 @@ class _BlockManager(object):
"""Initiate a background download of a block.
"""
- if not self.prefetch_enabled:
+ if not self.prefetch_lookahead:
return
with self.lock:
@@ -825,7 +825,7 @@ class ArvadosFile(object):
"""
__slots__ = ('parent', 'name', '_writers', '_committed',
- '_segments', 'lock', '_current_bblock', 'fuse_entry')
+ '_segments', 'lock', '_current_bblock', 'fuse_entry', '_read_counter')
def __init__(self, parent, name, stream=[], segments=[]):
"""
@@ -846,6 +846,7 @@ class ArvadosFile(object):
for s in segments:
self._add_segment(stream, s.locator, s.range_size)
self._current_bblock = None
+ self._read_counter = 0
def writable(self):
return self.parent.writable()
@@ -1060,8 +1061,25 @@ class ArvadosFile(object):
if size == 0 or offset >= self.size():
return b''
readsegs = locators_and_ranges(self._segments, offset, size)
- if self.parent._my_block_manager()._keep.num_prefetch_threads > 0:
- prefetch = locators_and_ranges(self._segments, offset + size, config.KEEP_BLOCK_SIZE * self.parent._my_block_manager()._keep.num_prefetch_threads, limit=32)
+
+ prefetch = None
+ prefetch_lookahead = self.parent._my_block_manager().prefetch_lookahead
+ if prefetch_lookahead:
+ # Doing prefetch on every read() call is surprisingly expensive
+ # when we're trying to deliver data at 600+ MiBps and want
+ # the read() fast path to be as lightweight as possible.
+ #
+ # Only prefetching every 128 read operations
+ # dramatically reduces the overhead while still
+ # getting the benefit of prefetching (e.g. when
+ # reading 128 KiB at a time, it checks for prefetch
+ # every 16 MiB).
+ self._read_counter = (self._read_counter+1) % 128
+ if self._read_counter == 1:
+ prefetch = locators_and_ranges(self._segments,
+ offset + size,
+ config.KEEP_BLOCK_SIZE * prefetch_lookahead,
+ limit=(1+prefetch_lookahead))
locs = set()
data = []
@@ -1074,7 +1092,7 @@ class ArvadosFile(object):
else:
break
- if self.parent._my_block_manager()._keep.num_prefetch_threads > 0:
+ if prefetch:
for lr in prefetch:
if lr.locator not in locs:
self.parent._my_block_manager().block_prefetch(lr.locator)
diff --git a/sdk/python/arvados/keep.py b/sdk/python/arvados/keep.py
index a824621079..d1be6b931e 100644
--- a/sdk/python/arvados/keep.py
+++ b/sdk/python/arvados/keep.py
@@ -1181,6 +1181,8 @@ class KeepClient(object):
# result, so if it is already in flight return
# immediately. Clear 'slot' to prevent
# finally block from calling slot.set()
+ if slot.ready.is_set():
+ slot.get()
slot = None
return None
diff --git a/sdk/python/tests/test_arvfile.py b/sdk/python/tests/test_arvfile.py
index cf6dec1a55..3517d78e17 100644
--- a/sdk/python/tests/test_arvfile.py
+++ b/sdk/python/tests/test_arvfile.py
@@ -631,6 +631,7 @@ class ArvadosFileReaderTestCase(StreamFileReaderTestCase):
self.blocks = blocks
self.nocache = nocache
self._keep = ArvadosFileWriterTestCase.MockKeep({})
+ self.prefetch_lookahead = 0
def block_prefetch(self, loc):
pass
diff --git a/services/fuse/arvados_fuse/command.py b/services/fuse/arvados_fuse/command.py
index 1398b92e87..f52121d862 100644
--- a/services/fuse/arvados_fuse/command.py
+++ b/services/fuse/arvados_fuse/command.py
@@ -490,13 +490,6 @@ class Mount(object):
disk_cache=self.args.disk_cache,
disk_cache_dir=self.args.disk_cache_dir)
- # Profiling indicates that prefetching has more of a
- # negative impact on the read() fast path (by requiring it
- # to do more work and take additional locks) than benefit.
- # Also, the kernel does some readahead itself, which has a
- # similar effect.
- prefetch_threads = 0
-
self.api = arvados.safeapi.ThreadSafeApiCache(
apiconfig=arvados.config.settings(),
api_params={
@@ -504,7 +497,6 @@ class Mount(object):
},
keep_params={
'block_cache': block_cache,
- 'num_prefetch_threads': prefetch_threads,
'num_retries': self.args.retries,
},
version='v1',
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list