[arvados] created: 2.7.0-6602-g9876a78e50

git repository hosting git at public.arvados.org
Mon May 20 13:43:21 UTC 2024


        at  9876a78e50d1b3ad2571b6e77d92bcb2465d996e (commit)


commit 9876a78e50d1b3ad2571b6e77d92bcb2465d996e
Author: Peter Amstutz <peter.amstutz at curii.com>
Date:   Mon May 20 09:39:43 2024 -0400

    21718: Add return_bytes_only flag (default true) to restore behavior
    
    Returning memoryview objects from ArvFile.read() produced unexpected
    regressions in arvados-cwl-runner.  Make the new behavior optional and
    restore the old behavior by default.  FUSE requests the new behavior
    explicitly.
    
    This is a stopgap for 2.7.3, we should plan to flip the default from
    old behavior to new behavior for 3.0.
    
    Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz at curii.com>

diff --git a/sdk/python/arvados/arvfile.py b/sdk/python/arvados/arvfile.py
index de20254351..b21f3b406c 100644
--- a/sdk/python/arvados/arvfile.py
+++ b/sdk/python/arvados/arvfile.py
@@ -1092,8 +1092,16 @@ class ArvadosFile(object):
                     locs.add(lr.locator)
 
         if len(data) == 1:
-            return data[0]
+            if self.parent.return_bytes_only():
+                # return bytes (API behavior prior to 2.7.2)
+                return data[0].tobytes()
+            else:
+                # return memoryview (save a copy, significant
+                # performance improvement for FUSE)
+                return data[0]
         else:
+            # need to join multiple segments, will be copied and
+            # returned as bytes
             return b''.join(data)
 
     @must_be_writable
diff --git a/sdk/python/arvados/collection.py b/sdk/python/arvados/collection.py
index 1050d4c093..1084220ef8 100644
--- a/sdk/python/arvados/collection.py
+++ b/sdk/python/arvados/collection.py
@@ -177,6 +177,9 @@ class RichCollectionBase(CollectionBase):
     def _my_block_manager(self):
         raise NotImplementedError()
 
+    def return_bytes_only(self) -> bool:
+        raise NotImplementedError()
+
     def writable(self) -> bool:
         """Indicate whether this collection object can be modified
 
@@ -1042,7 +1045,8 @@ class Collection(RichCollectionBase):
                  block_manager: Optional['arvados.arvfile._BlockManager']=None,
                  replication_desired: Optional[int]=None,
                  storage_classes_desired: Optional[List[str]]=None,
-                 put_threads: Optional[int]=None):
+                 put_threads: Optional[int]=None,
+                 return_bytes_only: bool=True):
         """Initialize a Collection object
 
         Arguments:
@@ -1103,6 +1107,13 @@ class Collection(RichCollectionBase):
           simultaneously to upload data blocks to Keep. This value is used when
           building a new `block_manager`. It is unused when a `block_manager`
           is provided.
+
+        * return_bytes_only: bool --- If True, ArvFile read() will
+        only actual 'bytes' objects, if False, allow ArvFile read() to
+        return a bytes-like object (memoryview) for better efficiency,
+        but slightly reduced compatibility with code expecting plain
+        'bytes' objects.
+
         """
 
         if storage_classes_desired and type(storage_classes_desired) is not list:
@@ -1111,6 +1122,7 @@ class Collection(RichCollectionBase):
         super(Collection, self).__init__(parent)
         self._api_client = api_client
         self._keep_client = keep_client
+        self._return_bytes_only = return_bytes_only
 
         # Use the keep client from ThreadSafeApiCache
         if self._keep_client is None and isinstance(self._api_client, ThreadSafeApiCache):
@@ -1163,6 +1175,9 @@ class Collection(RichCollectionBase):
     def root_collection(self) -> 'Collection':
         return self
 
+    def return_bytes_only(self) -> bool:
+        self._return_bytes_only
+
     def get_properties(self) -> Properties:
         """Get this collection's properties
 
@@ -1788,6 +1803,9 @@ class Subcollection(RichCollectionBase):
     def stream_name(self) -> str:
         return os.path.join(self.parent.stream_name(), self.name)
 
+    def return_bytes_only(self) -> bool:
+        return self.root_collection().return_bytes_only()
+
     @synchronized
     def clone(
             self,
diff --git a/services/fuse/arvados_fuse/fusedir.py b/services/fuse/arvados_fuse/fusedir.py
index 9c78805107..4f4d148a35 100644
--- a/services/fuse/arvados_fuse/fusedir.py
+++ b/services/fuse/arvados_fuse/fusedir.py
@@ -568,11 +568,13 @@ class CollectionDirectory(CollectionDirectoryBase):
                         if uuid_pattern.match(self.collection_locator):
                             coll_reader = arvados.collection.Collection(
                                 self.collection_locator, self.api, self.api.keep,
-                                num_retries=self.num_retries)
+                                num_retries=self.num_retries,
+                                return_bytes_only=False)
                         else:
                             coll_reader = arvados.collection.CollectionReader(
                                 self.collection_locator, self.api, self.api.keep,
-                                num_retries=self.num_retries)
+                                num_retries=self.num_retries,
+                                return_bytes_only=False)
                         new_collection_record = coll_reader.api_response() or {}
                         # If the Collection only exists in Keep, there will be no API
                         # response.  Fill in the fields we need.

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list