[ARVADOS] created: 9c9b52038aa8b9c15f02567d186539fd8794d0f2

Git user git at public.curoverse.com
Thu Feb 2 17:14:04 EST 2017


        at  9c9b52038aa8b9c15f02567d186539fd8794d0f2 (commit)


commit 9c9b52038aa8b9c15f02567d186539fd8794d0f2
Author: Lucas Di Pentima <lucas at curoverse.com>
Date:   Thu Feb 2 19:10:45 2017 -0300

    10932: Replaced the use of a list with a set to check if files on local collection are on the local file list, so that the resume start time is greatly reduced.
    Also, the save_state method was taking too much time on two operations: deepcopy() and json.dump(). Replaced both with just one call to json.dumps() that's a lot faster than json.dump().
    This will improve overall performance on big file collections uploads.

diff --git a/sdk/python/arvados/commands/put.py b/sdk/python/arvados/commands/put.py
index 5b46ba7..2fbac22 100644
--- a/sdk/python/arvados/commands/put.py
+++ b/sdk/python/arvados/commands/put.py
@@ -436,9 +436,11 @@ class ArvPutUploadJob(object):
                 raise ArvPutUploadNotPending()
             # Remove local_collection's files that don't exist locally anymore, so the
             # bytes_written count is correct.
+            # Using a set because is lot faster than a list in this case
+            file_paths = set(self._file_paths)
             for f in self.collection_file_paths(self._local_collection,
                                                 path_prefix=""):
-                if f != 'stdin' and f != self.filename and not f in self._file_paths:
+                if f != 'stdin' and f != self.filename and not f in file_paths:
                     self._local_collection.remove(f)
             # Update bytes_written from current local collection and
             # report initial progress.
@@ -703,12 +705,12 @@ class ArvPutUploadJob(object):
         """
         try:
             with self._state_lock:
-                state = copy.deepcopy(self._state)
+                state = json.dumps(self._state)
             new_cache_fd, new_cache_name = tempfile.mkstemp(
                 dir=os.path.dirname(self._cache_filename))
             self._lock_file(new_cache_fd)
             new_cache = os.fdopen(new_cache_fd, 'r+')
-            json.dump(state, new_cache)
+            new_cache.write(state)
             new_cache.flush()
             os.fsync(new_cache)
             os.rename(new_cache_name, self._cache_filename)

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list