[ARVADOS] created: 1.1.4-257-g490286e

Git user git at public.curoverse.com
Thu May 10 15:23:35 EDT 2018


        at  490286e1fc9266b79967ba71b27801415e631ab0 (commit)


commit 490286e1fc9266b79967ba71b27801415e631ab0
Author: Peter Amstutz <pamstutz at veritasgenetics.com>
Date:   Thu May 10 14:39:54 2018 -0400

    11907: Sort files for deterministic upload in a-c-r
    
    Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <pamstutz at veritasgenetics.com>

diff --git a/sdk/python/arvados/commands/run.py b/sdk/python/arvados/commands/run.py
index 831e496..22c345b 100644
--- a/sdk/python/arvados/commands/run.py
+++ b/sdk/python/arvados/commands/run.py
@@ -185,9 +185,12 @@ def uploadfiles(files, api, dry_run=False, num_retries=0,
         logger.info("$(input) is %s", pathprefix.rstrip('/'))
         pdh = "$(input)"
     else:
-        files = sorted(files, key=lambda x: x.fn)
+        files.sort(key=lambda f: f.fn)
+
         if collection is None:
             collection = arvados.collection.Collection(api_client=api, num_retries=num_retries)
+
+        filtered = []
         prev = ""
         for f in files:
             localpath = os.path.join(pathprefix, f.fn)
@@ -198,10 +201,26 @@ def uploadfiles(files, api, dry_run=False, num_retries=0,
                 # skip it because it starts with "/tmp/foo/"
                 continue
             prev = localpath
+            filtered.append(f)
+
+        # Sort such that for each directory, files are uploaded
+        # first, then subdirectories.
+        def keyfunc(f):
+            localpath = os.path.join(pathprefix, f.fn)
+            dn, fn = os.path.split(localpath)
+            return (dn, os.path.isdir(localpath), fn)
+
+        filtered.sort(key=keyfunc)
+
+        for f in filtered:
+            localpath = os.path.join(pathprefix, f.fn)
             if os.path.isfile(localpath):
                 write_file(collection, pathprefix, f.fn)
             elif os.path.isdir(localpath):
                 for root, dirs, iterfiles in os.walk(localpath):
+                    # Make the directory traversal deterministic
+                    dirs.sort()
+                    iterfiles.sort()
                     root = root[len(pathprefix):]
                     for src in iterfiles:
                         write_file(collection, pathprefix, os.path.join(root, src))
diff --git a/sdk/python/tests/test_arv_run.py b/sdk/python/tests/test_arv_run.py
index 1afc120..62a6701 100644
--- a/sdk/python/tests/test_arv_run.py
+++ b/sdk/python/tests/test_arv_run.py
@@ -7,6 +7,8 @@ import os
 import sys
 import tempfile
 import unittest
+import random
+import mock
 
 import arvados.commands.run as arv_run
 from . import arvados_testutil as tutil
@@ -26,3 +28,16 @@ class ArvRunTestCase(unittest.TestCase, tutil.VersionChecker):
             with self.assertRaises(SystemExit):
                 self.run_arv_run(['--version'])
         self.assertVersionOutput(out, err)
+
+    @mock.patch('arvados.commands.run.write_file')
+    def test_uploadfiles(self, write_file_mock):
+        path = os.getcwd()
+        files = [arv_run.statfile('', 'tests/upf/'+s) for s in ('a', 'b', 'b/y', 'c/x', 'd')]
+        random.shuffle(files)
+        mockcol = mock.MagicMock()
+        arv_run.uploadfiles(files, mock.MagicMock(), collection=mockcol)
+        write_file_mock.assert_has_calls([mock.call(mockcol, path+"/tests/upf/", 'a'),
+                                          mock.call(mockcol, path+"/tests/upf/", 'd'),
+                                          mock.call(mockcol, path+"/tests/upf/", 'b/x'),
+                                          mock.call(mockcol, path+"/tests/upf/", 'b/y'),
+                                          mock.call(mockcol, path+"/tests/upf/", 'c/x')])

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list