[ARVADOS] updated: 0a87aad48d7fccfc4d7d56a8628370cb7370d792

git at public.curoverse.com git at public.curoverse.com
Fri May 1 14:43:45 EDT 2015


Summary of changes:
 crunch_scripts/crunchutil/vwd.py                | 70 +++++++++++++++----------
 crunch_scripts/run-command                      |  2 +-
 doc/user/topics/run-command.html.textile.liquid |  8 ++-
 3 files changed, 49 insertions(+), 31 deletions(-)

       via  0a87aad48d7fccfc4d7d56a8628370cb7370d792 (commit)
      from  f6fcf9f51d326fa0a42e9ebdd1343a6a6ef6d8e8 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.


commit 0a87aad48d7fccfc4d7d56a8628370cb7370d792
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Fri May 1 14:43:41 2015 -0400

    5787: Write files that are pointed to by symlinks that are outside of Keep.
    Catch exceptions.  Update documentation.

diff --git a/crunch_scripts/crunchutil/vwd.py b/crunch_scripts/crunchutil/vwd.py
index 4a97546..5b9edf5 100644
--- a/crunch_scripts/crunchutil/vwd.py
+++ b/crunch_scripts/crunchutil/vwd.py
@@ -1,8 +1,8 @@
 import arvados
 import os
-import robust_put
 import stat
 import arvados.commands.run
+import logging
 
 # Implements "Virtual Working Directory"
 # Provides a way of emulating a shared writable directory in Keep based
@@ -34,19 +34,19 @@ def checkout(source_collection, target_dir, keepmount=None):
             os.symlink(os.path.join(root, f), os.path.join(target_dir, rel, f))
 
 def checkin(target_dir):
-    """Write files in the target_dir to Keep.
+    """Write files in `target_dir` to Keep.
 
-    Symlinks into the keep mount in the output dir are efficiently added to the
-    collection with no data copying.
+    Regular files or symlinks to files outside the keep mount are written to
+    Keep as normal files (Keep does not support symlinks).
+
+    Symlinks to files in the keep mount will result in files in the new
+    collection which reference existing Keep blocks, no data copying necessay.
 
     Returns a new Collection object, with data flushed but the collection record
     not saved to the API.
 
     """
 
-    # delete symlinks, commit directory, merge manifests and return combined
-    # collection.
-
     outputcollection = arvados.collection.Collection(num_retries=5)
 
     if target_dir[-1:] != '/':
@@ -54,30 +54,42 @@ def checkin(target_dir):
 
     collections = {}
 
+    logger = logging.getLogger("arvados")
+
     for root, dirs, files in os.walk(target_dir):
         for f in files:
-            s = os.lstat(os.path.join(root, f))
-            if stat.S_ISLNK(s.st_mode):
-                # 1. check if it is a link into a collection
-                real = os.path.split(os.path.realpath(os.path.join(root, f)))
-                (pdh, branch) = arvados.commands.run.is_in_collection(real[0], real[1])
-                if pdh is not None:
-                    # 2. load collection
-                    if pdh not in collections:
-                        collections[pdh] = arvados.collection.CollectionReader(pdh,
-                                                                               api_client=outputcollection._my_api(),
-                                                                               keep_client=outputcollection._my_keep(),
-                                                                               num_retries=5)
-                    # 3. copy arvfile to new collection
-                    outputcollection.copy(branch, os.path.join(root[len(target_dir):], f), source_collection=collections[pdh])
-
-            elif stat.S_ISREG(s.st_mode):
-                reldir = root[len(target_dir):]
-                with outputcollection.open(os.path.join(reldir, f), "wb") as writer:
-                    with open(os.path.join(root, f), "rb") as reader:
-                        dat = reader.read(64*1024)
-                        while dat:
-                            writer.write(dat)
+            try:
+                s = os.lstat(os.path.join(root, f))
+
+                writeIt = False
+
+                if stat.S_ISREG(s.st_mode):
+                    writeIt = True
+                elif stat.S_ISLNK(s.st_mode):
+                    # 1. check if it is a link into a collection
+                    real = os.path.split(os.path.realpath(os.path.join(root, f)))
+                    (pdh, branch) = arvados.commands.run.is_in_collection(real[0], real[1])
+                    if pdh is not None:
+                        # 2. load collection
+                        if pdh not in collections:
+                            collections[pdh] = arvados.collection.CollectionReader(pdh,
+                                                                                   api_client=outputcollection._my_api(),
+                                                                                   keep_client=outputcollection._my_keep(),
+                                                                                   num_retries=5)
+                        # 3. copy arvfile to new collection
+                        outputcollection.copy(branch, os.path.join(root[len(target_dir):], f), source_collection=collections[pdh])
+                    else:
+                        writeIt = True
+
+                if writeIt:
+                    reldir = root[len(target_dir):]
+                    with outputcollection.open(os.path.join(reldir, f), "wb") as writer:
+                        with open(os.path.join(root, f), "rb") as reader:
                             dat = reader.read(64*1024)
+                            while dat:
+                                writer.write(dat)
+                                dat = reader.read(64*1024)
+            except (IOError, OSError) as e:
+                logger.error(e)
 
     return outputcollection
diff --git a/crunch_scripts/run-command b/crunch_scripts/run-command
index ae2233e..682ebfc 100755
--- a/crunch_scripts/run-command
+++ b/crunch_scripts/run-command
@@ -415,7 +415,7 @@ signal.signal(signal.SIGQUIT, signal.SIG_DFL)
 
 logger.info("the following output files will be saved to keep:")
 
-subprocess.call(["find", ".", "-printf", "run-command: %12.12s %h/%f\\n"], stdout=sys.stderr, cwd=outdir)
+subprocess.call(["find", "-L", ".", "-type", "f", "-printf", "run-command: %12.12s %h/%f\\n"], stdout=sys.stderr, cwd=outdir)
 
 logger.info("start writing output to keep")
 
diff --git a/doc/user/topics/run-command.html.textile.liquid b/doc/user/topics/run-command.html.textile.liquid
index f1d42ad..64f7f79 100644
--- a/doc/user/topics/run-command.html.textile.liquid
+++ b/doc/user/topics/run-command.html.textile.liquid
@@ -73,6 +73,12 @@ table(table table-bordered table-condensed).
 |$(dir ...)        | Takes a reference to an Arvados collection or directory within an Arvados collection and evaluates to a directory path on the local file system where that directory can be accessed by your command.  The path may include a file name, in which case it will evaluate to the parent directory of the file.  Uses Python's os.path.dirname(), so "/foo/bar" will evaluate to "/foo" but "/foo/bar/" will evaluate to "/foo/bar".  Will raise an error if the directory is not accessible. |
 |$(basename ...)   | Strip leading directory and trailing file extension from the path provided.  For example, $(basename /foo/bar.baz.txt) will evaluate to "bar.baz".|
 |$(glob ...)       | Take a Unix shell path pattern (supports @*@ @?@ and @[]@) and search the local filesystem, returning the first match found.  Use together with $(dir ...) to get a local filesystem path for Arvados collections.  For example: $(glob $(dir $(mycollection)/*.bam)) will find the first .bam file in the collection specified by the user parameter "mycollection".  If there is more than one match, which one is returned is undefined.  Will raise an error if no matches are found.|
+|$(task.tmpdir)|Get the designated temporary directory.  This directory will be discarded when the job completes.|
+|$(task.outdir)|Get the designated output directory.  The contents of this directory will be saved to Keep when the job completes.  A symlink to a file in the keep mount will reference existing Keep blocks in your job output collection, with no data copying or duplication.|
+|$(job.srcdir)|Get the path to your git checkout.|
+|$(node.cores)|Get the number of CPU cores on the node.|
+|$(job.uuid)|Get current job uuid.|
+|$(task.uuid)|Get current task uuid.|
 
 h3. Escape sequences
 
@@ -235,7 +241,7 @@ h3. task.vwd
 
 Background: because Keep collections are read-only, this does not play well with certain tools that expect to be able to write their outputs alongside their inputs (such as tools that generate indexes that are closely associated with the original file.)  The run-command's solution to this is the "virtual working directory".
 
- at task.vwd@ specifies a Keep collection with the starting contents of the directory.  @run-command@ will then populate @task.outdir@ with directories and symlinks to mirror the contents of the @task.vwd@ collection.  Your command will then be able to both access its input files and write its output files in @task.outdir at .  When the command completes, the output collection will merge the output of your command with the contents of the starting collection.  Note that files in the starting collection remain read-only and cannot be altered or deleted.
+ at task.vwd@ specifies a Keep collection with the starting contents of the directory.  @run-command@ will then populate @task.outdir@ with directories and symlinks to mirror the contents of the @task.vwd@ collection.  Your command will then be able to both access its input files and write its output files in @task.outdir at .  When the command completes, the output collection will write both the output of your command and the contents of the starting collection.  Note that files from the starting collection remain read-only and cannot be altered, but may be deleted or renamed.
 
 h3. task.foreach
 

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list