[ARVADOS] updated: b1365ce7c1ccc74f479b1ebdf31b5da52028da84

Fri Aug 8 15:46:56 EDT 2014

Summary of changes:
 .../app/controllers/application_controller.rb      |  6 +-
 .../workbench/app/controllers/groups_controller.rb |  4 +-
 apps/workbench/app/helpers/application_helper.rb   |  7 +-
 apps/workbench/app/models/group.rb                 |  2 +-
 apps/workbench/app/models/pipeline_instance.rb     |  9 ++-
 .../app/views/application/_choose.html.erb         |  8 +-
 .../app/views/application/_content_layout.html.erb |  2 +-
 .../views/pipeline_instances/_show_inputs.html.erb |  3 +-
 .../app/views/projects/_show_sharing.html.erb      |  6 ++
 .../app/views/users/_choose_rows.html.erb          |  2 +-
 .../test/integration/pipeline_instances_test.rb    |  2 +
 apps/workbench/test/integration/projects_test.rb   | 28 ++++++-
 crunch_scripts/decompress-all.py                   | 35 +++++----
 crunch_scripts/split-fastq.py                      | 89 +++++++++++-----------
 doc/api/schema/Group.html.textile.liquid           |  2 +-
 .../schema/PipelineTemplate.html.textile.liquid    |  6 +-
 sdk/cli/bin/crunch-job                             | 18 ++---
 sdk/python/arvados/commands/put.py                 |  1 -
 .../controllers/arvados/v1/groups_controller.rb    |  2 +-
 services/api/app/models/link.rb                    |  2 +-
 services/api/db/structure.sql                      |  1 +
 .../arvados/v1/groups_controller_test.rb           |  4 +-
 22 files changed, 145 insertions(+), 94 deletions(-)

       via  b1365ce7c1ccc74f479b1ebdf31b5da52028da84 (commit)
       via  6f231c2537c196644f15eb7cac6f6861ea24e429 (commit)
       via  f00ee875d99ba65aaac178e762fbd3e35ddc5f87 (commit)
       via  1d1069684c35353f9f0ee4020b76f5dfc7406a47 (commit)
       via  1865158987c87ef6e6b1c53db65615b17b6324af (commit)
       via  862a1fd23f072b7107595755996fc2ec1c62162b (commit)
       via  572400db0f322f365e2bf6849ceebc432a69191a (commit)
       via  b82d80addf80b9ead7c63f85f72613042b3b2c51 (commit)
       via  7962de491af28d00a9c88412ad4d1e42be83432a (commit)
       via  c29fc69a571bc2e4a4e450a09d94adbb305633bd (commit)
       via  3af6ef2d42e3e6efb3ea8e1f32d34293f9b1011e (commit)
       via  fca9bd08944a78cf0b6843e06598a5378cb31581 (commit)
       via  fc68033c06f6f4320a9f901d3e79cc57d8097a21 (commit)
       via  5ce1225be2b74fa8b2025b0f52e72b86629fecc1 (commit)
       via  2bf927358c5956c5296009948c45403c54787021 (commit)
       via  9ea365275a363d634908f2a1eb5926024e8e8803 (commit)
      from  2d8508738cc5ae65818f5e6a0ca4e0af15fb6b0c (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.


commit b1365ce7c1ccc74f479b1ebdf31b5da52028da84
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Fri Aug 8 15:46:49 2014 -0400

    3373: split-fastq: merge loops to capture both single and paired fastq files.
    Will now error out if there are fastq files in subdirectories instead of
    ignoring them silently.  Added comment about splitfastq() function.

diff --git a/crunch_scripts/split-fastq.py b/crunch_scripts/split-fastq.py
index ece593d..8382eb1 100755
--- a/crunch_scripts/split-fastq.py
+++ b/crunch_scripts/split-fastq.py
@@ -14,8 +14,6 @@ manifest_text = ""
 
 inp = arvados.CollectionReader(arvados.getjobparam('reads'))
 
-prog = re.compile(r'(.*?)_1.fastq(.gz)?$')
-
 manifest_list = []
 
 chunking = False #arvados.getjobparam('chunking')
@@ -33,6 +31,17 @@ def nextline(reader, start):
             start += 128
     return n
 
+# Chunk a fastq into approximately 64 MiB chunks.  Requires that the input data
+# be decompressed ahead of time, such as using decompress-all.py.  Generates a
+# new manifest, but doesn't actually move any data around.  Handles paired
+# reads by ensuring that each chunk of a pair gets the same number of records.
+#
+# This works, but in practice is so slow that potential gains in alignment
+# performance are lost in the prep time, which is why it is currently disabled.
+#
+# A better algorithm would seek to a file position a bit less than the desired
+# chunk size and then scan ahead for the next record, making sure that record
+# was matched by the read pair.
 def splitfastq(p):
     for i in xrange(0, len(p)):
         p[i]["start"] = 0
@@ -64,17 +73,12 @@ def splitfastq(p):
         if splitnow:
             for i in xrange(0, len(p)):
                 global manifest_list
-                print "Finish piece ./_%s/%s (%s %s)" % (piece, p[i]["reader"].name(), p[i]["start"], p[i]["end"])
+                print >>sys.stderr, "Finish piece ./_%s/%s (%s %s)" % (piece, p[i]["reader"].name(), p[i]["start"], p[i]["end"])
                 manifest = []
                 manifest.extend(["./_" + str(piece)])
                 manifest.extend([d[arvados.LOCATOR] for d in p[i]["reader"]._stream._data_locators])
-
-                print p[i]
-                print arvados.locators_and_ranges(p[i]["reader"].segments, p[i]["start"], p[i]["end"] - p[i]["start"])
-
                 manifest.extend(["{}:{}:{}".format(seg[arvados.LOCATOR]+seg[arvados.OFFSET], seg[arvados.SEGMENTSIZE], p[i]["reader"].name().replace(' ', '\\040')) for seg in arvados.locators_and_ranges(p[i]["reader"].segments, p[i]["start"], p[i]["end"] - p[i]["start"])])
                 manifest_list.append(manifest)
-                print "Finish piece %s" % (" ".join(manifest))
                 p[i]["start"] = p[i]["end"]
             piece += 1
         else:
@@ -82,44 +86,43 @@ def splitfastq(p):
                 p[i]["end"] += recordsize[i]
             count += 1
             if count % 10000 == 0:
-                print "Record %s at %s" % (count, p[i]["end"])
+                print >>sys.stderr, "Record %s at %s" % (count, p[i]["end"])
+
+prog = re.compile(r'(.*?)(_12)?\.fastq(\.gz)?$')
 
+# Look for fastq files
 for s in inp.all_streams():
-    if s.name() == ".":
-        for f in s.all_files():
-            result = prog.match(f.name())
-            if result != None:
-                p = [{}, {}]
-                p[0]["reader"] = s.files()[result.group(0)]
-                if result.group(2) != None:
-                    p[1]["reader"] = s.files()[result.group(1) + "_2.fastq" + result.group(2)]
-                else:
-                    p[1]["reader"] = s.files()[result.group(1) + "_2.fastq"]
-                if chunking:
-                    splitfastq(p)
-                else:
-                    m0 = p[0]["reader"].as_manifest()[1:]
-                    m1 = p[1]["reader"].as_manifest()[1:]
-                    manifest_list.append(["./_" + str(piece), m0[:-1]])
-                    manifest_list.append(["./_" + str(piece), m1[:-1]])
-                    piece += 1
-
-# No pairs found so just put each fastq file into a separate directory
-if len(manifest_list) == 0:
-    for s in inp.all_streams():
-        prog = re.compile("(.*?).fastq(.gz)?$")
-        if s.name() == ".":
-            for f in s.all_files():
-                result = prog.match(f.name())
-                if result != None:
-                    p = [{}]
-                    p[0]["reader"] = s.files()[result.group(0)]
-                    if chunking:
-                        splitfastq(p)
+    for f in s.all_files():
+        name_pieces = prog.match(f.name())
+        if name_pieces != None:
+            if s.name() != ".":
+                # The downstream tool (run-command) only iterates over the top
+                # level of directories so if there are fastq files in
+                # directories in the input, the choice is either to forget
+                # there are directories (which might lead to name conflicts) or
+                # just fail.
+                print >>sys.stderr, "fastq must be at the root of the collection")
+                sys.exit(1)
+
+            if name_pieces.group(2) != None:
+                if name_pieces.group(2) == "_1":
+                    p = [{}, {}]
+                    p[0]["reader"] = s.files()[name_pieces.group(0)]
+                    if name_pieces.group(2) != None:
+                        p[1]["reader"] = s.files()[name_pieces.group(1) + "_2.fastq" + name_pieces.group(2)]
                     else:
-                        m0 = p[0]["reader"].as_manifest()[1:]
-                        manifest_list.append(["./_" + str(piece), m0])
-                        piece += 1
+                        p[1]["reader"] = s.files()[name_pieces.group(1) + "_2.fastq"]
+            else:
+                p = [{}]
+                p[0]["reader"] = s.files()[name_pieces.group(0)]
+
+            if chunking:
+                splitfastq(p)
+            else:
+                for i in xrange(0, len(p)):
+                    m = p[i]["reader"].as_manifest()[1:]
+                    manifest_list.append(["./_" + str(piece), m[:-1]])
+                piece += 1
 
 manifest_text = "\n".join(" ".join(m) for m in manifest_list)
 

commit 6f231c2537c196644f15eb7cac6f6861ea24e429
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Fri Aug 8 15:44:51 2014 -0400

    3373: decompress-all: cleaned up code a bit.  Fail properly with error message on bad input.

diff --git a/crunch_scripts/decompress-all.py b/crunch_scripts/decompress-all.py
index 8fa49f5..460425f 100755
--- a/crunch_scripts/decompress-all.py
+++ b/crunch_scripts/decompress-all.py
@@ -17,6 +17,7 @@ import arvados
 import re
 import subprocess
 import os
+import sys
 
 arvados.job_setup.one_task_per_input_file(if_sequence=0, and_end_task=True,
                                           input_as_path=True)
@@ -25,27 +26,31 @@ task = arvados.current_task()
 
 input_file = task['parameters']['input']
 
-result = re.match(r"(^[a-f0-9]{32}\+\d+)(\+\S+)*(/.*)(/[^/]+)$", input_file)
+infile_parts = re.match(r"(^[a-f0-9]{32}\+\d+)(\+\S+)*(/.*)?(/[^/]+)$", input_file)
 
 outdir = os.path.join(task.tmpdir, "output")
 os.makedirs(outdir)
 os.chdir(outdir)
 
-if result != None:
-    cr = arvados.CollectionReader(result.group(1))
-    streamname = result.group(3)[1:]
-    filename = result.group(4)[1:]
+if infile_parts == None:
+    print >>sys.stderr, "Failed to parse input filename '%s' as a Keep file\n" % input_file
+    sys.exit(1)
+
+cr = arvados.CollectionReader(infile_parts.group(1))
+streamname = infile_parts.group(3)[1:]
+filename = infile_parts.group(4)[1:]
 
+if streamname != None:
     subprocess.call(["mkdir", "-p", streamname])
     os.chdir(streamname)
-    streamreader = filter(lambda s: s.name() == streamname, cr.all_streams())[0]
-    filereader = streamreader.files()[filename]
-    rc = subprocess.call(["dtrx", "-r", "-n", "-q", arvados.get_task_param_mount('input')])
-    if rc == 0:
-        out = arvados.CollectionWriter()
-        out.write_directory_tree(outdir, max_manifest_depth=0)
-        task.set_output(out.finish())
-    else:
-        task.set_output(streamname + filereader.as_manifest()[1:])
 else:
-    sys.exit(1)
+    streamname = '.'
+streamreader = filter(lambda s: s.name() == streamname, cr.all_streams())[0]
+filereader = streamreader.files()[filename]
+rc = subprocess.call(["dtrx", "-r", "-n", "-q", arvados.get_task_param_mount('input')])
+if rc == 0:
+    out = arvados.CollectionWriter()
+    out.write_directory_tree(outdir, max_manifest_depth=0)
+    task.set_output(out.finish())
+else:
+    task.set_output(streamname + filereader.as_manifest()[1:])

commit f00ee875d99ba65aaac178e762fbd3e35ddc5f87
Merge: 2d85087 1d10696
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Thu Aug 7 10:49:50 2014 -0400

    Merge branch 'master' into 3373-improve-gatk3-snv-pipeline


-----------------------------------------------------------------------


hooks/post-receive
--