[ARVADOS] updated: d94d24f806debac76b29896bf486067be3cb084c

git at public.curoverse.com git at public.curoverse.com
Tue Aug 5 13:29:31 EDT 2014


Summary of changes:
 crunch_scripts/split-fastq.py | 62 ++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 56 insertions(+), 6 deletions(-)

       via  d94d24f806debac76b29896bf486067be3cb084c (commit)
      from  9d476c8e51784747298293864026252852480df0 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.


commit d94d24f806debac76b29896bf486067be3cb084c
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Tue Aug 5 13:29:28 2014 -0400

    Working on split-fastq to actually split the fastq file.

diff --git a/crunch_scripts/split-fastq.py b/crunch_scripts/split-fastq.py
index 42b8e61..fc61142 100755
--- a/crunch_scripts/split-fastq.py
+++ b/crunch_scripts/split-fastq.py
@@ -17,6 +17,54 @@ prog = re.compile("(.*?)_1.fastq(.gz)?$")
 
 manifest_text = ""
 
+def readline(reader, start):
+    line = ""
+    n = -1
+    while n == -1:
+        r = reader.readfrom(start, 1024)
+        if r == '':
+            break
+        n = string.find(r, "\n")
+        line += r[0:n]
+        start += len(r)
+    return line
+
+def splitfastq(p):
+    for i in xrange(0, len(p)):
+        p[i]["start"] = 0
+        p[i]["end"] = 0
+
+    while True:
+        recordsize = [0, 0]
+
+        # read 4 lines starting at "start"
+        for ln in xrange(0, 4):
+            for i in xrange(0, len(p)):
+                r = readline(p[i]["reader"], p[i]["start"])
+                if r == '':
+                    return
+                recordsize[i] += len(r)
+
+        splitnow = False
+        for i in xrange(0, len(p)):
+            if ((p[i]["end"] - p[i]["start"]) + recordsize[i]) >= arvados.BLOCKSIZE:
+                splitnow = True
+
+        if splitnow:
+            for i in xrange(0, len(p)):
+                global piece
+                global manifest_text
+                manifest = []
+                manifest.extend("./_" + str(piece))
+                manifest.extend([d[LOCATOR] for d in p["reader"]._stream._data_locators])
+                manifest.extend(["{}:{}:{}".format(seg[LOCATOR], seg[BLOCKSIZE], self.name().replace(' ', '\\040')) for seg in arvados.locators_and_ranges(p[i]["reader"].segments, p[i]["start"], p[i]["end"] - p[i]["start"])])
+                manifest_text += manifest.join(" ") + "\n"
+                p[i]["start"] = p[i]["end"]
+        else:
+            for i in xrange(0, len(p)):
+                p[i]["end"] += recordsize[i]
+
+
 for s in inp.all_streams():
     if s.name() == ".":
         for f in s.all_files():
@@ -25,10 +73,11 @@ for s in inp.all_streams():
                 p = [{}, {}]
                 p[0]["reader"] = s.files()[result.group(0)]
                 p[1]["reader"] = s.files()[result.group(1) + "_2.fastq" + result.group(2)]
-                m0 = p[0]["reader"].as_manifest()[1:]
-                m1 = p[1]["reader"].as_manifest()[1:]
-                manifest_text += "./_" + str(piece) + m0
-                manifest_text += "./_" + str(piece) + m1
+                splitfastq(p)
+                #m0 = p[0]["reader"].as_manifest()[1:]
+                #m1 = p[1]["reader"].as_manifest()[1:]
+                #manifest_text += "./_" + str(piece) + m0
+                #manifest_text += "./_" + str(piece) + m1
                 piece += 1
 
 # No pairs found so just put each fastq file into a separate directory
@@ -41,8 +90,9 @@ if manifest_text == "":
                 if result != None:
                     p = [{}]
                     p[0]["reader"] = s.files()[result.group(0)]
-                    m0 = p[0]["reader"].as_manifest()[1:]
-                    manifest_text += "./_" + str(piece) + m0
+                    splitfastq(p)
+                    #m0 = p[0]["reader"].as_manifest()[1:]
+                    #manifest_text += "./_" + str(piece) + m0
                     piece += 1
 
 arvados.current_task().set_output(manifest_text)

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list