[ARVADOS] updated: d94d24f806debac76b29896bf486067be3cb084c
git at public.curoverse.com
git at public.curoverse.com
Tue Aug 5 13:29:31 EDT 2014
Summary of changes:
crunch_scripts/split-fastq.py | 62 ++++++++++++++++++++++++++++++++++++++-----
1 file changed, 56 insertions(+), 6 deletions(-)
via d94d24f806debac76b29896bf486067be3cb084c (commit)
from 9d476c8e51784747298293864026252852480df0 (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
commit d94d24f806debac76b29896bf486067be3cb084c
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date: Tue Aug 5 13:29:28 2014 -0400
Working on split-fastq to actually split the fastq file.
diff --git a/crunch_scripts/split-fastq.py b/crunch_scripts/split-fastq.py
index 42b8e61..fc61142 100755
--- a/crunch_scripts/split-fastq.py
+++ b/crunch_scripts/split-fastq.py
@@ -17,6 +17,54 @@ prog = re.compile("(.*?)_1.fastq(.gz)?$")
manifest_text = ""
+def readline(reader, start):
+ line = ""
+ n = -1
+ while n == -1:
+ r = reader.readfrom(start, 1024)
+ if r == '':
+ break
+ n = string.find(r, "\n")
+ line += r[0:n]
+ start += len(r)
+ return line
+
+def splitfastq(p):
+ for i in xrange(0, len(p)):
+ p[i]["start"] = 0
+ p[i]["end"] = 0
+
+ while True:
+ recordsize = [0, 0]
+
+ # read 4 lines starting at "start"
+ for ln in xrange(0, 4):
+ for i in xrange(0, len(p)):
+ r = readline(p[i]["reader"], p[i]["start"])
+ if r == '':
+ return
+ recordsize[i] += len(r)
+
+ splitnow = False
+ for i in xrange(0, len(p)):
+ if ((p[i]["end"] - p[i]["start"]) + recordsize[i]) >= arvados.BLOCKSIZE:
+ splitnow = True
+
+ if splitnow:
+ for i in xrange(0, len(p)):
+ global piece
+ global manifest_text
+ manifest = []
+ manifest.extend("./_" + str(piece))
+ manifest.extend([d[LOCATOR] for d in p["reader"]._stream._data_locators])
+ manifest.extend(["{}:{}:{}".format(seg[LOCATOR], seg[BLOCKSIZE], self.name().replace(' ', '\\040')) for seg in arvados.locators_and_ranges(p[i]["reader"].segments, p[i]["start"], p[i]["end"] - p[i]["start"])])
+ manifest_text += manifest.join(" ") + "\n"
+ p[i]["start"] = p[i]["end"]
+ else:
+ for i in xrange(0, len(p)):
+ p[i]["end"] += recordsize[i]
+
+
for s in inp.all_streams():
if s.name() == ".":
for f in s.all_files():
@@ -25,10 +73,11 @@ for s in inp.all_streams():
p = [{}, {}]
p[0]["reader"] = s.files()[result.group(0)]
p[1]["reader"] = s.files()[result.group(1) + "_2.fastq" + result.group(2)]
- m0 = p[0]["reader"].as_manifest()[1:]
- m1 = p[1]["reader"].as_manifest()[1:]
- manifest_text += "./_" + str(piece) + m0
- manifest_text += "./_" + str(piece) + m1
+ splitfastq(p)
+ #m0 = p[0]["reader"].as_manifest()[1:]
+ #m1 = p[1]["reader"].as_manifest()[1:]
+ #manifest_text += "./_" + str(piece) + m0
+ #manifest_text += "./_" + str(piece) + m1
piece += 1
# No pairs found so just put each fastq file into a separate directory
@@ -41,8 +90,9 @@ if manifest_text == "":
if result != None:
p = [{}]
p[0]["reader"] = s.files()[result.group(0)]
- m0 = p[0]["reader"].as_manifest()[1:]
- manifest_text += "./_" + str(piece) + m0
+ splitfastq(p)
+ #m0 = p[0]["reader"].as_manifest()[1:]
+ #manifest_text += "./_" + str(piece) + m0
piece += 1
arvados.current_task().set_output(manifest_text)
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list