[ARVADOS] created: cd8865242a72db92cb2c6a269ca209e06b0a31ef
git at public.curoverse.com
git at public.curoverse.com
Wed Jul 30 13:51:10 EDT 2014
at cd8865242a72db92cb2c6a269ca209e06b0a31ef (commit)
commit cd8865242a72db92cb2c6a269ca209e06b0a31ef
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date: Wed Jul 30 13:37:50 2014 -0400
str or unicode
diff --git a/crunch_scripts/run-command b/crunch_scripts/run-command
index b915ef9..a92329a 100755
--- a/crunch_scripts/run-command
+++ b/crunch_scripts/run-command
@@ -85,7 +85,7 @@ def expand_item(p, c):
return r
elif isinstance(c, list):
return expand_list(p, c)
- elif isinstance(c, str):
+ elif isinstance(c, str) or isinstance(c, unicode):
return [subst.do_substitution(p, c)]
return []
commit 629ff557c7cb0e94080d3faed8b5ed4c53119f29
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date: Wed Jul 30 13:36:08 2014 -0400
extra / after dir
diff --git a/crunch_scripts/run-command b/crunch_scripts/run-command
index f73410c..b915ef9 100755
--- a/crunch_scripts/run-command
+++ b/crunch_scripts/run-command
@@ -102,7 +102,7 @@ def get_items(p, value):
prefix = fn[len(os.environ['TASK_KEEPMOUNT'])+1:]
if mode != None:
if stat.S_ISDIR(mode):
- items = ["$(dir %s/%s)" % (prefix, l) for l in os.listdir(fn)]
+ items = ["$(dir %s/%s/)" % (prefix, l) for l in os.listdir(fn)]
elif stat.S_ISREG(mode):
with open(fn) as f:
items = [line for line in f]
commit 2b18c1a62315bf7eb407d75923d8d3e10b4c7cd2
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date: Wed Jul 30 13:32:07 2014 -0400
Better error from glob
diff --git a/crunch_scripts/subst.py b/crunch_scripts/subst.py
index 2598e1c..8154d0e 100644
--- a/crunch_scripts/subst.py
+++ b/crunch_scripts/subst.py
@@ -44,7 +44,11 @@ def sub_basename(v):
return os.path.splitext(os.path.basename(v))[0]
def sub_glob(v):
- return glob.glob(v)[0]
+ l = glob.glob(v)
+ if len(l) == 0:
+ raise Exception("$(glob): No match on '%s'" % v)
+ else:
+ return l[0]
default_subs = {"file ": sub_file,
"dir ": sub_dir,
commit b0dff193d231456b36bed0bca7eac7f78997a8ae
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date: Wed Jul 30 13:25:51 2014 -0400
Pretty print
diff --git a/crunch_scripts/run-command b/crunch_scripts/run-command
index 4708bd8..f73410c 100755
--- a/crunch_scripts/run-command
+++ b/crunch_scripts/run-command
@@ -13,6 +13,7 @@ import signal
import stat
import copy
import traceback
+import pprint
os.umask(0077)
@@ -147,7 +148,8 @@ try:
except Exception as e:
print("run-command: caught exception:")
traceback.print_exc(file=sys.stdout)
- print("run-command: parameters is %s" % p)
+ print("run-command: task parameters was:")
+ pprint.pprint(p)
sys.exit(1)
try:
commit 529efd199afb26cdc7b8a422ab8b69bcee1b3ec8
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date: Wed Jul 30 13:23:15 2014 -0400
Rearranged exception handling a little more
diff --git a/crunch_scripts/run-command b/crunch_scripts/run-command
index 30f00f3..4708bd8 100755
--- a/crunch_scripts/run-command
+++ b/crunch_scripts/run-command
@@ -109,6 +109,9 @@ def get_items(p, value):
else:
return None
+stdoutname = None
+stdoutfile = None
+
try:
if "task.foreach" in jobp:
if arvados.current_task()['sequence'] == 0:
@@ -135,6 +138,12 @@ try:
cmd = expand_list(p, p["command"])
+ if "save.stdout" in p:
+ stdoutname = subst.do_substitution(p, p["save.stdout"])
+ stdoutfile = open(stdoutname, "wb")
+
+ print("run-command: {}{}".format(' '.join(cmd), (" > " + stdoutname) if stdoutname != None else ""))
+
except Exception as e:
print("run-command: caught exception:")
traceback.print_exc(file=sys.stdout)
@@ -142,14 +151,6 @@ except Exception as e:
sys.exit(1)
try:
- stdoutname = None
- stdoutfile = None
- if "save.stdout" in p:
- stdoutname = subst.do_substitution(p, p["save.stdout"])
- stdoutfile = open(stdoutname, "wb")
-
- print("run-command: {}{}".format(' '.join(cmd), (" > " + stdoutname) if stdoutname != None else ""))
-
sp = subprocess.Popen(cmd, shell=False, stdout=stdoutfile)
sig = SigHandler()
commit 528280155a4a963187736364f74835f2873fc073
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date: Wed Jul 30 13:20:52 2014 -0400
More logging
diff --git a/crunch_scripts/run-command b/crunch_scripts/run-command
index 840b4b6..30f00f3 100755
--- a/crunch_scripts/run-command
+++ b/crunch_scripts/run-command
@@ -109,33 +109,39 @@ def get_items(p, value):
else:
return None
-if "task.foreach" in jobp:
- if arvados.current_task()['sequence'] == 0:
- var = jobp["task.foreach"]
- items = get_items(jobp, jobp[var])
- if items != None:
+try:
+ if "task.foreach" in jobp:
+ if arvados.current_task()['sequence'] == 0:
+ var = jobp["task.foreach"]
+ items = get_items(jobp, jobp[var])
print("run-command: parallelizing on %s with items %s" % (var, items))
+ if items != None:
+ for i in items:
+ params = copy.copy(jobp)
+ params[var] = i
+ arvados.api().job_tasks().create(body={
+ 'job_uuid': arvados.current_job()['uuid'],
+ 'created_by_job_task_uuid': arvados.current_task()['uuid'],
+ 'sequence': 1,
+ 'parameters': params
+ }
+ ).execute()
+ arvados.current_task().set_output(None)
+ sys.exit(0)
+ else:
+ sys.exit(1)
+ else:
+ p = jobp
- for i in items:
- params = copy.copy(jobp)
- params[var] = i
- arvados.api().job_tasks().create(body={
- 'job_uuid': arvados.current_job()['uuid'],
- 'created_by_job_task_uuid': arvados.current_task()['uuid'],
- 'sequence': 1,
- 'parameters': params
- }
- ).execute()
- arvados.current_task().set_output(None)
- sys.exit(0)
- else:
- sys.exit(1)
-else:
- p = jobp
-
-try:
cmd = expand_list(p, p["command"])
+except Exception as e:
+ print("run-command: caught exception:")
+ traceback.print_exc(file=sys.stdout)
+ print("run-command: parameters is %s" % p)
+ sys.exit(1)
+
+try:
stdoutname = None
stdoutfile = None
if "save.stdout" in p:
commit 74fb942ca46cf6f039b961b572f9c0f160a24b93
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date: Wed Jul 30 13:15:51 2014 -0400
Print traceback on exception
diff --git a/crunch_scripts/run-command b/crunch_scripts/run-command
index 1ff985c..840b4b6 100755
--- a/crunch_scripts/run-command
+++ b/crunch_scripts/run-command
@@ -12,6 +12,7 @@ import arvados.commands.put as put
import signal
import stat
import copy
+import traceback
os.umask(0077)
@@ -161,7 +162,8 @@ try:
print("run-command: completed with exit code %i (%s)" % (rcode, "success" if rcode == 0 else "failed"))
except Exception as e:
- print("run-command: caught exception: {}".format(e))
+ print("run-command: caught exception:")
+ traceback.print_exc(file=sys.stdout)
# restore default signal handlers.
signal.signal(signal.SIGINT, signal.SIG_DFL)
@@ -198,7 +200,8 @@ while not done:
print("run-command: terminating on signal 2")
sys.exit(2)
except Exception as e:
- print("run-command: caught exception: {}".format(e))
+ print("run-command: caught exception:")
+ traceback.print_exc(file=sys.stdout)
time.sleep(5)
sys.exit(rcode)
commit adcf46d59e7da4f08fbc9a94dcae52bc573c24ad
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date: Wed Jul 30 11:40:29 2014 -0400
Set first task to success
diff --git a/crunch_scripts/run-command b/crunch_scripts/run-command
index 1056925..1ff985c 100755
--- a/crunch_scripts/run-command
+++ b/crunch_scripts/run-command
@@ -125,6 +125,7 @@ if "task.foreach" in jobp:
'parameters': params
}
).execute()
+ arvados.current_task().set_output(None)
sys.exit(0)
else:
sys.exit(1)
commit 577c61793d8d9f2452b6b53d3791822d001e339c
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date: Wed Jul 30 11:36:05 2014 -0400
import copy
diff --git a/crunch_scripts/run-command b/crunch_scripts/run-command
index 64d9c6c..1056925 100755
--- a/crunch_scripts/run-command
+++ b/crunch_scripts/run-command
@@ -11,6 +11,7 @@ import time
import arvados.commands.put as put
import signal
import stat
+import copy
os.umask(0077)
commit c3b19359bd21c0b2115c4169102327762e48a93b
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date: Wed Jul 30 11:34:38 2014 -0400
.st_mode
diff --git a/crunch_scripts/run-command b/crunch_scripts/run-command
index aa30d2d..64d9c6c 100755
--- a/crunch_scripts/run-command
+++ b/crunch_scripts/run-command
@@ -95,7 +95,7 @@ def get_items(p, value):
return expand_list(p, value)
fn = subst.do_substitution(p, value)
- mode = os.stat(fn)
+ mode = os.stat(fn).st_mode
prefix = fn[len(os.environ['TASK_KEEPMOUNT'])+1:]
if mode != None:
if stat.S_ISDIR(mode):
commit baf2c86b3af2d0b8db22aff5d6f427f025d767b7
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date: Wed Jul 30 11:32:41 2014 -0400
import stat
diff --git a/crunch_scripts/run-command b/crunch_scripts/run-command
index 826cbb5..aa30d2d 100755
--- a/crunch_scripts/run-command
+++ b/crunch_scripts/run-command
@@ -10,6 +10,7 @@ import subst
import time
import arvados.commands.put as put
import signal
+import stat
os.umask(0077)
commit 1d13f6a6edd0f101d11edc55663d83ac40f6fbfe
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date: Wed Jul 30 11:30:47 2014 -0400
Fix syntax error
diff --git a/crunch_scripts/run-command b/crunch_scripts/run-command
index 27ce822..826cbb5 100755
--- a/crunch_scripts/run-command
+++ b/crunch_scripts/run-command
@@ -99,7 +99,7 @@ def get_items(p, value):
if mode != None:
if stat.S_ISDIR(mode):
items = ["$(dir %s/%s)" % (prefix, l) for l in os.listdir(fn)]
- else stat.S_ISREG(mode):
+ elif stat.S_ISREG(mode):
with open(fn) as f:
items = [line for line in f]
return items
@@ -108,7 +108,7 @@ def get_items(p, value):
if "task.foreach" in jobp:
if arvados.current_task()['sequence'] == 0:
- var = jobp["foreach"]
+ var = jobp["task.foreach"]
items = get_items(jobp, jobp[var])
if items != None:
print("run-command: parallelizing on %s with items %s" % (var, items))
@@ -134,8 +134,8 @@ try:
stdoutname = None
stdoutfile = None
- if "stdout" in p:
- stdoutname = subst.do_substitution(p, p["stdout"])
+ if "save.stdout" in p:
+ stdoutname = subst.do_substitution(p, p["save.stdout"])
stdoutfile = open(stdoutname, "wb")
print("run-command: {}{}".format(' '.join(cmd), (" > " + stdoutname) if stdoutname != None else ""))
commit 99d5b548b50c573281d9780ecd631ecf83f4eefe
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date: Wed Jul 30 10:58:08 2014 -0400
Refactored and generalized list expansion, can use "foreach" to repeat portions
of the command line.
diff --git a/crunch_scripts/run-command b/crunch_scripts/run-command
index 186b1bc..27ce822 100755
--- a/crunch_scripts/run-command
+++ b/crunch_scripts/run-command
@@ -68,19 +68,49 @@ class SigHandler(object):
sp.send_signal(signum)
self.sig = signum
-if "foreach" in jobp:
+def expand_item(p, c):
+ if isinstance(c, dict):
+ if "foreach" in c and "command" in c:
+ var = c["foreach"]
+ items = get_items(p, p[var])
+ r = []
+ for i in items:
+ params = copy.copy(p)
+ params[var] = i
+ r.extend(expand_list(params, c["command"]))
+ return r
+ elif isinstance(c, list):
+ return expand_list(p, c)
+ elif isinstance(c, str):
+ return [subst.do_substitution(p, c)]
+
+ return []
+
+def expand_list(p, l):
+ return [exp for arg in l for exp in expand_item(p, arg)]
+
+def get_items(p, value):
+ if isinstance(value, list):
+ return expand_list(p, value)
+
+ fn = subst.do_substitution(p, value)
+ mode = os.stat(fn)
+ prefix = fn[len(os.environ['TASK_KEEPMOUNT'])+1:]
+ if mode != None:
+ if stat.S_ISDIR(mode):
+ items = ["$(dir %s/%s)" % (prefix, l) for l in os.listdir(fn)]
+ else stat.S_ISREG(mode):
+ with open(fn) as f:
+ items = [line for line in f]
+ return items
+ else:
+ return None
+
+if "task.foreach" in jobp:
if arvados.current_task()['sequence'] == 0:
var = jobp["foreach"]
- fn = subst.do_substitution(jobp, jobp[var])
- mode = os.stat(fn)
- prefix = fn[len(os.environ['TASK_KEEPMOUNT'])+1:]
- if mode != None:
- if stat.S_ISDIR(mode):
- items = ["$(dir %s/%s)" % (prefix, l) for l in os.listdir(fn)]
- else stat.S_ISREG(mode):
- with open(fn) as f:
- items = [line for line in f]
-
+ items = get_items(jobp, jobp[var])
+ if items != None:
print("run-command: parallelizing on %s with items %s" % (var, items))
for i in items:
@@ -100,9 +130,7 @@ else:
p = jobp
try:
- cmd = []
- for c in p["command"]:
- cmd.append(subst.do_substitution(p, c))
+ cmd = expand_list(p, p["command"])
stdoutname = None
stdoutfile = None
commit 06cc4285949e8203a80f1b506b3551b731494dcf
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date: Wed Jul 30 10:54:41 2014 -0400
Simpler split-fastq, doesn't try to upload the collection, assume crunch-job
will create the collection for us.
diff --git a/crunch_scripts/split-fastq.py b/crunch_scripts/split-fastq.py
index 065c396..42b8e61 100755
--- a/crunch_scripts/split-fastq.py
+++ b/crunch_scripts/split-fastq.py
@@ -9,36 +9,6 @@ api = arvados.api('v1')
piece = 0
manifest_text = ""
-def put_manifest(manifest_text, sources=[]):
- crm = arvados.CollectionReader(manifest_text)
-
- combined = crm.manifest_text(strip=True)
-
- m = hashlib.new('md5')
- m.update(combined)
-
- print combined
-
- uuid = "{}+{}".format(m.hexdigest(), len(combined))
-
- collection = arvados.api().collections().create(
- body={'collection':{
- 'uuid': uuid,
- 'manifest_text': crm.manifest_text()
- }
- }).execute()
-
- for s in sources:
- l = arvados.api().links().create(body={
- "link": {
- "tail_uuid": s,
- "head_uuid": uuid,
- "link_class": "provenance",
- "name": "provided"
- }}).execute()
-
- return uuid
-
# Look for paired reads
inp = arvados.CollectionReader(arvados.getjobparam('reads'))
@@ -75,6 +45,4 @@ if manifest_text == "":
manifest_text += "./_" + str(piece) + m0
piece += 1
-print manifest_text
-
-arvados.current_task().set_output(put_manifest(manifest_text, [arvados.getjobparam('reads')]))
+arvados.current_task().set_output(manifest_text)
commit 1774a688181d771790c3c406ae45c99dd26df023
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date: Tue Jul 29 16:53:06 2014 -0400
Change collection create request
diff --git a/crunch_scripts/split-fastq.py b/crunch_scripts/split-fastq.py
index b51a334..065c396 100755
--- a/crunch_scripts/split-fastq.py
+++ b/crunch_scripts/split-fastq.py
@@ -22,9 +22,10 @@ def put_manifest(manifest_text, sources=[]):
uuid = "{}+{}".format(m.hexdigest(), len(combined))
collection = arvados.api().collections().create(
- body={
+ body={'collection':{
'uuid': uuid,
- 'manifest_text': crm.manifest_text(),
+ 'manifest_text': crm.manifest_text()
+ }
}).execute()
for s in sources:
commit d1332738de33f6e5c50b8d444590f4add9663853
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date: Tue Jul 29 16:48:13 2014 -0400
Debug manifest format to see why api server is rejecting it
diff --git a/crunch_scripts/split-fastq.py b/crunch_scripts/split-fastq.py
index 55e7fcc..b51a334 100755
--- a/crunch_scripts/split-fastq.py
+++ b/crunch_scripts/split-fastq.py
@@ -17,6 +17,8 @@ def put_manifest(manifest_text, sources=[]):
m = hashlib.new('md5')
m.update(combined)
+ print combined
+
uuid = "{}+{}".format(m.hexdigest(), len(combined))
collection = arvados.api().collections().create(
@@ -54,8 +56,8 @@ for s in inp.all_streams():
p[1]["reader"] = s.files()[result.group(1) + "_2.fastq" + result.group(2)]
m0 = p[0]["reader"].as_manifest()[1:]
m1 = p[1]["reader"].as_manifest()[1:]
- manifest_text += "_" + str(piece) + m0
- manifest_text += "_" + str(piece) + m1
+ manifest_text += "./_" + str(piece) + m0
+ manifest_text += "./_" + str(piece) + m1
piece += 1
# No pairs found so just put each fastq file into a separate directory
@@ -69,7 +71,7 @@ if manifest_text == "":
p = [{}]
p[0]["reader"] = s.files()[result.group(0)]
m0 = p[0]["reader"].as_manifest()[1:]
- manifest_text += "_" + str(piece) + m0
+ manifest_text += "./_" + str(piece) + m0
piece += 1
print manifest_text
commit 6755f71e4b5e9474f24f8d79739feb775d91496b
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date: Tue Jul 29 16:43:16 2014 -0400
debug print manifest_text
diff --git a/crunch_scripts/split-fastq.py b/crunch_scripts/split-fastq.py
index 1c0fae5..55e7fcc 100755
--- a/crunch_scripts/split-fastq.py
+++ b/crunch_scripts/split-fastq.py
@@ -72,4 +72,6 @@ if manifest_text == "":
manifest_text += "_" + str(piece) + m0
piece += 1
+print manifest_text
+
arvados.current_task().set_output(put_manifest(manifest_text, [arvados.getjobparam('reads')]))
commit a8e5a5ee592e9e71c1ce80247212e33d61ae2381
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date: Tue Jul 29 16:41:01 2014 -0400
Left debugging in by accident
diff --git a/crunch_scripts/split-fastq.py b/crunch_scripts/split-fastq.py
index d3faa9b..1c0fae5 100755
--- a/crunch_scripts/split-fastq.py
+++ b/crunch_scripts/split-fastq.py
@@ -40,9 +40,6 @@ def put_manifest(manifest_text, sources=[]):
inp = arvados.CollectionReader(arvados.getjobparam('reads'))
-with open("/home/peter/manifest") as f:
- inp = arvados.CollectionReader(f.read())
-
prog = re.compile("(.*?)_1.fastq(.gz)?$")
manifest_text = ""
commit e4f893184833d43c4aed29a1825f4db08cb14d0e
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date: Tue Jul 29 16:39:45 2014 -0400
Same bug, different place
diff --git a/crunch_scripts/split-fastq.py b/crunch_scripts/split-fastq.py
index 4c112fb..d3faa9b 100755
--- a/crunch_scripts/split-fastq.py
+++ b/crunch_scripts/split-fastq.py
@@ -75,4 +75,4 @@ if manifest_text == "":
manifest_text += "_" + str(piece) + m0
piece += 1
-arvados.current_task().set_output(put_manifest(manifest_text, [arvados.get_job_param('input')]))
+arvados.current_task().set_output(put_manifest(manifest_text, [arvados.getjobparam('reads')]))
commit 0cf8e41ee88c6cd6e2546958926d7a59991dab4e
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date: Tue Jul 29 16:38:16 2014 -0400
Input is "reads" not "input"
diff --git a/crunch_scripts/split-fastq.py b/crunch_scripts/split-fastq.py
index a4444d0..4c112fb 100755
--- a/crunch_scripts/split-fastq.py
+++ b/crunch_scripts/split-fastq.py
@@ -38,7 +38,7 @@ def put_manifest(manifest_text, sources=[]):
# Look for paired reads
-inp = arvados.CollectionReader(arvados.getjobparam('input'))
+inp = arvados.CollectionReader(arvados.getjobparam('reads'))
with open("/home/peter/manifest") as f:
inp = arvados.CollectionReader(f.read())
commit 2c7b8f8bee023c438c47aff2f35848bbf6c1ae5f
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date: Tue Jul 29 16:37:03 2014 -0400
typo fix get_job_param() -> getjobparam()
diff --git a/crunch_scripts/split-fastq.py b/crunch_scripts/split-fastq.py
index 96ebfbd..a4444d0 100755
--- a/crunch_scripts/split-fastq.py
+++ b/crunch_scripts/split-fastq.py
@@ -38,7 +38,7 @@ def put_manifest(manifest_text, sources=[]):
# Look for paired reads
-inp = arvados.CollectionReader(arvados.get_job_param('input'))
+inp = arvados.CollectionReader(arvados.getjobparam('input'))
with open("/home/peter/manifest") as f:
inp = arvados.CollectionReader(f.read())
commit 5990ac0ad45146fad60d151b24a39c15e72a1d87
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date: Tue Jul 29 16:32:50 2014 -0400
foreach just refers to other parameter to use as iterator variable.
diff --git a/crunch_scripts/run-command b/crunch_scripts/run-command
index a7dcd25..186b1bc 100755
--- a/crunch_scripts/run-command
+++ b/crunch_scripts/run-command
@@ -70,9 +70,8 @@ class SigHandler(object):
if "foreach" in jobp:
if arvados.current_task()['sequence'] == 0:
- var = jobp["foreach"]["var"]
- values = jobp["foreach"]["values"]
- fn = subst.do_substitution(jobp, values)
+ var = jobp["foreach"]
+ fn = subst.do_substitution(jobp, jobp[var])
mode = os.stat(fn)
prefix = fn[len(os.environ['TASK_KEEPMOUNT'])+1:]
if mode != None:
commit 925a830167cb2af6ededf9a0f6d83a2e672452b8
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date: Tue Jul 29 16:10:21 2014 -0400
Added "foreach" to run-command wrapper. Needs testing.
diff --git a/crunch_scripts/run-command b/crunch_scripts/run-command
index e6ec889..a7dcd25 100755
--- a/crunch_scripts/run-command
+++ b/crunch_scripts/run-command
@@ -23,10 +23,10 @@ os.mkdir("output")
os.chdir("output")
+taskp = None
+jobp = arvados.current_job()['script_parameters']
if len(arvados.current_task()['parameters']) > 0:
p = arvados.current_task()['parameters']
-else:
- p = arvados.current_job()['script_parameters']
links = []
@@ -42,9 +42,17 @@ def sub_tmpdir(v):
def sub_cores(v):
return os.environ['CRUNCH_NODE_SLOTS']
+def sub_jobid(v):
+ return os.environ['JOB_UUID']
+
+def sub_taskid(v):
+ return os.environ['TASK_UUID']
+
subst.default_subs["link "] = sub_link
-subst.default_subs["tmpdir"] = sub_tmpdir
+subst.default_subs["task.tmpdir"] = sub_tmpdir
subst.default_subs["node.cores"] = sub_cores
+subst.default_subs["job.id"] = sub_jobid
+subst.default_subs["task.id"] = sub_taskid
rcode = 1
@@ -60,6 +68,38 @@ class SigHandler(object):
sp.send_signal(signum)
self.sig = signum
+if "foreach" in jobp:
+ if arvados.current_task()['sequence'] == 0:
+ var = jobp["foreach"]["var"]
+ values = jobp["foreach"]["values"]
+ fn = subst.do_substitution(jobp, values)
+ mode = os.stat(fn)
+ prefix = fn[len(os.environ['TASK_KEEPMOUNT'])+1:]
+ if mode != None:
+ if stat.S_ISDIR(mode):
+ items = ["$(dir %s/%s)" % (prefix, l) for l in os.listdir(fn)]
+ else stat.S_ISREG(mode):
+ with open(fn) as f:
+ items = [line for line in f]
+
+ print("run-command: parallelizing on %s with items %s" % (var, items))
+
+ for i in items:
+ params = copy.copy(jobp)
+ params[var] = i
+ arvados.api().job_tasks().create(body={
+ 'job_uuid': arvados.current_job()['uuid'],
+ 'created_by_job_task_uuid': arvados.current_task()['uuid'],
+ 'sequence': 1,
+ 'parameters': params
+ }
+ ).execute()
+ sys.exit(0)
+ else:
+ sys.exit(1)
+else:
+ p = jobp
+
try:
cmd = []
for c in p["command"]:
@@ -101,7 +141,7 @@ signal.signal(signal.SIGQUIT, signal.SIG_DFL)
for l in links:
os.unlink(l)
-print("run-command: the follow output files will be saved to keep:")
+print("run-command: the following output files will be saved to keep:")
subprocess.call(["find", ".", "-type", "f", "-printf", "run-command: %12.12s %h/%f\\n"])
commit 95ec380b85dec7a9004bcfd05e613eeb773c9d60
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date: Tue Jul 29 15:32:16 2014 -0400
Don't split fastq, just organize into directories.
diff --git a/crunch_scripts/split-fastq.py b/crunch_scripts/split-fastq.py
index f42b2b0..96ebfbd 100755
--- a/crunch_scripts/split-fastq.py
+++ b/crunch_scripts/split-fastq.py
@@ -4,57 +4,11 @@ import arvados
import re
import hashlib
-#api = arvados.api('v1')
+api = arvados.api('v1')
piece = 0
manifest_text = ""
-def readline(reader, start):
- line = ""
- n = -1
- while n == -1:
- r = reader.readfrom(start, 1024)
- if r == '':
- break
- n = string.find(r, "\n")
- line += r[0:n]
- start += len(r)
- return line
-
-def splitfastq(p):
- for i in xrange(0, len(p)):
- p[i]["start"] = 0
- p[i]["end"] = 0
-
- while True:
- recordsize = [0, 0]
-
- # read 4 lines starting at "start"
- for ln in xrange(0, 4):
- for i in xrange(0, len(p)):
- r = readline(p[i]["reader"], p[i]["start"])
- if r == '':
- return
- recordsize[i] += len(r)
-
- splitnow = False
- for i in xrange(0, len(p)):
- if ((p[i]["end"] - p[i]["start"]) + recordsize[i]) >= arvados.BLOCKSIZE:
- splitnow = True
-
- if splitnow:
- for i in xrange(0, len(p)):
- manifest = []
- manifest.extend(str(piece))
- manifest.extend([d[LOCATOR] for d in p["reader"]._stream._data_locators])
- manifest.extend(["{}:{}:{}".format(seg[LOCATOR], seg[BLOCKSIZE], self.name().replace(' ', '\\040')) for seg in arvados.locators_and_ranges(p[i]["reader"].segments, p[i]["start"], p[i]["end"] - p[i]["start"])])
- global manifest_text
- manifest_text += manifest.join(" ") + "\n"
- p[i]["start"] = p[i]["end"]
- else:
- for i in xrange(0, len(p)):
- p[i]["end"] += recordsize[i]
-
def put_manifest(manifest_text, sources=[]):
crm = arvados.CollectionReader(manifest_text)
@@ -82,26 +36,43 @@ def put_manifest(manifest_text, sources=[]):
return uuid
+# Look for paired reads
-# Look for pairs
-
-#inp = arvados.CollectionReader(arvados.get_job_param('input'))
+inp = arvados.CollectionReader(arvados.get_job_param('input'))
with open("/home/peter/manifest") as f:
inp = arvados.CollectionReader(f.read())
-prog = re.compile("(.*?)_1.fastq$")
+prog = re.compile("(.*?)_1.fastq(.gz)?$")
+
+manifest_text = ""
for s in inp.all_streams():
if s.name() == ".":
for f in s.all_files():
result = prog.match(f.name())
- print f.name()
if result != None:
p = [{}, {}]
- p[0]["reader"] = s.file(f)
- p[1]["reader"] = s.file(prog.group(1) + "_2.fastq")
-
-print manifest_text
-
-#arvados.current_task().set_output(put_manifest(manifest_text))
+ p[0]["reader"] = s.files()[result.group(0)]
+ p[1]["reader"] = s.files()[result.group(1) + "_2.fastq" + result.group(2)]
+ m0 = p[0]["reader"].as_manifest()[1:]
+ m1 = p[1]["reader"].as_manifest()[1:]
+ manifest_text += "_" + str(piece) + m0
+ manifest_text += "_" + str(piece) + m1
+ piece += 1
+
+# No pairs found so just put each fastq file into a separate directory
+if manifest_text == "":
+ for s in inp.all_streams():
+ prog = re.compile("(.*?).fastq(.gz)?$")
+ if s.name() == ".":
+ for f in s.all_files():
+ result = prog.match(f.name())
+ if result != None:
+ p = [{}]
+ p[0]["reader"] = s.files()[result.group(0)]
+ m0 = p[0]["reader"].as_manifest()[1:]
+ manifest_text += "_" + str(piece) + m0
+ piece += 1
+
+arvados.current_task().set_output(put_manifest(manifest_text, [arvados.get_job_param('input')]))
commit 783aff1187eba225008cd7db6fdd8307e46bc59b
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date: Tue Jul 29 14:59:56 2014 -0400
Full split-fastq wip
diff --git a/crunch_scripts/split-fastq.py b/crunch_scripts/split-fastq.py
new file mode 100755
index 0000000..f42b2b0
--- /dev/null
+++ b/crunch_scripts/split-fastq.py
@@ -0,0 +1,107 @@
+#!/usr/bin/python
+
+import arvados
+import re
+import hashlib
+
+#api = arvados.api('v1')
+
+piece = 0
+manifest_text = ""
+
+def readline(reader, start):
+ line = ""
+ n = -1
+ while n == -1:
+ r = reader.readfrom(start, 1024)
+ if r == '':
+ break
+ n = string.find(r, "\n")
+ line += r[0:n]
+ start += len(r)
+ return line
+
+def splitfastq(p):
+ for i in xrange(0, len(p)):
+ p[i]["start"] = 0
+ p[i]["end"] = 0
+
+ while True:
+ recordsize = [0, 0]
+
+ # read 4 lines starting at "start"
+ for ln in xrange(0, 4):
+ for i in xrange(0, len(p)):
+ r = readline(p[i]["reader"], p[i]["start"])
+ if r == '':
+ return
+ recordsize[i] += len(r)
+
+ splitnow = False
+ for i in xrange(0, len(p)):
+ if ((p[i]["end"] - p[i]["start"]) + recordsize[i]) >= arvados.BLOCKSIZE:
+ splitnow = True
+
+ if splitnow:
+ for i in xrange(0, len(p)):
+ manifest = []
+ manifest.extend(str(piece))
+ manifest.extend([d[LOCATOR] for d in p["reader"]._stream._data_locators])
+ manifest.extend(["{}:{}:{}".format(seg[LOCATOR], seg[BLOCKSIZE], self.name().replace(' ', '\\040')) for seg in arvados.locators_and_ranges(p[i]["reader"].segments, p[i]["start"], p[i]["end"] - p[i]["start"])])
+ global manifest_text
+ manifest_text += manifest.join(" ") + "\n"
+ p[i]["start"] = p[i]["end"]
+ else:
+ for i in xrange(0, len(p)):
+ p[i]["end"] += recordsize[i]
+
+def put_manifest(manifest_text, sources=[]):
+ crm = arvados.CollectionReader(manifest_text)
+
+ combined = crm.manifest_text(strip=True)
+
+ m = hashlib.new('md5')
+ m.update(combined)
+
+ uuid = "{}+{}".format(m.hexdigest(), len(combined))
+
+ collection = arvados.api().collections().create(
+ body={
+ 'uuid': uuid,
+ 'manifest_text': crm.manifest_text(),
+ }).execute()
+
+ for s in sources:
+ l = arvados.api().links().create(body={
+ "link": {
+ "tail_uuid": s,
+ "head_uuid": uuid,
+ "link_class": "provenance",
+ "name": "provided"
+ }}).execute()
+
+ return uuid
+
+
+# Look for pairs
+
+#inp = arvados.CollectionReader(arvados.get_job_param('input'))
+
+with open("/home/peter/manifest") as f:
+ inp = arvados.CollectionReader(f.read())
+
+prog = re.compile("(.*?)_1.fastq$")
+
+for s in inp.all_streams():
+ if s.name() == ".":
+ for f in s.all_files():
+ result = prog.match(f.name())
+ print f.name()
+ if result != None:
+ p = [{}, {}]
+ p[0]["reader"] = s.file(f)
+ p[1]["reader"] = s.file(prog.group(1) + "_2.fastq")
+
+print manifest_text
+
+#arvados.current_task().set_output(put_manifest(manifest_text))
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list