[ARVADOS] created: cd8865242a72db92cb2c6a269ca209e06b0a31ef

git at public.curoverse.com git at public.curoverse.com
Wed Jul 30 13:51:10 EDT 2014


        at  cd8865242a72db92cb2c6a269ca209e06b0a31ef (commit)


commit cd8865242a72db92cb2c6a269ca209e06b0a31ef
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Wed Jul 30 13:37:50 2014 -0400

    str or unicode

diff --git a/crunch_scripts/run-command b/crunch_scripts/run-command
index b915ef9..a92329a 100755
--- a/crunch_scripts/run-command
+++ b/crunch_scripts/run-command
@@ -85,7 +85,7 @@ def expand_item(p, c):
             return r
     elif isinstance(c, list):
         return expand_list(p, c)
-    elif isinstance(c, str):
+    elif isinstance(c, str) or isinstance(c, unicode):
         return [subst.do_substitution(p, c)]
 
     return []

commit 629ff557c7cb0e94080d3faed8b5ed4c53119f29
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Wed Jul 30 13:36:08 2014 -0400

    extra / after dir

diff --git a/crunch_scripts/run-command b/crunch_scripts/run-command
index f73410c..b915ef9 100755
--- a/crunch_scripts/run-command
+++ b/crunch_scripts/run-command
@@ -102,7 +102,7 @@ def get_items(p, value):
     prefix = fn[len(os.environ['TASK_KEEPMOUNT'])+1:]
     if mode != None:
         if stat.S_ISDIR(mode):
-            items = ["$(dir %s/%s)" % (prefix, l) for l in os.listdir(fn)]
+            items = ["$(dir %s/%s/)" % (prefix, l) for l in os.listdir(fn)]
         elif stat.S_ISREG(mode):
             with open(fn) as f:
                 items = [line for line in f]

commit 2b18c1a62315bf7eb407d75923d8d3e10b4c7cd2
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Wed Jul 30 13:32:07 2014 -0400

    Better error from glob

diff --git a/crunch_scripts/subst.py b/crunch_scripts/subst.py
index 2598e1c..8154d0e 100644
--- a/crunch_scripts/subst.py
+++ b/crunch_scripts/subst.py
@@ -44,7 +44,11 @@ def sub_basename(v):
     return os.path.splitext(os.path.basename(v))[0]
 
 def sub_glob(v):
-    return glob.glob(v)[0]
+    l = glob.glob(v)
+    if len(l) == 0:
+        raise Exception("$(glob): No match on '%s'" % v)
+    else:
+        return l[0]
 
 default_subs = {"file ": sub_file,
                 "dir ": sub_dir,

commit b0dff193d231456b36bed0bca7eac7f78997a8ae
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Wed Jul 30 13:25:51 2014 -0400

    Pretty print

diff --git a/crunch_scripts/run-command b/crunch_scripts/run-command
index 4708bd8..f73410c 100755
--- a/crunch_scripts/run-command
+++ b/crunch_scripts/run-command
@@ -13,6 +13,7 @@ import signal
 import stat
 import copy
 import traceback
+import pprint
 
 os.umask(0077)
 
@@ -147,7 +148,8 @@ try:
 except Exception as e:
     print("run-command: caught exception:")
     traceback.print_exc(file=sys.stdout)
-    print("run-command: parameters is %s" % p)
+    print("run-command: task parameters was:")
+    pprint.pprint(p)
     sys.exit(1)
 
 try:

commit 529efd199afb26cdc7b8a422ab8b69bcee1b3ec8
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Wed Jul 30 13:23:15 2014 -0400

    Rearranged exception handling a little more

diff --git a/crunch_scripts/run-command b/crunch_scripts/run-command
index 30f00f3..4708bd8 100755
--- a/crunch_scripts/run-command
+++ b/crunch_scripts/run-command
@@ -109,6 +109,9 @@ def get_items(p, value):
     else:
         return None
 
+stdoutname = None
+stdoutfile = None
+
 try:
     if "task.foreach" in jobp:
         if arvados.current_task()['sequence'] == 0:
@@ -135,6 +138,12 @@ try:
 
     cmd = expand_list(p, p["command"])
 
+    if "save.stdout" in p:
+        stdoutname = subst.do_substitution(p, p["save.stdout"])
+        stdoutfile = open(stdoutname, "wb")
+
+    print("run-command: {}{}".format(' '.join(cmd), (" > " + stdoutname) if stdoutname != None else ""))
+
 except Exception as e:
     print("run-command: caught exception:")
     traceback.print_exc(file=sys.stdout)
@@ -142,14 +151,6 @@ except Exception as e:
     sys.exit(1)
 
 try:
-    stdoutname = None
-    stdoutfile = None
-    if "save.stdout" in p:
-        stdoutname = subst.do_substitution(p, p["save.stdout"])
-        stdoutfile = open(stdoutname, "wb")
-
-    print("run-command: {}{}".format(' '.join(cmd), (" > " + stdoutname) if stdoutname != None else ""))
-
     sp = subprocess.Popen(cmd, shell=False, stdout=stdoutfile)
     sig = SigHandler()
 

commit 528280155a4a963187736364f74835f2873fc073
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Wed Jul 30 13:20:52 2014 -0400

    More logging

diff --git a/crunch_scripts/run-command b/crunch_scripts/run-command
index 840b4b6..30f00f3 100755
--- a/crunch_scripts/run-command
+++ b/crunch_scripts/run-command
@@ -109,33 +109,39 @@ def get_items(p, value):
     else:
         return None
 
-if "task.foreach" in jobp:
-    if arvados.current_task()['sequence'] == 0:
-        var = jobp["task.foreach"]
-        items = get_items(jobp, jobp[var])
-        if items != None:
+try:
+    if "task.foreach" in jobp:
+        if arvados.current_task()['sequence'] == 0:
+            var = jobp["task.foreach"]
+            items = get_items(jobp, jobp[var])
             print("run-command: parallelizing on %s with items %s" % (var, items))
+            if items != None:
+                for i in items:
+                    params = copy.copy(jobp)
+                    params[var] = i
+                    arvados.api().job_tasks().create(body={
+                        'job_uuid': arvados.current_job()['uuid'],
+                        'created_by_job_task_uuid': arvados.current_task()['uuid'],
+                        'sequence': 1,
+                        'parameters': params
+                        }
+                    ).execute()
+                arvados.current_task().set_output(None)
+                sys.exit(0)
+            else:
+                sys.exit(1)
+    else:
+        p = jobp
 
-            for i in items:
-                params = copy.copy(jobp)
-                params[var] = i
-                arvados.api().job_tasks().create(body={
-                    'job_uuid': arvados.current_job()['uuid'],
-                    'created_by_job_task_uuid': arvados.current_task()['uuid'],
-                    'sequence': 1,
-                    'parameters': params
-                    }
-                ).execute()
-            arvados.current_task().set_output(None)
-            sys.exit(0)
-        else:
-            sys.exit(1)
-else:
-    p = jobp
-
-try:
     cmd = expand_list(p, p["command"])
 
+except Exception as e:
+    print("run-command: caught exception:")
+    traceback.print_exc(file=sys.stdout)
+    print("run-command: parameters is %s" % p)
+    sys.exit(1)
+
+try:
     stdoutname = None
     stdoutfile = None
     if "save.stdout" in p:

commit 74fb942ca46cf6f039b961b572f9c0f160a24b93
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Wed Jul 30 13:15:51 2014 -0400

    Print traceback on exception

diff --git a/crunch_scripts/run-command b/crunch_scripts/run-command
index 1ff985c..840b4b6 100755
--- a/crunch_scripts/run-command
+++ b/crunch_scripts/run-command
@@ -12,6 +12,7 @@ import arvados.commands.put as put
 import signal
 import stat
 import copy
+import traceback
 
 os.umask(0077)
 
@@ -161,7 +162,8 @@ try:
         print("run-command: completed with exit code %i (%s)" % (rcode, "success" if rcode == 0 else "failed"))
 
 except Exception as e:
-    print("run-command: caught exception: {}".format(e))
+    print("run-command: caught exception:")
+    traceback.print_exc(file=sys.stdout)
 
 # restore default signal handlers.
 signal.signal(signal.SIGINT, signal.SIG_DFL)
@@ -198,7 +200,8 @@ while not done:
         print("run-command: terminating on signal 2")
         sys.exit(2)
     except Exception as e:
-        print("run-command: caught exception: {}".format(e))
+        print("run-command: caught exception:")
+        traceback.print_exc(file=sys.stdout)
         time.sleep(5)
 
 sys.exit(rcode)

commit adcf46d59e7da4f08fbc9a94dcae52bc573c24ad
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Wed Jul 30 11:40:29 2014 -0400

    Set first task to success

diff --git a/crunch_scripts/run-command b/crunch_scripts/run-command
index 1056925..1ff985c 100755
--- a/crunch_scripts/run-command
+++ b/crunch_scripts/run-command
@@ -125,6 +125,7 @@ if "task.foreach" in jobp:
                     'parameters': params
                     }
                 ).execute()
+            arvados.current_task().set_output(None)
             sys.exit(0)
         else:
             sys.exit(1)

commit 577c61793d8d9f2452b6b53d3791822d001e339c
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Wed Jul 30 11:36:05 2014 -0400

    import copy

diff --git a/crunch_scripts/run-command b/crunch_scripts/run-command
index 64d9c6c..1056925 100755
--- a/crunch_scripts/run-command
+++ b/crunch_scripts/run-command
@@ -11,6 +11,7 @@ import time
 import arvados.commands.put as put
 import signal
 import stat
+import copy
 
 os.umask(0077)
 

commit c3b19359bd21c0b2115c4169102327762e48a93b
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Wed Jul 30 11:34:38 2014 -0400

    .st_mode

diff --git a/crunch_scripts/run-command b/crunch_scripts/run-command
index aa30d2d..64d9c6c 100755
--- a/crunch_scripts/run-command
+++ b/crunch_scripts/run-command
@@ -95,7 +95,7 @@ def get_items(p, value):
         return expand_list(p, value)
 
     fn = subst.do_substitution(p, value)
-    mode = os.stat(fn)
+    mode = os.stat(fn).st_mode
     prefix = fn[len(os.environ['TASK_KEEPMOUNT'])+1:]
     if mode != None:
         if stat.S_ISDIR(mode):

commit baf2c86b3af2d0b8db22aff5d6f427f025d767b7
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Wed Jul 30 11:32:41 2014 -0400

    import stat

diff --git a/crunch_scripts/run-command b/crunch_scripts/run-command
index 826cbb5..aa30d2d 100755
--- a/crunch_scripts/run-command
+++ b/crunch_scripts/run-command
@@ -10,6 +10,7 @@ import subst
 import time
 import arvados.commands.put as put
 import signal
+import stat
 
 os.umask(0077)
 

commit 1d13f6a6edd0f101d11edc55663d83ac40f6fbfe
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Wed Jul 30 11:30:47 2014 -0400

    Fix syntax error

diff --git a/crunch_scripts/run-command b/crunch_scripts/run-command
index 27ce822..826cbb5 100755
--- a/crunch_scripts/run-command
+++ b/crunch_scripts/run-command
@@ -99,7 +99,7 @@ def get_items(p, value):
     if mode != None:
         if stat.S_ISDIR(mode):
             items = ["$(dir %s/%s)" % (prefix, l) for l in os.listdir(fn)]
-        else stat.S_ISREG(mode):
+        elif stat.S_ISREG(mode):
             with open(fn) as f:
                 items = [line for line in f]
         return items
@@ -108,7 +108,7 @@ def get_items(p, value):
 
 if "task.foreach" in jobp:
     if arvados.current_task()['sequence'] == 0:
-        var = jobp["foreach"]
+        var = jobp["task.foreach"]
         items = get_items(jobp, jobp[var])
         if items != None:
             print("run-command: parallelizing on %s with items %s" % (var, items))
@@ -134,8 +134,8 @@ try:
 
     stdoutname = None
     stdoutfile = None
-    if "stdout" in p:
-        stdoutname = subst.do_substitution(p, p["stdout"])
+    if "save.stdout" in p:
+        stdoutname = subst.do_substitution(p, p["save.stdout"])
         stdoutfile = open(stdoutname, "wb")
 
     print("run-command: {}{}".format(' '.join(cmd), (" > " + stdoutname) if stdoutname != None else ""))

commit 99d5b548b50c573281d9780ecd631ecf83f4eefe
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Wed Jul 30 10:58:08 2014 -0400

    Refactored and generalized list expansion, can use "foreach" to repeat portions
    of the command line.

diff --git a/crunch_scripts/run-command b/crunch_scripts/run-command
index 186b1bc..27ce822 100755
--- a/crunch_scripts/run-command
+++ b/crunch_scripts/run-command
@@ -68,19 +68,49 @@ class SigHandler(object):
         sp.send_signal(signum)
         self.sig = signum
 
-if "foreach" in jobp:
+def expand_item(p, c):
+    if isinstance(c, dict):
+        if "foreach" in c and "command" in c:
+            var = c["foreach"]
+            items = get_items(p, p[var])
+            r = []
+            for i in items:
+                params = copy.copy(p)
+                params[var] = i
+                r.extend(expand_list(params, c["command"]))
+            return r
+    elif isinstance(c, list):
+        return expand_list(p, c)
+    elif isinstance(c, str):
+        return [subst.do_substitution(p, c)]
+
+    return []
+
+def expand_list(p, l):
+    return [exp for arg in l for exp in expand_item(p, arg)]
+
+def get_items(p, value):
+    if isinstance(value, list):
+        return expand_list(p, value)
+
+    fn = subst.do_substitution(p, value)
+    mode = os.stat(fn)
+    prefix = fn[len(os.environ['TASK_KEEPMOUNT'])+1:]
+    if mode != None:
+        if stat.S_ISDIR(mode):
+            items = ["$(dir %s/%s)" % (prefix, l) for l in os.listdir(fn)]
+        else stat.S_ISREG(mode):
+            with open(fn) as f:
+                items = [line for line in f]
+        return items
+    else:
+        return None
+
+if "task.foreach" in jobp:
     if arvados.current_task()['sequence'] == 0:
         var = jobp["foreach"]
-        fn = subst.do_substitution(jobp, jobp[var])
-        mode = os.stat(fn)
-        prefix = fn[len(os.environ['TASK_KEEPMOUNT'])+1:]
-        if mode != None:
-            if stat.S_ISDIR(mode):
-                items = ["$(dir %s/%s)" % (prefix, l) for l in os.listdir(fn)]
-            else stat.S_ISREG(mode):
-                with open(fn) as f:
-                    items = [line for line in f]
-
+        items = get_items(jobp, jobp[var])
+        if items != None:
             print("run-command: parallelizing on %s with items %s" % (var, items))
 
             for i in items:
@@ -100,9 +130,7 @@ else:
     p = jobp
 
 try:
-    cmd = []
-    for c in p["command"]:
-        cmd.append(subst.do_substitution(p, c))
+    cmd = expand_list(p, p["command"])
 
     stdoutname = None
     stdoutfile = None

commit 06cc4285949e8203a80f1b506b3551b731494dcf
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Wed Jul 30 10:54:41 2014 -0400

    Simpler split-fastq, doesn't try to upload the collection, assume crunch-job
    will create the collection for us.

diff --git a/crunch_scripts/split-fastq.py b/crunch_scripts/split-fastq.py
index 065c396..42b8e61 100755
--- a/crunch_scripts/split-fastq.py
+++ b/crunch_scripts/split-fastq.py
@@ -9,36 +9,6 @@ api = arvados.api('v1')
 piece = 0
 manifest_text = ""
 
-def put_manifest(manifest_text, sources=[]):
-    crm = arvados.CollectionReader(manifest_text)
-
-    combined = crm.manifest_text(strip=True)
-
-    m = hashlib.new('md5')
-    m.update(combined)
-
-    print combined
-
-    uuid = "{}+{}".format(m.hexdigest(), len(combined))
-
-    collection = arvados.api().collections().create(
-        body={'collection':{
-            'uuid': uuid,
-            'manifest_text': crm.manifest_text()
-        }
-        }).execute()
-
-    for s in sources:
-        l = arvados.api().links().create(body={
-            "link": {
-                "tail_uuid": s,
-                "head_uuid": uuid,
-                "link_class": "provenance",
-                "name": "provided"
-            }}).execute()
-
-    return uuid
-
 # Look for paired reads
 
 inp = arvados.CollectionReader(arvados.getjobparam('reads'))
@@ -75,6 +45,4 @@ if manifest_text == "":
                     manifest_text += "./_" + str(piece) + m0
                     piece += 1
 
-print manifest_text
-
-arvados.current_task().set_output(put_manifest(manifest_text, [arvados.getjobparam('reads')]))
+arvados.current_task().set_output(manifest_text)

commit 1774a688181d771790c3c406ae45c99dd26df023
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Tue Jul 29 16:53:06 2014 -0400

    Change collection create request

diff --git a/crunch_scripts/split-fastq.py b/crunch_scripts/split-fastq.py
index b51a334..065c396 100755
--- a/crunch_scripts/split-fastq.py
+++ b/crunch_scripts/split-fastq.py
@@ -22,9 +22,10 @@ def put_manifest(manifest_text, sources=[]):
     uuid = "{}+{}".format(m.hexdigest(), len(combined))
 
     collection = arvados.api().collections().create(
-        body={
+        body={'collection':{
             'uuid': uuid,
-            'manifest_text': crm.manifest_text(),
+            'manifest_text': crm.manifest_text()
+        }
         }).execute()
 
     for s in sources:

commit d1332738de33f6e5c50b8d444590f4add9663853
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Tue Jul 29 16:48:13 2014 -0400

    Debug manifest format to see why api server is rejecting it

diff --git a/crunch_scripts/split-fastq.py b/crunch_scripts/split-fastq.py
index 55e7fcc..b51a334 100755
--- a/crunch_scripts/split-fastq.py
+++ b/crunch_scripts/split-fastq.py
@@ -17,6 +17,8 @@ def put_manifest(manifest_text, sources=[]):
     m = hashlib.new('md5')
     m.update(combined)
 
+    print combined
+
     uuid = "{}+{}".format(m.hexdigest(), len(combined))
 
     collection = arvados.api().collections().create(
@@ -54,8 +56,8 @@ for s in inp.all_streams():
                 p[1]["reader"] = s.files()[result.group(1) + "_2.fastq" + result.group(2)]
                 m0 = p[0]["reader"].as_manifest()[1:]
                 m1 = p[1]["reader"].as_manifest()[1:]
-                manifest_text += "_" + str(piece) + m0
-                manifest_text += "_" + str(piece) + m1
+                manifest_text += "./_" + str(piece) + m0
+                manifest_text += "./_" + str(piece) + m1
                 piece += 1
 
 # No pairs found so just put each fastq file into a separate directory
@@ -69,7 +71,7 @@ if manifest_text == "":
                     p = [{}]
                     p[0]["reader"] = s.files()[result.group(0)]
                     m0 = p[0]["reader"].as_manifest()[1:]
-                    manifest_text += "_" + str(piece) + m0
+                    manifest_text += "./_" + str(piece) + m0
                     piece += 1
 
 print manifest_text

commit 6755f71e4b5e9474f24f8d79739feb775d91496b
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Tue Jul 29 16:43:16 2014 -0400

    debug print manifest_text

diff --git a/crunch_scripts/split-fastq.py b/crunch_scripts/split-fastq.py
index 1c0fae5..55e7fcc 100755
--- a/crunch_scripts/split-fastq.py
+++ b/crunch_scripts/split-fastq.py
@@ -72,4 +72,6 @@ if manifest_text == "":
                     manifest_text += "_" + str(piece) + m0
                     piece += 1
 
+print manifest_text
+
 arvados.current_task().set_output(put_manifest(manifest_text, [arvados.getjobparam('reads')]))

commit a8e5a5ee592e9e71c1ce80247212e33d61ae2381
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Tue Jul 29 16:41:01 2014 -0400

    Left debugging in by accident

diff --git a/crunch_scripts/split-fastq.py b/crunch_scripts/split-fastq.py
index d3faa9b..1c0fae5 100755
--- a/crunch_scripts/split-fastq.py
+++ b/crunch_scripts/split-fastq.py
@@ -40,9 +40,6 @@ def put_manifest(manifest_text, sources=[]):
 
 inp = arvados.CollectionReader(arvados.getjobparam('reads'))
 
-with open("/home/peter/manifest") as f:
-    inp = arvados.CollectionReader(f.read())
-
 prog = re.compile("(.*?)_1.fastq(.gz)?$")
 
 manifest_text = ""

commit e4f893184833d43c4aed29a1825f4db08cb14d0e
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Tue Jul 29 16:39:45 2014 -0400

    Same bug, different place

diff --git a/crunch_scripts/split-fastq.py b/crunch_scripts/split-fastq.py
index 4c112fb..d3faa9b 100755
--- a/crunch_scripts/split-fastq.py
+++ b/crunch_scripts/split-fastq.py
@@ -75,4 +75,4 @@ if manifest_text == "":
                     manifest_text += "_" + str(piece) + m0
                     piece += 1
 
-arvados.current_task().set_output(put_manifest(manifest_text, [arvados.get_job_param('input')]))
+arvados.current_task().set_output(put_manifest(manifest_text, [arvados.getjobparam('reads')]))

commit 0cf8e41ee88c6cd6e2546958926d7a59991dab4e
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Tue Jul 29 16:38:16 2014 -0400

    Input is "reads" not "input"

diff --git a/crunch_scripts/split-fastq.py b/crunch_scripts/split-fastq.py
index a4444d0..4c112fb 100755
--- a/crunch_scripts/split-fastq.py
+++ b/crunch_scripts/split-fastq.py
@@ -38,7 +38,7 @@ def put_manifest(manifest_text, sources=[]):
 
 # Look for paired reads
 
-inp = arvados.CollectionReader(arvados.getjobparam('input'))
+inp = arvados.CollectionReader(arvados.getjobparam('reads'))
 
 with open("/home/peter/manifest") as f:
     inp = arvados.CollectionReader(f.read())

commit 2c7b8f8bee023c438c47aff2f35848bbf6c1ae5f
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Tue Jul 29 16:37:03 2014 -0400

    typo fix get_job_param() -> getjobparam()

diff --git a/crunch_scripts/split-fastq.py b/crunch_scripts/split-fastq.py
index 96ebfbd..a4444d0 100755
--- a/crunch_scripts/split-fastq.py
+++ b/crunch_scripts/split-fastq.py
@@ -38,7 +38,7 @@ def put_manifest(manifest_text, sources=[]):
 
 # Look for paired reads
 
-inp = arvados.CollectionReader(arvados.get_job_param('input'))
+inp = arvados.CollectionReader(arvados.getjobparam('input'))
 
 with open("/home/peter/manifest") as f:
     inp = arvados.CollectionReader(f.read())

commit 5990ac0ad45146fad60d151b24a39c15e72a1d87
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Tue Jul 29 16:32:50 2014 -0400

    foreach just refers to other parameter to use as iterator variable.

diff --git a/crunch_scripts/run-command b/crunch_scripts/run-command
index a7dcd25..186b1bc 100755
--- a/crunch_scripts/run-command
+++ b/crunch_scripts/run-command
@@ -70,9 +70,8 @@ class SigHandler(object):
 
 if "foreach" in jobp:
     if arvados.current_task()['sequence'] == 0:
-        var = jobp["foreach"]["var"]
-        values = jobp["foreach"]["values"]
-        fn = subst.do_substitution(jobp, values)
+        var = jobp["foreach"]
+        fn = subst.do_substitution(jobp, jobp[var])
         mode = os.stat(fn)
         prefix = fn[len(os.environ['TASK_KEEPMOUNT'])+1:]
         if mode != None:

commit 925a830167cb2af6ededf9a0f6d83a2e672452b8
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Tue Jul 29 16:10:21 2014 -0400

    Added "foreach" to run-command wrapper.  Needs testing.

diff --git a/crunch_scripts/run-command b/crunch_scripts/run-command
index e6ec889..a7dcd25 100755
--- a/crunch_scripts/run-command
+++ b/crunch_scripts/run-command
@@ -23,10 +23,10 @@ os.mkdir("output")
 
 os.chdir("output")
 
+taskp = None
+jobp = arvados.current_job()['script_parameters']
 if len(arvados.current_task()['parameters']) > 0:
     p = arvados.current_task()['parameters']
-else:
-    p = arvados.current_job()['script_parameters']
 
 links = []
 
@@ -42,9 +42,17 @@ def sub_tmpdir(v):
 def sub_cores(v):
      return os.environ['CRUNCH_NODE_SLOTS']
 
+def sub_jobid(v):
+     return os.environ['JOB_UUID']
+
+def sub_taskid(v):
+     return os.environ['TASK_UUID']
+
 subst.default_subs["link "] = sub_link
-subst.default_subs["tmpdir"] = sub_tmpdir
+subst.default_subs["task.tmpdir"] = sub_tmpdir
 subst.default_subs["node.cores"] = sub_cores
+subst.default_subs["job.id"] = sub_jobid
+subst.default_subs["task.id"] = sub_taskid
 
 rcode = 1
 
@@ -60,6 +68,38 @@ class SigHandler(object):
         sp.send_signal(signum)
         self.sig = signum
 
+if "foreach" in jobp:
+    if arvados.current_task()['sequence'] == 0:
+        var = jobp["foreach"]["var"]
+        values = jobp["foreach"]["values"]
+        fn = subst.do_substitution(jobp, values)
+        mode = os.stat(fn)
+        prefix = fn[len(os.environ['TASK_KEEPMOUNT'])+1:]
+        if mode != None:
+            if stat.S_ISDIR(mode):
+                items = ["$(dir %s/%s)" % (prefix, l) for l in os.listdir(fn)]
+            else stat.S_ISREG(mode):
+                with open(fn) as f:
+                    items = [line for line in f]
+
+            print("run-command: parallelizing on %s with items %s" % (var, items))
+
+            for i in items:
+                params = copy.copy(jobp)
+                params[var] = i
+                arvados.api().job_tasks().create(body={
+                    'job_uuid': arvados.current_job()['uuid'],
+                    'created_by_job_task_uuid': arvados.current_task()['uuid'],
+                    'sequence': 1,
+                    'parameters': params
+                    }
+                ).execute()
+            sys.exit(0)
+        else:
+            sys.exit(1)
+else:
+    p = jobp
+
 try:
     cmd = []
     for c in p["command"]:
@@ -101,7 +141,7 @@ signal.signal(signal.SIGQUIT, signal.SIG_DFL)
 for l in links:
     os.unlink(l)
 
-print("run-command: the follow output files will be saved to keep:")
+print("run-command: the following output files will be saved to keep:")
 
 subprocess.call(["find", ".", "-type", "f", "-printf", "run-command: %12.12s %h/%f\\n"])
 

commit 95ec380b85dec7a9004bcfd05e613eeb773c9d60
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Tue Jul 29 15:32:16 2014 -0400

    Don't split fastq, just organize into directories.

diff --git a/crunch_scripts/split-fastq.py b/crunch_scripts/split-fastq.py
index f42b2b0..96ebfbd 100755
--- a/crunch_scripts/split-fastq.py
+++ b/crunch_scripts/split-fastq.py
@@ -4,57 +4,11 @@ import arvados
 import re
 import hashlib
 
-#api = arvados.api('v1')
+api = arvados.api('v1')
 
 piece = 0
 manifest_text = ""
 
-def readline(reader, start):
-    line = ""
-    n = -1
-    while n == -1:
-        r = reader.readfrom(start, 1024)
-        if r == '':
-            break
-        n = string.find(r, "\n")
-        line += r[0:n]
-        start += len(r)
-    return line
-
-def splitfastq(p):
-    for i in xrange(0, len(p)):
-        p[i]["start"] = 0
-        p[i]["end"] = 0
-
-    while True:
-        recordsize = [0, 0]
-
-        # read 4 lines starting at "start"
-        for ln in xrange(0, 4):
-            for i in xrange(0, len(p)):
-                r = readline(p[i]["reader"], p[i]["start"])
-                if r == '':
-                    return
-                recordsize[i] += len(r)
-
-        splitnow = False
-        for i in xrange(0, len(p)):
-            if ((p[i]["end"] - p[i]["start"]) + recordsize[i]) >= arvados.BLOCKSIZE:
-                splitnow = True
-
-        if splitnow:
-            for i in xrange(0, len(p)):
-                manifest = []
-                manifest.extend(str(piece))
-                manifest.extend([d[LOCATOR] for d in p["reader"]._stream._data_locators])
-                manifest.extend(["{}:{}:{}".format(seg[LOCATOR], seg[BLOCKSIZE], self.name().replace(' ', '\\040')) for seg in arvados.locators_and_ranges(p[i]["reader"].segments, p[i]["start"], p[i]["end"] - p[i]["start"])])
-                global manifest_text
-                manifest_text += manifest.join(" ") + "\n"
-                p[i]["start"] = p[i]["end"]
-        else:
-            for i in xrange(0, len(p)):
-                p[i]["end"] += recordsize[i]
-
 def put_manifest(manifest_text, sources=[]):
     crm = arvados.CollectionReader(manifest_text)
 
@@ -82,26 +36,43 @@ def put_manifest(manifest_text, sources=[]):
 
     return uuid
 
+# Look for paired reads
 
-# Look for pairs
-
-#inp = arvados.CollectionReader(arvados.get_job_param('input'))
+inp = arvados.CollectionReader(arvados.get_job_param('input'))
 
 with open("/home/peter/manifest") as f:
     inp = arvados.CollectionReader(f.read())
 
-prog = re.compile("(.*?)_1.fastq$")
+prog = re.compile("(.*?)_1.fastq(.gz)?$")
+
+manifest_text = ""
 
 for s in inp.all_streams():
     if s.name() == ".":
         for f in s.all_files():
             result = prog.match(f.name())
-            print f.name()
             if result != None:
                 p = [{}, {}]
-                p[0]["reader"] = s.file(f)
-                p[1]["reader"] = s.file(prog.group(1) + "_2.fastq")
-
-print manifest_text
-
-#arvados.current_task().set_output(put_manifest(manifest_text))
+                p[0]["reader"] = s.files()[result.group(0)]
+                p[1]["reader"] = s.files()[result.group(1) + "_2.fastq" + result.group(2)]
+                m0 = p[0]["reader"].as_manifest()[1:]
+                m1 = p[1]["reader"].as_manifest()[1:]
+                manifest_text += "_" + str(piece) + m0
+                manifest_text += "_" + str(piece) + m1
+                piece += 1
+
+# No pairs found so just put each fastq file into a separate directory
+if manifest_text == "":
+    for s in inp.all_streams():
+        prog = re.compile("(.*?).fastq(.gz)?$")
+        if s.name() == ".":
+            for f in s.all_files():
+                result = prog.match(f.name())
+                if result != None:
+                    p = [{}]
+                    p[0]["reader"] = s.files()[result.group(0)]
+                    m0 = p[0]["reader"].as_manifest()[1:]
+                    manifest_text += "_" + str(piece) + m0
+                    piece += 1
+
+arvados.current_task().set_output(put_manifest(manifest_text, [arvados.get_job_param('input')]))

commit 783aff1187eba225008cd7db6fdd8307e46bc59b
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Tue Jul 29 14:59:56 2014 -0400

    Full split-fastq wip

diff --git a/crunch_scripts/split-fastq.py b/crunch_scripts/split-fastq.py
new file mode 100755
index 0000000..f42b2b0
--- /dev/null
+++ b/crunch_scripts/split-fastq.py
@@ -0,0 +1,107 @@
+#!/usr/bin/python
+
+import arvados
+import re
+import hashlib
+
+#api = arvados.api('v1')
+
+piece = 0
+manifest_text = ""
+
+def readline(reader, start):
+    line = ""
+    n = -1
+    while n == -1:
+        r = reader.readfrom(start, 1024)
+        if r == '':
+            break
+        n = string.find(r, "\n")
+        line += r[0:n]
+        start += len(r)
+    return line
+
+def splitfastq(p):
+    for i in xrange(0, len(p)):
+        p[i]["start"] = 0
+        p[i]["end"] = 0
+
+    while True:
+        recordsize = [0, 0]
+
+        # read 4 lines starting at "start"
+        for ln in xrange(0, 4):
+            for i in xrange(0, len(p)):
+                r = readline(p[i]["reader"], p[i]["start"])
+                if r == '':
+                    return
+                recordsize[i] += len(r)
+
+        splitnow = False
+        for i in xrange(0, len(p)):
+            if ((p[i]["end"] - p[i]["start"]) + recordsize[i]) >= arvados.BLOCKSIZE:
+                splitnow = True
+
+        if splitnow:
+            for i in xrange(0, len(p)):
+                manifest = []
+                manifest.extend(str(piece))
+                manifest.extend([d[LOCATOR] for d in p["reader"]._stream._data_locators])
+                manifest.extend(["{}:{}:{}".format(seg[LOCATOR], seg[BLOCKSIZE], self.name().replace(' ', '\\040')) for seg in arvados.locators_and_ranges(p[i]["reader"].segments, p[i]["start"], p[i]["end"] - p[i]["start"])])
+                global manifest_text
+                manifest_text += manifest.join(" ") + "\n"
+                p[i]["start"] = p[i]["end"]
+        else:
+            for i in xrange(0, len(p)):
+                p[i]["end"] += recordsize[i]
+
+def put_manifest(manifest_text, sources=[]):
+    crm = arvados.CollectionReader(manifest_text)
+
+    combined = crm.manifest_text(strip=True)
+
+    m = hashlib.new('md5')
+    m.update(combined)
+
+    uuid = "{}+{}".format(m.hexdigest(), len(combined))
+
+    collection = arvados.api().collections().create(
+        body={
+            'uuid': uuid,
+            'manifest_text': crm.manifest_text(),
+        }).execute()
+
+    for s in sources:
+        l = arvados.api().links().create(body={
+            "link": {
+                "tail_uuid": s,
+                "head_uuid": uuid,
+                "link_class": "provenance",
+                "name": "provided"
+            }}).execute()
+
+    return uuid
+
+
+# Look for pairs
+
+#inp = arvados.CollectionReader(arvados.get_job_param('input'))
+
+with open("/home/peter/manifest") as f:
+    inp = arvados.CollectionReader(f.read())
+
+prog = re.compile("(.*?)_1.fastq$")
+
+for s in inp.all_streams():
+    if s.name() == ".":
+        for f in s.all_files():
+            result = prog.match(f.name())
+            print f.name()
+            if result != None:
+                p = [{}, {}]
+                p[0]["reader"] = s.file(f)
+                p[1]["reader"] = s.file(prog.group(1) + "_2.fastq")
+
+print manifest_text
+
+#arvados.current_task().set_output(put_manifest(manifest_text))

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list