[ARVADOS] updated: 5590c9ac669f2d74858e6c994afe1a2e9df8d104

git at public.curoverse.com git at public.curoverse.com
Thu Dec 3 11:53:59 EST 2015


Summary of changes:
 crunch_scripts/test/task_output_dir | 16 ++++++++++++++++
 sdk/cli/bin/crunch-job              | 29 +++++++++++++++++++----------
 sdk/python/arvados/crunch.py        | 27 +++++++++++++++++++++++++++
 sdk/python/tests/test_crunch.py     | 27 +++++++++++++++++++++++++++
 4 files changed, 89 insertions(+), 10 deletions(-)
 create mode 100755 crunch_scripts/test/task_output_dir
 create mode 100644 sdk/python/arvados/crunch.py
 create mode 100644 sdk/python/tests/test_crunch.py

       via  5590c9ac669f2d74858e6c994afe1a2e9df8d104 (commit)
       via  66658d762ff7b8a6ef42cb592ad2d677802f4e18 (commit)
       via  ab5df30a207bfaa3f2163602bca538ed37163d15 (commit)
      from  de57addc345d228d8b1ebf0965fd5e98e01b9842 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.


commit 5590c9ac669f2d74858e6c994afe1a2e9df8d104
Merge: de57add 66658d7
Author: Tom Clegg <tom at curoverse.com>
Date:   Thu Dec 3 12:03:16 2015 -0500

    Merge branch '7751-crunch-fuse-output' closes #7751


commit 66658d762ff7b8a6ef42cb592ad2d677802f4e18
Author: Tom Clegg <tom at curoverse.com>
Date:   Thu Nov 26 21:11:16 2015 -0500

    7751: Add convenience class for staging task output in $TASK_KEEPMOUNT_TMP.

diff --git a/crunch_scripts/test/task_output_dir b/crunch_scripts/test/task_output_dir
new file mode 100755
index 0000000..b177892
--- /dev/null
+++ b/crunch_scripts/test/task_output_dir
@@ -0,0 +1,16 @@
+#!/usr/bin/env python
+
+import arvados
+import arvados.crunch
+import hashlib
+import os
+
+out = arvados.crunch.TaskOutputDir()
+
+string = open(__file__).read()
+with open(os.path.join(out.path, 'example.out'), 'w') as f:
+    f.write(string)
+with open(os.path.join(out.path, 'example.out.SHA1'), 'w') as f:
+    f.write(hashlib.sha1(string).hexdigest() + "\n")
+
+arvados.current_task().set_output(out.manifest_text())
diff --git a/sdk/python/arvados/crunch.py b/sdk/python/arvados/crunch.py
new file mode 100644
index 0000000..c184e6a
--- /dev/null
+++ b/sdk/python/arvados/crunch.py
@@ -0,0 +1,27 @@
+import json
+import os
+
+class TaskOutputDir(object):
+    """Keep-backed directory for staging outputs of Crunch tasks.
+
+    Example, in a crunch task whose output is a file called "out.txt"
+    containing "42":
+
+        import arvados
+        import arvados.crunch
+        import os
+
+        out = arvados.crunch.TaskOutputDir()
+        with open(os.path.join(out.path, 'out.txt'), 'w') as f:
+            f.write('42')
+        arvados.current_task().set_output(out.manifest_text())
+    """
+    def __init__(self):
+        self.path = os.environ['TASK_KEEPMOUNT_TMP']
+
+    def __str__(self):
+        return self.path
+
+    def manifest_text(self):
+        snapshot = os.path.join(self.path, '.arvados#collection')
+        return json.load(open(snapshot))['manifest_text']
diff --git a/sdk/python/tests/test_crunch.py b/sdk/python/tests/test_crunch.py
new file mode 100644
index 0000000..431390b
--- /dev/null
+++ b/sdk/python/tests/test_crunch.py
@@ -0,0 +1,27 @@
+import arvados.crunch
+import os
+import shutil
+import tempfile
+import unittest
+
+class TaskOutputDirTest(unittest.TestCase):
+    def setUp(self):
+        self.tmp = tempfile.mkdtemp()
+        os.environ['TASK_KEEPMOUNT_TMP'] = self.tmp
+
+    def tearDown(self):
+        os.environ.pop('TASK_KEEPMOUNT_TMP')
+        shutil.rmtree(self.tmp)
+
+    def test_env_var(self):
+        out = arvados.crunch.TaskOutputDir()
+        self.assertEqual(out.path, self.tmp)
+
+        with open(os.path.join(self.tmp, '.arvados#collection'), 'w') as f:
+            f.write('{\n  "manifest_text":"",\n  "uuid":null\n}\n')
+        self.assertEqual(out.manifest_text(), '')
+
+        # Special file must be re-read on each call to manifest_text().
+        with open(os.path.join(self.tmp, '.arvados#collection'), 'w') as f:
+            f.write(r'{"manifest_text":". unparsed 0:3:foo\n","uuid":null}')
+        self.assertEqual(out.manifest_text(), ". unparsed 0:3:foo\n")

commit ab5df30a207bfaa3f2163602bca538ed37163d15
Author: Tom Clegg <tom at curoverse.com>
Date:   Thu Nov 26 20:52:10 2015 -0500

    7751: Set up an arv-mount scratch directory for each task, and put its path in TASK_KEEPMOUNT_TMP.

diff --git a/sdk/cli/bin/crunch-job b/sdk/cli/bin/crunch-job
index b9365fe..70d05f0 100755
--- a/sdk/cli/bin/crunch-job
+++ b/sdk/cli/bin/crunch-job
@@ -872,11 +872,12 @@ for (my $todo_ptr = 0; $todo_ptr <= $#jobstep_todo; $todo_ptr ++)
     $ENV{"TASK_SLOT_NUMBER"} = $slot[$childslot]->{cpu};
     $ENV{"TASK_WORK"} = $ENV{"CRUNCH_TMP"}."/task/$childslotname";
     $ENV{"HOME"} = $ENV{"TASK_WORK"};
-    $ENV{"TASK_KEEPMOUNT"} = $ENV{"TASK_WORK"}.".keep";
     $ENV{"TASK_TMPDIR"} = $ENV{"TASK_WORK"}; # deprecated
     $ENV{"CRUNCH_NODE_SLOTS"} = $round_max_slots{$ENV{TASK_SLOT_NODE}};
     $ENV{"PATH"} = $ENV{"CRUNCH_INSTALL"} . "/bin:" . $ENV{"PATH"};
 
+    my $keep_mnt = $ENV{"TASK_WORK"}.".keep";
+
     $ENV{"GZIP"} = "-n";
 
     my @srunargs = (
@@ -894,16 +895,20 @@ for (my $todo_ptr = 0; $todo_ptr <= $#jobstep_todo; $todo_ptr ++)
     }
 
     my $command =
-	"if [ -e $ENV{TASK_WORK} ]; then rm -rf $ENV{TASK_WORK}; fi; "
-        ."mkdir -p $ENV{CRUNCH_TMP} $ENV{JOB_WORK} $ENV{TASK_WORK} $ENV{TASK_KEEPMOUNT} "
-	."&& cd $ENV{CRUNCH_TMP} "
+	"if [ -e \Q$ENV{TASK_WORK}\E ]; then rm -rf \Q$ENV{TASK_WORK}\E; fi; "
+        ."mkdir -p \Q$ENV{CRUNCH_TMP}\E \Q$ENV{JOB_WORK}\E \Q$ENV{TASK_WORK}\E \Q$keep_mnt\E "
+	."&& cd \Q$ENV{CRUNCH_TMP}\E "
         # These environment variables get used explicitly later in
         # $command.  No tool is expected to read these values directly.
         .q{&& MEM=$(awk '($1 == "MemTotal:"){print $2}' </proc/meminfo) }
         .q{&& SWAP=$(awk '($1 == "SwapTotal:"){print $2}' </proc/meminfo) }
         ."&& MEMLIMIT=\$(( (\$MEM * 95) / ($ENV{CRUNCH_NODE_SLOTS} * 100) )) "
         ."&& let SWAPLIMIT=\$MEMLIMIT+\$SWAP ";
-    $command .= "&& exec arv-mount --by-pdh --crunchstat-interval=10 --allow-other $arv_file_cache $ENV{TASK_KEEPMOUNT} --exec ";
+
+    $command .= "&& exec arv-mount --read-write --mount-by-pdh=by_pdh --mount-tmp=tmp --crunchstat-interval=10 --allow-other $arv_file_cache \Q$keep_mnt\E --exec ";
+    $ENV{TASK_KEEPMOUNT} = "$keep_mnt/by_pdh";
+    $ENV{TASK_KEEPMOUNT_TMP} = "$keep_mnt/tmp";
+
     if ($docker_hash)
     {
       my $containername = "$Jobstep->{arvados_task}->{uuid}-$Jobstep->{failures}";
@@ -924,14 +929,18 @@ for (my $todo_ptr = 0; $todo_ptr <= $#jobstep_todo; $todo_ptr ++)
       $command .= "--volume=\Q$ENV{CRUNCH_SRC}:$ENV{CRUNCH_SRC}:ro\E ";
       $command .= "--volume=\Q$ENV{CRUNCH_INSTALL}:$ENV{CRUNCH_INSTALL}:ro\E ";
 
-      # Currently, we make arv-mount's mount point appear at /keep
-      # inside the container (instead of using the same path as the
-      # host like we do with CRUNCH_SRC and CRUNCH_INSTALL). However,
-      # crunch scripts and utilities must not rely on this. They must
-      # use $TASK_KEEPMOUNT.
+      # Currently, we make the "by_pdh" directory in arv-mount's mount
+      # point appear at /keep inside the container (instead of using
+      # the same path as the host like we do with CRUNCH_SRC and
+      # CRUNCH_INSTALL). However, crunch scripts and utilities must
+      # not rely on this. They must use $TASK_KEEPMOUNT.
       $command .= "--volume=\Q$ENV{TASK_KEEPMOUNT}:/keep:ro\E ";
       $ENV{TASK_KEEPMOUNT} = "/keep";
 
+      # Ditto TASK_KEEPMOUNT_TMP, as /keep_tmp.
+      $command .= "--volume=\Q$ENV{TASK_KEEPMOUNT_TMP}:/keep_tmp\E ";
+      $ENV{TASK_KEEPMOUNT_TMP} = "/keep_tmp";
+
       # TASK_WORK is almost exactly like a docker data volume: it
       # starts out empty, is writable, and persists until no
       # containers use it any more. We don't use --volumes-from to

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list