[ARVADOS] updated: 5590c9ac669f2d74858e6c994afe1a2e9df8d104
git at public.curoverse.com
git at public.curoverse.com
Thu Dec 3 11:53:59 EST 2015
Summary of changes:
crunch_scripts/test/task_output_dir | 16 ++++++++++++++++
sdk/cli/bin/crunch-job | 29 +++++++++++++++++++----------
sdk/python/arvados/crunch.py | 27 +++++++++++++++++++++++++++
sdk/python/tests/test_crunch.py | 27 +++++++++++++++++++++++++++
4 files changed, 89 insertions(+), 10 deletions(-)
create mode 100755 crunch_scripts/test/task_output_dir
create mode 100644 sdk/python/arvados/crunch.py
create mode 100644 sdk/python/tests/test_crunch.py
via 5590c9ac669f2d74858e6c994afe1a2e9df8d104 (commit)
via 66658d762ff7b8a6ef42cb592ad2d677802f4e18 (commit)
via ab5df30a207bfaa3f2163602bca538ed37163d15 (commit)
from de57addc345d228d8b1ebf0965fd5e98e01b9842 (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
commit 5590c9ac669f2d74858e6c994afe1a2e9df8d104
Merge: de57add 66658d7
Author: Tom Clegg <tom at curoverse.com>
Date: Thu Dec 3 12:03:16 2015 -0500
Merge branch '7751-crunch-fuse-output' closes #7751
commit 66658d762ff7b8a6ef42cb592ad2d677802f4e18
Author: Tom Clegg <tom at curoverse.com>
Date: Thu Nov 26 21:11:16 2015 -0500
7751: Add convenience class for staging task output in $TASK_KEEPMOUNT_TMP.
diff --git a/crunch_scripts/test/task_output_dir b/crunch_scripts/test/task_output_dir
new file mode 100755
index 0000000..b177892
--- /dev/null
+++ b/crunch_scripts/test/task_output_dir
@@ -0,0 +1,16 @@
+#!/usr/bin/env python
+
+import arvados
+import arvados.crunch
+import hashlib
+import os
+
+out = arvados.crunch.TaskOutputDir()
+
+string = open(__file__).read()
+with open(os.path.join(out.path, 'example.out'), 'w') as f:
+ f.write(string)
+with open(os.path.join(out.path, 'example.out.SHA1'), 'w') as f:
+ f.write(hashlib.sha1(string).hexdigest() + "\n")
+
+arvados.current_task().set_output(out.manifest_text())
diff --git a/sdk/python/arvados/crunch.py b/sdk/python/arvados/crunch.py
new file mode 100644
index 0000000..c184e6a
--- /dev/null
+++ b/sdk/python/arvados/crunch.py
@@ -0,0 +1,27 @@
+import json
+import os
+
+class TaskOutputDir(object):
+ """Keep-backed directory for staging outputs of Crunch tasks.
+
+ Example, in a crunch task whose output is a file called "out.txt"
+ containing "42":
+
+ import arvados
+ import arvados.crunch
+ import os
+
+ out = arvados.crunch.TaskOutputDir()
+ with open(os.path.join(out.path, 'out.txt'), 'w') as f:
+ f.write('42')
+ arvados.current_task().set_output(out.manifest_text())
+ """
+ def __init__(self):
+ self.path = os.environ['TASK_KEEPMOUNT_TMP']
+
+ def __str__(self):
+ return self.path
+
+ def manifest_text(self):
+ snapshot = os.path.join(self.path, '.arvados#collection')
+ return json.load(open(snapshot))['manifest_text']
diff --git a/sdk/python/tests/test_crunch.py b/sdk/python/tests/test_crunch.py
new file mode 100644
index 0000000..431390b
--- /dev/null
+++ b/sdk/python/tests/test_crunch.py
@@ -0,0 +1,27 @@
+import arvados.crunch
+import os
+import shutil
+import tempfile
+import unittest
+
+class TaskOutputDirTest(unittest.TestCase):
+ def setUp(self):
+ self.tmp = tempfile.mkdtemp()
+ os.environ['TASK_KEEPMOUNT_TMP'] = self.tmp
+
+ def tearDown(self):
+ os.environ.pop('TASK_KEEPMOUNT_TMP')
+ shutil.rmtree(self.tmp)
+
+ def test_env_var(self):
+ out = arvados.crunch.TaskOutputDir()
+ self.assertEqual(out.path, self.tmp)
+
+ with open(os.path.join(self.tmp, '.arvados#collection'), 'w') as f:
+ f.write('{\n "manifest_text":"",\n "uuid":null\n}\n')
+ self.assertEqual(out.manifest_text(), '')
+
+ # Special file must be re-read on each call to manifest_text().
+ with open(os.path.join(self.tmp, '.arvados#collection'), 'w') as f:
+ f.write(r'{"manifest_text":". unparsed 0:3:foo\n","uuid":null}')
+ self.assertEqual(out.manifest_text(), ". unparsed 0:3:foo\n")
commit ab5df30a207bfaa3f2163602bca538ed37163d15
Author: Tom Clegg <tom at curoverse.com>
Date: Thu Nov 26 20:52:10 2015 -0500
7751: Set up an arv-mount scratch directory for each task, and put its path in TASK_KEEPMOUNT_TMP.
diff --git a/sdk/cli/bin/crunch-job b/sdk/cli/bin/crunch-job
index b9365fe..70d05f0 100755
--- a/sdk/cli/bin/crunch-job
+++ b/sdk/cli/bin/crunch-job
@@ -872,11 +872,12 @@ for (my $todo_ptr = 0; $todo_ptr <= $#jobstep_todo; $todo_ptr ++)
$ENV{"TASK_SLOT_NUMBER"} = $slot[$childslot]->{cpu};
$ENV{"TASK_WORK"} = $ENV{"CRUNCH_TMP"}."/task/$childslotname";
$ENV{"HOME"} = $ENV{"TASK_WORK"};
- $ENV{"TASK_KEEPMOUNT"} = $ENV{"TASK_WORK"}.".keep";
$ENV{"TASK_TMPDIR"} = $ENV{"TASK_WORK"}; # deprecated
$ENV{"CRUNCH_NODE_SLOTS"} = $round_max_slots{$ENV{TASK_SLOT_NODE}};
$ENV{"PATH"} = $ENV{"CRUNCH_INSTALL"} . "/bin:" . $ENV{"PATH"};
+ my $keep_mnt = $ENV{"TASK_WORK"}.".keep";
+
$ENV{"GZIP"} = "-n";
my @srunargs = (
@@ -894,16 +895,20 @@ for (my $todo_ptr = 0; $todo_ptr <= $#jobstep_todo; $todo_ptr ++)
}
my $command =
- "if [ -e $ENV{TASK_WORK} ]; then rm -rf $ENV{TASK_WORK}; fi; "
- ."mkdir -p $ENV{CRUNCH_TMP} $ENV{JOB_WORK} $ENV{TASK_WORK} $ENV{TASK_KEEPMOUNT} "
- ."&& cd $ENV{CRUNCH_TMP} "
+ "if [ -e \Q$ENV{TASK_WORK}\E ]; then rm -rf \Q$ENV{TASK_WORK}\E; fi; "
+ ."mkdir -p \Q$ENV{CRUNCH_TMP}\E \Q$ENV{JOB_WORK}\E \Q$ENV{TASK_WORK}\E \Q$keep_mnt\E "
+ ."&& cd \Q$ENV{CRUNCH_TMP}\E "
# These environment variables get used explicitly later in
# $command. No tool is expected to read these values directly.
.q{&& MEM=$(awk '($1 == "MemTotal:"){print $2}' </proc/meminfo) }
.q{&& SWAP=$(awk '($1 == "SwapTotal:"){print $2}' </proc/meminfo) }
."&& MEMLIMIT=\$(( (\$MEM * 95) / ($ENV{CRUNCH_NODE_SLOTS} * 100) )) "
."&& let SWAPLIMIT=\$MEMLIMIT+\$SWAP ";
- $command .= "&& exec arv-mount --by-pdh --crunchstat-interval=10 --allow-other $arv_file_cache $ENV{TASK_KEEPMOUNT} --exec ";
+
+ $command .= "&& exec arv-mount --read-write --mount-by-pdh=by_pdh --mount-tmp=tmp --crunchstat-interval=10 --allow-other $arv_file_cache \Q$keep_mnt\E --exec ";
+ $ENV{TASK_KEEPMOUNT} = "$keep_mnt/by_pdh";
+ $ENV{TASK_KEEPMOUNT_TMP} = "$keep_mnt/tmp";
+
if ($docker_hash)
{
my $containername = "$Jobstep->{arvados_task}->{uuid}-$Jobstep->{failures}";
@@ -924,14 +929,18 @@ for (my $todo_ptr = 0; $todo_ptr <= $#jobstep_todo; $todo_ptr ++)
$command .= "--volume=\Q$ENV{CRUNCH_SRC}:$ENV{CRUNCH_SRC}:ro\E ";
$command .= "--volume=\Q$ENV{CRUNCH_INSTALL}:$ENV{CRUNCH_INSTALL}:ro\E ";
- # Currently, we make arv-mount's mount point appear at /keep
- # inside the container (instead of using the same path as the
- # host like we do with CRUNCH_SRC and CRUNCH_INSTALL). However,
- # crunch scripts and utilities must not rely on this. They must
- # use $TASK_KEEPMOUNT.
+ # Currently, we make the "by_pdh" directory in arv-mount's mount
+ # point appear at /keep inside the container (instead of using
+ # the same path as the host like we do with CRUNCH_SRC and
+ # CRUNCH_INSTALL). However, crunch scripts and utilities must
+ # not rely on this. They must use $TASK_KEEPMOUNT.
$command .= "--volume=\Q$ENV{TASK_KEEPMOUNT}:/keep:ro\E ";
$ENV{TASK_KEEPMOUNT} = "/keep";
+ # Ditto TASK_KEEPMOUNT_TMP, as /keep_tmp.
+ $command .= "--volume=\Q$ENV{TASK_KEEPMOUNT_TMP}:/keep_tmp\E ";
+ $ENV{TASK_KEEPMOUNT_TMP} = "/keep_tmp";
+
# TASK_WORK is almost exactly like a docker data volume: it
# starts out empty, is writable, and persists until no
# containers use it any more. We don't use --volumes-from to
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list