[ARVADOS] updated: 50b8fd6c6dd1137a06b7849665d8ba4276f62ba2
git at public.curoverse.com
git at public.curoverse.com
Mon Dec 15 13:00:30 EST 2014
Summary of changes:
doc/_includes/_0_filter_py.liquid | 29 ++++----
doc/_includes/_concurrent_hash_script_py.liquid | 33 ++++-----
doc/_includes/_run_md5sum_py.liquid | 7 +-
doc/_includes/_tutorial_hash_script_py.liquid | 44 ++++++------
.../tutorial-firstscript.html.textile.liquid | 15 ++--
sdk/python/arvados/commands/ls.py | 52 ++++++++++++++
sdk/python/bin/arv-ls | 23 +------
sdk/python/tests/test_arv_ls.py | 80 ++++++++++++++++++++++
8 files changed, 195 insertions(+), 88 deletions(-)
create mode 100755 sdk/python/arvados/commands/ls.py
create mode 100644 sdk/python/tests/test_arv_ls.py
via 50b8fd6c6dd1137a06b7849665d8ba4276f62ba2 (commit)
via a6b1b78864353e3dcce2a1c5bf4afa2b4c88b036 (commit)
via 955f8c26064c9c70b3a4ce33e04eda97c70a1787 (commit)
via 812993586e7dc31ead38075e07616b7ec47fd347 (commit)
via 04700fca9ad96c23119b474ba1f472b9d1da2b20 (commit)
via b4a26f3454b98eb4cdc52ebbf8d4d00c67f5e49a (commit)
from d28d5446be0427609b6a288f39dac383ba83075f (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
commit 50b8fd6c6dd1137a06b7849665d8ba4276f62ba2
Merge: 04700fc a6b1b78
Author: Brett Smith <brett at curoverse.com>
Date: Mon Dec 15 12:58:52 2014 -0500
Merge branch '4481-update-user-docs-TC'
Closes #4741, #4790.
commit a6b1b78864353e3dcce2a1c5bf4afa2b4c88b036
Author: Tom Clegg <tom at curoverse.com>
Date: Mon Dec 15 12:18:33 2014 -0500
4481: Fix ambiguous "Keep id" -> "locator" in example scripts.
diff --git a/doc/_includes/_0_filter_py.liquid b/doc/_includes/_0_filter_py.liquid
index ef89e13..831e1b8 100644
--- a/doc/_includes/_0_filter_py.liquid
+++ b/doc/_includes/_0_filter_py.liquid
@@ -21,10 +21,10 @@ with out.open('0-filter.txt') as out_file:
# Output every line in the file that starts with '0'
out_file.writelines(line for line in input_file if line.startswith('0'))
-# Commit the output to keep. This returns a Keep id.
-output_id = out.finish()
+# Commit the output to Keep.
+output_locator = out.finish()
-# Set the output for this task to the Keep id
-this_task.set_output(output_id)
+# Use the resulting locator as the output for this task.
+this_task.set_output(output_locator)
# Done!
diff --git a/doc/_includes/_tutorial_hash_script_py.liquid b/doc/_includes/_tutorial_hash_script_py.liquid
index 3fb48f8..ede2809 100644
--- a/doc/_includes/_tutorial_hash_script_py.liquid
+++ b/doc/_includes/_tutorial_hash_script_py.liquid
@@ -36,10 +36,10 @@ with out.open('md5sum.txt') as out_file:
out_file.write("{} {}/{}\n".format(digestor.hexdigest(), input_id,
os.path.normpath(input_path)))
-# Commit the output to keep. This returns a Keep id.
-output_id = out.finish()
+# Commit the output to Keep.
+output_locator = out.finish()
-# Set the output for this task to the Keep id
-this_task.set_output(output_id)
+# Use the resulting locator as the output for this task.
+this_task.set_output(output_locator)
# Done!
commit 955f8c26064c9c70b3a4ce33e04eda97c70a1787
Author: Brett Smith <brett at curoverse.com>
Date: Wed Dec 10 16:40:13 2014 -0500
4481: Refresh Crunch script tutorial page.
* The script now normalizes the output path, for consistency with
other scripts, and it looks nicer.
* Modernize the job log output slightly, and adjust text to match.
diff --git a/doc/user/tutorials/tutorial-firstscript.html.textile.liquid b/doc/user/tutorials/tutorial-firstscript.html.textile.liquid
index b8b90ca..6fe88fe 100644
--- a/doc/user/tutorials/tutorial-firstscript.html.textile.liquid
+++ b/doc/user/tutorials/tutorial-firstscript.html.textile.liquid
@@ -76,25 +76,28 @@ You can now run your script on your local workstation or VM using @arv-crunch-jo
2014-08-06_15:16:26 qr1hi-8i9sb-qyrat80ef927lam 14473 1 stderr crunchstat: Running [stdbuf --output=0 --error=0 /home/$USER/tutorial/crunch_scripts/hash.py]
2014-08-06_15:16:35 qr1hi-8i9sb-qyrat80ef927lam 14473 1 child 14504 on localhost.1 exit 0 signal 0 success=true
2014-08-06_15:16:35 qr1hi-8i9sb-qyrat80ef927lam 14473 1 success in 10 seconds
-2014-08-06_15:16:35 qr1hi-8i9sb-qyrat80ef927lam 14473 1 output 50cafdb29cc21dd6eaec85ba9e0c6134+56+Aef0f991b80fa0b75f802e58e70b207aa184d24ff at 53f4bbd3
+2014-08-06_15:16:35 qr1hi-8i9sb-qyrat80ef927lam 14473 1 output 8c20281b9840f624a486e4f1a78a1da8+105+A234be74ceb5ea31db6e11b6be26f3eb76d288ad0 at 54987018
2014-08-06_15:16:35 qr1hi-8i9sb-qyrat80ef927lam 14473 wait for last 0 children to finish
2014-08-06_15:16:35 qr1hi-8i9sb-qyrat80ef927lam 14473 status: 2 done, 0 running, 0 todo
+2014-08-06_15:16:35 qr1hi-8i9sb-qyrat80ef927lam 14473 release job allocation
2014-08-06_15:16:35 qr1hi-8i9sb-qyrat80ef927lam 14473 Freeze not implemented
2014-08-06_15:16:35 qr1hi-8i9sb-qyrat80ef927lam 14473 collate
-2014-08-06_15:16:36 qr1hi-8i9sb-qyrat80ef927lam 14473 output d6338df28d6b8e5d14929833b417e20e+107+Adf1ce81222b6992ce5d33d8bfb28a6b5a1497898 at 53f4bbd4
+2014-08-06_15:16:36 qr1hi-8i9sb-qyrat80ef927lam 14473 output uuid qr1hi-4zz18-n91qrqfp3zivexo
+2014-08-06_15:16:36 qr1hi-8i9sb-qyrat80ef927lam 14473 output hash c1b44b6dc41ef334cf1136033ca950e6+54
2014-08-06_15:16:37 qr1hi-8i9sb-qyrat80ef927lam 14473 finish
2014-08-06_15:16:38 qr1hi-8i9sb-qyrat80ef927lam 14473 log manifest is 7fe8cf1d45d438a3ca3ac4a184b7aff4+83
</code></pre>
</notextile>
-Although the job runs locally, the output of the job has been saved to Keep, the Arvados file store. The "output" line (third from the bottom) provides the "Keep locator":{{site.baseurl}}/user/tutorials/tutorial-keep-get.html to which the script's output has been saved. Copy the output identifier and use @arv-ls@ to list the contents of your output collection, and @arv-get@ to download it to the current directory:
+Although the job runs locally, the output of the job has been saved to Keep, the Arvados file store. The "output uuid" line (fourth from the bottom) provides the UUID of the Arvados collection where the script's output has been saved. Copy the output identifier and use @arv-ls@ to list the contents of your output collection, and @arv-get@ to download it to the current directory:
<notextile>
-<pre><code>~/tutorial/crunch_scripts$ <span class="userinput">arv-ls d6338df28d6b8e5d14929833b417e20e+107+Adf1ce81222b6992ce5d33d8bfb28a6b5a1497898 at 53f4bbd4</span>
+<pre><code>~/tutorial/crunch_scripts$ <span class="userinput">arv-ls qr1hi-4zz18-n91qrqfp3zivexo</span>
./md5sum.txt
-~/tutorial/crunch_scripts$ <span class="userinput">arv-get d6338df28d6b8e5d14929833b417e20e+107+Adf1ce81222b6992ce5d33d8bfb28a6b5a1497898 at 53f4bbd4/ .</span>
+~/tutorial/crunch_scripts$ <span class="userinput">arv-get qr1hi-4zz18-n91qrqfp3zivexo/ .</span>
+0 MiB / 0 MiB 100.0%
~/tutorial/crunch_scripts$ <span class="userinput">cat md5sum.txt</span>
-44b8ae3fde7a8a88d2f7ebd237625b4f c1bad4b39ca5a924e481008009d94e32+210/./var-GS000016015-ASM.tsv.bz2
+44b8ae3fde7a8a88d2f7ebd237625b4f c1bad4b39ca5a924e481008009d94e32+210/var-GS000016015-ASM.tsv.bz2
</code></pre>
</notextile>
commit 812993586e7dc31ead38075e07616b7ec47fd347
Author: Brett Smith <brett at curoverse.com>
Date: Mon Dec 8 14:37:23 2014 -0500
4481: Update tutorial Crunch scripts to use newer PySDK methods.
Most focus is on the file Collection file methods added in #3603.
diff --git a/doc/_includes/_0_filter_py.liquid b/doc/_includes/_0_filter_py.liquid
index 035c481..ef89e13 100644
--- a/doc/_includes/_0_filter_py.liquid
+++ b/doc/_includes/_0_filter_py.liquid
@@ -14,17 +14,12 @@ collection = arvados.CollectionReader(this_task_input)
# Create an object to write a new collection as output
out = arvados.CollectionWriter()
-# Set the name of output file within the collection
-out.set_current_file_name("0-filter.txt")
-
-# Get an iterator over the files listed in the collection
-all_files = collection.all_files()
-
-# Iterate over each file
-for input_file in all_files:
- for ln in input_file.readlines():
- if ln[0] == '0':
- out.write(ln)
+# Create a new file in the output collection
+with out.open('0-filter.txt') as out_file:
+ # Iterate over every input file in the input collection
+ for input_file in collection.all_files():
+ # Output every line in the file that starts with '0'
+ out_file.writelines(line for line in input_file if line.startswith('0'))
# Commit the output to keep. This returns a Keep id.
output_id = out.finish()
diff --git a/doc/_includes/_concurrent_hash_script_py.liquid b/doc/_includes/_concurrent_hash_script_py.liquid
index a914e04..691ed56 100644
--- a/doc/_includes/_concurrent_hash_script_py.liquid
+++ b/doc/_includes/_concurrent_hash_script_py.liquid
@@ -1,6 +1,7 @@
#!/usr/bin/env python
import hashlib
+import os
import arvados
# Jobs consist of one or more tasks. A task is a single invocation of
@@ -11,7 +12,7 @@ this_task = arvados.current_task()
# Tasks have a sequence number for ordering. All tasks
# with the current sequence number must finish successfully
-# before tasks in the next sequence are started.
+# before tasks in the next sequence are started.
# The first task has sequence number 0
if this_task['sequence'] == 0:
# Get the "input" field from "script_parameters" on the task object
@@ -21,7 +22,7 @@ if this_task['sequence'] == 0:
cr = arvados.CollectionReader(job_input)
# Loop over each stream in the collection (a stream is a subset of
- # files that logically represents a directory
+ # files that logically represents a directory)
for s in cr.all_streams():
# Loop over each file in the stream
@@ -62,29 +63,21 @@ else:
collection = arvados.CollectionReader(this_task_input)
- out = arvados.CollectionWriter()
- out.set_current_file_name("md5sum.txt")
-
# There should only be one file in the collection, so get the
- # first one. collection.all_files() returns an iterator so we
- # need to make it into a list for indexed access.
- input_file = list(collection.all_files())[0]
+ # first one from the all files iterator.
+ input_file = next(collection.all_files())
+ output_path = os.path.normpath(os.path.join(input_file.stream_name(),
+ input_file.name))
# Everything after this is the same as the first tutorial.
digestor = hashlib.new('md5')
-
- while True:
- buf = input_file.read(2**20)
- if len(buf) == 0:
- break
+ for buf in input_file.readall():
digestor.update(buf)
- hexdigest = digestor.hexdigest()
- file_name = input_file.name()
- if input_file.stream_name() != '.':
- file_name = os.join(input_file.stream_name(), file_name)
- out.write("%s %s\n" % (hexdigest, file_name))
- output_id = out.finish()
- this_task.set_output(output_id)
+ out = arvados.CollectionWriter()
+ with out.open('md5sum.txt') as out_file:
+ out_file.write("{} {}\n".format(digestor.hexdigest(), output_path))
+
+ this_task.set_output(out.finish())
# Done!
diff --git a/doc/_includes/_run_md5sum_py.liquid b/doc/_includes/_run_md5sum_py.liquid
index 16516a8..46152f1 100644
--- a/doc/_includes/_run_md5sum_py.liquid
+++ b/doc/_includes/_run_md5sum_py.liquid
@@ -3,7 +3,8 @@
import arvados
# Automatically parallelize this job by running one task per file.
-arvados.job_setup.one_task_per_input_file(if_sequence=0, and_end_task=True, input_as_path=True)
+arvados.job_setup.one_task_per_input_file(if_sequence=0, and_end_task=True,
+ input_as_path=True)
# Get the input file for the task
input_file = arvados.get_task_param_mount('input')
@@ -13,6 +14,6 @@ stdoutdata, stderrdata = arvados.util.run_command(['md5sum', input_file])
# Save the standard output (stdoutdata) to "md5sum.txt" in the output collection
out = arvados.CollectionWriter()
-out.set_current_file_name("md5sum.txt")
-out.write(stdoutdata)
+with out.open('md5sum.txt') as out_file:
+ out_file.write(stdoutdata)
arvados.current_task().set_output(out.finish())
diff --git a/doc/_includes/_tutorial_hash_script_py.liquid b/doc/_includes/_tutorial_hash_script_py.liquid
index b9c7f31..3fb48f8 100644
--- a/doc/_includes/_tutorial_hash_script_py.liquid
+++ b/doc/_includes/_tutorial_hash_script_py.liquid
@@ -1,45 +1,45 @@
#!/usr/bin/env python
import hashlib # Import the hashlib module to compute MD5.
+import os # Import the os module for basic path manipulation
import arvados # Import the Arvados sdk module
# Automatically parallelize this job by running one task per file.
# This means that if the input consists of many files, each file will
-# be processed in parallel on different nodes enabling the job to
+# be processed in parallel on different nodes enabling the job to
# be completed quicker.
-arvados.job_setup.one_task_per_input_file(if_sequence=0, and_end_task=True,
+arvados.job_setup.one_task_per_input_file(if_sequence=0, and_end_task=True,
input_as_path=True)
+# Get object representing the current task
+this_task = arvados.current_task()
+
# Create the message digest object that will compute the MD5 hash
digestor = hashlib.new('md5')
# Get the input file for the task
-input_file = arvados.get_task_param_mount('input')
+input_id, input_path = this_task['parameters']['input'].split('/', 1)
-# Open the input file for reading
-with open(input_file) as f:
- while True:
- buf = f.read(2**20) # read a 1 megabyte block from the file
- if len(buf) == 0: # break when there is no more data left
- break
- digestor.update(buf) # update the MD5 hash object
+# Open the input collection
+input_collection = arvados.CollectionReader(input_id)
-# Get object representing the current task
-this_task = arvados.current_task()
+# Open the input file for reading
+with input_collection.open(input_path) as input_file:
+ for buf in input_file.readall(): # Iterate the file's data blocks
+ digestor.update(buf) # Update the MD5 hash object
- # Write a new collection as output
+# Write a new collection as output
out = arvados.CollectionWriter()
- # Set output file within the collection
-out.set_current_file_name("md5sum.txt")
-
-# Write an output line with the MD5 value and input
-out.write("%s %s\n" % (digestor.hexdigest(), this_task['parameters']['input']))
+# Write an output file with one line: the MD5 value and input path
+with out.open('md5sum.txt') as out_file:
+ out_file.write("{} {}/{}\n".format(digestor.hexdigest(), input_id,
+ os.path.normpath(input_path)))
- # Commit the output to keep. This returns a Keep id.
+# Commit the output to keep. This returns a Keep id.
output_id = out.finish()
# Set the output for this task to the Keep id
-this_task.set_output(output_id)
+this_task.set_output(output_id)
# Done!
commit 04700fca9ad96c23119b474ba1f472b9d1da2b20
Merge: d28d544 b4a26f3
Author: Brett Smith <brett at curoverse.com>
Date: Mon Dec 15 12:57:25 2014 -0500
Merge branch '4792-arv-ls-normalize-wip'
Closes #4792, #4813.
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list