[ARVADOS] created: 618e8ec538a19b9f907e7e0ac0e78c30869bdfd4

Mon Dec 8 14:37:30 EST 2014

at  618e8ec538a19b9f907e7e0ac0e78c30869bdfd4 (commit)


commit 618e8ec538a19b9f907e7e0ac0e78c30869bdfd4
Author: Brett Smith <brett at curoverse.com>
Date:   Mon Dec 8 14:37:23 2014 -0500

    4481: Update tutorial Crunch scripts to use newer PySDK methods.
    
    Most focus is on the file Collection file methods added in #3603.

diff --git a/doc/_includes/_0_filter_py.liquid b/doc/_includes/_0_filter_py.liquid
index 035c481..ef89e13 100644
--- a/doc/_includes/_0_filter_py.liquid
+++ b/doc/_includes/_0_filter_py.liquid
@@ -14,17 +14,12 @@ collection = arvados.CollectionReader(this_task_input)
 # Create an object to write a new collection as output
 out = arvados.CollectionWriter()
 
-# Set the name of output file within the collection
-out.set_current_file_name("0-filter.txt")
-
-# Get an iterator over the files listed in the collection
-all_files = collection.all_files()
-
-# Iterate over each file
-for input_file in all_files:
-    for ln in input_file.readlines():
-        if ln[0] == '0':
-            out.write(ln)
+# Create a new file in the output collection
+with out.open('0-filter.txt') as out_file:
+    # Iterate over every input file in the input collection
+    for input_file in collection.all_files():
+        # Output every line in the file that starts with '0'
+        out_file.writelines(line for line in input_file if line.startswith('0'))
 
 # Commit the output to keep.  This returns a Keep id.
 output_id = out.finish()
diff --git a/doc/_includes/_concurrent_hash_script_py.liquid b/doc/_includes/_concurrent_hash_script_py.liquid
index a914e04..691ed56 100644
--- a/doc/_includes/_concurrent_hash_script_py.liquid
+++ b/doc/_includes/_concurrent_hash_script_py.liquid
@@ -1,6 +1,7 @@
 #!/usr/bin/env python
 
 import hashlib
+import os
 import arvados
 
 # Jobs consist of one or more tasks.  A task is a single invocation of
@@ -11,7 +12,7 @@ this_task = arvados.current_task()
 
 # Tasks have a sequence number for ordering.  All tasks
 # with the current sequence number must finish successfully
-# before tasks in the next sequence are started. 
+# before tasks in the next sequence are started.
 # The first task has sequence number 0
 if this_task['sequence'] == 0:
     # Get the "input" field from "script_parameters" on the task object
@@ -21,7 +22,7 @@ if this_task['sequence'] == 0:
     cr = arvados.CollectionReader(job_input)
 
     # Loop over each stream in the collection (a stream is a subset of
-    # files that logically represents a directory
+    # files that logically represents a directory)
     for s in cr.all_streams():
 
         # Loop over each file in the stream
@@ -62,29 +63,21 @@ else:
 
     collection = arvados.CollectionReader(this_task_input)
 
-    out = arvados.CollectionWriter()
-    out.set_current_file_name("md5sum.txt")
-
     # There should only be one file in the collection, so get the
-    # first one.  collection.all_files() returns an iterator so we
-    # need to make it into a list for indexed access.
-    input_file = list(collection.all_files())[0]
+    # first one from the all files iterator.
+    input_file = next(collection.all_files())
+    output_path = os.path.normpath(os.path.join(input_file.stream_name(),
+                                                input_file.name))
 
     # Everything after this is the same as the first tutorial.
     digestor = hashlib.new('md5')
-
-    while True:
-        buf = input_file.read(2**20)
-        if len(buf) == 0:
-            break
+    for buf in input_file.readall():
         digestor.update(buf)
 
-    hexdigest = digestor.hexdigest()
-    file_name = input_file.name()
-    if input_file.stream_name() != '.':
-        file_name = os.join(input_file.stream_name(), file_name)
-    out.write("%s %s\n" % (hexdigest, file_name))
-    output_id = out.finish()
-    this_task.set_output(output_id)
+    out = arvados.CollectionWriter()
+    with out.open('md5sum.txt') as out_file:
+        out_file.write("{} {}\n".format(digestor.hexdigest(), output_path))
+
+    this_task.set_output(out.finish())
 
 # Done!
diff --git a/doc/_includes/_run_md5sum_py.liquid b/doc/_includes/_run_md5sum_py.liquid
index 16516a8..46152f1 100644
--- a/doc/_includes/_run_md5sum_py.liquid
+++ b/doc/_includes/_run_md5sum_py.liquid
@@ -3,7 +3,8 @@
 import arvados
 
 # Automatically parallelize this job by running one task per file.
-arvados.job_setup.one_task_per_input_file(if_sequence=0, and_end_task=True, input_as_path=True)
+arvados.job_setup.one_task_per_input_file(if_sequence=0, and_end_task=True,
+                                          input_as_path=True)
 
 # Get the input file for the task
 input_file = arvados.get_task_param_mount('input')
@@ -13,6 +14,6 @@ stdoutdata, stderrdata = arvados.util.run_command(['md5sum', input_file])
 
 # Save the standard output (stdoutdata) to "md5sum.txt" in the output collection
 out = arvados.CollectionWriter()
-out.set_current_file_name("md5sum.txt")
-out.write(stdoutdata)
+with out.open('md5sum.txt') as out_file:
+    out_file.write(stdoutdata)
 arvados.current_task().set_output(out.finish())
diff --git a/doc/_includes/_tutorial_hash_script_py.liquid b/doc/_includes/_tutorial_hash_script_py.liquid
index b9c7f31..3fb48f8 100644
--- a/doc/_includes/_tutorial_hash_script_py.liquid
+++ b/doc/_includes/_tutorial_hash_script_py.liquid
@@ -1,45 +1,45 @@
 #!/usr/bin/env python
 
 import hashlib      # Import the hashlib module to compute MD5.
+import os           # Import the os module for basic path manipulation
 import arvados      # Import the Arvados sdk module
 
 # Automatically parallelize this job by running one task per file.
 # This means that if the input consists of many files, each file will
-# be processed in parallel on different nodes enabling the job to 
+# be processed in parallel on different nodes enabling the job to
 # be completed quicker.
-arvados.job_setup.one_task_per_input_file(if_sequence=0, and_end_task=True, 
+arvados.job_setup.one_task_per_input_file(if_sequence=0, and_end_task=True,
                                           input_as_path=True)
 
+# Get object representing the current task
+this_task = arvados.current_task()
+
 # Create the message digest object that will compute the MD5 hash
 digestor = hashlib.new('md5')
 
 # Get the input file for the task
-input_file = arvados.get_task_param_mount('input')
+input_id, input_path = this_task['parameters']['input'].split('/', 1)
 
-# Open the input file for reading
-with open(input_file) as f:
-    while True:
-        buf = f.read(2**20)      # read a 1 megabyte block from the file
-        if len(buf) == 0:        # break when there is no more data left
-            break
-        digestor.update(buf)     # update the MD5 hash object
+# Open the input collection
+input_collection = arvados.CollectionReader(input_id)
 
-# Get object representing the current task
-this_task = arvados.current_task()
+# Open the input file for reading
+with input_collection.open(input_path) as input_file:
+    for buf in input_file.readall():  # Iterate the file's data blocks
+        digestor.update(buf)          # Update the MD5 hash object
 
- # Write a new collection as output
+# Write a new collection as output
 out = arvados.CollectionWriter()
 
- # Set output file within the collection
-out.set_current_file_name("md5sum.txt")
-
-# Write an output line with the MD5 value and input
-out.write("%s %s\n" % (digestor.hexdigest(), this_task['parameters']['input']))
+# Write an output file with one line: the MD5 value and input path
+with out.open('md5sum.txt') as out_file:
+    out_file.write("{} {}/{}\n".format(digestor.hexdigest(), input_id,
+                                       os.path.normpath(input_path)))
 
- # Commit the output to keep.  This returns a Keep id.
+# Commit the output to keep.  This returns a Keep id.
 output_id = out.finish()
 
 # Set the output for this task to the Keep id
-this_task.set_output(output_id) 
+this_task.set_output(output_id)
 
 # Done!

-----------------------------------------------------------------------


hooks/post-receive
--