[ARVADOS] updated: 1.3.0-1084-gd33dab9c7

Git user git at public.curoverse.com
Thu Jun 13 19:15:01 UTC 2019


Summary of changes:
 .licenseignore                                     |  1 +
 doc/_includes/_federated_cwl.liquid                |  2 +-
 .../cwl/federated-workflows.html.textile.liquid    |  2 +-
 doc/user/cwl/federated/FileOnCluster.yml           |  5 ++
 doc/user/cwl/federated/cat.cwl                     | 14 -----
 doc/user/cwl/federated/colors_to_select.txt        |  2 +
 doc/user/cwl/federated/extract.cwl                 | 22 +++++++
 doc/user/cwl/federated/extract.py                  | 31 ++++++++++
 .../cwl/federated/{federated.cwl => feddemo.cwl}   | 68 +++++++++++-----------
 doc/user/cwl/federated/file-on-clsr1.dat           |  1 -
 doc/user/cwl/federated/file-on-clsr2.dat           |  1 -
 doc/user/cwl/federated/file-on-clsr3.dat           |  1 -
 doc/user/cwl/federated/items1.csv                  | 29 +++++++++
 doc/user/cwl/federated/items2.csv                  | 33 +++++++++++
 doc/user/cwl/federated/items3.csv                  | 41 +++++++++++++
 doc/user/cwl/federated/md5sum.cwl                  | 21 -------
 doc/user/cwl/federated/merge.cwl                   | 23 ++++++++
 doc/user/cwl/federated/merge.py                    | 15 +++++
 doc/user/cwl/federated/shards.yml                  | 21 ++++---
 19 files changed, 251 insertions(+), 82 deletions(-)
 create mode 100644 doc/user/cwl/federated/FileOnCluster.yml
 delete mode 100644 doc/user/cwl/federated/cat.cwl
 create mode 100644 doc/user/cwl/federated/colors_to_select.txt
 create mode 100644 doc/user/cwl/federated/extract.cwl
 create mode 100644 doc/user/cwl/federated/extract.py
 rename doc/user/cwl/federated/{federated.cwl => feddemo.cwl} (51%)
 delete mode 100644 doc/user/cwl/federated/file-on-clsr1.dat
 delete mode 100644 doc/user/cwl/federated/file-on-clsr2.dat
 delete mode 100644 doc/user/cwl/federated/file-on-clsr3.dat
 create mode 100644 doc/user/cwl/federated/items1.csv
 create mode 100644 doc/user/cwl/federated/items2.csv
 create mode 100644 doc/user/cwl/federated/items3.csv
 delete mode 100644 doc/user/cwl/federated/md5sum.cwl
 create mode 100644 doc/user/cwl/federated/merge.cwl
 create mode 100644 doc/user/cwl/federated/merge.py

       via  d33dab9c7baafe56cdcd0e1266f03750531720b4 (commit)
       via  7ffc69693f4fd7f5db67fba3102d99cfba4b8fef (commit)
      from  21dfd2339fb0a4f501b43beedf3207c5d30aae1b (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.


commit d33dab9c7baafe56cdcd0e1266f03750531720b4
Merge: 21dfd2339 7ffc69693
Author: Peter Amstutz <pamstutz at veritasgenetics.com>
Date:   Thu Jun 13 15:14:46 2019 -0400

    Merge branch '15332-fed-demo' refs #15332
    
    Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <pamstutz at veritasgenetics.com>


commit 7ffc69693f4fd7f5db67fba3102d99cfba4b8fef
Author: Peter Amstutz <pamstutz at veritasgenetics.com>
Date:   Mon Jun 10 17:25:17 2019 -0400

    15332: Updated federation demo to do something a little more interesting.
    
    Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <pamstutz at veritasgenetics.com>

diff --git a/.licenseignore b/.licenseignore
index a9b6f5f6c..28ddf9c29 100644
--- a/.licenseignore
+++ b/.licenseignore
@@ -15,6 +15,7 @@ build/package-test-dockerfiles/ubuntu1604/etc-apt-preferences.d-arvados
 doc/fonts/*
 doc/_includes/_config_default_yml.liquid
 doc/user/cwl/federated/*
+doc/_includes/_federated_cwl.liquid
 */docker_image
 docker/jobs/apt.arvados.org*.list
 docker/jobs/1078ECD7.key
diff --git a/doc/_includes/_federated_cwl.liquid b/doc/_includes/_federated_cwl.liquid
index 59a629c5a..cfe8407e2 120000
--- a/doc/_includes/_federated_cwl.liquid
+++ b/doc/_includes/_federated_cwl.liquid
@@ -1 +1 @@
-../user/cwl/federated/federated.cwl
\ No newline at end of file
+../user/cwl/federated/feddemo.cwl
\ No newline at end of file
diff --git a/doc/user/cwl/federated-workflows.html.textile.liquid b/doc/user/cwl/federated-workflows.html.textile.liquid
index 7e2150dcc..01d656dd1 100644
--- a/doc/user/cwl/federated-workflows.html.textile.liquid
+++ b/doc/user/cwl/federated-workflows.html.textile.liquid
@@ -36,7 +36,7 @@ At this time, remote steps of a workflow on Workbench are not displayed.  As a w
 Run it like any other workflow:
 
 <notextile>
-<pre><code>~$ <span class="userinput">arvados-cwl-runner federated.cwl shards.cwl</span>
+<pre><code>~$ <span class="userinput">arvados-cwl-runner feddemo.cwl shards.cwl</span>
 </code></pre>
 </notextile>
 
diff --git a/doc/user/cwl/federated/FileOnCluster.yml b/doc/user/cwl/federated/FileOnCluster.yml
new file mode 100644
index 000000000..363d0717a
--- /dev/null
+++ b/doc/user/cwl/federated/FileOnCluster.yml
@@ -0,0 +1,5 @@
+name: FileOnCluster
+type: record
+fields:
+  file: File
+  cluster: string
\ No newline at end of file
diff --git a/doc/user/cwl/federated/cat.cwl b/doc/user/cwl/federated/cat.cwl
deleted file mode 100644
index 17132fe61..000000000
--- a/doc/user/cwl/federated/cat.cwl
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-cwlVersion: v1.0
-class: CommandLineTool
-inputs:
-  inp:
-    type: File[]
-    inputBinding: {}
-outputs:
-  joined: stdout
-stdout: joined.txt
-baseCommand: cat
diff --git a/doc/user/cwl/federated/colors_to_select.txt b/doc/user/cwl/federated/colors_to_select.txt
new file mode 100644
index 000000000..620b0084b
--- /dev/null
+++ b/doc/user/cwl/federated/colors_to_select.txt
@@ -0,0 +1,2 @@
+green
+blue
diff --git a/doc/user/cwl/federated/extract.cwl b/doc/user/cwl/federated/extract.cwl
new file mode 100644
index 000000000..f8fdedbd8
--- /dev/null
+++ b/doc/user/cwl/federated/extract.cwl
@@ -0,0 +1,22 @@
+cwlVersion: v1.0
+class: CommandLineTool
+requirements:
+  SchemaDefRequirement:
+    types:
+      - $import: FileOnCluster.yml
+inputs:
+  select_column: string
+  select_values: File
+  dataset: 'FileOnCluster.yml#FileOnCluster'
+  extract_py:
+    type: File
+    default:
+      class: File
+      location: extract.py
+outputs:
+  out:
+    type: File
+    outputBinding:
+      glob: extracted.csv
+
+arguments: [python, $(inputs.extract_py), $(inputs.select_column), $(inputs.select_values), $(inputs.dataset.file), $(inputs.dataset.cluster)]
diff --git a/doc/user/cwl/federated/extract.py b/doc/user/cwl/federated/extract.py
new file mode 100644
index 000000000..2d2c49dcb
--- /dev/null
+++ b/doc/user/cwl/federated/extract.py
@@ -0,0 +1,31 @@
+import csv
+import sys
+
+select_column = sys.argv[1]
+select_values = sys.argv[2]
+dataset = sys.argv[3]
+cluster = sys.argv[4]
+
+sv = open(select_values, "rt")
+selectvals = [s.strip() for s in sv]
+
+print("selectvals", selectvals)
+
+ds = csv.reader(open(dataset, "rt"))
+header = next(ds)
+print("header is", header)
+columnindex = None
+for i,v in enumerate(header):
+    if v == select_column:
+        columnindex = i
+if columnindex is None:
+    raise Exception("Column %s not found" % select_column)
+
+print("column index", columnindex)
+
+ex = csv.writer(open("extracted.csv", "wt"))
+ex.writerow(["cluster"]+list(header))
+
+for row in ds:
+    if row[columnindex] in selectvals:
+        ex.writerow([cluster]+list(row))
diff --git a/doc/user/cwl/federated/federated.cwl b/doc/user/cwl/federated/feddemo.cwl
similarity index 51%
rename from doc/user/cwl/federated/federated.cwl
rename to doc/user/cwl/federated/feddemo.cwl
index 5314a7675..a68ff444a 100644
--- a/doc/user/cwl/federated/federated.cwl
+++ b/doc/user/cwl/federated/feddemo.cwl
@@ -1,8 +1,11 @@
-#
-# Demonstrate Arvados federation features.  This performs a parallel
-# scatter over some arbitrary number of files and federated clusters,
-# then joins the results.
-#
+# Demonstrate Arvados federation features.  This example searches a
+# list of CSV files that are hosted on different Arvados clusters.
+# For each file, send a task to the remote cluster which will scan
+# file and extracts the rows where the column "select_column" has one
+# of the values appearing in the "select_values" file.  The home
+# cluster then runs a task which pulls the results from the remote
+# clusters and merges the results to produce a final report.
+
 cwlVersion: v1.0
 class: Workflow
 $namespaces:
@@ -19,50 +22,45 @@ requirements:
     dockerPull: arvados/jobs
 
   # Define a record type so we can conveniently associate the input
-  # file, the cluster on which the file lives, and the project on that
-  # cluster that will own the container requests and intermediate
-  # outputs.
+  # file and the cluster where the task should run.
   SchemaDefRequirement:
     types:
-      - name: FileOnCluster
-        type: record
-        fields:
-          file: File
-          cluster: string
-          project: string
+      - $import: FileOnCluster.yml
 
 inputs:
-  # Expect an array of FileOnCluster records (defined above)
-  # as our input.
-  shards:
+  select_column: string
+  select_values: File
+
+  datasets:
     type:
       type: array
-      items: FileOnCluster
+      items: FileOnCluster.yml#FileOnCluster
+
+  intermediate_projects: string[]
 
 outputs:
   # Will produce an output file with the results of the distributed
-  # analysis jobs joined together.
+  # analysis jobs merged together.
   joined:
     type: File
-    outputSource: gather-results/joined
+    outputSource: gather-results/out
 
 steps:
   distributed-analysis:
     in:
-      # Take "shards" array as input, we scatter over it below.
-      shard: shards
-
-      # Use an expression to extract the "file" field to assign to the
-      # "inp" parameter of the tool.
-      inp: {valueFrom: $(inputs.shard.file)}
+      select_column: select_column
+      select_values: select_values
+      dataset: datasets
+      intermediate_projects: intermediate_projects
 
     # Scatter over shards, this means creating a parallel job for each
     # element in the "shards" array.  Expressions are evaluated for
     # each element.
-    scatter: shard
+    scatter: [dataset, intermediate_projects]
+    scatterMethod: dotproduct
 
-    # Specify the cluster target for this job.  This means each
-    # separate scatter job will execute on the cluster that was
+    # Specify the cluster target for this task.  This means each
+    # separate scatter task will execute on the cluster that was
     # specified in the "cluster" field.
     #
     # Arvados handles streaming data between clusters, for example,
@@ -71,17 +69,17 @@ steps:
     # the federation.
     hints:
       arv:ClusterTarget:
-        cluster_id: $(inputs.shard.cluster)
-        project_uuid: $(inputs.shard.project)
+        cluster_id: $(inputs.dataset.cluster)
+        project_uuid: $(inputs.intermediate_projects)
 
     out: [out]
-    run: md5sum.cwl
+    run: extract.cwl
 
   # Collect the results of the distributed step and join them into a
   # single output file.  Arvados handles streaming inputs,
   # intermediate results, and outputs between clusters on demand.
   gather-results:
     in:
-      inp: distributed-analysis/out
-    out: [joined]
-    run: cat.cwl
+      dataset: distributed-analysis/out
+    out: [out]
+    run: merge.cwl
diff --git a/doc/user/cwl/federated/file-on-clsr1.dat b/doc/user/cwl/federated/file-on-clsr1.dat
deleted file mode 100644
index e79f1526c..000000000
--- a/doc/user/cwl/federated/file-on-clsr1.dat
+++ /dev/null
@@ -1 +0,0 @@
-file-on-clsr1.dat
diff --git a/doc/user/cwl/federated/file-on-clsr2.dat b/doc/user/cwl/federated/file-on-clsr2.dat
deleted file mode 100644
index 9179dc8a5..000000000
--- a/doc/user/cwl/federated/file-on-clsr2.dat
+++ /dev/null
@@ -1 +0,0 @@
-file-on-clsr2.dat
diff --git a/doc/user/cwl/federated/file-on-clsr3.dat b/doc/user/cwl/federated/file-on-clsr3.dat
deleted file mode 100644
index 58b590233..000000000
--- a/doc/user/cwl/federated/file-on-clsr3.dat
+++ /dev/null
@@ -1 +0,0 @@
-file-on-clsr3.dat
diff --git a/doc/user/cwl/federated/items1.csv b/doc/user/cwl/federated/items1.csv
new file mode 100644
index 000000000..59d2d322b
--- /dev/null
+++ b/doc/user/cwl/federated/items1.csv
@@ -0,0 +1,29 @@
+color,item
+blue,ball
+yellow,ball
+red,ball
+green,book
+purple,book
+red,book
+yellow,flower
+purple,flower
+red,bicycle
+red,ball
+green,picture
+yellow,ball
+purple,flower
+yellow,ball
+green,bicycle
+orange,book
+green,book
+orange,picture
+blue,book
+orange,car
+yellow,flower
+purple,ball
+blue,book
+orange,book
+orange,book
+yellow,book
+orange,car
+yellow,car
diff --git a/doc/user/cwl/federated/items2.csv b/doc/user/cwl/federated/items2.csv
new file mode 100644
index 000000000..566dab775
--- /dev/null
+++ b/doc/user/cwl/federated/items2.csv
@@ -0,0 +1,33 @@
+color,item
+green,bicycle
+red,flower
+blue,bicycle
+yellow,flower
+green,ball
+red,book
+red,bicycle
+yellow,ball
+blue,picture
+green,book
+orange,flower
+blue,ball
+orange,car
+green,book
+yellow,car
+orange,picture
+orange,car
+yellow,flower
+green,ball
+orange,car
+purple,book
+green,ball
+red,flower
+blue,car
+orange,flower
+blue,book
+blue,bicycle
+red,picture
+orange,flower
+orange,book
+blue,flower
+orange,book
diff --git a/doc/user/cwl/federated/items3.csv b/doc/user/cwl/federated/items3.csv
new file mode 100644
index 000000000..e820e4537
--- /dev/null
+++ b/doc/user/cwl/federated/items3.csv
@@ -0,0 +1,41 @@
+color,item
+purple,book
+green,book
+red,bicycle
+yellow,book
+orange,book
+green,car
+green,car
+blue,ball
+yellow,bicycle
+orange,book
+green,bicycle
+blue,flower
+red,bicycle
+purple,bicycle
+green,bicycle
+orange,ball
+yellow,car
+orange,ball
+red,ball
+red,car
+green,picture
+green,flower
+blue,picture
+green,car
+yellow,flower
+purple,flower
+green,ball
+yellow,bicycle
+orange,bicycle
+orange,flower
+yellow,picture
+purple,flower
+green,picture
+orange,car
+orange,picture
+yellow,car
+yellow,picture
+purple,picture
+purple,picture
+purple,flower
diff --git a/doc/user/cwl/federated/md5sum.cwl b/doc/user/cwl/federated/md5sum.cwl
deleted file mode 100644
index 9c78dc268..000000000
--- a/doc/user/cwl/federated/md5sum.cwl
+++ /dev/null
@@ -1,21 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-cwlVersion: v1.0
-class: CommandLineTool
-$namespaces:
-  arv: "http://arvados.org/cwl#"
-requirements:
-  InlineJavascriptRequirement: {}
-inputs:
-  inp:
-    type: File
-outputs:
-  out:
-    type: File
-    outputBinding:
-      glob: out.txt
-stdin: $(inputs.inp.path)
-stdout: out.txt
-arguments: ["md5sum", "-"]
diff --git a/doc/user/cwl/federated/merge.cwl b/doc/user/cwl/federated/merge.cwl
new file mode 100644
index 000000000..a60d619f9
--- /dev/null
+++ b/doc/user/cwl/federated/merge.cwl
@@ -0,0 +1,23 @@
+cwlVersion: v1.0
+class: CommandLineTool
+requirements:
+  SchemaDefRequirement:
+    types:
+      - $import: FileOnCluster.yml
+inputs:
+  dataset:
+    type: File[]
+    inputBinding:
+      position: 1
+  merge_py:
+    type: File
+    default:
+      class: File
+      location: merge.py
+outputs:
+  out:
+    type: File
+    outputBinding:
+      glob: merged.csv
+
+arguments: [python, $(inputs.merge_py)]
diff --git a/doc/user/cwl/federated/merge.py b/doc/user/cwl/federated/merge.py
new file mode 100644
index 000000000..03c79f23c
--- /dev/null
+++ b/doc/user/cwl/federated/merge.py
@@ -0,0 +1,15 @@
+import sys
+import csv
+
+merged = open("merged.csv", "wt")
+
+wroteheader = False
+for s in sys.argv[1:]:
+    f = open(s, "rt")
+    header = next(f)
+    if not wroteheader:
+        merged.write(header)
+        wroteheader = True
+    for l in f:
+        merged.write(l)
+    f.close()
diff --git a/doc/user/cwl/federated/shards.yml b/doc/user/cwl/federated/shards.yml
index ed8a83ab3..14e346248 100644
--- a/doc/user/cwl/federated/shards.yml
+++ b/doc/user/cwl/federated/shards.yml
@@ -1,18 +1,25 @@
-shards:
+select_column: color
+select_values:
+  class: File
+  location: colors_to_select.txt
+
+datasets:
   - cluster: clsr1
-    project: clsr1-j7d0g-qxc4jcji7n4lafx
     file:
       class: File
-      location: keep:485df2c5cec3207a32f49c42f1cdcca9+61/file-on-clsr1.dat
+      location: keep:0dcf9310e5bf0c07270416d3a0cd6a43+56/items1.csv
 
   - cluster: clsr2
-    project: clsr2-j7d0g-ivdrm1hyym21vkq
     file:
       class: File
-      location: keep:ae6e9c3e9bfa52a0122ecb489d8198ff+61/file-on-clsr2.dat
+      location: keep:12707d325a3f4687674b858bd32beae9+56/items2.csv
 
   - cluster: clsr3
-    project: clsr3-j7d0g-e3njz2s53lyb0ka
     file:
       class: File
-      location: keep:0b43a0ef9ea592d5d7b299978dfa8643+61/file-on-clsr3.dat
+      location: keep:dbff6bb7fc43176527af5eb9dec28871+56/items3.csv
+
+intermediate_projects:
+  - clsr1-j7d0g-qxc4jcji7n4lafx
+  - clsr2-j7d0g-e7r20egb8hlgn53
+  - clsr3-j7d0g-vrl00zoku9spnen

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list