[ARVADOS] updated: 1.3.0-1084-gd33dab9c7
Git user
git at public.curoverse.com
Thu Jun 13 19:15:01 UTC 2019
Summary of changes:
.licenseignore | 1 +
doc/_includes/_federated_cwl.liquid | 2 +-
.../cwl/federated-workflows.html.textile.liquid | 2 +-
doc/user/cwl/federated/FileOnCluster.yml | 5 ++
doc/user/cwl/federated/cat.cwl | 14 -----
doc/user/cwl/federated/colors_to_select.txt | 2 +
doc/user/cwl/federated/extract.cwl | 22 +++++++
doc/user/cwl/federated/extract.py | 31 ++++++++++
.../cwl/federated/{federated.cwl => feddemo.cwl} | 68 +++++++++++-----------
doc/user/cwl/federated/file-on-clsr1.dat | 1 -
doc/user/cwl/federated/file-on-clsr2.dat | 1 -
doc/user/cwl/federated/file-on-clsr3.dat | 1 -
doc/user/cwl/federated/items1.csv | 29 +++++++++
doc/user/cwl/federated/items2.csv | 33 +++++++++++
doc/user/cwl/federated/items3.csv | 41 +++++++++++++
doc/user/cwl/federated/md5sum.cwl | 21 -------
doc/user/cwl/federated/merge.cwl | 23 ++++++++
doc/user/cwl/federated/merge.py | 15 +++++
doc/user/cwl/federated/shards.yml | 21 ++++---
19 files changed, 251 insertions(+), 82 deletions(-)
create mode 100644 doc/user/cwl/federated/FileOnCluster.yml
delete mode 100644 doc/user/cwl/federated/cat.cwl
create mode 100644 doc/user/cwl/federated/colors_to_select.txt
create mode 100644 doc/user/cwl/federated/extract.cwl
create mode 100644 doc/user/cwl/federated/extract.py
rename doc/user/cwl/federated/{federated.cwl => feddemo.cwl} (51%)
delete mode 100644 doc/user/cwl/federated/file-on-clsr1.dat
delete mode 100644 doc/user/cwl/federated/file-on-clsr2.dat
delete mode 100644 doc/user/cwl/federated/file-on-clsr3.dat
create mode 100644 doc/user/cwl/federated/items1.csv
create mode 100644 doc/user/cwl/federated/items2.csv
create mode 100644 doc/user/cwl/federated/items3.csv
delete mode 100644 doc/user/cwl/federated/md5sum.cwl
create mode 100644 doc/user/cwl/federated/merge.cwl
create mode 100644 doc/user/cwl/federated/merge.py
via d33dab9c7baafe56cdcd0e1266f03750531720b4 (commit)
via 7ffc69693f4fd7f5db67fba3102d99cfba4b8fef (commit)
from 21dfd2339fb0a4f501b43beedf3207c5d30aae1b (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
commit d33dab9c7baafe56cdcd0e1266f03750531720b4
Merge: 21dfd2339 7ffc69693
Author: Peter Amstutz <pamstutz at veritasgenetics.com>
Date: Thu Jun 13 15:14:46 2019 -0400
Merge branch '15332-fed-demo' refs #15332
Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <pamstutz at veritasgenetics.com>
commit 7ffc69693f4fd7f5db67fba3102d99cfba4b8fef
Author: Peter Amstutz <pamstutz at veritasgenetics.com>
Date: Mon Jun 10 17:25:17 2019 -0400
15332: Updated federation demo to do something a little more interesting.
Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <pamstutz at veritasgenetics.com>
diff --git a/.licenseignore b/.licenseignore
index a9b6f5f6c..28ddf9c29 100644
--- a/.licenseignore
+++ b/.licenseignore
@@ -15,6 +15,7 @@ build/package-test-dockerfiles/ubuntu1604/etc-apt-preferences.d-arvados
doc/fonts/*
doc/_includes/_config_default_yml.liquid
doc/user/cwl/federated/*
+doc/_includes/_federated_cwl.liquid
*/docker_image
docker/jobs/apt.arvados.org*.list
docker/jobs/1078ECD7.key
diff --git a/doc/_includes/_federated_cwl.liquid b/doc/_includes/_federated_cwl.liquid
index 59a629c5a..cfe8407e2 120000
--- a/doc/_includes/_federated_cwl.liquid
+++ b/doc/_includes/_federated_cwl.liquid
@@ -1 +1 @@
-../user/cwl/federated/federated.cwl
\ No newline at end of file
+../user/cwl/federated/feddemo.cwl
\ No newline at end of file
diff --git a/doc/user/cwl/federated-workflows.html.textile.liquid b/doc/user/cwl/federated-workflows.html.textile.liquid
index 7e2150dcc..01d656dd1 100644
--- a/doc/user/cwl/federated-workflows.html.textile.liquid
+++ b/doc/user/cwl/federated-workflows.html.textile.liquid
@@ -36,7 +36,7 @@ At this time, remote steps of a workflow on Workbench are not displayed. As a w
Run it like any other workflow:
<notextile>
-<pre><code>~$ <span class="userinput">arvados-cwl-runner federated.cwl shards.cwl</span>
+<pre><code>~$ <span class="userinput">arvados-cwl-runner feddemo.cwl shards.cwl</span>
</code></pre>
</notextile>
diff --git a/doc/user/cwl/federated/FileOnCluster.yml b/doc/user/cwl/federated/FileOnCluster.yml
new file mode 100644
index 000000000..363d0717a
--- /dev/null
+++ b/doc/user/cwl/federated/FileOnCluster.yml
@@ -0,0 +1,5 @@
+name: FileOnCluster
+type: record
+fields:
+ file: File
+ cluster: string
\ No newline at end of file
diff --git a/doc/user/cwl/federated/cat.cwl b/doc/user/cwl/federated/cat.cwl
deleted file mode 100644
index 17132fe61..000000000
--- a/doc/user/cwl/federated/cat.cwl
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-cwlVersion: v1.0
-class: CommandLineTool
-inputs:
- inp:
- type: File[]
- inputBinding: {}
-outputs:
- joined: stdout
-stdout: joined.txt
-baseCommand: cat
diff --git a/doc/user/cwl/federated/colors_to_select.txt b/doc/user/cwl/federated/colors_to_select.txt
new file mode 100644
index 000000000..620b0084b
--- /dev/null
+++ b/doc/user/cwl/federated/colors_to_select.txt
@@ -0,0 +1,2 @@
+green
+blue
diff --git a/doc/user/cwl/federated/extract.cwl b/doc/user/cwl/federated/extract.cwl
new file mode 100644
index 000000000..f8fdedbd8
--- /dev/null
+++ b/doc/user/cwl/federated/extract.cwl
@@ -0,0 +1,22 @@
+cwlVersion: v1.0
+class: CommandLineTool
+requirements:
+ SchemaDefRequirement:
+ types:
+ - $import: FileOnCluster.yml
+inputs:
+ select_column: string
+ select_values: File
+ dataset: 'FileOnCluster.yml#FileOnCluster'
+ extract_py:
+ type: File
+ default:
+ class: File
+ location: extract.py
+outputs:
+ out:
+ type: File
+ outputBinding:
+ glob: extracted.csv
+
+arguments: [python, $(inputs.extract_py), $(inputs.select_column), $(inputs.select_values), $(inputs.dataset.file), $(inputs.dataset.cluster)]
diff --git a/doc/user/cwl/federated/extract.py b/doc/user/cwl/federated/extract.py
new file mode 100644
index 000000000..2d2c49dcb
--- /dev/null
+++ b/doc/user/cwl/federated/extract.py
@@ -0,0 +1,31 @@
+import csv
+import sys
+
+select_column = sys.argv[1]
+select_values = sys.argv[2]
+dataset = sys.argv[3]
+cluster = sys.argv[4]
+
+sv = open(select_values, "rt")
+selectvals = [s.strip() for s in sv]
+
+print("selectvals", selectvals)
+
+ds = csv.reader(open(dataset, "rt"))
+header = next(ds)
+print("header is", header)
+columnindex = None
+for i,v in enumerate(header):
+ if v == select_column:
+ columnindex = i
+if columnindex is None:
+ raise Exception("Column %s not found" % select_column)
+
+print("column index", columnindex)
+
+ex = csv.writer(open("extracted.csv", "wt"))
+ex.writerow(["cluster"]+list(header))
+
+for row in ds:
+ if row[columnindex] in selectvals:
+ ex.writerow([cluster]+list(row))
diff --git a/doc/user/cwl/federated/federated.cwl b/doc/user/cwl/federated/feddemo.cwl
similarity index 51%
rename from doc/user/cwl/federated/federated.cwl
rename to doc/user/cwl/federated/feddemo.cwl
index 5314a7675..a68ff444a 100644
--- a/doc/user/cwl/federated/federated.cwl
+++ b/doc/user/cwl/federated/feddemo.cwl
@@ -1,8 +1,11 @@
-#
-# Demonstrate Arvados federation features. This performs a parallel
-# scatter over some arbitrary number of files and federated clusters,
-# then joins the results.
-#
+# Demonstrate Arvados federation features. This example searches a
+# list of CSV files that are hosted on different Arvados clusters.
+# For each file, send a task to the remote cluster which will scan
+# file and extracts the rows where the column "select_column" has one
+# of the values appearing in the "select_values" file. The home
+# cluster then runs a task which pulls the results from the remote
+# clusters and merges the results to produce a final report.
+
cwlVersion: v1.0
class: Workflow
$namespaces:
@@ -19,50 +22,45 @@ requirements:
dockerPull: arvados/jobs
# Define a record type so we can conveniently associate the input
- # file, the cluster on which the file lives, and the project on that
- # cluster that will own the container requests and intermediate
- # outputs.
+ # file and the cluster where the task should run.
SchemaDefRequirement:
types:
- - name: FileOnCluster
- type: record
- fields:
- file: File
- cluster: string
- project: string
+ - $import: FileOnCluster.yml
inputs:
- # Expect an array of FileOnCluster records (defined above)
- # as our input.
- shards:
+ select_column: string
+ select_values: File
+
+ datasets:
type:
type: array
- items: FileOnCluster
+ items: FileOnCluster.yml#FileOnCluster
+
+ intermediate_projects: string[]
outputs:
# Will produce an output file with the results of the distributed
- # analysis jobs joined together.
+ # analysis jobs merged together.
joined:
type: File
- outputSource: gather-results/joined
+ outputSource: gather-results/out
steps:
distributed-analysis:
in:
- # Take "shards" array as input, we scatter over it below.
- shard: shards
-
- # Use an expression to extract the "file" field to assign to the
- # "inp" parameter of the tool.
- inp: {valueFrom: $(inputs.shard.file)}
+ select_column: select_column
+ select_values: select_values
+ dataset: datasets
+ intermediate_projects: intermediate_projects
# Scatter over shards, this means creating a parallel job for each
# element in the "shards" array. Expressions are evaluated for
# each element.
- scatter: shard
+ scatter: [dataset, intermediate_projects]
+ scatterMethod: dotproduct
- # Specify the cluster target for this job. This means each
- # separate scatter job will execute on the cluster that was
+ # Specify the cluster target for this task. This means each
+ # separate scatter task will execute on the cluster that was
# specified in the "cluster" field.
#
# Arvados handles streaming data between clusters, for example,
@@ -71,17 +69,17 @@ steps:
# the federation.
hints:
arv:ClusterTarget:
- cluster_id: $(inputs.shard.cluster)
- project_uuid: $(inputs.shard.project)
+ cluster_id: $(inputs.dataset.cluster)
+ project_uuid: $(inputs.intermediate_projects)
out: [out]
- run: md5sum.cwl
+ run: extract.cwl
# Collect the results of the distributed step and join them into a
# single output file. Arvados handles streaming inputs,
# intermediate results, and outputs between clusters on demand.
gather-results:
in:
- inp: distributed-analysis/out
- out: [joined]
- run: cat.cwl
+ dataset: distributed-analysis/out
+ out: [out]
+ run: merge.cwl
diff --git a/doc/user/cwl/federated/file-on-clsr1.dat b/doc/user/cwl/federated/file-on-clsr1.dat
deleted file mode 100644
index e79f1526c..000000000
--- a/doc/user/cwl/federated/file-on-clsr1.dat
+++ /dev/null
@@ -1 +0,0 @@
-file-on-clsr1.dat
diff --git a/doc/user/cwl/federated/file-on-clsr2.dat b/doc/user/cwl/federated/file-on-clsr2.dat
deleted file mode 100644
index 9179dc8a5..000000000
--- a/doc/user/cwl/federated/file-on-clsr2.dat
+++ /dev/null
@@ -1 +0,0 @@
-file-on-clsr2.dat
diff --git a/doc/user/cwl/federated/file-on-clsr3.dat b/doc/user/cwl/federated/file-on-clsr3.dat
deleted file mode 100644
index 58b590233..000000000
--- a/doc/user/cwl/federated/file-on-clsr3.dat
+++ /dev/null
@@ -1 +0,0 @@
-file-on-clsr3.dat
diff --git a/doc/user/cwl/federated/items1.csv b/doc/user/cwl/federated/items1.csv
new file mode 100644
index 000000000..59d2d322b
--- /dev/null
+++ b/doc/user/cwl/federated/items1.csv
@@ -0,0 +1,29 @@
+color,item
+blue,ball
+yellow,ball
+red,ball
+green,book
+purple,book
+red,book
+yellow,flower
+purple,flower
+red,bicycle
+red,ball
+green,picture
+yellow,ball
+purple,flower
+yellow,ball
+green,bicycle
+orange,book
+green,book
+orange,picture
+blue,book
+orange,car
+yellow,flower
+purple,ball
+blue,book
+orange,book
+orange,book
+yellow,book
+orange,car
+yellow,car
diff --git a/doc/user/cwl/federated/items2.csv b/doc/user/cwl/federated/items2.csv
new file mode 100644
index 000000000..566dab775
--- /dev/null
+++ b/doc/user/cwl/federated/items2.csv
@@ -0,0 +1,33 @@
+color,item
+green,bicycle
+red,flower
+blue,bicycle
+yellow,flower
+green,ball
+red,book
+red,bicycle
+yellow,ball
+blue,picture
+green,book
+orange,flower
+blue,ball
+orange,car
+green,book
+yellow,car
+orange,picture
+orange,car
+yellow,flower
+green,ball
+orange,car
+purple,book
+green,ball
+red,flower
+blue,car
+orange,flower
+blue,book
+blue,bicycle
+red,picture
+orange,flower
+orange,book
+blue,flower
+orange,book
diff --git a/doc/user/cwl/federated/items3.csv b/doc/user/cwl/federated/items3.csv
new file mode 100644
index 000000000..e820e4537
--- /dev/null
+++ b/doc/user/cwl/federated/items3.csv
@@ -0,0 +1,41 @@
+color,item
+purple,book
+green,book
+red,bicycle
+yellow,book
+orange,book
+green,car
+green,car
+blue,ball
+yellow,bicycle
+orange,book
+green,bicycle
+blue,flower
+red,bicycle
+purple,bicycle
+green,bicycle
+orange,ball
+yellow,car
+orange,ball
+red,ball
+red,car
+green,picture
+green,flower
+blue,picture
+green,car
+yellow,flower
+purple,flower
+green,ball
+yellow,bicycle
+orange,bicycle
+orange,flower
+yellow,picture
+purple,flower
+green,picture
+orange,car
+orange,picture
+yellow,car
+yellow,picture
+purple,picture
+purple,picture
+purple,flower
diff --git a/doc/user/cwl/federated/md5sum.cwl b/doc/user/cwl/federated/md5sum.cwl
deleted file mode 100644
index 9c78dc268..000000000
--- a/doc/user/cwl/federated/md5sum.cwl
+++ /dev/null
@@ -1,21 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-cwlVersion: v1.0
-class: CommandLineTool
-$namespaces:
- arv: "http://arvados.org/cwl#"
-requirements:
- InlineJavascriptRequirement: {}
-inputs:
- inp:
- type: File
-outputs:
- out:
- type: File
- outputBinding:
- glob: out.txt
-stdin: $(inputs.inp.path)
-stdout: out.txt
-arguments: ["md5sum", "-"]
diff --git a/doc/user/cwl/federated/merge.cwl b/doc/user/cwl/federated/merge.cwl
new file mode 100644
index 000000000..a60d619f9
--- /dev/null
+++ b/doc/user/cwl/federated/merge.cwl
@@ -0,0 +1,23 @@
+cwlVersion: v1.0
+class: CommandLineTool
+requirements:
+ SchemaDefRequirement:
+ types:
+ - $import: FileOnCluster.yml
+inputs:
+ dataset:
+ type: File[]
+ inputBinding:
+ position: 1
+ merge_py:
+ type: File
+ default:
+ class: File
+ location: merge.py
+outputs:
+ out:
+ type: File
+ outputBinding:
+ glob: merged.csv
+
+arguments: [python, $(inputs.merge_py)]
diff --git a/doc/user/cwl/federated/merge.py b/doc/user/cwl/federated/merge.py
new file mode 100644
index 000000000..03c79f23c
--- /dev/null
+++ b/doc/user/cwl/federated/merge.py
@@ -0,0 +1,15 @@
+import sys
+import csv
+
+merged = open("merged.csv", "wt")
+
+wroteheader = False
+for s in sys.argv[1:]:
+ f = open(s, "rt")
+ header = next(f)
+ if not wroteheader:
+ merged.write(header)
+ wroteheader = True
+ for l in f:
+ merged.write(l)
+ f.close()
diff --git a/doc/user/cwl/federated/shards.yml b/doc/user/cwl/federated/shards.yml
index ed8a83ab3..14e346248 100644
--- a/doc/user/cwl/federated/shards.yml
+++ b/doc/user/cwl/federated/shards.yml
@@ -1,18 +1,25 @@
-shards:
+select_column: color
+select_values:
+ class: File
+ location: colors_to_select.txt
+
+datasets:
- cluster: clsr1
- project: clsr1-j7d0g-qxc4jcji7n4lafx
file:
class: File
- location: keep:485df2c5cec3207a32f49c42f1cdcca9+61/file-on-clsr1.dat
+ location: keep:0dcf9310e5bf0c07270416d3a0cd6a43+56/items1.csv
- cluster: clsr2
- project: clsr2-j7d0g-ivdrm1hyym21vkq
file:
class: File
- location: keep:ae6e9c3e9bfa52a0122ecb489d8198ff+61/file-on-clsr2.dat
+ location: keep:12707d325a3f4687674b858bd32beae9+56/items2.csv
- cluster: clsr3
- project: clsr3-j7d0g-e3njz2s53lyb0ka
file:
class: File
- location: keep:0b43a0ef9ea592d5d7b299978dfa8643+61/file-on-clsr3.dat
+ location: keep:dbff6bb7fc43176527af5eb9dec28871+56/items3.csv
+
+intermediate_projects:
+ - clsr1-j7d0g-qxc4jcji7n4lafx
+ - clsr2-j7d0g-e7r20egb8hlgn53
+ - clsr3-j7d0g-vrl00zoku9spnen
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list