[arvados] created: 2.1.0-2882-g07ff7264d
git repository hosting
git at public.arvados.org
Thu Sep 8 20:34:01 UTC 2022
at 07ff7264dc3ff32c8edf08e5e6827ff7f1f48cf7 (commit)
commit 07ff7264dc3ff32c8edf08e5e6827ff7f1f48cf7
Author: Peter Amstutz <peter.amstutz at curii.com>
Date: Thu Sep 8 16:33:38 2022 -0400
19464: Record & report provenance information from git
Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz at curii.com>
diff --git a/sdk/cwl/arvados_cwl/arvcontainer.py b/sdk/cwl/arvados_cwl/arvcontainer.py
index 5094ea3bf..c763cb7ef 100644
--- a/sdk/cwl/arvados_cwl/arvcontainer.py
+++ b/sdk/cwl/arvados_cwl/arvcontainer.py
@@ -453,7 +453,7 @@ class ArvadosContainer(JobBase):
class RunnerContainer(Runner):
"""Submit and manage a container that runs arvados-cwl-runner."""
- def arvados_job_spec(self, runtimeContext):
+ def arvados_job_spec(self, runtimeContext, git_info):
"""Create an Arvados container request for this workflow.
The returned dict can be used to create a container passed as
@@ -515,7 +515,7 @@ class RunnerContainer(Runner):
"portable_data_hash": "%s" % workflowcollection
}
else:
- packed = packed_workflow(self.arvrunner, self.embedded_tool, self.merged_map, runtimeContext)
+ packed = packed_workflow(self.arvrunner, self.embedded_tool, self.merged_map, runtimeContext, git_info)
workflowpath = "/var/lib/cwl/workflow.json#main"
container_req["mounts"]["/var/lib/cwl/workflow.json"] = {
"kind": "json",
@@ -595,7 +595,7 @@ class RunnerContainer(Runner):
def run(self, runtimeContext):
runtimeContext.keepprefix = "keep:"
- job_spec = self.arvados_job_spec(runtimeContext)
+ job_spec = self.arvados_job_spec(runtimeContext, self.git_info)
if runtimeContext.project_uuid:
job_spec["owner_uuid"] = runtimeContext.project_uuid
diff --git a/sdk/cwl/arvados_cwl/arvworkflow.py b/sdk/cwl/arvados_cwl/arvworkflow.py
index 51e7cd8b9..5f3feabf8 100644
--- a/sdk/cwl/arvados_cwl/arvworkflow.py
+++ b/sdk/cwl/arvados_cwl/arvworkflow.py
@@ -40,9 +40,10 @@ sum_res_pars = ("outdirMin", "outdirMax")
def upload_workflow(arvRunner, tool, job_order, project_uuid,
runtimeContext, uuid=None,
submit_runner_ram=0, name=None, merged_map=None,
- submit_runner_image=None):
+ submit_runner_image=None,
+ git_info=None):
- packed = packed_workflow(arvRunner, tool, merged_map, runtimeContext)
+ packed = packed_workflow(arvRunner, tool, merged_map, runtimeContext, git_info)
adjustDirObjs(job_order, trim_listing)
adjustFileObjs(job_order, trim_anonymous_location)
diff --git a/sdk/cwl/arvados_cwl/executor.py b/sdk/cwl/arvados_cwl/executor.py
index 8635d5fcf..dbbea9f4e 100644
--- a/sdk/cwl/arvados_cwl/executor.py
+++ b/sdk/cwl/arvados_cwl/executor.py
@@ -17,6 +17,7 @@ import copy
import json
import re
from functools import partial
+import subprocess
import time
import urllib
@@ -24,6 +25,7 @@ from cwltool.errors import WorkflowException
import cwltool.workflow
from schema_salad.sourceline import SourceLine
import schema_salad.validate as validate
+from schema_salad.ref_resolver import file_uri, uri_file_path
import arvados
import arvados.config
@@ -518,9 +520,59 @@ The 'jobs' API is no longer supported.
for req in job_reqs:
tool.requirements.append(req)
+ def get_git_info(self, tool):
+ in_a_git_repo = False
+ cwd = None
+
+ if tool.tool["id"].startswith("file://"):
+ # check if git is installed
+ try:
+ cwd = os.path.dirname(uri_file_path(tool.tool["id"]))
+ subprocess.run(["git", "log", "--format=%H", "-n1", "HEAD"], cwd=cwd, check=True, capture_output=True, text=True)
+ in_a_git_repo = True
+ except Exception as e:
+ pass
+
+ gitproperties = {}
+
+ if in_a_git_repo:
+ git_commit = subprocess.run(["git", "log", "--format=%H", "-n1", "HEAD"], cwd=cwd, capture_output=True, text=True).stdout
+ git_date = subprocess.run(["git", "log", "--format=%cD", "-n1", "HEAD"], cwd=cwd, capture_output=True, text=True).stdout
+ git_committer = subprocess.run(["git", "log", "--format=%cn <%ce>", "-n1", "HEAD"], cwd=cwd, capture_output=True, text=True).stdout
+ git_branch = subprocess.run(["git", "branch", "--show-current"], cwd=cwd, capture_output=True, text=True).stdout
+ git_origin = subprocess.run(["git", "remote", "get-url", "origin"], cwd=cwd, capture_output=True, text=True).stdout
+ git_status = subprocess.run(["git", "status", "--untracked-files=no", "--porcelain"], cwd=cwd, capture_output=True, text=True).stdout
+
+ gitproperties = {
+ "http://arvados.org/cwl#gitCommit": git_commit.strip(),
+ "http://arvados.org/cwl#gitDate": git_date.strip(),
+ "http://arvados.org/cwl#gitCommitter": git_committer.strip(),
+ "http://arvados.org/cwl#gitBranch": git_branch.strip(),
+ "http://arvados.org/cwl#gitOrigin": git_origin.strip(),
+ "http://arvados.org/cwl#gitStatus": git_status.strip(),
+ }
+ else:
+ for g in ("http://arvados.org/cwl#gitCommit",
+ "http://arvados.org/cwl#gitDate",
+ "http://arvados.org/cwl#gitCommitter",
+ "http://arvados.org/cwl#gitBranch",
+ "http://arvados.org/cwl#gitOrigin",
+ "http://arvados.org/cwl#gitStatus"):
+ if g in tool.metadata:
+ gitproperties = tool.metadata[g]
+
+ return gitproperties
+
def arv_executor(self, updated_tool, job_order, runtimeContext, logger=None):
self.debug = runtimeContext.debug
+ git_info = self.get_git_info(updated_tool)
+ if git_info:
+ logger.info("Provenance of %s", updated_tool.tool["id"])
+ for g in git_info:
+ if git_info[g]:
+ logger.info(" %s: %s", g.split("#", 1)[1], git_info[g])
+
workbench1 = self.api.config()["Services"]["Workbench1"]["ExternalURL"]
workbench2 = self.api.config()["Services"]["Workbench2"]["ExternalURL"]
controller = self.api.config()["Services"]["Controller"]["ExternalURL"]
@@ -628,7 +680,8 @@ The 'jobs' API is no longer supported.
submit_runner_ram=runtimeContext.submit_runner_ram,
name=runtimeContext.name,
merged_map=merged_map,
- submit_runner_image=runtimeContext.submit_runner_image)
+ submit_runner_image=runtimeContext.submit_runner_image,
+ git_info=git_info)
self.stdout.write(uuid + "\n")
return (None, "success")
@@ -690,7 +743,8 @@ The 'jobs' API is no longer supported.
priority=runtimeContext.priority,
secret_store=self.secret_store,
collection_cache_size=runtimeContext.collection_cache_size,
- collection_cache_is_default=self.should_estimate_cache_size)
+ collection_cache_is_default=self.should_estimate_cache_size,
+ git_info=git_info)
else:
runtimeContext.runnerjob = tool.tool["id"]
diff --git a/sdk/cwl/arvados_cwl/runner.py b/sdk/cwl/arvados_cwl/runner.py
index 225f4ae60..1544d05cd 100644
--- a/sdk/cwl/arvados_cwl/runner.py
+++ b/sdk/cwl/arvados_cwl/runner.py
@@ -604,7 +604,7 @@ def upload_docker(arvrunner, tool, runtimeContext):
upload_docker(arvrunner, s.embedded_tool, runtimeContext)
-def packed_workflow(arvrunner, tool, merged_map, runtimeContext):
+def packed_workflow(arvrunner, tool, merged_map, runtimeContext, git_info):
"""Create a packed workflow.
A "packed" workflow is one where all the components have been combined into a single document."""
@@ -644,6 +644,11 @@ def packed_workflow(arvrunner, tool, merged_map, runtimeContext):
for l in v:
visit(l, cur_id)
visit(packed, None)
+
+ if git_info:
+ for g in git_info:
+ packed[g] = git_info[g]
+
return packed
@@ -794,7 +799,8 @@ class Runner(Process):
intermediate_output_ttl=0, merged_map=None,
priority=None, secret_store=None,
collection_cache_size=256,
- collection_cache_is_default=True):
+ collection_cache_is_default=True,
+ git_info=None):
loadingContext = loadingContext.copy()
loadingContext.metadata = updated_tool.metadata.copy()
@@ -823,6 +829,7 @@ class Runner(Process):
self.priority = priority
self.secret_store = secret_store
self.enable_dev = loadingContext.enable_dev
+ self.git_info = git_info
self.submit_runner_cores = 1
self.submit_runner_ram = 1024 # defaut 1 GiB
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list