[arvados] created: 2.1.0-2882-g07ff7264d

git repository hosting git at public.arvados.org
Thu Sep 8 20:34:01 UTC 2022


        at  07ff7264dc3ff32c8edf08e5e6827ff7f1f48cf7 (commit)


commit 07ff7264dc3ff32c8edf08e5e6827ff7f1f48cf7
Author: Peter Amstutz <peter.amstutz at curii.com>
Date:   Thu Sep 8 16:33:38 2022 -0400

    19464: Record & report provenance information from git
    
    Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz at curii.com>

diff --git a/sdk/cwl/arvados_cwl/arvcontainer.py b/sdk/cwl/arvados_cwl/arvcontainer.py
index 5094ea3bf..c763cb7ef 100644
--- a/sdk/cwl/arvados_cwl/arvcontainer.py
+++ b/sdk/cwl/arvados_cwl/arvcontainer.py
@@ -453,7 +453,7 @@ class ArvadosContainer(JobBase):
 class RunnerContainer(Runner):
     """Submit and manage a container that runs arvados-cwl-runner."""
 
-    def arvados_job_spec(self, runtimeContext):
+    def arvados_job_spec(self, runtimeContext, git_info):
         """Create an Arvados container request for this workflow.
 
         The returned dict can be used to create a container passed as
@@ -515,7 +515,7 @@ class RunnerContainer(Runner):
                 "portable_data_hash": "%s" % workflowcollection
             }
         else:
-            packed = packed_workflow(self.arvrunner, self.embedded_tool, self.merged_map, runtimeContext)
+            packed = packed_workflow(self.arvrunner, self.embedded_tool, self.merged_map, runtimeContext, git_info)
             workflowpath = "/var/lib/cwl/workflow.json#main"
             container_req["mounts"]["/var/lib/cwl/workflow.json"] = {
                 "kind": "json",
@@ -595,7 +595,7 @@ class RunnerContainer(Runner):
 
     def run(self, runtimeContext):
         runtimeContext.keepprefix = "keep:"
-        job_spec = self.arvados_job_spec(runtimeContext)
+        job_spec = self.arvados_job_spec(runtimeContext, self.git_info)
         if runtimeContext.project_uuid:
             job_spec["owner_uuid"] = runtimeContext.project_uuid
 
diff --git a/sdk/cwl/arvados_cwl/arvworkflow.py b/sdk/cwl/arvados_cwl/arvworkflow.py
index 51e7cd8b9..5f3feabf8 100644
--- a/sdk/cwl/arvados_cwl/arvworkflow.py
+++ b/sdk/cwl/arvados_cwl/arvworkflow.py
@@ -40,9 +40,10 @@ sum_res_pars = ("outdirMin", "outdirMax")
 def upload_workflow(arvRunner, tool, job_order, project_uuid,
                     runtimeContext, uuid=None,
                     submit_runner_ram=0, name=None, merged_map=None,
-                    submit_runner_image=None):
+                    submit_runner_image=None,
+                    git_info=None):
 
-    packed = packed_workflow(arvRunner, tool, merged_map, runtimeContext)
+    packed = packed_workflow(arvRunner, tool, merged_map, runtimeContext, git_info)
 
     adjustDirObjs(job_order, trim_listing)
     adjustFileObjs(job_order, trim_anonymous_location)
diff --git a/sdk/cwl/arvados_cwl/executor.py b/sdk/cwl/arvados_cwl/executor.py
index 8635d5fcf..dbbea9f4e 100644
--- a/sdk/cwl/arvados_cwl/executor.py
+++ b/sdk/cwl/arvados_cwl/executor.py
@@ -17,6 +17,7 @@ import copy
 import json
 import re
 from functools import partial
+import subprocess
 import time
 import urllib
 
@@ -24,6 +25,7 @@ from cwltool.errors import WorkflowException
 import cwltool.workflow
 from schema_salad.sourceline import SourceLine
 import schema_salad.validate as validate
+from schema_salad.ref_resolver import file_uri, uri_file_path
 
 import arvados
 import arvados.config
@@ -518,9 +520,59 @@ The 'jobs' API is no longer supported.
             for req in job_reqs:
                 tool.requirements.append(req)
 
+    def get_git_info(self, tool):
+        in_a_git_repo = False
+        cwd = None
+
+        if tool.tool["id"].startswith("file://"):
+            # check if git is installed
+            try:
+                cwd = os.path.dirname(uri_file_path(tool.tool["id"]))
+                subprocess.run(["git", "log", "--format=%H", "-n1", "HEAD"], cwd=cwd, check=True, capture_output=True, text=True)
+                in_a_git_repo = True
+            except Exception as e:
+                pass
+
+        gitproperties = {}
+
+        if in_a_git_repo:
+            git_commit = subprocess.run(["git", "log", "--format=%H", "-n1", "HEAD"], cwd=cwd, capture_output=True, text=True).stdout
+            git_date = subprocess.run(["git", "log", "--format=%cD", "-n1", "HEAD"], cwd=cwd, capture_output=True, text=True).stdout
+            git_committer = subprocess.run(["git", "log", "--format=%cn <%ce>", "-n1", "HEAD"], cwd=cwd, capture_output=True, text=True).stdout
+            git_branch = subprocess.run(["git", "branch", "--show-current"], cwd=cwd, capture_output=True, text=True).stdout
+            git_origin = subprocess.run(["git", "remote", "get-url", "origin"], cwd=cwd, capture_output=True, text=True).stdout
+            git_status = subprocess.run(["git", "status", "--untracked-files=no", "--porcelain"], cwd=cwd, capture_output=True, text=True).stdout
+
+            gitproperties = {
+                "http://arvados.org/cwl#gitCommit": git_commit.strip(),
+                "http://arvados.org/cwl#gitDate": git_date.strip(),
+                "http://arvados.org/cwl#gitCommitter": git_committer.strip(),
+                "http://arvados.org/cwl#gitBranch": git_branch.strip(),
+                "http://arvados.org/cwl#gitOrigin": git_origin.strip(),
+                "http://arvados.org/cwl#gitStatus": git_status.strip(),
+            }
+        else:
+            for g in ("http://arvados.org/cwl#gitCommit",
+                      "http://arvados.org/cwl#gitDate",
+                      "http://arvados.org/cwl#gitCommitter",
+                      "http://arvados.org/cwl#gitBranch",
+                      "http://arvados.org/cwl#gitOrigin",
+                      "http://arvados.org/cwl#gitStatus"):
+                if g in tool.metadata:
+                    gitproperties = tool.metadata[g]
+
+        return gitproperties
+
     def arv_executor(self, updated_tool, job_order, runtimeContext, logger=None):
         self.debug = runtimeContext.debug
 
+        git_info = self.get_git_info(updated_tool)
+        if git_info:
+            logger.info("Provenance of %s", updated_tool.tool["id"])
+            for g in git_info:
+                if git_info[g]:
+                    logger.info("  %s: %s", g.split("#", 1)[1], git_info[g])
+
         workbench1 = self.api.config()["Services"]["Workbench1"]["ExternalURL"]
         workbench2 = self.api.config()["Services"]["Workbench2"]["ExternalURL"]
         controller = self.api.config()["Services"]["Controller"]["ExternalURL"]
@@ -628,7 +680,8 @@ The 'jobs' API is no longer supported.
                                        submit_runner_ram=runtimeContext.submit_runner_ram,
                                        name=runtimeContext.name,
                                        merged_map=merged_map,
-                                       submit_runner_image=runtimeContext.submit_runner_image)
+                                       submit_runner_image=runtimeContext.submit_runner_image,
+                                       git_info=git_info)
                 self.stdout.write(uuid + "\n")
                 return (None, "success")
 
@@ -690,7 +743,8 @@ The 'jobs' API is no longer supported.
                                            priority=runtimeContext.priority,
                                            secret_store=self.secret_store,
                                            collection_cache_size=runtimeContext.collection_cache_size,
-                                           collection_cache_is_default=self.should_estimate_cache_size)
+                                           collection_cache_is_default=self.should_estimate_cache_size,
+                                           git_info=git_info)
                 else:
                     runtimeContext.runnerjob = tool.tool["id"]
 
diff --git a/sdk/cwl/arvados_cwl/runner.py b/sdk/cwl/arvados_cwl/runner.py
index 225f4ae60..1544d05cd 100644
--- a/sdk/cwl/arvados_cwl/runner.py
+++ b/sdk/cwl/arvados_cwl/runner.py
@@ -604,7 +604,7 @@ def upload_docker(arvrunner, tool, runtimeContext):
             upload_docker(arvrunner, s.embedded_tool, runtimeContext)
 
 
-def packed_workflow(arvrunner, tool, merged_map, runtimeContext):
+def packed_workflow(arvrunner, tool, merged_map, runtimeContext, git_info):
     """Create a packed workflow.
 
     A "packed" workflow is one where all the components have been combined into a single document."""
@@ -644,6 +644,11 @@ def packed_workflow(arvrunner, tool, merged_map, runtimeContext):
             for l in v:
                 visit(l, cur_id)
     visit(packed, None)
+
+    if git_info:
+        for g in git_info:
+            packed[g] = git_info[g]
+
     return packed
 
 
@@ -794,7 +799,8 @@ class Runner(Process):
                  intermediate_output_ttl=0, merged_map=None,
                  priority=None, secret_store=None,
                  collection_cache_size=256,
-                 collection_cache_is_default=True):
+                 collection_cache_is_default=True,
+                 git_info=None):
 
         loadingContext = loadingContext.copy()
         loadingContext.metadata = updated_tool.metadata.copy()
@@ -823,6 +829,7 @@ class Runner(Process):
         self.priority = priority
         self.secret_store = secret_store
         self.enable_dev = loadingContext.enable_dev
+        self.git_info = git_info
 
         self.submit_runner_cores = 1
         self.submit_runner_ram = 1024  # defaut 1 GiB

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list