[arvados] created: 2.5.0-230-g5d73c8c82

Sun Mar 5 04:03:42 UTC 2023

at  5d73c8c825577b65d48b9dec05428f3db104efb2 (commit)


commit 5d73c8c825577b65d48b9dec05428f3db104efb2
Author: Peter Amstutz <peter.amstutz at curii.com>
Date:   Sat Mar 4 23:03:19 2023 -0500

    19975: Add OutOfMemoryRetry extension
    
    Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz at curii.com>

diff --git a/sdk/cwl/arvados_cwl/__init__.py b/sdk/cwl/arvados_cwl/__init__.py
index 52a9a6c20..74ca9312b 100644
--- a/sdk/cwl/arvados_cwl/__init__.py
+++ b/sdk/cwl/arvados_cwl/__init__.py
@@ -285,6 +285,7 @@ def add_arv_hints():
         "http://arvados.org/cwl#UsePreemptible",
         "http://arvados.org/cwl#OutputCollectionProperties",
         "http://arvados.org/cwl#KeepCacheTypeRequirement",
+        "http://arvados.org/cwl#OutOfMemoryRetry",
     ])
 
 def exit_signal_handler(sigcode, frame):
diff --git a/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.0.yml b/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.0.yml
index fc370eb81..91a05e125 100644
--- a/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.0.yml
+++ b/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.0.yml
@@ -456,3 +456,30 @@ $graph:
         only or written to disk and memory-mapped.  The disk cache
         leverages the kernel's virtual memory system so "hot" data will
         generally still be kept in RAM.
+
+- name: OutOfMemoryRetry
+  type: record
+  extends: cwl:ProcessRequirement
+  inVocab: false
+  doc: |
+    Detect when a failed tool run may have run out of memory, and
+    re-submit the container with more RAM.
+  fields:
+    - name: class
+      type: string
+      doc: "'arv:OutOfMemoryRetry"
+      jsonldPredicate:
+        _id: "@type"
+        _type: "@vocab"
+    - name: memoryErrorRegex
+      type: string?
+      doc: |
+        A regular expression that will be used on the text of stdout
+        and stderr produced by the tool to determine if a failed job
+        should be retried with more RAM.  By default, searches for the
+        substrings 'bad_alloc' and 'OutOfMemory'.
+    - name: memoryRetryMultipler
+      type: float
+      doc: |
+        If the container failed on its first run, re-submit the
+        container with the RAM request multiplied by this factor.
diff --git a/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.1.yml b/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.1.yml
index 69c0ed6cf..458d5a37a 100644
--- a/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.1.yml
+++ b/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.1.yml
@@ -399,3 +399,30 @@ $graph:
         only or written to disk and memory-mapped.  The disk cache
         leverages the kernel's virtual memory system so "hot" data will
         generally still be kept in RAM.
+
+- name: OutOfMemoryRetry
+  type: record
+  extends: cwl:ProcessRequirement
+  inVocab: false
+  doc: |
+    Detect when a failed tool run may have run out of memory, and
+    re-submit the container with more RAM.
+  fields:
+    - name: class
+      type: string
+      doc: "'arv:OutOfMemoryRetry"
+      jsonldPredicate:
+        _id: "@type"
+        _type: "@vocab"
+    - name: memoryErrorRegex
+      type: string?
+      doc: |
+        A regular expression that will be used on the text of stdout
+        and stderr produced by the tool to determine if a failed job
+        should be retried with more RAM.  By default, searches for the
+        substrings 'bad_alloc' and 'OutOfMemory'.
+    - name: memoryRetryMultipler
+      type: float
+      doc: |
+        If the container failed on its first run, re-submit the
+        container with the RAM request multiplied by this factor.
diff --git a/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.2.yml b/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.2.yml
index 86cd06eff..f4246ed70 100644
--- a/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.2.yml
+++ b/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.2.yml
@@ -401,3 +401,31 @@ $graph:
         only or written to disk and memory-mapped.  The disk cache
         leverages the kernel's virtual memory system so "hot" data will
         generally still be kept in RAM.
+
+
+- name: OutOfMemoryRetry
+  type: record
+  extends: cwl:ProcessRequirement
+  inVocab: false
+  doc: |
+    Detect when a failed tool run may have run out of memory, and
+    re-submit the container with more RAM.
+  fields:
+    - name: class
+      type: string
+      doc: "'arv:OutOfMemoryRetry"
+      jsonldPredicate:
+        _id: "@type"
+        _type: "@vocab"
+    - name: memoryErrorRegex
+      type: string?
+      doc: |
+        A regular expression that will be used on the text of stdout
+        and stderr produced by the tool to determine if a failed job
+        should be retried with more RAM.  By default, searches for the
+        substrings 'bad_alloc' and 'OutOfMemory'.
+    - name: memoryRetryMultipler
+      type: float
+      doc: |
+        If the container failed on its first run, re-submit the
+        container with the RAM request multiplied by this factor.
diff --git a/sdk/cwl/arvados_cwl/arvcontainer.py b/sdk/cwl/arvados_cwl/arvcontainer.py
index e828b16d3..632e171b4 100644
--- a/sdk/cwl/arvados_cwl/arvcontainer.py
+++ b/sdk/cwl/arvados_cwl/arvcontainer.py
@@ -367,6 +367,12 @@ class ArvadosContainer(JobBase):
                 logger.warning("%s API revision is %s, revision %s is required to support setting properties on output collections.",
                                self.arvrunner.label(self), self.arvrunner.api._rootDesc["revision"], "20220510")
 
+        ramMultiplier = [1]
+
+        oom_retry_req, _ = self.get_requirement("http://arvados.org/cwl#OutOfMemoryRetry")
+        if oom_retry_req and oom_retry_req.get('memoryRetryMultipler'):
+            ramMultiplier.append(oom_retry_req.get('memoryRetryMultipler'))
+
         if runtimeContext.runnerjob.startswith("arvwf:"):
             wfuuid = runtimeContext.runnerjob[6:runtimeContext.runnerjob.index("#")]
             wfrecord = self.arvrunner.api.workflows().get(uuid=wfuuid).execute(num_retries=self.arvrunner.num_retries)
@@ -380,7 +386,7 @@ class ArvadosContainer(JobBase):
         try:
             ram = runtime_constraints["ram"]
 
-            for i in range(1, 4):
+            for i in ramMultiplier:
                 runtime_constraints["ram"] = ram * i
 
                 if runtimeContext.submit_request_uuid:
@@ -400,7 +406,7 @@ class ArvadosContainer(JobBase):
                     break
 
             if response["container_uuid"] is None:
-                runtime_constraints["ram"] = ram * (self.attempt_count+1)
+                runtime_constraints["ram"] = ram * ramMultiplier[self.attempt_count]
 
             container_request["state"] = "Committed"
             response = self.arvrunner.api.container_requests().update(
@@ -423,6 +429,14 @@ class ArvadosContainer(JobBase):
             self.output_callback({}, "permanentFail")
 
     def out_of_memory_retry(self, record, container):
+        oom_retry_req, _ = self.get_requirement("http://arvados.org/cwl#OutOfMemoryRetry")
+        if oom_retry_req is None:
+            return False
+
+        # Sometimes it gets killed with no warning
+        if container["exit_code"] == 137:
+            return True
+
         logc = arvados.collection.CollectionReader(record["log_uuid"],
                                                    api_client=self.arvrunner.api,
                                                    keep_client=self.arvrunner.keep_client,
@@ -434,14 +448,9 @@ class ArvadosContainer(JobBase):
 
         done.logtail(logc, callback, "", maxlen=1000)
 
-        # Check OOM killed
-        oom_matches = r'container using over 9.% of memory'
-        if container["exit_code"] == 137 and re.search(oom_matches, loglines[0], re.IGNORECASE | re.MULTILINE):
-            return True
-
         # Check allocation failure
-        bad_alloc_matches = r'(bad_alloc|out ?of ?memory)'
-        if re.search(bad_alloc_matches, loglines[0], re.IGNORECASE | re.MULTILINE):
+        oom_matches = oom_retry_req.get('memoryErrorRegex') or r'(bad_alloc|out ?of ?memory|container using over 9.% of memory)'
+        if re.search(oom_matches, loglines[0], re.IGNORECASE | re.MULTILINE):
             return True
 
         return False
@@ -466,7 +475,7 @@ class ArvadosContainer(JobBase):
                 else:
                     processStatus = "permanentFail"
 
-                if processStatus == "permanentFail" and self.out_of_memory_retry(record, container):
+                if processStatus == "permanentFail" and self.attempt_count == 1 and self.out_of_memory_retry(record, container):
                     logger.info("%s Container failed with out of memory error, retrying with more RAM.",
                                  self.arvrunner.label(self))
                     self.job_runtime.submit_request_uuid = None

commit fe18471f4f1a04af7444d2162cab16eef2685677
Author: Peter Amstutz <peter.amstutz at curii.com>
Date:   Sat Mar 4 17:46:37 2023 -0500

    19975: Retrying RAM wip
    
    Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz at curii.com>

diff --git a/sdk/cwl/arvados_cwl/arvcontainer.py b/sdk/cwl/arvados_cwl/arvcontainer.py
index 4d0fde744..e828b16d3 100644
--- a/sdk/cwl/arvados_cwl/arvcontainer.py
+++ b/sdk/cwl/arvados_cwl/arvcontainer.py
@@ -374,7 +374,8 @@ class ArvadosContainer(JobBase):
                 container_request["name"] = wfrecord["name"]
             container_request["properties"]["template_uuid"] = wfuuid
 
-        self.output_callback = self.arvrunner.get_wrapped_callback(self.output_callback)
+        if self.attempt_count == 0:
+            self.output_callback = self.arvrunner.get_wrapped_callback(self.output_callback)
 
         try:
             ram = runtime_constraints["ram"]
@@ -422,9 +423,6 @@ class ArvadosContainer(JobBase):
             self.output_callback({}, "permanentFail")
 
     def out_of_memory_retry(self, record, container):
-        if container["exit_code"] == 137:
-            return True
-
         logc = arvados.collection.CollectionReader(record["log_uuid"],
                                                    api_client=self.arvrunner.api,
                                                    keep_client=self.arvrunner.keep_client,
@@ -434,15 +432,16 @@ class ArvadosContainer(JobBase):
         def callback(v1, v2, v3):
             loglines[0] = v3
 
-        done.logtail(logc, callback, "", maxlen=200)
-
-        oom_matches = r'(bad_alloc|out ?of ?memory|Container using over 95% of memory)'
+        done.logtail(logc, callback, "", maxlen=1000)
 
-        print("Checking loglines", loglines[0])
-
-        print("Match", re.search(oom_matches, loglines[0], re.IGNORECASE | re.MULTILINE))
+        # Check OOM killed
+        oom_matches = r'container using over 9.% of memory'
+        if container["exit_code"] == 137 and re.search(oom_matches, loglines[0], re.IGNORECASE | re.MULTILINE):
+            return True
 
-        if re.search(oom_matches, loglines[0], re.IGNORECASE | re.MULTILINE):
+        # Check allocation failure
+        bad_alloc_matches = r'(bad_alloc|out ?of ?memory)'
+        if re.search(bad_alloc_matches, loglines[0], re.IGNORECASE | re.MULTILINE):
             return True
 
         return False
@@ -471,6 +470,7 @@ class ArvadosContainer(JobBase):
                     logger.info("%s Container failed with out of memory error, retrying with more RAM.",
                                  self.arvrunner.label(self))
                     self.job_runtime.submit_request_uuid = None
+                    self.uuid = None
                     self.run(None)
                     retried = True
                     return

commit 39fc2f223fae40dc4fb160758e76ca39304b44af
Author: Peter Amstutz <peter.amstutz at curii.com>
Date:   Sat Mar 4 17:25:17 2023 -0500

    19975: Initial work to auto-retry on OOM
    
    Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz at curii.com>

diff --git a/sdk/cwl/arvados_cwl/arvcontainer.py b/sdk/cwl/arvados_cwl/arvcontainer.py
index 742906c61..4d0fde744 100644
--- a/sdk/cwl/arvados_cwl/arvcontainer.py
+++ b/sdk/cwl/arvados_cwl/arvcontainer.py
@@ -15,6 +15,7 @@ import datetime
 import ciso8601
 import uuid
 import math
+import re
 
 import arvados_cwl.util
 import ruamel.yaml
@@ -56,6 +57,7 @@ class ArvadosContainer(JobBase):
         self.job_runtime = job_runtime
         self.running = False
         self.uuid = None
+        self.attempt_count = 0
 
     def update_pipeline_component(self, r):
         pass
@@ -88,7 +90,7 @@ class ArvadosContainer(JobBase):
         container_request["output_path"] = self.outdir
         container_request["cwd"] = self.outdir
         container_request["priority"] = runtimeContext.priority
-        container_request["state"] = "Committed"
+        container_request["state"] = "Uncommitted"
         container_request.setdefault("properties", {})
 
         container_request["properties"]["cwl_input"] = self.joborder
@@ -375,20 +377,40 @@ class ArvadosContainer(JobBase):
         self.output_callback = self.arvrunner.get_wrapped_callback(self.output_callback)
 
         try:
-            if runtimeContext.submit_request_uuid:
-                response = self.arvrunner.api.container_requests().update(
-                    uuid=runtimeContext.submit_request_uuid,
-                    body=container_request,
-                    **extra_submit_params
-                ).execute(num_retries=self.arvrunner.num_retries)
-            else:
-                response = self.arvrunner.api.container_requests().create(
-                    body=container_request,
-                    **extra_submit_params
-                ).execute(num_retries=self.arvrunner.num_retries)
+            ram = runtime_constraints["ram"]
+
+            for i in range(1, 4):
+                runtime_constraints["ram"] = ram * i
+
+                if runtimeContext.submit_request_uuid:
+                    response = self.arvrunner.api.container_requests().update(
+                        uuid=runtimeContext.submit_request_uuid,
+                        body=container_request,
+                        **extra_submit_params
+                    ).execute(num_retries=self.arvrunner.num_retries)
+                else:
+                    response = self.arvrunner.api.container_requests().create(
+                        body=container_request,
+                        **extra_submit_params
+                    ).execute(num_retries=self.arvrunner.num_retries)
+                    runtimeContext.submit_request_uuid = response["uuid"]
+
+                if response["container_uuid"] is not None:
+                    break
+
+            if response["container_uuid"] is None:
+                runtime_constraints["ram"] = ram * (self.attempt_count+1)
+
+            container_request["state"] = "Committed"
+            response = self.arvrunner.api.container_requests().update(
+                uuid=runtimeContext.submit_request_uuid,
+                body=container_request,
+                **extra_submit_params
+            ).execute(num_retries=self.arvrunner.num_retries)
 
             self.uuid = response["uuid"]
             self.arvrunner.process_submitted(self)
+            self.attempt_count += 1
 
             if response["state"] == "Final":
                 logger.info("%s reused container %s", self.arvrunner.label(self), response["container_uuid"])
@@ -399,8 +421,35 @@ class ArvadosContainer(JobBase):
             logger.debug("Container request was %s", container_request)
             self.output_callback({}, "permanentFail")
 
+    def out_of_memory_retry(self, record, container):
+        if container["exit_code"] == 137:
+            return True
+
+        logc = arvados.collection.CollectionReader(record["log_uuid"],
+                                                   api_client=self.arvrunner.api,
+                                                   keep_client=self.arvrunner.keep_client,
+                                                   num_retries=self.arvrunner.num_retries)
+
+        loglines = [""]
+        def callback(v1, v2, v3):
+            loglines[0] = v3
+
+        done.logtail(logc, callback, "", maxlen=200)
+
+        oom_matches = r'(bad_alloc|out ?of ?memory|Container using over 95% of memory)'
+
+        print("Checking loglines", loglines[0])
+
+        print("Match", re.search(oom_matches, loglines[0], re.IGNORECASE | re.MULTILINE))
+
+        if re.search(oom_matches, loglines[0], re.IGNORECASE | re.MULTILINE):
+            return True
+
+        return False
+
     def done(self, record):
         outputs = {}
+        retried = False
         try:
             container = self.arvrunner.api.containers().get(
                 uuid=record["container_uuid"]
@@ -418,6 +467,14 @@ class ArvadosContainer(JobBase):
                 else:
                     processStatus = "permanentFail"
 
+                if processStatus == "permanentFail" and self.out_of_memory_retry(record, container):
+                    logger.info("%s Container failed with out of memory error, retrying with more RAM.",
+                                 self.arvrunner.label(self))
+                    self.job_runtime.submit_request_uuid = None
+                    self.run(None)
+                    retried = True
+                    return
+
                 if rcode == 137:
                     logger.warning("%s Container may have been killed for using too much RAM.  Try resubmitting with a higher 'ramMin'.",
                                  self.arvrunner.label(self))
@@ -464,7 +521,8 @@ class ArvadosContainer(JobBase):
             logger.exception("%s while getting output object:", self.arvrunner.label(self))
             processStatus = "permanentFail"
         finally:
-            self.output_callback(outputs, processStatus)
+            if not retried:
+                self.output_callback(outputs, processStatus)
 
 
 class RunnerContainer(Runner):

-----------------------------------------------------------------------


hooks/post-receive
--