[arvados] created: 2.5.0-230-g5d73c8c82
git repository hosting
git at public.arvados.org
Sun Mar 5 04:03:42 UTC 2023
at 5d73c8c825577b65d48b9dec05428f3db104efb2 (commit)
commit 5d73c8c825577b65d48b9dec05428f3db104efb2
Author: Peter Amstutz <peter.amstutz at curii.com>
Date: Sat Mar 4 23:03:19 2023 -0500
19975: Add OutOfMemoryRetry extension
Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz at curii.com>
diff --git a/sdk/cwl/arvados_cwl/__init__.py b/sdk/cwl/arvados_cwl/__init__.py
index 52a9a6c20..74ca9312b 100644
--- a/sdk/cwl/arvados_cwl/__init__.py
+++ b/sdk/cwl/arvados_cwl/__init__.py
@@ -285,6 +285,7 @@ def add_arv_hints():
"http://arvados.org/cwl#UsePreemptible",
"http://arvados.org/cwl#OutputCollectionProperties",
"http://arvados.org/cwl#KeepCacheTypeRequirement",
+ "http://arvados.org/cwl#OutOfMemoryRetry",
])
def exit_signal_handler(sigcode, frame):
diff --git a/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.0.yml b/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.0.yml
index fc370eb81..91a05e125 100644
--- a/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.0.yml
+++ b/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.0.yml
@@ -456,3 +456,30 @@ $graph:
only or written to disk and memory-mapped. The disk cache
leverages the kernel's virtual memory system so "hot" data will
generally still be kept in RAM.
+
+- name: OutOfMemoryRetry
+ type: record
+ extends: cwl:ProcessRequirement
+ inVocab: false
+ doc: |
+ Detect when a failed tool run may have run out of memory, and
+ re-submit the container with more RAM.
+ fields:
+ - name: class
+ type: string
+ doc: "'arv:OutOfMemoryRetry"
+ jsonldPredicate:
+ _id: "@type"
+ _type: "@vocab"
+ - name: memoryErrorRegex
+ type: string?
+ doc: |
+ A regular expression that will be used on the text of stdout
+ and stderr produced by the tool to determine if a failed job
+ should be retried with more RAM. By default, searches for the
+ substrings 'bad_alloc' and 'OutOfMemory'.
+ - name: memoryRetryMultipler
+ type: float
+ doc: |
+ If the container failed on its first run, re-submit the
+ container with the RAM request multiplied by this factor.
diff --git a/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.1.yml b/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.1.yml
index 69c0ed6cf..458d5a37a 100644
--- a/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.1.yml
+++ b/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.1.yml
@@ -399,3 +399,30 @@ $graph:
only or written to disk and memory-mapped. The disk cache
leverages the kernel's virtual memory system so "hot" data will
generally still be kept in RAM.
+
+- name: OutOfMemoryRetry
+ type: record
+ extends: cwl:ProcessRequirement
+ inVocab: false
+ doc: |
+ Detect when a failed tool run may have run out of memory, and
+ re-submit the container with more RAM.
+ fields:
+ - name: class
+ type: string
+ doc: "'arv:OutOfMemoryRetry"
+ jsonldPredicate:
+ _id: "@type"
+ _type: "@vocab"
+ - name: memoryErrorRegex
+ type: string?
+ doc: |
+ A regular expression that will be used on the text of stdout
+ and stderr produced by the tool to determine if a failed job
+ should be retried with more RAM. By default, searches for the
+ substrings 'bad_alloc' and 'OutOfMemory'.
+ - name: memoryRetryMultipler
+ type: float
+ doc: |
+ If the container failed on its first run, re-submit the
+ container with the RAM request multiplied by this factor.
diff --git a/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.2.yml b/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.2.yml
index 86cd06eff..f4246ed70 100644
--- a/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.2.yml
+++ b/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.2.yml
@@ -401,3 +401,31 @@ $graph:
only or written to disk and memory-mapped. The disk cache
leverages the kernel's virtual memory system so "hot" data will
generally still be kept in RAM.
+
+
+- name: OutOfMemoryRetry
+ type: record
+ extends: cwl:ProcessRequirement
+ inVocab: false
+ doc: |
+ Detect when a failed tool run may have run out of memory, and
+ re-submit the container with more RAM.
+ fields:
+ - name: class
+ type: string
+ doc: "'arv:OutOfMemoryRetry"
+ jsonldPredicate:
+ _id: "@type"
+ _type: "@vocab"
+ - name: memoryErrorRegex
+ type: string?
+ doc: |
+ A regular expression that will be used on the text of stdout
+ and stderr produced by the tool to determine if a failed job
+ should be retried with more RAM. By default, searches for the
+ substrings 'bad_alloc' and 'OutOfMemory'.
+ - name: memoryRetryMultipler
+ type: float
+ doc: |
+ If the container failed on its first run, re-submit the
+ container with the RAM request multiplied by this factor.
diff --git a/sdk/cwl/arvados_cwl/arvcontainer.py b/sdk/cwl/arvados_cwl/arvcontainer.py
index e828b16d3..632e171b4 100644
--- a/sdk/cwl/arvados_cwl/arvcontainer.py
+++ b/sdk/cwl/arvados_cwl/arvcontainer.py
@@ -367,6 +367,12 @@ class ArvadosContainer(JobBase):
logger.warning("%s API revision is %s, revision %s is required to support setting properties on output collections.",
self.arvrunner.label(self), self.arvrunner.api._rootDesc["revision"], "20220510")
+ ramMultiplier = [1]
+
+ oom_retry_req, _ = self.get_requirement("http://arvados.org/cwl#OutOfMemoryRetry")
+ if oom_retry_req and oom_retry_req.get('memoryRetryMultipler'):
+ ramMultiplier.append(oom_retry_req.get('memoryRetryMultipler'))
+
if runtimeContext.runnerjob.startswith("arvwf:"):
wfuuid = runtimeContext.runnerjob[6:runtimeContext.runnerjob.index("#")]
wfrecord = self.arvrunner.api.workflows().get(uuid=wfuuid).execute(num_retries=self.arvrunner.num_retries)
@@ -380,7 +386,7 @@ class ArvadosContainer(JobBase):
try:
ram = runtime_constraints["ram"]
- for i in range(1, 4):
+ for i in ramMultiplier:
runtime_constraints["ram"] = ram * i
if runtimeContext.submit_request_uuid:
@@ -400,7 +406,7 @@ class ArvadosContainer(JobBase):
break
if response["container_uuid"] is None:
- runtime_constraints["ram"] = ram * (self.attempt_count+1)
+ runtime_constraints["ram"] = ram * ramMultiplier[self.attempt_count]
container_request["state"] = "Committed"
response = self.arvrunner.api.container_requests().update(
@@ -423,6 +429,14 @@ class ArvadosContainer(JobBase):
self.output_callback({}, "permanentFail")
def out_of_memory_retry(self, record, container):
+ oom_retry_req, _ = self.get_requirement("http://arvados.org/cwl#OutOfMemoryRetry")
+ if oom_retry_req is None:
+ return False
+
+ # Sometimes it gets killed with no warning
+ if container["exit_code"] == 137:
+ return True
+
logc = arvados.collection.CollectionReader(record["log_uuid"],
api_client=self.arvrunner.api,
keep_client=self.arvrunner.keep_client,
@@ -434,14 +448,9 @@ class ArvadosContainer(JobBase):
done.logtail(logc, callback, "", maxlen=1000)
- # Check OOM killed
- oom_matches = r'container using over 9.% of memory'
- if container["exit_code"] == 137 and re.search(oom_matches, loglines[0], re.IGNORECASE | re.MULTILINE):
- return True
-
# Check allocation failure
- bad_alloc_matches = r'(bad_alloc|out ?of ?memory)'
- if re.search(bad_alloc_matches, loglines[0], re.IGNORECASE | re.MULTILINE):
+ oom_matches = oom_retry_req.get('memoryErrorRegex') or r'(bad_alloc|out ?of ?memory|container using over 9.% of memory)'
+ if re.search(oom_matches, loglines[0], re.IGNORECASE | re.MULTILINE):
return True
return False
@@ -466,7 +475,7 @@ class ArvadosContainer(JobBase):
else:
processStatus = "permanentFail"
- if processStatus == "permanentFail" and self.out_of_memory_retry(record, container):
+ if processStatus == "permanentFail" and self.attempt_count == 1 and self.out_of_memory_retry(record, container):
logger.info("%s Container failed with out of memory error, retrying with more RAM.",
self.arvrunner.label(self))
self.job_runtime.submit_request_uuid = None
commit fe18471f4f1a04af7444d2162cab16eef2685677
Author: Peter Amstutz <peter.amstutz at curii.com>
Date: Sat Mar 4 17:46:37 2023 -0500
19975: Retrying RAM wip
Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz at curii.com>
diff --git a/sdk/cwl/arvados_cwl/arvcontainer.py b/sdk/cwl/arvados_cwl/arvcontainer.py
index 4d0fde744..e828b16d3 100644
--- a/sdk/cwl/arvados_cwl/arvcontainer.py
+++ b/sdk/cwl/arvados_cwl/arvcontainer.py
@@ -374,7 +374,8 @@ class ArvadosContainer(JobBase):
container_request["name"] = wfrecord["name"]
container_request["properties"]["template_uuid"] = wfuuid
- self.output_callback = self.arvrunner.get_wrapped_callback(self.output_callback)
+ if self.attempt_count == 0:
+ self.output_callback = self.arvrunner.get_wrapped_callback(self.output_callback)
try:
ram = runtime_constraints["ram"]
@@ -422,9 +423,6 @@ class ArvadosContainer(JobBase):
self.output_callback({}, "permanentFail")
def out_of_memory_retry(self, record, container):
- if container["exit_code"] == 137:
- return True
-
logc = arvados.collection.CollectionReader(record["log_uuid"],
api_client=self.arvrunner.api,
keep_client=self.arvrunner.keep_client,
@@ -434,15 +432,16 @@ class ArvadosContainer(JobBase):
def callback(v1, v2, v3):
loglines[0] = v3
- done.logtail(logc, callback, "", maxlen=200)
-
- oom_matches = r'(bad_alloc|out ?of ?memory|Container using over 95% of memory)'
+ done.logtail(logc, callback, "", maxlen=1000)
- print("Checking loglines", loglines[0])
-
- print("Match", re.search(oom_matches, loglines[0], re.IGNORECASE | re.MULTILINE))
+ # Check OOM killed
+ oom_matches = r'container using over 9.% of memory'
+ if container["exit_code"] == 137 and re.search(oom_matches, loglines[0], re.IGNORECASE | re.MULTILINE):
+ return True
- if re.search(oom_matches, loglines[0], re.IGNORECASE | re.MULTILINE):
+ # Check allocation failure
+ bad_alloc_matches = r'(bad_alloc|out ?of ?memory)'
+ if re.search(bad_alloc_matches, loglines[0], re.IGNORECASE | re.MULTILINE):
return True
return False
@@ -471,6 +470,7 @@ class ArvadosContainer(JobBase):
logger.info("%s Container failed with out of memory error, retrying with more RAM.",
self.arvrunner.label(self))
self.job_runtime.submit_request_uuid = None
+ self.uuid = None
self.run(None)
retried = True
return
commit 39fc2f223fae40dc4fb160758e76ca39304b44af
Author: Peter Amstutz <peter.amstutz at curii.com>
Date: Sat Mar 4 17:25:17 2023 -0500
19975: Initial work to auto-retry on OOM
Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz at curii.com>
diff --git a/sdk/cwl/arvados_cwl/arvcontainer.py b/sdk/cwl/arvados_cwl/arvcontainer.py
index 742906c61..4d0fde744 100644
--- a/sdk/cwl/arvados_cwl/arvcontainer.py
+++ b/sdk/cwl/arvados_cwl/arvcontainer.py
@@ -15,6 +15,7 @@ import datetime
import ciso8601
import uuid
import math
+import re
import arvados_cwl.util
import ruamel.yaml
@@ -56,6 +57,7 @@ class ArvadosContainer(JobBase):
self.job_runtime = job_runtime
self.running = False
self.uuid = None
+ self.attempt_count = 0
def update_pipeline_component(self, r):
pass
@@ -88,7 +90,7 @@ class ArvadosContainer(JobBase):
container_request["output_path"] = self.outdir
container_request["cwd"] = self.outdir
container_request["priority"] = runtimeContext.priority
- container_request["state"] = "Committed"
+ container_request["state"] = "Uncommitted"
container_request.setdefault("properties", {})
container_request["properties"]["cwl_input"] = self.joborder
@@ -375,20 +377,40 @@ class ArvadosContainer(JobBase):
self.output_callback = self.arvrunner.get_wrapped_callback(self.output_callback)
try:
- if runtimeContext.submit_request_uuid:
- response = self.arvrunner.api.container_requests().update(
- uuid=runtimeContext.submit_request_uuid,
- body=container_request,
- **extra_submit_params
- ).execute(num_retries=self.arvrunner.num_retries)
- else:
- response = self.arvrunner.api.container_requests().create(
- body=container_request,
- **extra_submit_params
- ).execute(num_retries=self.arvrunner.num_retries)
+ ram = runtime_constraints["ram"]
+
+ for i in range(1, 4):
+ runtime_constraints["ram"] = ram * i
+
+ if runtimeContext.submit_request_uuid:
+ response = self.arvrunner.api.container_requests().update(
+ uuid=runtimeContext.submit_request_uuid,
+ body=container_request,
+ **extra_submit_params
+ ).execute(num_retries=self.arvrunner.num_retries)
+ else:
+ response = self.arvrunner.api.container_requests().create(
+ body=container_request,
+ **extra_submit_params
+ ).execute(num_retries=self.arvrunner.num_retries)
+ runtimeContext.submit_request_uuid = response["uuid"]
+
+ if response["container_uuid"] is not None:
+ break
+
+ if response["container_uuid"] is None:
+ runtime_constraints["ram"] = ram * (self.attempt_count+1)
+
+ container_request["state"] = "Committed"
+ response = self.arvrunner.api.container_requests().update(
+ uuid=runtimeContext.submit_request_uuid,
+ body=container_request,
+ **extra_submit_params
+ ).execute(num_retries=self.arvrunner.num_retries)
self.uuid = response["uuid"]
self.arvrunner.process_submitted(self)
+ self.attempt_count += 1
if response["state"] == "Final":
logger.info("%s reused container %s", self.arvrunner.label(self), response["container_uuid"])
@@ -399,8 +421,35 @@ class ArvadosContainer(JobBase):
logger.debug("Container request was %s", container_request)
self.output_callback({}, "permanentFail")
+ def out_of_memory_retry(self, record, container):
+ if container["exit_code"] == 137:
+ return True
+
+ logc = arvados.collection.CollectionReader(record["log_uuid"],
+ api_client=self.arvrunner.api,
+ keep_client=self.arvrunner.keep_client,
+ num_retries=self.arvrunner.num_retries)
+
+ loglines = [""]
+ def callback(v1, v2, v3):
+ loglines[0] = v3
+
+ done.logtail(logc, callback, "", maxlen=200)
+
+ oom_matches = r'(bad_alloc|out ?of ?memory|Container using over 95% of memory)'
+
+ print("Checking loglines", loglines[0])
+
+ print("Match", re.search(oom_matches, loglines[0], re.IGNORECASE | re.MULTILINE))
+
+ if re.search(oom_matches, loglines[0], re.IGNORECASE | re.MULTILINE):
+ return True
+
+ return False
+
def done(self, record):
outputs = {}
+ retried = False
try:
container = self.arvrunner.api.containers().get(
uuid=record["container_uuid"]
@@ -418,6 +467,14 @@ class ArvadosContainer(JobBase):
else:
processStatus = "permanentFail"
+ if processStatus == "permanentFail" and self.out_of_memory_retry(record, container):
+ logger.info("%s Container failed with out of memory error, retrying with more RAM.",
+ self.arvrunner.label(self))
+ self.job_runtime.submit_request_uuid = None
+ self.run(None)
+ retried = True
+ return
+
if rcode == 137:
logger.warning("%s Container may have been killed for using too much RAM. Try resubmitting with a higher 'ramMin'.",
self.arvrunner.label(self))
@@ -464,7 +521,8 @@ class ArvadosContainer(JobBase):
logger.exception("%s while getting output object:", self.arvrunner.label(self))
processStatus = "permanentFail"
finally:
- self.output_callback(outputs, processStatus)
+ if not retried:
+ self.output_callback(outputs, processStatus)
class RunnerContainer(Runner):
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list