[arvados] created: 2.6.0-505-gc0c3828dcb
git repository hosting
git at public.arvados.org
Tue Jan 2 18:39:24 UTC 2024
at c0c3828dcb8d8a396d8183bb2dcca60eda56bcdf (commit)
commit c0c3828dcb8d8a396d8183bb2dcca60eda56bcdf
Author: Alex Coleman <alex.coleman at curii.com>
Date: Tue Jan 2 13:36:23 2024 -0500
19982: Adding inital code
Adding inital code in arvcontainer.py, adding first draft of test, and adding new hint.
Arvados-DCO-1.1-Signed-off-by: Alex Coleman <alex.coleman at curii.com>
diff --git a/sdk/cwl/arvados_cwl/__init__.py b/sdk/cwl/arvados_cwl/__init__.py
index 8108934aae..e90de431ff 100644
--- a/sdk/cwl/arvados_cwl/__init__.py
+++ b/sdk/cwl/arvados_cwl/__init__.py
@@ -290,6 +290,7 @@ def add_arv_hints():
"http://arvados.org/cwl#OutputCollectionProperties",
"http://arvados.org/cwl#KeepCacheTypeRequirement",
"http://arvados.org/cwl#OutOfMemoryRetry",
+ "http://arvados.org/cwl#SpotInstanceRetry",
])
def exit_signal_handler(sigcode, frame):
diff --git a/sdk/cwl/arvados_cwl/arvcontainer.py b/sdk/cwl/arvados_cwl/arvcontainer.py
index a94fdac522..4df20f10e3 100644
--- a/sdk/cwl/arvados_cwl/arvcontainer.py
+++ b/sdk/cwl/arvados_cwl/arvcontainer.py
@@ -429,6 +429,17 @@ class ArvadosContainer(JobBase):
logger.debug("Container request was %s", container_request)
self.output_callback({}, "permanentFail")
+
+ def spot_instance_retry(self, record, container):
+ spot_instance_retry_req, _ = self.get_requirement("http://arvados.org/cwl#SpotInstanceRetry")
+ if spot_instance_retry_req is None:
+ return False
+ if container["preemptionNotice"]:
+ return True
+ return False
+
+
+
def out_of_memory_retry(self, record, container):
oom_retry_req, _ = self.get_requirement("http://arvados.org/cwl#OutOfMemoryRetry")
if oom_retry_req is None:
@@ -485,7 +496,12 @@ class ArvadosContainer(JobBase):
self.run(None)
retried = True
return
-
+ if processStatus == "permanentFail" and self.attempt_count == 1 and self.spot_instance_retry(record, container):
+ logger.warning("%s Container failed with preemptible instance reclaimed, trying again nonpreemptible")
+ self.job_runtime.enable_preemptible = False
+ self.run(None)
+ retried = True
+ return
if rcode == 137:
logger.warning("%s Container may have been killed for using too much RAM. Try resubmitting with a higher 'ramMin' or use the arv:OutOfMemoryRetry feature.",
self.arvrunner.label(self))
diff --git a/sdk/cwl/tests/test_container.py b/sdk/cwl/tests/test_container.py
index a2f404d7eb..fbd3ef54d8 100644
--- a/sdk/cwl/tests/test_container.py
+++ b/sdk/cwl/tests/test_container.py
@@ -579,7 +579,7 @@ class TestContainer(unittest.TestCase):
self.fail("RuntimeStatusLoggingHandler should not be called recursively")
- # Test to make sure that an exception raised from
+ # Test to make sure trunner = mock.MagicMock()hat an exception raised from
# get_current_container doesn't cause the logger to raise an
# exception
@mock.patch("arvados_cwl.util.get_current_container")
@@ -1708,3 +1708,26 @@ class TestWorkflow(unittest.TestCase):
api._rootDesc = copy.deepcopy(get_rootDesc())
runner = arvados_cwl.executor.ArvCwlExecutor(api)
self.assertEqual(runner.work_api, 'containers')
+
+ @mock.patch("arvados.collection.Collection")
+ def test_spot_instance_retry(self):
+ arvados_cwl.add_arv_hints()
+
+ # Add hint
+
+ api = mock.MagicMock()
+
+ runner = mock.MagicMock()
+ runner.api = api
+ runner.num_retries = 0
+ runner.ignore_docker_for_reuse = False
+ runner.intermediate_output_ttl = 0
+ runner.secret_store = cwltool.secrets.SecretStore()
+
+ runner.api.containers().get().execute.return_value = {
+ "state": "Complete",
+ "output": "abc+123",
+ "exit_code": 137
+ }
+ # Add assertions to make sure it reran as nonpreemptible
+ assert False
\ No newline at end of file
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list