[ARVADOS] created: 1.2.0-9-gcb2a05a42
Git user
git at public.curoverse.com
Wed Sep 5 16:19:56 EDT 2018
at cb2a05a424c0fc4bb05d7ad171cd220a0507150e (commit)
commit cb2a05a424c0fc4bb05d7ad171cd220a0507150e
Author: Lucas Di Pentima <ldipentima at veritasgenetics.com>
Date: Wed Sep 5 16:41:29 2018 -0300
13773: Containers & CRs show "Failing" when a child job failed.
Also, at the detailed view an error panel is displayed with the error message
and detail, if any.
Arvados-DCO-1.1-Signed-off-by: Lucas Di Pentima <ldipentima at veritasgenetics.com>
diff --git a/apps/workbench/app/models/container_work_unit.rb b/apps/workbench/app/models/container_work_unit.rb
index 7f1052ebc..94b416173 100644
--- a/apps/workbench/app/models/container_work_unit.rb
+++ b/apps/workbench/app/models/container_work_unit.rb
@@ -117,12 +117,31 @@ class ContainerWorkUnit < ProxyWorkUnit
else
'Failed'
end
+ when 'Running'
+ if runtime_status[:error]
+ 'Failing'
+ else
+ state
+ end
else
- # Cancelled, Running, or Uncommitted (no container assigned)
+ # Cancelled, or Uncommitted (no container assigned)
state
end
end
+ def runtime_status
+ return get(:runtime_status, @container) || get(:runtime_status, @proxied)
+ end
+
+ def state_bootstrap_class
+ case state_label
+ when 'Failing'
+ 'danger'
+ else
+ super
+ end
+ end
+
def exit_code
get_combined(:exit_code)
end
diff --git a/apps/workbench/app/models/proxy_work_unit.rb b/apps/workbench/app/models/proxy_work_unit.rb
index 02f6b42fd..adf0bd7d6 100644
--- a/apps/workbench/app/models/proxy_work_unit.rb
+++ b/apps/workbench/app/models/proxy_work_unit.rb
@@ -278,7 +278,11 @@ class ProxyWorkUnit < WorkUnit
end
if is_failed?
- resp << " Check the Log tab for more detail about why it failed."
+ if runtime_status.andand[:error]
+ resp << " Check the error information below."
+ else
+ resp << " Check the Log tab for more detail about why it failed."
+ end
end
resp << "</p>"
diff --git a/apps/workbench/app/models/work_unit.rb b/apps/workbench/app/models/work_unit.rb
index f0c4230f2..493dd2f57 100644
--- a/apps/workbench/app/models/work_unit.rb
+++ b/apps/workbench/app/models/work_unit.rb
@@ -211,4 +211,8 @@ class WorkUnit
def template_uuid
# return the uuid of this work unit's template, if one exists
end
+
+ def runtime_status
+ # Returns this work unit's runtime_status, if any
+ end
end
diff --git a/apps/workbench/app/views/work_units/_show_component.html.erb b/apps/workbench/app/views/work_units/_show_component.html.erb
index 3bba31f78..d85329576 100644
--- a/apps/workbench/app/views/work_units/_show_component.html.erb
+++ b/apps/workbench/app/views/work_units/_show_component.html.erb
@@ -37,6 +37,28 @@ SPDX-License-Identifier: AGPL-3.0 %>
<% end %>
</div>
+<%# Display runtime error information %>
+<% if wu.runtime_status.andand[:error] %>
+<div class="container">
+ <div class="col-md-12">
+ <div class="panel panel-danger">
+ <div class="panel-heading">Error Information</div>
+ <div class="panel-body">
+ <%= wu.runtime_status[:error] %>
+ <%# Show collapsable detailed error information, if any %>
+ <% if wu.runtime_status[:errorDetail] %>
+ <a class="btn btn-sm btn-primary pull-right" data-toggle="collapse" data-target="#errorDetail">Toggle details</a>
+ <div class="clearfix"></div>
+ <div id="errorDetail" class="collapse">
+ <pre><%= wu.runtime_status[:errorDetail] %></pre>
+ </div>
+ <% end %>
+ </div>
+ </div>
+ </div>
+</div>
+<% end %>
+
<p>
<%= render(partial: 'work_units/component_detail', locals: {current_obj: wu}) %>
</p>
commit 05709dc092dccede4206b7cfe04f1574bc2299c4
Author: Lucas Di Pentima <ldipentima at veritasgenetics.com>
Date: Thu Aug 23 17:28:53 2018 -0300
13773: Updates tests checking that runtime_status_error() is called when needed
Arvados-DCO-1.1-Signed-off-by: Lucas Di Pentima <ldipentima at veritasgenetics.com>
diff --git a/sdk/cwl/arvados_cwl/__init__.py b/sdk/cwl/arvados_cwl/__init__.py
index 2b4a5d935..7f56eac8c 100644
--- a/sdk/cwl/arvados_cwl/__init__.py
+++ b/sdk/cwl/arvados_cwl/__init__.py
@@ -216,7 +216,7 @@ http://doc.arvados.org/install/install-api-server.html#disable_api_methods
if not runtime_status.get('error'):
runtime_status.update({
'error': error_msg,
- 'errorDetail': error_log
+ 'errorDetail': error_log or "No error logs available"
})
# Further errors are only mentioned as a count
else:
diff --git a/sdk/cwl/tests/test_container.py b/sdk/cwl/tests/test_container.py
index 69f3ae046..fcc9d1550 100644
--- a/sdk/cwl/tests/test_container.py
+++ b/sdk/cwl/tests/test_container.py
@@ -488,11 +488,67 @@ class TestContainer(unittest.TestCase):
})
self.assertFalse(api.collections().create.called)
+ self.assertFalse(runner.runtime_status_error.called)
arvjob.collect_outputs.assert_called_with("keep:abc+123")
arvjob.output_callback.assert_called_with({"out": "stuff"}, "success")
runner.add_intermediate_output.assert_called_with("zzzzz-4zz18-zzzzzzzzzzzzzz2")
+ @mock.patch("arvados_cwl.done.logtail")
+ @mock.patch("arvados.collection.CollectionReader")
+ @mock.patch("arvados.collection.Collection")
+ def test_child_failure(self, col, reader, logtail):
+ api = mock.MagicMock()
+
+ runner = mock.MagicMock()
+ runner.api = api
+ runner.project_uuid = "zzzzz-8i9sb-zzzzzzzzzzzzzzz"
+ runner.num_retries = 0
+ runner.ignore_docker_for_reuse = False
+ runner.intermediate_output_ttl = 0
+ runner.secret_store = cwltool.secrets.SecretStore()
+ runner.label.return_value = '[container testjob]'
+
+ runner.api.containers().get().execute.return_value = {
+ "state":"Complete",
+ "output": "abc+123",
+ "exit_code": 1,
+ "log": "def+234"
+ }
+
+ col().open.return_value = []
+ logtail.return_value = 'some error detail'
+
+ arvjob = arvados_cwl.ArvadosContainer(runner,
+ mock.MagicMock(),
+ {},
+ None,
+ [],
+ [],
+ "testjob")
+ arvjob.output_callback = mock.MagicMock()
+ arvjob.collect_outputs = mock.MagicMock()
+ arvjob.successCodes = [0]
+ arvjob.outdir = "/var/spool/cwl"
+ arvjob.output_ttl = 3600
+ arvjob.collect_outputs.return_value = {"out": "stuff"}
+
+ arvjob.done({
+ "state": "Final",
+ "log_uuid": "zzzzz-4zz18-zzzzzzzzzzzzzz1",
+ "output_uuid": "zzzzz-4zz18-zzzzzzzzzzzzzz2",
+ "uuid": "zzzzz-xvhdp-zzzzzzzzzzzzzzz",
+ "container_uuid": "zzzzz-8i9sb-zzzzzzzzzzzzzzz",
+ "modified_at": "2017-05-26T12:01:22Z"
+ })
+
+ runner.runtime_status_error.assert_called_with(
+ '[container testjob]',
+ 'zzzzz-xvhdp-zzzzzzzzzzzzzzz',
+ 'some error detail'
+ )
+ arvjob.output_callback.assert_called_with({"out": "stuff"}, "permanentFail")
+
# The test passes no builder.resources
# Hence the default resources will apply: {'cores': 1, 'ram': 1024, 'outdirSize': 1024, 'tmpdirSize': 1024}
@mock.patch("arvados.commands.keepdocker.list_images_in_arv")
commit 43bb88d0e15c7dc257cc8b98d5862a1fa3681549
Author: Lucas Di Pentima <ldipentima at veritasgenetics.com>
Date: Thu Aug 23 12:04:56 2018 -0300
13773: Updates runner container's runtime_status on child failures.
Arvados-DCO-1.1-Signed-off-by: Lucas Di Pentima <ldipentima at veritasgenetics.com>
diff --git a/sdk/cwl/arvados_cwl/__init__.py b/sdk/cwl/arvados_cwl/__init__.py
index 6edf00e7f..2b4a5d935 100644
--- a/sdk/cwl/arvados_cwl/__init__.py
+++ b/sdk/cwl/arvados_cwl/__init__.py
@@ -168,14 +168,13 @@ http://doc.arvados.org/install/install-api-server.html#disable_api_methods
with self.workflow_eval_lock:
if processStatus == "success":
logger.info("Overall process status is %s", processStatus)
- if self.pipeline:
- self.api.pipeline_instances().update(uuid=self.pipeline["uuid"],
- body={"state": "Complete"}).execute(num_retries=self.num_retries)
+ state = "Complete"
else:
logger.error("Overall process status is %s", processStatus)
- if self.pipeline:
- self.api.pipeline_instances().update(uuid=self.pipeline["uuid"],
- body={"state": "Failed"}).execute(num_retries=self.num_retries)
+ state = "Failed"
+ if self.pipeline:
+ self.api.pipeline_instances().update(uuid=self.pipeline["uuid"],
+ body={"state": state}).execute(num_retries=self.num_retries)
self.final_status = processStatus
self.final_output = out
self.workflow_eval_lock.notifyAll()
@@ -195,6 +194,53 @@ http://doc.arvados.org/install/install-api-server.html#disable_api_methods
self.task_queue.add(partial(j.done, record))
del self.processes[uuid]
+ def runtime_status_error(self, child_label, child_uuid, error_log):
+ """
+ Called from a failing child container. Records the first child error
+ on this runner's runtime_status field.
+ On subsequent errors, updates the 'error' key to show how many additional
+ failures happened.
+ """
+ error_msg = "%s %s failed" % (child_label, child_uuid)
+ logger.info(error_msg)
+ with self.workflow_eval_lock:
+ try:
+ current = self.api.containers().current().execute(num_retries=self.num_retries)
+ except ApiError as e:
+ # Status code 404 just means we're not running in a container.
+ if e.resp.status != 404:
+ logger.info("Getting current container: %s", e)
+ return
+ runtime_status = current.get('runtime_status', {})
+ # Save first fatal error
+ if not runtime_status.get('error'):
+ runtime_status.update({
+ 'error': error_msg,
+ 'errorDetail': error_log
+ })
+ # Further errors are only mentioned as a count
+ else:
+ error_msg = re.match(
+ r'^(.*failed)\s*\(?', runtime_status.get('error')).groups()[0]
+ more_failures = re.match(
+ r'.*\(.*(\d+) more\)', runtime_status.get('error'))
+ if more_failures:
+ failure_qty = int(more_failures.groups()[0])
+ runtime_status.update({
+ 'error': "%s (and %d more)" % (error_msg, failure_qty+1)
+ })
+ else:
+ runtime_status.update({
+ 'error': "%s (and 1 more)" % error_msg
+ })
+ try:
+ self.api.containers().update(uuid=current['uuid'],
+ body={
+ 'runtime_status': runtime_status,
+ }).execute(num_retries=self.num_retries)
+ except Exception as e:
+ logger.error("Updating runtime_status: %s", e)
+
def wrapped_callback(self, cb, obj, st):
with self.workflow_eval_lock:
cb(obj, st)
diff --git a/sdk/cwl/arvados_cwl/arvcontainer.py b/sdk/cwl/arvados_cwl/arvcontainer.py
index 49c40b1da..d49b65002 100644
--- a/sdk/cwl/arvados_cwl/arvcontainer.py
+++ b/sdk/cwl/arvados_cwl/arvcontainer.py
@@ -320,7 +320,11 @@ class ArvadosContainer(JobBase):
api_client=self.arvrunner.api,
keep_client=self.arvrunner.keep_client,
num_retries=self.arvrunner.num_retries)
- done.logtail(logc, logger.error, "%s (%s) error log:" % (self.arvrunner.label(self), record["uuid"]), maxlen=40)
+ label = self.arvrunner.label(self)
+ error_log = done.logtail(
+ logc, logger.error,
+ "%s (%s) error log:" % (label, record["uuid"]), maxlen=40)
+ self.arvrunner.runtime_status_error(label, record["uuid"], error_log)
if record["output_uuid"]:
if self.arvrunner.trash_intermediate or self.arvrunner.intermediate_output_ttl:
diff --git a/sdk/cwl/arvados_cwl/done.py b/sdk/cwl/arvados_cwl/done.py
index 25efade2a..7f3cb36de 100644
--- a/sdk/cwl/arvados_cwl/done.py
+++ b/sdk/cwl/arvados_cwl/done.py
@@ -97,3 +97,4 @@ def logtail(logcollection, logfunc, header, maxlen=25):
logtxt = "\n ".join(l.strip() for l in loglines)
logfunc(header)
logfunc("\n %s", logtxt)
+ return logtxt
commit 40447d7fe7a73c6babd90fbb6d10f72f8de3a3e4
Author: Lucas Di Pentima <ldipentima at veritasgenetics.com>
Date: Tue Aug 21 12:10:58 2018 -0300
13773: Add runtime_status to be accesible via API
Arvados-DCO-1.1-Signed-off-by: Lucas Di Pentima <ldipentima at veritasgenetics.com>
diff --git a/services/api/app/controllers/arvados/v1/containers_controller.rb b/services/api/app/controllers/arvados/v1/containers_controller.rb
index 25cb0037a..65d8385ad 100644
--- a/services/api/app/controllers/arvados/v1/containers_controller.rb
+++ b/services/api/app/controllers/arvados/v1/containers_controller.rb
@@ -6,6 +6,7 @@ class Arvados::V1::ContainersController < ApplicationController
accept_attribute_as_json :environment, Hash
accept_attribute_as_json :mounts, Hash
accept_attribute_as_json :runtime_constraints, Hash
+ accept_attribute_as_json :runtime_status, Hash
accept_attribute_as_json :command, Array
accept_attribute_as_json :scheduling_parameters, Hash
diff --git a/services/api/app/models/container.rb b/services/api/app/models/container.rb
index 0228eb2af..798124247 100644
--- a/services/api/app/models/container.rb
+++ b/services/api/app/models/container.rb
@@ -60,6 +60,7 @@ class Container < ArvadosModel
t.add :priority
t.add :progress
t.add :runtime_constraints
+ t.add :runtime_status
t.add :started_at
t.add :state
t.add :auth_uuid
commit 70eb5479023517624f1966791ccfaeb6d273017d
Author: Lucas Di Pentima <ldipentima at veritasgenetics.com>
Date: Tue Aug 14 12:36:39 2018 -0300
13773: Expand test to prove that non-error runtime_status doesn't avoid reuse.
Arvados-DCO-1.1-Signed-off-by: Lucas Di Pentima <ldipentima at veritasgenetics.com>
diff --git a/services/api/test/unit/container_test.rb b/services/api/test/unit/container_test.rb
index b1d35c55f..3d8bf841e 100644
--- a/services/api/test/unit/container_test.rb
+++ b/services/api/test/unit/container_test.rb
@@ -342,6 +342,7 @@ class ContainerTest < ActiveSupport::TestCase
progress: 0.1})
c_faster_started_first.update_attributes!({state: Container::Locked})
c_faster_started_first.update_attributes!({state: Container::Running,
+ runtime_status: {'warning' => 'This is not an error'},
progress: 0.15})
c_faster_started_second.update_attributes!({state: Container::Locked})
c_faster_started_second.update_attributes!({state: Container::Running,
commit fda792680a8fd21b1c80ea2a79b267381521935a
Author: Lucas Di Pentima <ldipentima at veritasgenetics.com>
Date: Tue Aug 14 11:25:10 2018 -0300
13773: Fix reuse query & add tests
Arvados-DCO-1.1-Signed-off-by: Lucas Di Pentima <ldipentima at veritasgenetics.com>
diff --git a/sdk/cwl/arvados_cwl/__init__.py b/sdk/cwl/arvados_cwl/__init__.py
index 8c3f0eade..6edf00e7f 100644
--- a/sdk/cwl/arvados_cwl/__init__.py
+++ b/sdk/cwl/arvados_cwl/__init__.py
@@ -133,7 +133,7 @@ class ArvCwlRunner(object):
if arvargs.work_api is None:
raise Exception("No supported APIs")
else:
- raise Exception("Unsupported API '%s', expected one of %s" % (work_api, expected_api))
+ raise Exception("Unsupported API '%s', expected one of %s" % (arvargs.work_api, expected_api))
if self.work_api == "jobs":
logger.warn("""
diff --git a/services/api/app/models/container.rb b/services/api/app/models/container.rb
index 380d4aafb..0228eb2af 100644
--- a/services/api/app/models/container.rb
+++ b/services/api/app/models/container.rb
@@ -280,7 +280,7 @@ class Container < ArvadosModel
# Check for non-failing Running candidates and return the most likely to finish sooner.
log_reuse_info { "checking for state=Running..." }
running = candidates.where(state: Running).
- where("NOT (runtime_status ? 'error')").
+ where("(runtime_status->'error') is null").
order('progress desc, started_at asc').
limit(1).first
if running
diff --git a/services/api/test/unit/container_test.rb b/services/api/test/unit/container_test.rb
index 83ab59b60..b1d35c55f 100644
--- a/services/api/test/unit/container_test.rb
+++ b/services/api/test/unit/container_test.rb
@@ -131,6 +131,57 @@ class ContainerTest < ActiveSupport::TestCase
end
end
+ test "Container runtime_status updates" do
+ set_user_from_auth :active
+ attrs = {
+ environment: {},
+ mounts: {"BAR" => "FOO"},
+ output_path: "/tmp",
+ priority: 1,
+ runtime_constraints: {"vcpus" => 1, "ram" => 1}
+ }
+ c1, _ = minimal_new(attrs)
+ assert_equal c1.runtime_status, {}
+
+ assert_equal Container::Queued, c1.state
+ assert_raises ActiveRecord::RecordInvalid do
+ c1.update_attributes! runtime_status: {'error' => 'Oops!'}
+ end
+
+ set_user_from_auth :dispatch1
+
+ # Allow updates when state = Locked
+ c1.update_attributes! state: Container::Locked
+ c1.update_attributes! runtime_status: {'error' => 'Oops!'}
+ assert c1.runtime_status.key? 'error'
+
+ # Reset when transitioning from Locked to Queued
+ c1.update_attributes! state: Container::Queued
+ assert_equal c1.runtime_status, {}
+
+ # Allow updates when state = Running
+ c1.update_attributes! state: Container::Locked
+ c1.update_attributes! state: Container::Running
+ c1.update_attributes! runtime_status: {'error' => 'Oops!'}
+ assert c1.runtime_status.key? 'error'
+
+ # Don't allow updates on other states
+ c1.update_attributes! state: Container::Complete
+ assert_raises ActiveRecord::RecordInvalid do
+ c1.update_attributes! runtime_status: {'error' => 'Some other error'}
+ end
+
+ set_user_from_auth :active
+ c2, _ = minimal_new(attrs)
+ assert_equal c2.runtime_status, {}
+ set_user_from_auth :dispatch1
+ c2.update_attributes! state: Container::Locked
+ c2.update_attributes! state: Container::Running
+ c2.update_attributes! state: Container::Cancelled
+ assert_raises ActiveRecord::RecordInvalid do
+ c2.update_attributes! runtime_status: {'error' => 'Oops!'}
+ end
+ end
test "Container serialized hash attributes sorted before save" do
env = {"C" => "3", "B" => "2", "A" => "1"}
@@ -277,6 +328,31 @@ class ContainerTest < ActiveSupport::TestCase
assert_equal reused.uuid, c_faster_started_second.uuid
end
+ test "find_reusable method should select non-failing running container" do
+ set_user_from_auth :active
+ common_attrs = REUSABLE_COMMON_ATTRS.merge({environment: {"var" => "running2"}})
+ c_slower, _ = minimal_new(common_attrs.merge({use_existing: false}))
+ c_faster_started_first, _ = minimal_new(common_attrs.merge({use_existing: false}))
+ c_faster_started_second, _ = minimal_new(common_attrs.merge({use_existing: false}))
+ # Confirm the 3 container UUIDs are different.
+ assert_equal 3, [c_slower.uuid, c_faster_started_first.uuid, c_faster_started_second.uuid].uniq.length
+ set_user_from_auth :dispatch1
+ c_slower.update_attributes!({state: Container::Locked})
+ c_slower.update_attributes!({state: Container::Running,
+ progress: 0.1})
+ c_faster_started_first.update_attributes!({state: Container::Locked})
+ c_faster_started_first.update_attributes!({state: Container::Running,
+ progress: 0.15})
+ c_faster_started_second.update_attributes!({state: Container::Locked})
+ c_faster_started_second.update_attributes!({state: Container::Running,
+ runtime_status: {'error' => 'Something bad happened'},
+ progress: 0.2})
+ reused = Container.find_reusable(common_attrs)
+ assert_not_nil reused
+ # Selected the non-failing container even if it's the one with less progress done
+ assert_equal reused.uuid, c_faster_started_first.uuid
+ end
+
test "find_reusable method should select locked container most likely to start sooner" do
set_user_from_auth :active
common_attrs = REUSABLE_COMMON_ATTRS.merge({environment: {"var" => "locked"}})
commit 5ea613b9dfb402666adb88b3d5e531bce86f5401
Author: Lucas Di Pentima <ldipentima at veritasgenetics.com>
Date: Mon Aug 6 14:45:43 2018 -0300
13773: Add documentation.
Arvados-DCO-1.1-Signed-off-by: Lucas Di Pentima <ldipentima at veritasgenetics.com>
diff --git a/doc/_includes/_container_runtime_status.liquid b/doc/_includes/_container_runtime_status.liquid
new file mode 100644
index 000000000..77051beb3
--- /dev/null
+++ b/doc/_includes/_container_runtime_status.liquid
@@ -0,0 +1,18 @@
+{% comment %}
+Copyright (C) The Arvados Authors. All rights reserved.
+
+SPDX-License-Identifier: CC-BY-SA-3.0
+{% endcomment %}
+
+h2. Runtime status
+
+Runtime status provides container's relevant information about its progress even while it's still in Running state. This is used to avoid reusing containers that have not yet failed but will definitely do, and also for easier workflow debugging.
+
+The following keys have well known meanings:
+
+table(table table-bordered table-condensed).
+|_. Key|_. Type|_. Description|_. Notes|
+|error|string|The existance of this key indicates the container will definitely fail, or has already failed.|Optional.|
+|warning|string|Indicates something unusual happened or is currently happening, but isn't considered fatal.|Optional.|
+|activity|string|A message for the end user about what state the container is currently in.|Optional.|
+|errorDetails|string|Additional structured error details.|Optional.|
diff --git a/doc/api/methods/containers.html.textile.liquid b/doc/api/methods/containers.html.textile.liquid
index 30ec055a6..3384f9377 100644
--- a/doc/api/methods/containers.html.textile.liquid
+++ b/doc/api/methods/containers.html.textile.liquid
@@ -41,6 +41,10 @@ Generally this will contain additional keys that are not present in any correspo
"vcpus":2,
"API":true
}</code></pre>See "Runtime constraints":#runtime_constraints for more details.|
+|runtime_status|hash|Information related to the container's run, including its steps. Some keys have specific meaning and are described later in this page.|e.g.,
+<pre><code>{
+ "error": "This container won't be successful because at least one step have already failed."
+}</code></pre>See "Runtime status":#runtime_status for more details.|
|scheduling_parameters|hash|Parameters to be passed to the container scheduler when running this container.|e.g.,<pre><code>{
"partitions":["fastcpu","vfastcpu"]
}</code></pre>See "Scheduling parameters":#scheduling_parameters for more details.|
@@ -66,6 +70,8 @@ h2(#mount_types). {% include 'mount_types' %}
h2(#runtime_constraints). {% include 'container_runtime_constraints' %}
+h2(#runtime_status). {% include 'container_runtime_status' %}
+
h2(#scheduling_parameters). {% include 'container_scheduling_parameters' %}
h2. Methods
commit 5e0a13ebe8f0a25bf09de76024687481c81a19fe
Author: Lucas Di Pentima <ldipentima at veritasgenetics.com>
Date: Thu Aug 2 15:51:01 2018 -0300
13773: Filter out running containers that will fail from reuse selection.
Arvados-DCO-1.1-Signed-off-by: Lucas Di Pentima <ldipentima at veritasgenetics.com>
diff --git a/services/api/app/models/container.rb b/services/api/app/models/container.rb
index 4c6cadd1d..380d4aafb 100644
--- a/services/api/app/models/container.rb
+++ b/services/api/app/models/container.rb
@@ -277,9 +277,10 @@ class Container < ArvadosModel
return usable
end
- # Check for Running candidates and return the most likely to finish sooner.
+ # Check for non-failing Running candidates and return the most likely to finish sooner.
log_reuse_info { "checking for state=Running..." }
running = candidates.where(state: Running).
+ where("NOT (runtime_status ? 'error')").
order('progress desc, started_at asc').
limit(1).first
if running
commit df6941fb0efb83b0c030fb284b9397dd2bd09167
Author: Lucas Di Pentima <ldipentima at veritasgenetics.com>
Date: Tue Jul 31 10:38:56 2018 -0300
13773: Adds runtime_status field to containers on API server
Arvados-DCO-1.1-Signed-off-by: Lucas Di Pentima <ldipentima at veritasgenetics.com>
diff --git a/services/api/app/models/container.rb b/services/api/app/models/container.rb
index 7ec9845bc..4c6cadd1d 100644
--- a/services/api/app/models/container.rb
+++ b/services/api/app/models/container.rb
@@ -23,6 +23,7 @@ class Container < ArvadosModel
serialize :command, Array
serialize :scheduling_parameters, Hash
serialize :secret_mounts, Hash
+ serialize :runtime_status, Hash
before_validation :fill_field_defaults, :if => :new_record?
before_validation :set_timestamps
@@ -36,6 +37,7 @@ class Container < ArvadosModel
before_save :sort_serialized_attrs
before_save :update_secret_mounts_md5
before_save :scrub_secret_mounts
+ before_save :clear_runtime_status_when_queued
after_save :handle_completed
after_save :propagate_priority
after_commit { UpdatePriority.run_update_thread }
@@ -412,11 +414,14 @@ class Container < ArvadosModel
end
case self.state
- when Queued, Locked
+ when Locked
+ permitted.push :priority, :runtime_status
+
+ when Queued
permitted.push :priority
when Running
- permitted.push :priority, :progress, :output
+ permitted.push :priority, :progress, :output, :runtime_status
if self.state_changed?
permitted.push :started_at
end
@@ -533,6 +538,13 @@ class Container < ArvadosModel
end
end
+ def clear_runtime_status_when_queued
+ # Avoid leaking status messages between different dispatch attempts
+ if self.state_was == Locked && self.state == Queued
+ self.runtime_status = {}
+ end
+ end
+
def handle_completed
# This container is finished so finalize any associated container requests
# that are associated with this container.
diff --git a/services/api/db/migrate/20180904110712_add_runtime_status_to_containers.rb b/services/api/db/migrate/20180904110712_add_runtime_status_to_containers.rb
new file mode 100644
index 000000000..755c7c89e
--- /dev/null
+++ b/services/api/db/migrate/20180904110712_add_runtime_status_to_containers.rb
@@ -0,0 +1,6 @@
+class AddRuntimeStatusToContainers < ActiveRecord::Migration
+ def change
+ add_column :containers, :runtime_status, :jsonb, default: {}
+ add_index :containers, :runtime_status, using: :gin
+ end
+end
diff --git a/services/api/db/structure.sql b/services/api/db/structure.sql
index d7ee1532d..d1559c4c6 100644
--- a/services/api/db/structure.sql
+++ b/services/api/db/structure.sql
@@ -354,7 +354,8 @@ CREATE TABLE public.containers (
locked_by_uuid character varying(255),
scheduling_parameters text,
secret_mounts jsonb DEFAULT '{}'::jsonb,
- secret_mounts_md5 character varying DEFAULT '99914b932bd37a50b983c5e7c90ae93b'::character varying
+ secret_mounts_md5 character varying DEFAULT '99914b932bd37a50b983c5e7c90ae93b'::character varying,
+ runtime_status jsonb DEFAULT '{}'::jsonb
);
@@ -1914,6 +1915,13 @@ CREATE INDEX index_containers_on_owner_uuid ON public.containers USING btree (ow
--
+-- Name: index_containers_on_runtime_status; Type: INDEX; Schema: public; Owner: -
+--
+
+CREATE INDEX index_containers_on_runtime_status ON public.containers USING gin (runtime_status);
+
+
+--
-- Name: index_containers_on_secret_mounts_md5; Type: INDEX; Schema: public; Owner: -
--
@@ -3125,3 +3133,5 @@ INSERT INTO schema_migrations (version) VALUES ('20180820130357');
INSERT INTO schema_migrations (version) VALUES ('20180820135808');
+INSERT INTO schema_migrations (version) VALUES ('20180904110712');
+
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list