[ARVADOS] created: 1.2.0-9-gcb2a05a42

Git user git at public.curoverse.com
Wed Sep 5 16:19:56 EDT 2018


        at  cb2a05a424c0fc4bb05d7ad171cd220a0507150e (commit)


commit cb2a05a424c0fc4bb05d7ad171cd220a0507150e
Author: Lucas Di Pentima <ldipentima at veritasgenetics.com>
Date:   Wed Sep 5 16:41:29 2018 -0300

    13773: Containers & CRs show "Failing" when a child job failed.
    
    Also, at the detailed view an error panel is displayed with the error message
    and detail, if any.
    
    Arvados-DCO-1.1-Signed-off-by: Lucas Di Pentima <ldipentima at veritasgenetics.com>

diff --git a/apps/workbench/app/models/container_work_unit.rb b/apps/workbench/app/models/container_work_unit.rb
index 7f1052ebc..94b416173 100644
--- a/apps/workbench/app/models/container_work_unit.rb
+++ b/apps/workbench/app/models/container_work_unit.rb
@@ -117,12 +117,31 @@ class ContainerWorkUnit < ProxyWorkUnit
       else
         'Failed'
       end
+    when 'Running'
+      if runtime_status[:error]
+        'Failing'
+      else
+        state
+      end
     else
-      # Cancelled, Running, or Uncommitted (no container assigned)
+      # Cancelled, or Uncommitted (no container assigned)
       state
     end
   end
 
+  def runtime_status
+    return get(:runtime_status, @container) || get(:runtime_status, @proxied)
+  end
+
+  def state_bootstrap_class
+    case state_label
+    when 'Failing'
+      'danger'
+    else
+      super
+    end
+  end
+
   def exit_code
     get_combined(:exit_code)
   end
diff --git a/apps/workbench/app/models/proxy_work_unit.rb b/apps/workbench/app/models/proxy_work_unit.rb
index 02f6b42fd..adf0bd7d6 100644
--- a/apps/workbench/app/models/proxy_work_unit.rb
+++ b/apps/workbench/app/models/proxy_work_unit.rb
@@ -278,7 +278,11 @@ class ProxyWorkUnit < WorkUnit
     end
 
     if is_failed?
-      resp << " Check the Log tab for more detail about why it failed."
+      if runtime_status.andand[:error]
+        resp << " Check the error information below."
+      else
+        resp << " Check the Log tab for more detail about why it failed."
+      end
     end
     resp << "</p>"
 
diff --git a/apps/workbench/app/models/work_unit.rb b/apps/workbench/app/models/work_unit.rb
index f0c4230f2..493dd2f57 100644
--- a/apps/workbench/app/models/work_unit.rb
+++ b/apps/workbench/app/models/work_unit.rb
@@ -211,4 +211,8 @@ class WorkUnit
   def template_uuid
     # return the uuid of this work unit's template, if one exists
   end
+
+  def runtime_status
+    # Returns this work unit's runtime_status, if any
+  end
 end
diff --git a/apps/workbench/app/views/work_units/_show_component.html.erb b/apps/workbench/app/views/work_units/_show_component.html.erb
index 3bba31f78..d85329576 100644
--- a/apps/workbench/app/views/work_units/_show_component.html.erb
+++ b/apps/workbench/app/views/work_units/_show_component.html.erb
@@ -37,6 +37,28 @@ SPDX-License-Identifier: AGPL-3.0 %>
   <% end %>
 </div>
 
+<%# Display runtime error information %>
+<% if wu.runtime_status.andand[:error] %>
+<div class="container">
+  <div class="col-md-12">
+    <div class="panel panel-danger">
+      <div class="panel-heading">Error Information</div>
+      <div class="panel-body">
+        <%= wu.runtime_status[:error] %>
+        <%# Show collapsable detailed error information, if any %>
+        <% if wu.runtime_status[:errorDetail] %>
+        <a class="btn btn-sm btn-primary pull-right" data-toggle="collapse" data-target="#errorDetail">Toggle details</a>
+        <div class="clearfix"></div>
+        <div id="errorDetail" class="collapse">
+          <pre><%= wu.runtime_status[:errorDetail] %></pre>
+        </div>
+        <% end %>
+      </div>
+    </div>
+  </div>
+</div>
+<% end %>
+
 <p>
   <%= render(partial: 'work_units/component_detail', locals: {current_obj: wu}) %>
 </p>

commit 05709dc092dccede4206b7cfe04f1574bc2299c4
Author: Lucas Di Pentima <ldipentima at veritasgenetics.com>
Date:   Thu Aug 23 17:28:53 2018 -0300

    13773: Updates tests checking that runtime_status_error() is called when needed
    
    Arvados-DCO-1.1-Signed-off-by: Lucas Di Pentima <ldipentima at veritasgenetics.com>

diff --git a/sdk/cwl/arvados_cwl/__init__.py b/sdk/cwl/arvados_cwl/__init__.py
index 2b4a5d935..7f56eac8c 100644
--- a/sdk/cwl/arvados_cwl/__init__.py
+++ b/sdk/cwl/arvados_cwl/__init__.py
@@ -216,7 +216,7 @@ http://doc.arvados.org/install/install-api-server.html#disable_api_methods
             if not runtime_status.get('error'):
                 runtime_status.update({
                     'error': error_msg,
-                    'errorDetail': error_log
+                    'errorDetail': error_log or "No error logs available"
                 })
             # Further errors are only mentioned as a count
             else:
diff --git a/sdk/cwl/tests/test_container.py b/sdk/cwl/tests/test_container.py
index 69f3ae046..fcc9d1550 100644
--- a/sdk/cwl/tests/test_container.py
+++ b/sdk/cwl/tests/test_container.py
@@ -488,11 +488,67 @@ class TestContainer(unittest.TestCase):
         })
 
         self.assertFalse(api.collections().create.called)
+        self.assertFalse(runner.runtime_status_error.called)
 
         arvjob.collect_outputs.assert_called_with("keep:abc+123")
         arvjob.output_callback.assert_called_with({"out": "stuff"}, "success")
         runner.add_intermediate_output.assert_called_with("zzzzz-4zz18-zzzzzzzzzzzzzz2")
 
+    @mock.patch("arvados_cwl.done.logtail")
+    @mock.patch("arvados.collection.CollectionReader")
+    @mock.patch("arvados.collection.Collection")
+    def test_child_failure(self, col, reader, logtail):
+        api = mock.MagicMock()
+
+        runner = mock.MagicMock()
+        runner.api = api
+        runner.project_uuid = "zzzzz-8i9sb-zzzzzzzzzzzzzzz"
+        runner.num_retries = 0
+        runner.ignore_docker_for_reuse = False
+        runner.intermediate_output_ttl = 0
+        runner.secret_store = cwltool.secrets.SecretStore()
+        runner.label.return_value = '[container testjob]'
+
+        runner.api.containers().get().execute.return_value = {
+            "state":"Complete",
+            "output": "abc+123",
+            "exit_code": 1,
+            "log": "def+234"
+        }
+
+        col().open.return_value = []
+        logtail.return_value = 'some error detail'
+
+        arvjob = arvados_cwl.ArvadosContainer(runner,
+                                              mock.MagicMock(),
+                                              {},
+                                              None,
+                                              [],
+                                              [],
+                                              "testjob")
+        arvjob.output_callback = mock.MagicMock()
+        arvjob.collect_outputs = mock.MagicMock()
+        arvjob.successCodes = [0]
+        arvjob.outdir = "/var/spool/cwl"
+        arvjob.output_ttl = 3600
+        arvjob.collect_outputs.return_value = {"out": "stuff"}
+
+        arvjob.done({
+            "state": "Final",
+            "log_uuid": "zzzzz-4zz18-zzzzzzzzzzzzzz1",
+            "output_uuid": "zzzzz-4zz18-zzzzzzzzzzzzzz2",
+            "uuid": "zzzzz-xvhdp-zzzzzzzzzzzzzzz",
+            "container_uuid": "zzzzz-8i9sb-zzzzzzzzzzzzzzz",
+            "modified_at": "2017-05-26T12:01:22Z"
+        })
+
+        runner.runtime_status_error.assert_called_with(
+            '[container testjob]',
+            'zzzzz-xvhdp-zzzzzzzzzzzzzzz',
+            'some error detail'
+        )
+        arvjob.output_callback.assert_called_with({"out": "stuff"}, "permanentFail")
+
     # The test passes no builder.resources
     # Hence the default resources will apply: {'cores': 1, 'ram': 1024, 'outdirSize': 1024, 'tmpdirSize': 1024}
     @mock.patch("arvados.commands.keepdocker.list_images_in_arv")

commit 43bb88d0e15c7dc257cc8b98d5862a1fa3681549
Author: Lucas Di Pentima <ldipentima at veritasgenetics.com>
Date:   Thu Aug 23 12:04:56 2018 -0300

    13773: Updates runner container's runtime_status on child failures.
    
    Arvados-DCO-1.1-Signed-off-by: Lucas Di Pentima <ldipentima at veritasgenetics.com>

diff --git a/sdk/cwl/arvados_cwl/__init__.py b/sdk/cwl/arvados_cwl/__init__.py
index 6edf00e7f..2b4a5d935 100644
--- a/sdk/cwl/arvados_cwl/__init__.py
+++ b/sdk/cwl/arvados_cwl/__init__.py
@@ -168,14 +168,13 @@ http://doc.arvados.org/install/install-api-server.html#disable_api_methods
         with self.workflow_eval_lock:
             if processStatus == "success":
                 logger.info("Overall process status is %s", processStatus)
-                if self.pipeline:
-                    self.api.pipeline_instances().update(uuid=self.pipeline["uuid"],
-                                                         body={"state": "Complete"}).execute(num_retries=self.num_retries)
+                state = "Complete"
             else:
                 logger.error("Overall process status is %s", processStatus)
-                if self.pipeline:
-                    self.api.pipeline_instances().update(uuid=self.pipeline["uuid"],
-                                                         body={"state": "Failed"}).execute(num_retries=self.num_retries)
+                state = "Failed"
+            if self.pipeline:
+                self.api.pipeline_instances().update(uuid=self.pipeline["uuid"],
+                                                        body={"state": state}).execute(num_retries=self.num_retries)
             self.final_status = processStatus
             self.final_output = out
             self.workflow_eval_lock.notifyAll()
@@ -195,6 +194,53 @@ http://doc.arvados.org/install/install-api-server.html#disable_api_methods
             self.task_queue.add(partial(j.done, record))
             del self.processes[uuid]
 
+    def runtime_status_error(self, child_label, child_uuid, error_log):
+        """
+        Called from a failing child container. Records the first child error
+        on this runner's runtime_status field.
+        On subsequent errors, updates the 'error' key to show how many additional
+        failures happened.
+        """
+        error_msg = "%s %s failed" % (child_label, child_uuid)
+        logger.info(error_msg)
+        with self.workflow_eval_lock:
+            try:
+                current = self.api.containers().current().execute(num_retries=self.num_retries)
+            except ApiError as e:
+                # Status code 404 just means we're not running in a container.
+                if e.resp.status != 404:
+                    logger.info("Getting current container: %s", e)
+                return
+            runtime_status = current.get('runtime_status', {})
+            # Save first fatal error
+            if not runtime_status.get('error'):
+                runtime_status.update({
+                    'error': error_msg,
+                    'errorDetail': error_log
+                })
+            # Further errors are only mentioned as a count
+            else:
+                error_msg = re.match(
+                    r'^(.*failed)\s*\(?', runtime_status.get('error')).groups()[0]
+                more_failures = re.match(
+                    r'.*\(.*(\d+) more\)', runtime_status.get('error'))
+                if more_failures:
+                    failure_qty = int(more_failures.groups()[0])
+                    runtime_status.update({
+                        'error': "%s (and %d more)" % (error_msg, failure_qty+1)
+                    })
+                else:
+                    runtime_status.update({
+                        'error': "%s (and 1 more)" % error_msg
+                    })
+            try:
+                self.api.containers().update(uuid=current['uuid'],
+                                            body={
+                                                'runtime_status': runtime_status,
+                                            }).execute(num_retries=self.num_retries)
+            except Exception as e:
+                logger.error("Updating runtime_status: %s", e)
+
     def wrapped_callback(self, cb, obj, st):
         with self.workflow_eval_lock:
             cb(obj, st)
diff --git a/sdk/cwl/arvados_cwl/arvcontainer.py b/sdk/cwl/arvados_cwl/arvcontainer.py
index 49c40b1da..d49b65002 100644
--- a/sdk/cwl/arvados_cwl/arvcontainer.py
+++ b/sdk/cwl/arvados_cwl/arvcontainer.py
@@ -320,7 +320,11 @@ class ArvadosContainer(JobBase):
                                                            api_client=self.arvrunner.api,
                                                            keep_client=self.arvrunner.keep_client,
                                                            num_retries=self.arvrunner.num_retries)
-                done.logtail(logc, logger.error, "%s (%s) error log:" % (self.arvrunner.label(self), record["uuid"]), maxlen=40)
+                label = self.arvrunner.label(self)
+                error_log = done.logtail(
+                    logc, logger.error,
+                    "%s (%s) error log:" % (label, record["uuid"]), maxlen=40)
+                self.arvrunner.runtime_status_error(label, record["uuid"], error_log)
 
             if record["output_uuid"]:
                 if self.arvrunner.trash_intermediate or self.arvrunner.intermediate_output_ttl:
diff --git a/sdk/cwl/arvados_cwl/done.py b/sdk/cwl/arvados_cwl/done.py
index 25efade2a..7f3cb36de 100644
--- a/sdk/cwl/arvados_cwl/done.py
+++ b/sdk/cwl/arvados_cwl/done.py
@@ -97,3 +97,4 @@ def logtail(logcollection, logfunc, header, maxlen=25):
     logtxt = "\n  ".join(l.strip() for l in loglines)
     logfunc(header)
     logfunc("\n  %s", logtxt)
+    return logtxt

commit 40447d7fe7a73c6babd90fbb6d10f72f8de3a3e4
Author: Lucas Di Pentima <ldipentima at veritasgenetics.com>
Date:   Tue Aug 21 12:10:58 2018 -0300

    13773: Add runtime_status to be accesible via API
    
    Arvados-DCO-1.1-Signed-off-by: Lucas Di Pentima <ldipentima at veritasgenetics.com>

diff --git a/services/api/app/controllers/arvados/v1/containers_controller.rb b/services/api/app/controllers/arvados/v1/containers_controller.rb
index 25cb0037a..65d8385ad 100644
--- a/services/api/app/controllers/arvados/v1/containers_controller.rb
+++ b/services/api/app/controllers/arvados/v1/containers_controller.rb
@@ -6,6 +6,7 @@ class Arvados::V1::ContainersController < ApplicationController
   accept_attribute_as_json :environment, Hash
   accept_attribute_as_json :mounts, Hash
   accept_attribute_as_json :runtime_constraints, Hash
+  accept_attribute_as_json :runtime_status, Hash
   accept_attribute_as_json :command, Array
   accept_attribute_as_json :scheduling_parameters, Hash
 
diff --git a/services/api/app/models/container.rb b/services/api/app/models/container.rb
index 0228eb2af..798124247 100644
--- a/services/api/app/models/container.rb
+++ b/services/api/app/models/container.rb
@@ -60,6 +60,7 @@ class Container < ArvadosModel
     t.add :priority
     t.add :progress
     t.add :runtime_constraints
+    t.add :runtime_status
     t.add :started_at
     t.add :state
     t.add :auth_uuid

commit 70eb5479023517624f1966791ccfaeb6d273017d
Author: Lucas Di Pentima <ldipentima at veritasgenetics.com>
Date:   Tue Aug 14 12:36:39 2018 -0300

    13773: Expand test to prove that non-error runtime_status doesn't avoid reuse.
    
    Arvados-DCO-1.1-Signed-off-by: Lucas Di Pentima <ldipentima at veritasgenetics.com>

diff --git a/services/api/test/unit/container_test.rb b/services/api/test/unit/container_test.rb
index b1d35c55f..3d8bf841e 100644
--- a/services/api/test/unit/container_test.rb
+++ b/services/api/test/unit/container_test.rb
@@ -342,6 +342,7 @@ class ContainerTest < ActiveSupport::TestCase
                                  progress: 0.1})
     c_faster_started_first.update_attributes!({state: Container::Locked})
     c_faster_started_first.update_attributes!({state: Container::Running,
+                                               runtime_status: {'warning' => 'This is not an error'},
                                                progress: 0.15})
     c_faster_started_second.update_attributes!({state: Container::Locked})
     c_faster_started_second.update_attributes!({state: Container::Running,

commit fda792680a8fd21b1c80ea2a79b267381521935a
Author: Lucas Di Pentima <ldipentima at veritasgenetics.com>
Date:   Tue Aug 14 11:25:10 2018 -0300

    13773: Fix reuse query & add tests
    
    Arvados-DCO-1.1-Signed-off-by: Lucas Di Pentima <ldipentima at veritasgenetics.com>

diff --git a/sdk/cwl/arvados_cwl/__init__.py b/sdk/cwl/arvados_cwl/__init__.py
index 8c3f0eade..6edf00e7f 100644
--- a/sdk/cwl/arvados_cwl/__init__.py
+++ b/sdk/cwl/arvados_cwl/__init__.py
@@ -133,7 +133,7 @@ class ArvCwlRunner(object):
             if arvargs.work_api is None:
                 raise Exception("No supported APIs")
             else:
-                raise Exception("Unsupported API '%s', expected one of %s" % (work_api, expected_api))
+                raise Exception("Unsupported API '%s', expected one of %s" % (arvargs.work_api, expected_api))
 
         if self.work_api == "jobs":
             logger.warn("""
diff --git a/services/api/app/models/container.rb b/services/api/app/models/container.rb
index 380d4aafb..0228eb2af 100644
--- a/services/api/app/models/container.rb
+++ b/services/api/app/models/container.rb
@@ -280,7 +280,7 @@ class Container < ArvadosModel
     # Check for non-failing Running candidates and return the most likely to finish sooner.
     log_reuse_info { "checking for state=Running..." }
     running = candidates.where(state: Running).
-              where("NOT (runtime_status ? 'error')").
+              where("(runtime_status->'error') is null").
               order('progress desc, started_at asc').
               limit(1).first
     if running
diff --git a/services/api/test/unit/container_test.rb b/services/api/test/unit/container_test.rb
index 83ab59b60..b1d35c55f 100644
--- a/services/api/test/unit/container_test.rb
+++ b/services/api/test/unit/container_test.rb
@@ -131,6 +131,57 @@ class ContainerTest < ActiveSupport::TestCase
     end
   end
 
+  test "Container runtime_status updates" do
+    set_user_from_auth :active
+    attrs = {
+      environment: {},
+      mounts: {"BAR" => "FOO"},
+      output_path: "/tmp",
+      priority: 1,
+      runtime_constraints: {"vcpus" => 1, "ram" => 1}
+    }
+    c1, _ = minimal_new(attrs)
+    assert_equal c1.runtime_status, {}
+
+    assert_equal Container::Queued, c1.state
+    assert_raises ActiveRecord::RecordInvalid do
+      c1.update_attributes! runtime_status: {'error' => 'Oops!'}
+    end
+
+    set_user_from_auth :dispatch1
+
+    # Allow updates when state = Locked
+    c1.update_attributes! state: Container::Locked
+    c1.update_attributes! runtime_status: {'error' => 'Oops!'}
+    assert c1.runtime_status.key? 'error'
+
+    # Reset when transitioning from Locked to Queued
+    c1.update_attributes! state: Container::Queued
+    assert_equal c1.runtime_status, {}
+
+    # Allow updates when state = Running
+    c1.update_attributes! state: Container::Locked
+    c1.update_attributes! state: Container::Running
+    c1.update_attributes! runtime_status: {'error' => 'Oops!'}
+    assert c1.runtime_status.key? 'error'
+
+    # Don't allow updates on other states
+    c1.update_attributes! state: Container::Complete
+    assert_raises ActiveRecord::RecordInvalid do
+      c1.update_attributes! runtime_status: {'error' => 'Some other error'}
+    end
+
+    set_user_from_auth :active
+    c2, _ = minimal_new(attrs)
+    assert_equal c2.runtime_status, {}
+    set_user_from_auth :dispatch1
+    c2.update_attributes! state: Container::Locked
+    c2.update_attributes! state: Container::Running
+    c2.update_attributes! state: Container::Cancelled
+    assert_raises ActiveRecord::RecordInvalid do
+      c2.update_attributes! runtime_status: {'error' => 'Oops!'}
+    end
+  end
 
   test "Container serialized hash attributes sorted before save" do
     env = {"C" => "3", "B" => "2", "A" => "1"}
@@ -277,6 +328,31 @@ class ContainerTest < ActiveSupport::TestCase
     assert_equal reused.uuid, c_faster_started_second.uuid
   end
 
+  test "find_reusable method should select non-failing running container" do
+    set_user_from_auth :active
+    common_attrs = REUSABLE_COMMON_ATTRS.merge({environment: {"var" => "running2"}})
+    c_slower, _ = minimal_new(common_attrs.merge({use_existing: false}))
+    c_faster_started_first, _ = minimal_new(common_attrs.merge({use_existing: false}))
+    c_faster_started_second, _ = minimal_new(common_attrs.merge({use_existing: false}))
+    # Confirm the 3 container UUIDs are different.
+    assert_equal 3, [c_slower.uuid, c_faster_started_first.uuid, c_faster_started_second.uuid].uniq.length
+    set_user_from_auth :dispatch1
+    c_slower.update_attributes!({state: Container::Locked})
+    c_slower.update_attributes!({state: Container::Running,
+                                 progress: 0.1})
+    c_faster_started_first.update_attributes!({state: Container::Locked})
+    c_faster_started_first.update_attributes!({state: Container::Running,
+                                               progress: 0.15})
+    c_faster_started_second.update_attributes!({state: Container::Locked})
+    c_faster_started_second.update_attributes!({state: Container::Running,
+                                                runtime_status: {'error' => 'Something bad happened'},
+                                                progress: 0.2})
+    reused = Container.find_reusable(common_attrs)
+    assert_not_nil reused
+    # Selected the non-failing container even if it's the one with less progress done
+    assert_equal reused.uuid, c_faster_started_first.uuid
+  end
+
   test "find_reusable method should select locked container most likely to start sooner" do
     set_user_from_auth :active
     common_attrs = REUSABLE_COMMON_ATTRS.merge({environment: {"var" => "locked"}})

commit 5ea613b9dfb402666adb88b3d5e531bce86f5401
Author: Lucas Di Pentima <ldipentima at veritasgenetics.com>
Date:   Mon Aug 6 14:45:43 2018 -0300

    13773: Add documentation.
    
    Arvados-DCO-1.1-Signed-off-by: Lucas Di Pentima <ldipentima at veritasgenetics.com>

diff --git a/doc/_includes/_container_runtime_status.liquid b/doc/_includes/_container_runtime_status.liquid
new file mode 100644
index 000000000..77051beb3
--- /dev/null
+++ b/doc/_includes/_container_runtime_status.liquid
@@ -0,0 +1,18 @@
+{% comment %}
+Copyright (C) The Arvados Authors. All rights reserved.
+
+SPDX-License-Identifier: CC-BY-SA-3.0
+{% endcomment %}
+
+h2. Runtime status
+
+Runtime status provides container's relevant information about its progress even while it's still in Running state. This is used to avoid reusing containers that have not yet failed but will definitely do, and also for easier workflow debugging.
+
+The following keys have well known meanings:
+
+table(table table-bordered table-condensed).
+|_. Key|_. Type|_. Description|_. Notes|
+|error|string|The existance of this key indicates the container will definitely fail, or has already failed.|Optional.|
+|warning|string|Indicates something unusual happened or is currently happening, but isn't considered fatal.|Optional.|
+|activity|string|A message for the end user about what state the container is currently in.|Optional.|
+|errorDetails|string|Additional structured error details.|Optional.|
diff --git a/doc/api/methods/containers.html.textile.liquid b/doc/api/methods/containers.html.textile.liquid
index 30ec055a6..3384f9377 100644
--- a/doc/api/methods/containers.html.textile.liquid
+++ b/doc/api/methods/containers.html.textile.liquid
@@ -41,6 +41,10 @@ Generally this will contain additional keys that are not present in any correspo
   "vcpus":2,
   "API":true
 }</code></pre>See "Runtime constraints":#runtime_constraints for more details.|
+|runtime_status|hash|Information related to the container's run, including its steps. Some keys have specific meaning and are described later in this page.|e.g.,
+<pre><code>{
+  "error": "This container won't be successful because at least one step have already failed."
+}</code></pre>See "Runtime status":#runtime_status for more details.|
 |scheduling_parameters|hash|Parameters to be passed to the container scheduler when running this container.|e.g.,<pre><code>{
 "partitions":["fastcpu","vfastcpu"]
 }</code></pre>See "Scheduling parameters":#scheduling_parameters for more details.|
@@ -66,6 +70,8 @@ h2(#mount_types). {% include 'mount_types' %}
 
 h2(#runtime_constraints). {% include 'container_runtime_constraints' %}
 
+h2(#runtime_status). {% include 'container_runtime_status' %}
+
 h2(#scheduling_parameters). {% include 'container_scheduling_parameters' %}
 
 h2. Methods

commit 5e0a13ebe8f0a25bf09de76024687481c81a19fe
Author: Lucas Di Pentima <ldipentima at veritasgenetics.com>
Date:   Thu Aug 2 15:51:01 2018 -0300

    13773: Filter out running containers that will fail from reuse selection.
    
    Arvados-DCO-1.1-Signed-off-by: Lucas Di Pentima <ldipentima at veritasgenetics.com>

diff --git a/services/api/app/models/container.rb b/services/api/app/models/container.rb
index 4c6cadd1d..380d4aafb 100644
--- a/services/api/app/models/container.rb
+++ b/services/api/app/models/container.rb
@@ -277,9 +277,10 @@ class Container < ArvadosModel
       return usable
     end
 
-    # Check for Running candidates and return the most likely to finish sooner.
+    # Check for non-failing Running candidates and return the most likely to finish sooner.
     log_reuse_info { "checking for state=Running..." }
     running = candidates.where(state: Running).
+              where("NOT (runtime_status ? 'error')").
               order('progress desc, started_at asc').
               limit(1).first
     if running

commit df6941fb0efb83b0c030fb284b9397dd2bd09167
Author: Lucas Di Pentima <ldipentima at veritasgenetics.com>
Date:   Tue Jul 31 10:38:56 2018 -0300

    13773: Adds runtime_status field to containers on API server
    
    Arvados-DCO-1.1-Signed-off-by: Lucas Di Pentima <ldipentima at veritasgenetics.com>

diff --git a/services/api/app/models/container.rb b/services/api/app/models/container.rb
index 7ec9845bc..4c6cadd1d 100644
--- a/services/api/app/models/container.rb
+++ b/services/api/app/models/container.rb
@@ -23,6 +23,7 @@ class Container < ArvadosModel
   serialize :command, Array
   serialize :scheduling_parameters, Hash
   serialize :secret_mounts, Hash
+  serialize :runtime_status, Hash
 
   before_validation :fill_field_defaults, :if => :new_record?
   before_validation :set_timestamps
@@ -36,6 +37,7 @@ class Container < ArvadosModel
   before_save :sort_serialized_attrs
   before_save :update_secret_mounts_md5
   before_save :scrub_secret_mounts
+  before_save :clear_runtime_status_when_queued
   after_save :handle_completed
   after_save :propagate_priority
   after_commit { UpdatePriority.run_update_thread }
@@ -412,11 +414,14 @@ class Container < ArvadosModel
     end
 
     case self.state
-    when Queued, Locked
+    when Locked
+      permitted.push :priority, :runtime_status
+
+    when Queued
       permitted.push :priority
 
     when Running
-      permitted.push :priority, :progress, :output
+      permitted.push :priority, :progress, :output, :runtime_status
       if self.state_changed?
         permitted.push :started_at
       end
@@ -533,6 +538,13 @@ class Container < ArvadosModel
     end
   end
 
+  def clear_runtime_status_when_queued
+    # Avoid leaking status messages between different dispatch attempts
+    if self.state_was == Locked && self.state == Queued
+      self.runtime_status = {}
+    end
+  end
+
   def handle_completed
     # This container is finished so finalize any associated container requests
     # that are associated with this container.
diff --git a/services/api/db/migrate/20180904110712_add_runtime_status_to_containers.rb b/services/api/db/migrate/20180904110712_add_runtime_status_to_containers.rb
new file mode 100644
index 000000000..755c7c89e
--- /dev/null
+++ b/services/api/db/migrate/20180904110712_add_runtime_status_to_containers.rb
@@ -0,0 +1,6 @@
+class AddRuntimeStatusToContainers < ActiveRecord::Migration
+  def change
+    add_column :containers, :runtime_status, :jsonb, default: {}
+    add_index :containers, :runtime_status, using: :gin
+  end
+end
diff --git a/services/api/db/structure.sql b/services/api/db/structure.sql
index d7ee1532d..d1559c4c6 100644
--- a/services/api/db/structure.sql
+++ b/services/api/db/structure.sql
@@ -354,7 +354,8 @@ CREATE TABLE public.containers (
     locked_by_uuid character varying(255),
     scheduling_parameters text,
     secret_mounts jsonb DEFAULT '{}'::jsonb,
-    secret_mounts_md5 character varying DEFAULT '99914b932bd37a50b983c5e7c90ae93b'::character varying
+    secret_mounts_md5 character varying DEFAULT '99914b932bd37a50b983c5e7c90ae93b'::character varying,
+    runtime_status jsonb DEFAULT '{}'::jsonb
 );
 
 
@@ -1914,6 +1915,13 @@ CREATE INDEX index_containers_on_owner_uuid ON public.containers USING btree (ow
 
 
 --
+-- Name: index_containers_on_runtime_status; Type: INDEX; Schema: public; Owner: -
+--
+
+CREATE INDEX index_containers_on_runtime_status ON public.containers USING gin (runtime_status);
+
+
+--
 -- Name: index_containers_on_secret_mounts_md5; Type: INDEX; Schema: public; Owner: -
 --
 
@@ -3125,3 +3133,5 @@ INSERT INTO schema_migrations (version) VALUES ('20180820130357');
 
 INSERT INTO schema_migrations (version) VALUES ('20180820135808');
 
+INSERT INTO schema_migrations (version) VALUES ('20180904110712');
+

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list