[arvados] created: 2.6.0-259-g431bd5023
git repository hosting
git at public.arvados.org
Thu Jun 8 16:33:07 UTC 2023
at 431bd5023a19d58369dc18e582b1fc2a3d20a321 (commit)
commit 431bd5023a19d58369dc18e582b1fc2a3d20a321
Author: Peter Amstutz <peter.amstutz at curii.com>
Date: Thu Jun 8 12:31:48 2023 -0400
20614: Better query for live container requests when deciding to retry
Now takes into account the state and priority of the requesting
container, if any.
Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz at curii.com>
diff --git a/services/api/app/models/container.rb b/services/api/app/models/container.rb
index d897ff7af..8a51d749f 100644
--- a/services/api/app/models/container.rb
+++ b/services/api/app/models/container.rb
@@ -730,7 +730,21 @@ class Container < ArvadosModel
self.with_lock do
act_as_system_user do
if self.state == Cancelled
- retryable_requests = ContainerRequest.where("container_uuid = ? and priority > 0 and state = 'Committed' and container_count < container_count_max", uuid)
+ # Cancelled means the container didn't run to completion.
+ # This happens either because it was cancelled by the user
+ # or because there was an infrastructure failure. We want
+ # to retry infrastructure failures automatically.
+ #
+ # Seach for live container requests to determine if we
+ # should retry the container.
+ retryable_requests = ContainerRequest.
+ joins('left outer join containers as requesting_container on container_requests.requesting_container_uuid = requesting_container.uuid').
+ where("container_requests.container_uuid = ? and "+
+ "container_requests.priority > 0 and "+
+ "(requesting_container.priority is null or (requesting_container.state = 'Running' and requesting_container.priority > 0)) and "+
+ "container_requests.state = 'Committed' and "+
+ "container_requests.container_count < container_requests.container_count_max", uuid).
+ order('container_requests.uuid asc')
else
retryable_requests = []
end
diff --git a/services/api/test/unit/container_request_test.rb b/services/api/test/unit/container_request_test.rb
index fedfe1521..86e9d93bb 100644
--- a/services/api/test/unit/container_request_test.rb
+++ b/services/api/test/unit/container_request_test.rb
@@ -950,6 +950,174 @@ class ContainerRequestTest < ActiveSupport::TestCase
end
+ test "Retry sub-request on error" do
+ set_user_from_auth :active
+ cr1 = create_minimal_req!(priority: 1, state: "Committed", container_count_max: 2, command: ["echo", "foo1"])
+ c1 = Container.find_by_uuid(cr1.container_uuid)
+ act_as_system_user do
+ c1.update_attributes!(state: Container::Locked)
+ c1.update_attributes!(state: Container::Running)
+ end
+
+ cr2 = with_container_auth(c1) do
+ create_minimal_req!(priority: 10, state: "Committed", container_count_max: 2, command: ["echo", "foo2"])
+ end
+ c2 = Container.find_by_uuid(cr2.container_uuid)
+ act_as_system_user do
+ c2.update_attributes!(state: Container::Locked)
+ c2.update_attributes!(state: Container::Running)
+ end
+
+ cr3 = with_container_auth(c2) do
+ create_minimal_req!(priority: 10, state: "Committed", container_count_max: 2, command: ["echo", "foo3"])
+ end
+ c3 = Container.find_by_uuid(cr3.container_uuid)
+
+ act_as_system_user do
+ c3.update_attributes!(state: Container::Locked)
+ c3.update_attributes!(state: Container::Running)
+ end
+
+ # All the containers are in running state
+
+ c3.reload
+ cr3.reload
+
+ # c3 still running
+ assert_equal 'Running', c3.state
+ assert_equal 1, cr3.container_count
+ assert_equal 'Committed', cr3.state
+
+ # c3 goes to cancelled state
+ act_as_system_user do
+ c3.state = "Cancelled"
+ c3.save!
+ end
+
+ cr3.reload
+
+ # Because the parent request is still live, it should
+ # be retried.
+ assert_equal 2, cr3.container_count
+ assert_equal 'Committed', cr3.state
+ end
+
+ test "Do not retry sub-request when process tree is cancelled" do
+ set_user_from_auth :active
+ cr1 = create_minimal_req!(priority: 1, state: "Committed", container_count_max: 2, command: ["echo", "foo1"])
+ c1 = Container.find_by_uuid(cr1.container_uuid)
+ act_as_system_user do
+ c1.update_attributes!(state: Container::Locked)
+ c1.update_attributes!(state: Container::Running)
+ end
+
+ cr2 = with_container_auth(c1) do
+ create_minimal_req!(priority: 10, state: "Committed", container_count_max: 2, command: ["echo", "foo2"])
+ end
+ c2 = Container.find_by_uuid(cr2.container_uuid)
+ act_as_system_user do
+ c2.update_attributes!(state: Container::Locked)
+ c2.update_attributes!(state: Container::Running)
+ end
+
+ cr3 = with_container_auth(c2) do
+ create_minimal_req!(priority: 10, state: "Committed", container_count_max: 2, command: ["echo", "foo3"])
+ end
+ c3 = Container.find_by_uuid(cr3.container_uuid)
+
+ act_as_system_user do
+ c3.update_attributes!(state: Container::Locked)
+ c3.update_attributes!(state: Container::Running)
+ end
+
+ # All the containers are in running state
+
+ # Now cancel the toplevel container request
+ act_as_system_user do
+ cr1.priority = 0
+ cr1.save!
+ end
+
+ c3.reload
+ cr3.reload
+
+ # c3 still running
+ assert_equal 'Running', c3.state
+ assert_equal 1, cr3.container_count
+ assert_equal 'Committed', cr3.state
+
+ # c3 goes to cancelled state
+ act_as_system_user do
+ assert_equal 0, c3.priority
+ c3.state = "Cancelled"
+ c3.save!
+ end
+
+ cr3.reload
+
+ # Because the parent process was cancelled, it _should not_ be
+ # retried.
+ assert_equal 1, cr3.container_count
+ assert_equal 'Final', cr3.state
+ end
+
+ test "Retry process tree on error" do
+ set_user_from_auth :active
+ cr1 = create_minimal_req!(priority: 1, state: "Committed", container_count_max: 2, command: ["echo", "foo1"])
+ c1 = Container.find_by_uuid(cr1.container_uuid)
+ act_as_system_user do
+ c1.update_attributes!(state: Container::Locked)
+ c1.update_attributes!(state: Container::Running)
+ end
+
+ cr2 = with_container_auth(c1) do
+ create_minimal_req!(priority: 10, state: "Committed", container_count_max: 2, command: ["echo", "foo2"])
+ end
+ c2 = Container.find_by_uuid(cr2.container_uuid)
+ act_as_system_user do
+ c2.update_attributes!(state: Container::Locked)
+ c2.update_attributes!(state: Container::Running)
+ end
+
+ cr3 = with_container_auth(c2) do
+ create_minimal_req!(priority: 10, state: "Committed", container_count_max: 2, command: ["echo", "foo3"])
+ end
+ c3 = Container.find_by_uuid(cr3.container_uuid)
+
+ act_as_system_user do
+ c3.update_attributes!(state: Container::Locked)
+ c3.update_attributes!(state: Container::Running)
+ end
+
+ # All the containers are in running state
+
+ c1.reload
+
+ # c1 goes to cancelled state
+ act_as_system_user do
+ c1.state = "Cancelled"
+ c1.save!
+ end
+
+ cr1.reload
+ cr2.reload
+ cr3.reload
+
+ # Because the root request is still live, it should be retried.
+ # Assumes the root is something like arvados-cwl-runner where
+ # container reuse enables it to more or less pick up where it left
+ # off.
+ assert_equal 2, cr1.container_count
+ assert_equal 'Committed', cr1.state
+
+ # These keep running.
+ assert_equal 1, cr2.container_count
+ assert_equal 'Committed', cr2.state
+
+ assert_equal 1, cr3.container_count
+ assert_equal 'Committed', cr3.state
+ end
+
test "Output collection name setting using output_name with name collision resolution" do
set_user_from_auth :active
output_name = 'unimaginative name'
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list