[ARVADOS] created: 1.3.0-831-g243010586
Git user
git at public.curoverse.com
Tue Apr 30 15:57:46 UTC 2019
at 24301058687be0d42883871d168c15dac98668c2 (commit)
commit 24301058687be0d42883871d168c15dac98668c2
Author: Peter Amstutz <pamstutz at veritasgenetics.com>
Date: Tue Apr 30 11:53:01 2019 -0400
15164: Add locking between container completion and reuse
Addresses race condition between container completion and container
reuse. Without this locking, a container request can resolve and
attempt to reuse a container which is concurrently being completed,
resulting in a race condition that results in the container request
never being finalized.
Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <pamstutz at veritasgenetics.com>
diff --git a/services/api/app/models/container.rb b/services/api/app/models/container.rb
index 45cd13bbc..2bbdd0a07 100644
--- a/services/api/app/models/container.rb
+++ b/services/api/app/models/container.rb
@@ -648,64 +648,76 @@ class Container < ArvadosModel
# This container is finished so finalize any associated container requests
# that are associated with this container.
if self.state_changed? and self.final?
- act_as_system_user do
-
- if self.state == Cancelled
- retryable_requests = ContainerRequest.where("container_uuid = ? and priority > 0 and state = 'Committed' and container_count < container_count_max", uuid)
- else
- retryable_requests = []
- end
+ # These get wiped out by with_lock (which reloads the record),
+ # so record them now in case we need to schedule a retry.
+ prev_secret_mounts = self.secret_mounts_was
+ prev_runtime_token = self.runtime_token_was
+
+ # Need to take a lock on the container to ensure that any
+ # concurrent container requests that might try to reuse this
+ # container will block until the container completion
+ # transaction finishes. This ensure that concurrent container
+ # requests that try to reuse this container are finalized (on
+ # Complete) or don't reuse it (on Cancelled).
+ self.with_lock do
+ act_as_system_user do
+ if self.state == Cancelled
+ retryable_requests = ContainerRequest.where("container_uuid = ? and priority > 0 and state = 'Committed' and container_count < container_count_max", uuid)
+ else
+ retryable_requests = []
+ end
- if retryable_requests.any?
- c_attrs = {
- command: self.command,
- cwd: self.cwd,
- environment: self.environment,
- output_path: self.output_path,
- container_image: self.container_image,
- mounts: self.mounts,
- runtime_constraints: self.runtime_constraints,
- scheduling_parameters: self.scheduling_parameters,
- secret_mounts: self.secret_mounts_was,
- runtime_token: self.runtime_token_was,
- runtime_user_uuid: self.runtime_user_uuid,
- runtime_auth_scopes: self.runtime_auth_scopes
- }
- c = Container.create! c_attrs
- retryable_requests.each do |cr|
- cr.with_lock do
- leave_modified_by_user_alone do
- # Use row locking because this increments container_count
- cr.container_uuid = c.uuid
- cr.save!
+ if retryable_requests.any?
+ c_attrs = {
+ command: self.command,
+ cwd: self.cwd,
+ environment: self.environment,
+ output_path: self.output_path,
+ container_image: self.container_image,
+ mounts: self.mounts,
+ runtime_constraints: self.runtime_constraints,
+ scheduling_parameters: self.scheduling_parameters,
+ secret_mounts: prev_secret_mounts,
+ runtime_token: prev_runtime_token,
+ runtime_user_uuid: self.runtime_user_uuid,
+ runtime_auth_scopes: self.runtime_auth_scopes
+ }
+ c = Container.create! c_attrs
+ retryable_requests.each do |cr|
+ cr.with_lock do
+ leave_modified_by_user_alone do
+ # Use row locking because this increments container_count
+ cr.container_uuid = c.uuid
+ cr.save!
+ end
end
end
end
- end
- # Notify container requests associated with this container
- ContainerRequest.where(container_uuid: uuid,
- state: ContainerRequest::Committed).each do |cr|
- leave_modified_by_user_alone do
- cr.finalize!
+ # Notify container requests associated with this container
+ ContainerRequest.where(container_uuid: uuid,
+ state: ContainerRequest::Committed).each do |cr|
+ leave_modified_by_user_alone do
+ cr.finalize!
+ end
end
- end
- # Cancel outstanding container requests made by this container.
- ContainerRequest.
- includes(:container).
- where(requesting_container_uuid: uuid,
- state: ContainerRequest::Committed).each do |cr|
- leave_modified_by_user_alone do
- cr.update_attributes!(priority: 0)
- cr.container.reload
- if cr.container.state == Container::Queued || cr.container.state == Container::Locked
- # If the child container hasn't started yet, finalize the
- # child CR now instead of leaving it "on hold", i.e.,
- # Queued with priority 0. (OTOH, if the child is already
- # running, leave it alone so it can get cancelled the
- # usual way, get a copy of the log collection, etc.)
- cr.update_attributes!(state: ContainerRequest::Final)
+ # Cancel outstanding container requests made by this container.
+ ContainerRequest.
+ includes(:container).
+ where(requesting_container_uuid: uuid,
+ state: ContainerRequest::Committed).each do |cr|
+ leave_modified_by_user_alone do
+ cr.update_attributes!(priority: 0)
+ cr.container.reload
+ if cr.container.state == Container::Queued || cr.container.state == Container::Locked
+ # If the child container hasn't started yet, finalize the
+ # child CR now instead of leaving it "on hold", i.e.,
+ # Queued with priority 0. (OTOH, if the child is already
+ # running, leave it alone so it can get cancelled the
+ # usual way, get a copy of the log collection, etc.)
+ cr.update_attributes!(state: ContainerRequest::Final)
+ end
end
end
end
diff --git a/services/api/app/models/container_request.rb b/services/api/app/models/container_request.rb
index 24882860e..45db4ee91 100644
--- a/services/api/app/models/container_request.rb
+++ b/services/api/app/models/container_request.rb
@@ -119,13 +119,34 @@ class ContainerRequest < ArvadosModel
end
def finalize_if_needed
- if state == Committed && Container.find_by_uuid(container_uuid).final?
- reload
- act_as_system_user do
- leave_modified_by_user_alone do
- finalize!
+ return if state != Committed
+ while true
+ # get container lock first, then lock current container request
+ # (same order as Container#handle_completed). Locking always
+ # reloads the Container and ContainerRequest records.
+ c = Container.find_by_uuid(container_uuid)
+ c.lock!
+ self.lock!
+
+ if container_uuid != c.uuid
+ # After locking, we've noticed a race, the container_uuid is
+ # different than the container record we just loaded. This
+ # can happen if Container#handle_completed scheduled a new
+ # container for retry and set container_uuid while we were
+ # waiting on the container lock. Restart the loop and get the
+ # new container.
+ redo
+ end
+
+ if state == Committed && c.final?
+ # The current container is
+ act_as_system_user do
+ leave_modified_by_user_alone do
+ finalize!
+ end
end
end
+ return true
end
end
@@ -210,7 +231,18 @@ class ContainerRequest < ArvadosModel
return false
end
if state_changed? and state == Committed and container_uuid.nil?
- self.container_uuid = Container.resolve(self).uuid
+ while true
+ c = Container.resolve(self)
+ c.lock!
+ if c.state == Container::Cancelled
+ # Lost a race, we have a lock on the container but the
+ # container was cancelled in a different request, restart
+ # the loop and resolve request to a new container.
+ redo
+ end
+ self.container_uuid = c.uuid
+ break
+ end
end
if self.container_uuid != self.container_uuid_was
if self.container_count_changed?
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list