[ARVADOS] updated: caa5dd776dfad5e50592a5cc2824c70ac3474b46
git at public.curoverse.com
git at public.curoverse.com
Thu Sep 25 14:08:44 EDT 2014
Summary of changes:
services/api/app/models/job.rb | 15 ++++++++++
services/api/script/crunch-dispatch.rb | 53 ++++++++++++++++++----------------
2 files changed, 43 insertions(+), 25 deletions(-)
via caa5dd776dfad5e50592a5cc2824c70ac3474b46 (commit)
from 469f117ead24509639fb5b6ba6c9bd1b6067460c (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
commit caa5dd776dfad5e50592a5cc2824c70ac3474b46
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date: Thu Sep 25 14:08:25 2014 -0400
3859: Check crunch-job return code properly. Restore job state change
validation.
diff --git a/services/api/app/models/job.rb b/services/api/app/models/job.rb
index ea9990d..81744c7 100644
--- a/services/api/app/models/job.rb
+++ b/services/api/app/models/job.rb
@@ -14,6 +14,7 @@ class Job < ArvadosModel
validate :ensure_script_version_is_commit
validate :find_docker_image_locator
validate :validate_status
+ validate :validate_state_change
has_many :commit_ancestors, :foreign_key => :descendant, :primary_key => :script_version
has_many(:nodes, foreign_key: :job_uuid, primary_key: :uuid)
@@ -330,4 +331,18 @@ class Job < ArvadosModel
end
end
+ def validate_state_change
+ if self.state_changed?
+ if self.state_was.in? [Complete, Failed, Cancelled]
+ # Once in a finished state, don't permit any changes
+ errors.add :state, "invalid change from #{self.state_was} to #{self.state}"
+ return false
+ elsif self.state_was == Running and not self.state.in? [Complete, Failed, Cancelled]
+ # From running, can only transition to a finished state
+ errors.add :state, "invalid change from #{self.state_was} to #{self.state}"
+ return false
+ end
+ end
+ true
+ end
end
diff --git a/services/api/script/crunch-dispatch.rb b/services/api/script/crunch-dispatch.rb
index 45fb6dd..68af85d 100755
--- a/services/api/script/crunch-dispatch.rb
+++ b/services/api/script/crunch-dispatch.rb
@@ -419,34 +419,37 @@ class Dispatcher
end
# Wait the thread (returns a Process::Status)
- exit_status = j_done[:wait_thr].value
+ exit_status = j_done[:wait_thr].value.exitstatus
jobrecord = Job.find_by_uuid(job_done.uuid)
- if exit_status.to_i != 75 and jobrecord.started_at
- # Clean up state fields in case crunch-job exited without
- # putting the job in a suitable "finished" state.
- jobrecord.running = false
- jobrecord.finished_at ||= Time.now
- if jobrecord.success.nil?
- jobrecord.success = false
+ if exit_status != 0
+ # crunch-job exited with some kind of failure.
+ if exit_status != 75 and jobrecord.started_at
+ # Clean up state fields in case crunch-job exited without
+ # putting the job in a suitable "finished" state.
+ jobrecord.running = false
+ jobrecord.finished_at ||= Time.now
+ if jobrecord.success.nil?
+ jobrecord.success = false
+ end
+ jobrecord.save!
+ else
+ # Don't fail the job if crunch-job didn't even get as far as
+ # starting it. If the job failed to run due to an infrastructure
+ # issue with crunch-job or slurm, we want the job to stay in the
+ # queue. If crunch-job exited after losing a race to another
+ # crunch-job process, it exits 75 and we should leave the job
+ # record alone so the winner of the race do its thing.
+ #
+ # There is still an unhandled race condition: If our crunch-job
+ # process is about to lose a race with another crunch-job
+ # process, but crashes before getting to its "exit 75" (for
+ # example, "cannot fork" or "cannot reach API server") then we
+ # will assume incorrectly that it's our process's fault
+ # jobrecord.started_at is non-nil, and mark the job as failed
+ # even though the winner of the race is probably still doing
+ # fine.
end
- jobrecord.save!
- else
- # Don't fail the job if crunch-job didn't even get as far as
- # starting it. If the job failed to run due to an infrastructure
- # issue with crunch-job or slurm, we want the job to stay in the
- # queue. If crunch-job exited after losing a race to another
- # crunch-job process, it exits 75 and we should leave the job
- # record alone so the winner of the race do its thing.
- #
- # There is still an unhandled race condition: If our crunch-job
- # process is about to lose a race with another crunch-job
- # process, but crashes before getting to its "exit 75" (for
- # example, "cannot fork" or "cannot reach API server") then we
- # will assume incorrectly that it's our process's fault
- # jobrecord.started_at is non-nil, and mark the job as failed
- # even though the winner of the race is probably still doing
- # fine.
end
# Invalidate the per-job auth token
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list