[ARVADOS] created: b3d286cda65b90e4dd0aaef88f085f45ea855ed5
Git user
git at public.curoverse.com
Wed Mar 29 11:50:03 EDT 2017
at b3d286cda65b90e4dd0aaef88f085f45ea855ed5 (commit)
commit b3d286cda65b90e4dd0aaef88f085f45ea855ed5
Author: Tom Clegg <tom at curoverse.com>
Date: Wed Mar 29 11:49:49 2017 -0400
11235: Log a message when a job is interrupted by node failure.
...and say what's going to happen as a result, even if that is not
"giving up" yet.
diff --git a/services/api/lib/crunch_dispatch.rb b/services/api/lib/crunch_dispatch.rb
index bea1657..2ae99f0 100644
--- a/services/api/lib/crunch_dispatch.rb
+++ b/services/api/lib/crunch_dispatch.rb
@@ -684,17 +684,20 @@ class CrunchDispatch
jobrecord = Job.find_by_uuid(job_done.uuid)
if exit_status == EXIT_RETRY_UNLOCKED or (exit_tempfail and @job_retry_counts.include? jobrecord.uuid)
+ $stderr.puts("dispatch: job #{jobrecord.uuid} was interrupted by node failure")
# Only this crunch-dispatch process can retry the job:
# it's already locked, and there's no way to put it back in the
# Queued state. Put it in our internal todo list unless the job
# has failed this way excessively.
@job_retry_counts[jobrecord.uuid] += 1
exit_tempfail = @job_retry_counts[jobrecord.uuid] <= RETRY_UNLOCKED_LIMIT
+ do_what_next = "give up now"
if exit_tempfail
@todo_job_retries[jobrecord.uuid] = jobrecord
- else
- $stderr.puts("dispatch: job #{jobrecord.uuid} exceeded node failure retry limit -- giving up")
+ do_what_next = "re-attempt"
end
+ $stderr.puts("dispatch: job #{jobrecord.uuid} has been interrupted " +
+ "#{@job_retry_counts[jobrecord.uuid]}x, will #{do_what_next}")
end
if !exit_tempfail
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list