[ARVADOS] updated: 2a64eae3cf8363c596feda5337ea20ce356ca11f
git at public.curoverse.com
git at public.curoverse.com
Wed Jun 4 10:00:01 EDT 2014
Summary of changes:
sdk/cli/bin/arv-run-pipeline-instance | 2 +-
services/api/script/crunch-dispatch.rb | 48 +++++++++++++++++++++++++---------
2 files changed, 37 insertions(+), 13 deletions(-)
via 2a64eae3cf8363c596feda5337ea20ce356ca11f (commit)
via 114df81b90be76e6921b9f20c9ddb272567c82e1 (commit)
from a276e40691a8f96b321879de2279159ef08b804f (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
commit 2a64eae3cf8363c596feda5337ea20ce356ca11f
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date: Mon Jun 2 16:44:26 2014 -0400
2955: crunch-dispatch now sends a clean environment to crunch-job. (cherry-picked from #2882)
Conflicts:
services/api/script/crunch-dispatch.rb
diff --git a/services/api/script/crunch-dispatch.rb b/services/api/script/crunch-dispatch.rb
index 0e9e069..f49f21b 100755
--- a/services/api/script/crunch-dispatch.rb
+++ b/services/api/script/crunch-dispatch.rb
@@ -1,5 +1,7 @@
#!/usr/bin/env ruby
+require 'trollop'
+
include Process
$warned = {}
@@ -20,6 +22,10 @@ if ENV["CRUNCH_DISPATCH_LOCKFILE"]
end
end
+$trollopts = Trollop::options do
+ opt :use_env, "Pass selected environment variables (PATH, PYTHONPATH, RUBYLIB, GEM_PATH, PERLLIB) to crunch-job"
+end
+
ENV["RAILS_ENV"] = ARGV[0] || ENV["RAILS_ENV"] || "development"
require File.dirname(__FILE__) + '/../config/boot'
@@ -152,9 +158,23 @@ class Dispatcher
end
if Server::Application.config.crunch_job_user
- cmd_args.unshift("sudo", "-E", "-u",
- Server::Application.config.crunch_job_user,
- "PERLLIB=#{ENV['PERLLIB']}")
+ cmd_args.unshift("sudo", "-E", "-u", Server::Application.config.crunch_job_user)
+ end
+
+ cmd_args << "HOME=/dev/null"
+ cmd_args << "ARVADOS_API_HOST=#{ENV['ARVADOS_API_HOST']}"
+ cmd_args << "ARVADOS_API_HOST_INSECURE=#{ENV['ARVADOS_API_HOST_INSECURE']}" if ENV['ARVADOS_API_HOST_INSECURE']
+
+ ENV.each do |k, v|
+ cmd_args << "#{k}=#{v}" if k.starts_with? "CRUNCH_"
+ end
+
+ if $trollopts.use_env
+ cmd_args << "PATH=#{ENV['PATH']}"
+ cmd_args << "PYTHONPATH=#{ENV['PYTHONPATH']}"
+ cmd_args << "PERLLIB=#{ENV['PERLLIB']}"
+ cmd_args << "RUBYLIB=#{ENV['RUBYLIB']}"
+ cmd_args << "GEM_PATH=#{ENV['GEM_PATH']}"
end
job_auth = ApiClientAuthorization.
@@ -194,10 +214,10 @@ class Dispatcher
cmd_args << '--git-dir'
cmd_args << arvados_internal
- $stderr.puts "dispatch: #{cmd_args.join ' '}"
+ $stderr.puts "dispatch: #{cmd_args}"
begin
- i, o, e, t = Open3.popen3(*cmd_args)
+ i, o, e, t = Open3.popen3({}, *cmd_args, { :unsetenv_others => true})
rescue
$stderr.puts "dispatch: popen3: #{$!}"
sleep 1
commit 114df81b90be76e6921b9f20c9ddb272567c82e1
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date: Wed Jun 4 09:55:24 2014 -0400
2955: Added check that an orphan job hasn't produces any logs for 5 minutes
before automatically failing it. arv-run-pipeline-instance will identify jobs
that are running=false and success=false as failed even if finished_at is null.
diff --git a/sdk/cli/bin/arv-run-pipeline-instance b/sdk/cli/bin/arv-run-pipeline-instance
index fc636df..e9a7654 100755
--- a/sdk/cli/bin/arv-run-pipeline-instance
+++ b/sdk/cli/bin/arv-run-pipeline-instance
@@ -578,7 +578,7 @@ class WhRunPipelineInstance
failed = 0
@components.each do |cname, c|
if c[:job]
- if c[:job][:finished_at]
+ if c[:job][:finished_at] or (c[:job][:running] == false and c[:job][:success] == false)
ended += 1
if c[:job][:success] == true
succeeded += 1
diff --git a/services/api/script/crunch-dispatch.rb b/services/api/script/crunch-dispatch.rb
index bde9b67..0e9e069 100755
--- a/services/api/script/crunch-dispatch.rb
+++ b/services/api/script/crunch-dispatch.rb
@@ -38,13 +38,17 @@ class Dispatcher
def refresh_running
Job.running.each do |jobrecord|
if !@running[jobrecord.uuid]
- # job is marked running, but not actually running. so fail it
- jobrecord.running = false
- jobrecord.finished_at ||= Time.now
- if jobrecord.success.nil?
- jobrecord.success = false
+ f = Log.filter(["object_uuid", "=", jobrecord.uuid]).limit(1).order("created_at desc").results.first
+ if (Time.now - f.created_at) > 300
+ # job is marked running, but not known to crunch-dispatcher, and
+ # hasn't produced any log entries for 5 minutes, so mark it as failed.
+ jobrecord.running = false
+ jobrecord.finished_at ||= Time.now
+ if jobrecord.success.nil?
+ jobrecord.success = false
+ end
+ jobrecord.save!
end
- jobrecord.save!
end
end
end
@@ -382,7 +386,7 @@ class Dispatcher
end
end
else
- refresh_running unless did_recently(:refresh_running, 30.0)
+ refresh_running unless did_recently(:refresh_running, 60.0)
refresh_todo unless did_recently(:refresh_todo, 1.0)
update_node_status
unless @todo.empty? or did_recently(:start_jobs, 1.0) or $signal[:term]
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list