[ARVADOS] updated: 378e6e0cd313541c395893e832e82a85856d5105
git at public.curoverse.com
git at public.curoverse.com
Wed Dec 23 14:29:27 EST 2015
Summary of changes:
services/api/lib/crunch_dispatch.rb | 83 +++++++++++++++++++++++++-----
services/api/script/cancel_stale_jobs.rb | 44 ----------------
services/api/script/fail-jobs.rb | 18 +++++++
services/api/test/fixtures/files/proc_stat | 14 +++++
services/api/test/unit/fail_jobs_test.rb | 79 ++++++++++++++++++++++++++++
5 files changed, 181 insertions(+), 57 deletions(-)
delete mode 100755 services/api/script/cancel_stale_jobs.rb
create mode 100755 services/api/script/fail-jobs.rb
create mode 100644 services/api/test/fixtures/files/proc_stat
create mode 100644 services/api/test/unit/fail_jobs_test.rb
via 378e6e0cd313541c395893e832e82a85856d5105 (commit)
via b3b9aeee4dba20bcddd8cb4ee2cdcd3c8a34eaec (commit)
from 6cef45f57d81e79ee692742e2817285bec00daa0 (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
commit 378e6e0cd313541c395893e832e82a85856d5105
Merge: 6cef45f b3b9aee
Author: Tom Clegg <tom at curoverse.com>
Date: Wed Dec 23 14:28:41 2015 -0500
Merge branch '7965-fail-abandoned-jobs' closes #7965
commit b3b9aeee4dba20bcddd8cb4ee2cdcd3c8a34eaec
Author: Tom Clegg <tom at curoverse.com>
Date: Fri Dec 18 14:33:28 2015 -0500
7965: Fail orphaned jobs still marked "running" after reboot.
diff --git a/services/api/lib/crunch_dispatch.rb b/services/api/lib/crunch_dispatch.rb
index 5d598d4..05f85c7 100644
--- a/services/api/lib/crunch_dispatch.rb
+++ b/services/api/lib/crunch_dispatch.rb
@@ -2,6 +2,7 @@ require 'open3'
require 'shellwords'
class CrunchDispatch
+ extend DbCurrentTime
include ApplicationHelper
include Process
@@ -194,7 +195,7 @@ class CrunchDispatch
nodelist
end
- def fail_job job, message
+ def fail_job job, message, skip_lock: false
$stderr.puts "dispatch: #{job.uuid}: #{message}"
begin
Log.new(object_uuid: job.uuid,
@@ -206,7 +207,7 @@ class CrunchDispatch
$stderr.puts "dispatch: log.create failed"
end
- if not have_job_lock?(job)
+ if not skip_lock and not have_job_lock?(job)
begin
job.lock @authorizations[job.uuid].user.uuid
rescue ArvadosModel::AlreadyLockedError
@@ -339,16 +340,7 @@ class CrunchDispatch
raise "Unknown crunch_job_wrapper: #{Server::Application.config.crunch_job_wrapper}"
end
- if Server::Application.config.crunch_job_user
- cmd_args.unshift("sudo", "-E", "-u",
- Server::Application.config.crunch_job_user,
- "LD_LIBRARY_PATH=#{ENV['LD_LIBRARY_PATH']}",
- "PATH=#{ENV['PATH']}",
- "PERLLIB=#{ENV['PERLLIB']}",
- "PYTHONPATH=#{ENV['PYTHONPATH']}",
- "RUBYLIB=#{ENV['RUBYLIB']}",
- "GEM_PATH=#{ENV['GEM_PATH']}")
- end
+ cmd_args = sudo_preface + cmd_args
next unless get_authorization job
@@ -362,7 +354,7 @@ class CrunchDispatch
# reasonable thing to do at this point.
repo = Repository.where(name: job.repository).first
if repo.nil? or repo.server_path.nil?
- fail_job "Repository #{job.repository} not found under #{@repo_root}"
+ fail_job job, "Repository #{job.repository} not found under #{@repo_root}"
next
end
ready &&= get_commit repo.server_path, job.script_version
@@ -810,6 +802,51 @@ class CrunchDispatch
end
end
+ def fail_jobs before: nil
+ act_as_system_user do
+ threshold = nil
+ if before == 'reboot'
+ boottime = nil
+ open('/proc/stat').map(&:split).each do |stat, t|
+ if stat == 'btime'
+ boottime = t
+ end
+ end
+ if not boottime
+ raise "Could not find btime in /proc/stat"
+ end
+ threshold = Time.at(boottime.to_i)
+ elsif before
+ threshold = Time.parse(before, Time.now)
+ else
+ threshold = db_current_time
+ end
+ Rails.logger.info "fail_jobs: threshold is #{threshold}"
+
+ if Rails.configuration.crunch_job_wrapper == :slurm_immediate
+ # [["slurm_job_id", "slurm_job_name"], ...]
+ squeue = File.popen(['squeue', '-h', '-o', '%i %j']).readlines.map do |line|
+ line.strip.split(' ', 2)
+ end
+ else
+ squeue = []
+ end
+
+ Job.where('state = ? and started_at < ?', Job::Running, threshold).
+ each do |job|
+ Rails.logger.debug "fail_jobs: #{job.uuid} started #{job.started_at}"
+ squeue.each do |slurm_id, slurm_name|
+ if slurm_name == job.uuid
+ Rails.logger.info "fail_jobs: scancel #{slurm_id} for #{job.uuid}"
+ scancel slurm_id
+ end
+ end
+ fail_job(job, "cleaned up stale job: started before #{threshold}",
+ skip_lock: true)
+ end
+ end
+ end
+
protected
def have_job_lock?(job)
@@ -851,4 +888,24 @@ class CrunchDispatch
running_job[:stderr_flushed_at] = Time.now
end
end
+
+ def scancel slurm_id
+ cmd = sudo_preface + ['scancel', slurm_id]
+ puts File.popen(cmd).read
+ if not $?.success?
+ Rails.logger.error "scancel #{slurm_id.shellescape}: $?"
+ end
+ end
+
+ def sudo_preface
+ return [] if not Server::Application.config.crunch_job_user
+ ["sudo", "-E", "-u",
+ Server::Application.config.crunch_job_user,
+ "LD_LIBRARY_PATH=#{ENV['LD_LIBRARY_PATH']}",
+ "PATH=#{ENV['PATH']}",
+ "PERLLIB=#{ENV['PERLLIB']}",
+ "PYTHONPATH=#{ENV['PYTHONPATH']}",
+ "RUBYLIB=#{ENV['RUBYLIB']}",
+ "GEM_PATH=#{ENV['GEM_PATH']}"]
+ end
end
diff --git a/services/api/script/cancel_stale_jobs.rb b/services/api/script/cancel_stale_jobs.rb
deleted file mode 100755
index 4949ec0..0000000
--- a/services/api/script/cancel_stale_jobs.rb
+++ /dev/null
@@ -1,44 +0,0 @@
-#!/usr/bin/env ruby
-
-
-if ENV["CRUNCH_DISPATCH_LOCKFILE"]
- lockfilename = ENV.delete "CRUNCH_DISPATCH_LOCKFILE"
- lockfile = File.open(lockfilename, File::RDWR|File::CREAT, 0644)
- unless lockfile.flock File::LOCK_EX|File::LOCK_NB
- abort "Lock unavailable on #{lockfilename} - exit"
- end
-end
-
-ENV["RAILS_ENV"] = ARGV[0] || ENV["RAILS_ENV"] || "development"
-
-require File.dirname(__FILE__) + '/../config/boot'
-require File.dirname(__FILE__) + '/../config/environment'
-
-class CancelJobs
- include ApplicationHelper
-
- def cancel_stale_jobs
- act_as_system_user do
- Job.running.each do |jobrecord|
- f = Log.where("object_uuid=?", jobrecord.uuid).limit(1).order("created_at desc").first
- if f
- age = (Time.now - f.created_at)
- if age > 300
- $stderr.puts "dispatch: failing orphan job #{jobrecord.uuid}, last log is #{age} seconds old"
- # job is marked running, but not known to crunch-dispatcher, and
- # hasn't produced any log entries for 5 minutes, so mark it as failed.
- jobrecord.running = false
- jobrecord.cancelled_at ||= Time.now
- jobrecord.finished_at ||= Time.now
- if jobrecord.success.nil?
- jobrecord.success = false
- end
- jobrecord.save!
- end
- end
- end
- end
- end
-end
-
-CancelJobs.new.cancel_stale_jobs
diff --git a/services/api/script/fail-jobs.rb b/services/api/script/fail-jobs.rb
new file mode 100755
index 0000000..fd9212c
--- /dev/null
+++ b/services/api/script/fail-jobs.rb
@@ -0,0 +1,18 @@
+#!/usr/bin/env ruby
+
+require 'trollop'
+
+opts = Trollop::options do
+ banner 'Fail jobs that have state=="Running".'
+ banner 'Options:'
+ opt(:before,
+ 'fail only jobs that started before the given time (or "reboot")',
+ type: :string)
+end
+
+ENV["RAILS_ENV"] = ARGV[0] || ENV["RAILS_ENV"] || "development"
+require File.dirname(__FILE__) + '/../config/boot'
+require File.dirname(__FILE__) + '/../config/environment'
+require Rails.root.join('lib/crunch_dispatch.rb')
+
+CrunchDispatch.new.fail_jobs before: opts[:before]
diff --git a/services/api/test/fixtures/files/proc_stat b/services/api/test/fixtures/files/proc_stat
new file mode 100644
index 0000000..eac6c47
--- /dev/null
+++ b/services/api/test/fixtures/files/proc_stat
@@ -0,0 +1,14 @@
+cpu 1632063 14136 880034 1195938459 1041039 63 21266 52811 0 0
+cpu0 291707 2191 123004 199461836 32816 58 4488 13329 0 0
+cpu1 279247 2288 168096 199443605 20358 0 3320 7776 0 0
+cpu2 243805 1099 145178 199516577 19542 0 2656 6975 0 0
+cpu3 225772 3025 145032 199534463 21217 0 2260 6578 0 0
+cpu4 280505 2581 151177 198587478 885147 2 4446 10116 0 0
+cpu5 311025 2950 147545 199394498 61957 2 4093 8035 0 0
+intr 165887918 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 8993615 9108946 8200 0 50911 0 12573182 7875376 8631 0 44633 0 10027365 7325091 8544 0 59992 0 9835855 6999541 8145 0 65576 0 9789778 8583897 8184 0 55917 0 10003804 8546910 8448 0 53484 0 463 150 3174990 11523 3836341 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+ctxt 255708943
+btime 1448378837
+processes 924315
+procs_running 1
+procs_blocked 0
+softirq 105120691 0 21194262 1261637 20292759 0 0 40708 13638302 27046 48665977
diff --git a/services/api/test/unit/fail_jobs_test.rb b/services/api/test/unit/fail_jobs_test.rb
new file mode 100644
index 0000000..c390b32
--- /dev/null
+++ b/services/api/test/unit/fail_jobs_test.rb
@@ -0,0 +1,79 @@
+require 'test_helper'
+require 'crunch_dispatch'
+
+class FailJobsTest < ActiveSupport::TestCase
+ include DbCurrentTime
+
+ BOOT_TIME = 1448378837
+
+ setup do
+ @job = {}
+ act_as_user users(:admin) do
+ @job[:before_reboot] = Job.create!(state: 'Running',
+ running: true,
+ started_at: Time.at(BOOT_TIME - 300))
+ @job[:after_reboot] = Job.create!(state: 'Running',
+ running: true,
+ started_at: Time.at(BOOT_TIME + 300))
+ @job[:complete] = Job.create!(state: 'Running',
+ running: true,
+ started_at: Time.at(BOOT_TIME - 300))
+ @job[:complete].update_attributes(state: 'Complete')
+ @job[:complete].update_attributes(finished_at: Time.at(BOOT_TIME + 100))
+ @job[:queued] = jobs(:queued)
+
+ @job.values.each do |job|
+ # backdate timestamps
+ Job.where(uuid: job.uuid).
+ update_all(created_at: Time.at(BOOT_TIME - 330),
+ modified_at: (job.finished_at ||
+ job.started_at ||
+ Time.at(BOOT_TIME - 300)))
+ end
+ end
+ @dispatch = CrunchDispatch.new
+ @test_start_time = db_current_time
+ end
+
+ test 'cancel slurm jobs' do
+ Rails.configuration.crunch_job_wrapper = :slurm_immediate
+ Rails.configuration.crunch_job_user = 'foobar'
+ fake_squeue = File.popen("echo 1234 #{@job[:before_reboot].uuid}")
+ fake_scancel = File.popen("true")
+ File.expects(:popen).
+ with(['squeue', '-h', '-o', '%i %j']).
+ returns(fake_squeue)
+ File.expects(:popen).
+ with(includes('sudo', '-u', 'foobar', 'scancel', '1234')).
+ returns(fake_scancel)
+ @dispatch.fail_jobs(before: Time.at(BOOT_TIME).to_s)
+ assert_end_states
+ end
+
+ test 'use reboot time' do
+ Rails.configuration.crunch_job_wrapper = nil
+ @dispatch.expects(:open).once.with('/proc/stat').
+ returns open(Rails.root.join('test/fixtures/files/proc_stat'))
+ @dispatch.fail_jobs(before: 'reboot')
+ assert_end_states
+ end
+
+ test 'command line help' do
+ cmd = Rails.root.join('script/fail-jobs.rb').to_s
+ assert_match /Options:.*--before=/m, File.popen([cmd, '--help']).read
+ end
+
+ protected
+
+ def assert_end_states
+ @job.values.map &:reload
+ assert_equal 'Failed', @job[:before_reboot].state
+ assert_equal false, @job[:before_reboot].running
+ assert_equal false, @job[:before_reboot].success
+ assert_operator @job[:before_reboot].finished_at, :>=, @test_start_time
+ assert_operator @job[:before_reboot].finished_at, :<=, db_current_time
+ assert_equal 'Running', @job[:after_reboot].state
+ assert_equal 'Complete', @job[:complete].state
+ assert_equal 'Queued', @job[:queued].state
+ end
+end
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list