[ARVADOS] updated: 25af6c40181b95f13ec3e9e366c53cb50868d065
Git user
git at public.curoverse.com
Wed Feb 22 14:49:49 EST 2017
Summary of changes:
.../methods/container_requests.html.textile.liquid | 15 +--
sdk/cli/bin/crunch-job | 5 +-
sdk/cli/test/binstub_output_coll_owner/python | 4 +
sdk/cli/test/test_crunch-job.rb | 9 ++
sdk/go/dispatch/dispatch.go | 18 +--
services/api/app/models/arvados_model.rb | 6 +-
services/api/app/models/container.rb | 43 +++----
services/api/db/structure.sql | 130 ++++++++++-----------
services/api/lib/crunch_dispatch.rb | 64 +++++-----
services/api/lib/has_uuid.rb | 4 +-
services/api/lib/sweep_trashed_collections.rb | 3 +
services/api/test/unit/collection_test.rb | 16 +++
services/api/test/unit/container_test.rb | 12 +-
services/api/test/unit/crunch_dispatch_test.rb | 3 +-
services/api/test/unit/fail_jobs_test.rb | 6 +-
services/api/test/unit/link_test.rb | 24 ++--
services/nodemanager/arvnodeman/jobqueue.py | 2 +-
17 files changed, 196 insertions(+), 168 deletions(-)
create mode 100755 sdk/cli/test/binstub_output_coll_owner/python
via 25af6c40181b95f13ec3e9e366c53cb50868d065 (commit)
via 3a31db02591a2f57d51b98ba9add7d835c5c6c26 (commit)
via 0c529ed05805507b4d2c903b9587e9b61cec5ee6 (commit)
via 423d27d0d7439e95dd8ef6b1dbfe890055cc0fa9 (commit)
via 802af81e13dd11a7f2d9796a2ada8faf3b722477 (commit)
via 1732df243e8ddc1eab0eca157e4b83bd8079f774 (commit)
via 8b75947ee3f99b87eec443763653ca6ae3eb21e1 (commit)
via 4bd54c4c42f13434b84410b7917a8cd208d613d7 (commit)
via 4f0e07d462b7860bb10686c27fac16970220377f (commit)
via 4cd89bd1767bece226c412ae7c9ea37669e8706b (commit)
via d069de03a99dc58fd38f241435fcbaac84e9f63a (commit)
via d0ba6eaac1387ad817de1c2df2d7f4f00800aaa5 (commit)
via b1fffbeb4e06d0ec36c41c2fd9a0f23871f081c5 (commit)
via 484370cbfe47b04e1d4222dd4a7606171c87a324 (commit)
via 85887cd7fed798345e340480062b8ffcf3cf053a (commit)
via 3a6966e5997ed5de342947759042ec5584f770c6 (commit)
via 0403a0c816df1edea311b9197147fd254d131712 (commit)
via 264ffa31bae106bb6c36643e13186289b6cd0e18 (commit)
from 2b34839cdf95291a7356554e05e50b9ced177dd6 (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
commit 25af6c40181b95f13ec3e9e366c53cb50868d065
Merge: 3a31db0 0c529ed
Author: radhika <radhika at curoverse.com>
Date: Wed Feb 22 14:49:37 2017 -0500
Merge branch 'master' into 10979-cancelled-job-nodes
commit 3a31db02591a2f57d51b98ba9add7d835c5c6c26
Author: radhika <radhika at curoverse.com>
Date: Wed Feb 22 14:48:47 2017 -0500
10979: refactor squeue invocations
diff --git a/sdk/go/dispatch/dispatch.go b/sdk/go/dispatch/dispatch.go
index 371cbb1..990d74e 100644
--- a/sdk/go/dispatch/dispatch.go
+++ b/sdk/go/dispatch/dispatch.go
@@ -205,6 +205,9 @@ func (d *Dispatcher) Unlock(uuid string) error {
// TrackContainer starts a tracker for given uuid if one is not already existing, despite its state.
func (d *Dispatcher) TrackContainer(uuid string) {
+ d.mtx.Lock()
+ defer d.mtx.Unlock()
+
if d.trackers == nil {
d.trackers = make(map[string]*runTracker)
}
@@ -214,14 +217,6 @@ func (d *Dispatcher) TrackContainer(uuid string) {
return
}
- d.mtx.Lock()
- defer d.mtx.Unlock()
-
- _, alreadyTracking = d.trackers[uuid]
- if alreadyTracking {
- return
- }
-
var cntr arvados.Container
err := d.Arv.Call("GET", "containers", uuid, "", nil, &cntr)
if err != nil {
@@ -229,10 +224,7 @@ func (d *Dispatcher) TrackContainer(uuid string) {
return
}
- tracker := &runTracker{updates: make(chan arvados.Container, 1)}
- tracker.updates <- cntr
-
- d.trackers[uuid] = tracker
+ d.trackers[uuid] = d.start(c)
}
type runTracker struct {
diff --git a/services/api/lib/crunch_dispatch.rb b/services/api/lib/crunch_dispatch.rb
index 34fed92..cf19cc4 100644
--- a/services/api/lib/crunch_dispatch.rb
+++ b/services/api/lib/crunch_dispatch.rb
@@ -871,22 +871,14 @@ class CrunchDispatch
end
Rails.logger.info "fail_jobs: threshold is #{threshold}"
- if Rails.configuration.crunch_job_wrapper == :slurm_immediate
- # [["slurm_job_id", "slurm_job_name"], ...]
- squeue = File.popen(['squeue', '-h', '-o', '%i %j']).readlines.map do |line|
- line.strip.split(' ', 2)
- end
- else
- squeue = []
- end
-
+ squeue = squeue_jobs
Job.where('state = ? and started_at < ?', Job::Running, threshold).
each do |job|
Rails.logger.debug "fail_jobs: #{job.uuid} started #{job.started_at}"
- squeue.each do |slurm_id, slurm_name|
+ squeue.each do |slurm_name|
if slurm_name == job.uuid
- Rails.logger.info "fail_jobs: scancel #{slurm_id} for #{job.uuid}"
- scancel slurm_id
+ Rails.logger.info "fail_jobs: scancel #{job.uuid}"
+ scancel slurm_name
end
end
fail_job(job, "cleaned up stale job: started before #{threshold}",
@@ -897,21 +889,19 @@ class CrunchDispatch
def check_orphaned_slurm_jobs
act_as_system_user do
- if Rails.configuration.crunch_job_wrapper == :slurm_immediate
- squeue_uuids = File.popen(['squeue', '-a', '-h', '-o', '%j']).readlines.map do |line|
- line.strip.split(' ', 1)
- end.collect{|l| l[0]}.
- select{|uuid| uuid.match(HasUuid::UUID_REGEX)}.
- select{|uuid| !@running.has_key?(uuid)}
-
- return if squeue_uuids.size == 0
-
- scancel_uuids = squeue_uuids - Job.where('uuid in (?) and (state=? or modified_at>?)',
- squeue_uuids, 'Running', (Time.now - 60)).collect(&:uuid)
- scancel_uuids.each do |uuid|
- Rails.logger.info "orphaned job: scancel #{uuid}"
- scancel uuid, true
- end
+ squeue_uuids = squeue_jobs.select{|uuid| uuid.match(HasUuid::UUID_REGEX)}.
+ select{|uuid| !@running.has_key?(uuid)}
+
+ return if squeue_uuids.size == 0
+
+ scancel_uuids = squeue_uuids - Job.where('uuid in (?) and (state in (?) or modified_at>?)',
+ squeue_uuids,
+ ['Running', 'Queued'],
+ (Time.now - 60)).
+ collect(&:uuid)
+ scancel_uuids.each do |uuid|
+ Rails.logger.info "orphaned job: scancel #{uuid}"
+ scancel uuid
end
end
end
@@ -970,14 +960,22 @@ class CrunchDispatch
end
end
- def scancel slurm_id, use_name=false
- scancel_cmd = ['scancel']
- scancel_cmd << '-n' if use_name
- scancel_cmd << slurm_id
- cmd = sudo_preface + scancel_cmd
+ # An array of job_uuids in squeue
+ def squeue_jobs
+ if Rails.configuration.crunch_job_wrapper == :slurm_immediate
+ squeue = File.popen(['squeue', '-a', '-h', '-o', '%j']).readlines.map do |line|
+ line.strip
+ end
+ else
+ squeue = []
+ end
+ end
+
+ def scancel slurm_name
+ cmd = sudo_preface + ['scancel', '-n', slurm_name]
puts File.popen(cmd).read
if not $?.success?
- Rails.logger.error "scancel #{slurm_id.shellescape}: $?"
+ Rails.logger.error "scancel #{slurm_name.shellescape}: $?"
end
end
end
diff --git a/services/api/test/unit/crunch_dispatch_test.rb b/services/api/test/unit/crunch_dispatch_test.rb
index 8e604ee..4646f7a 100644
--- a/services/api/test/unit/crunch_dispatch_test.rb
+++ b/services/api/test/unit/crunch_dispatch_test.rb
@@ -209,6 +209,7 @@ class CrunchDispatchTest < ActiveSupport::TestCase
dispatch = CrunchDispatch.new
squeue_resp = File.popen("echo zzzzz-8i9sb-pshmckwoma9plh7\necho thisisnotvalidjobuuid\necho zzzzz-8i9sb-4cf0abc123e809j\n")
+ scancel_resp = File.popen("true")
File.expects(:popen).
with(['squeue', '-a', '-h', '-o', '%j']).
@@ -216,7 +217,7 @@ class CrunchDispatchTest < ActiveSupport::TestCase
File.expects(:popen).
with(dispatch.sudo_preface + ['scancel', '-n', 'zzzzz-8i9sb-4cf0abc123e809j']).
- returns(squeue_resp)
+ returns(scancel_resp)
dispatch.check_orphaned_slurm_jobs
end
diff --git a/services/api/test/unit/fail_jobs_test.rb b/services/api/test/unit/fail_jobs_test.rb
index 1f5847a..8c6539e 100644
--- a/services/api/test/unit/fail_jobs_test.rb
+++ b/services/api/test/unit/fail_jobs_test.rb
@@ -38,13 +38,13 @@ class FailJobsTest < ActiveSupport::TestCase
test 'cancel slurm jobs' do
Rails.configuration.crunch_job_wrapper = :slurm_immediate
Rails.configuration.crunch_job_user = 'foobar'
- fake_squeue = File.popen("echo 1234 #{@job[:before_reboot].uuid}")
+ fake_squeue = File.popen("echo #{@job[:before_reboot].uuid}")
fake_scancel = File.popen("true")
File.expects(:popen).
- with(['squeue', '-h', '-o', '%i %j']).
+ with(['squeue', '-a', '-h', '-o', '%j']).
returns(fake_squeue)
File.expects(:popen).
- with(includes('sudo', '-u', 'foobar', 'scancel', '1234')).
+ with(includes('sudo', '-u', 'foobar', 'scancel', '-n', @job[:before_reboot].uuid)).
returns(fake_scancel)
@dispatch.fail_jobs(before: Time.at(BOOT_TIME).to_s)
assert_end_states
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list