[ARVADOS] updated: 25af6c40181b95f13ec3e9e366c53cb50868d065

Git user git at public.curoverse.com
Wed Feb 22 14:49:49 EST 2017


Summary of changes:
 .../methods/container_requests.html.textile.liquid |  15 +--
 sdk/cli/bin/crunch-job                             |   5 +-
 sdk/cli/test/binstub_output_coll_owner/python      |   4 +
 sdk/cli/test/test_crunch-job.rb                    |   9 ++
 sdk/go/dispatch/dispatch.go                        |  18 +--
 services/api/app/models/arvados_model.rb           |   6 +-
 services/api/app/models/container.rb               |  43 +++----
 services/api/db/structure.sql                      | 130 ++++++++++-----------
 services/api/lib/crunch_dispatch.rb                |  64 +++++-----
 services/api/lib/has_uuid.rb                       |   4 +-
 services/api/lib/sweep_trashed_collections.rb      |   3 +
 services/api/test/unit/collection_test.rb          |  16 +++
 services/api/test/unit/container_test.rb           |  12 +-
 services/api/test/unit/crunch_dispatch_test.rb     |   3 +-
 services/api/test/unit/fail_jobs_test.rb           |   6 +-
 services/api/test/unit/link_test.rb                |  24 ++--
 services/nodemanager/arvnodeman/jobqueue.py        |   2 +-
 17 files changed, 196 insertions(+), 168 deletions(-)
 create mode 100755 sdk/cli/test/binstub_output_coll_owner/python

       via  25af6c40181b95f13ec3e9e366c53cb50868d065 (commit)
       via  3a31db02591a2f57d51b98ba9add7d835c5c6c26 (commit)
       via  0c529ed05805507b4d2c903b9587e9b61cec5ee6 (commit)
       via  423d27d0d7439e95dd8ef6b1dbfe890055cc0fa9 (commit)
       via  802af81e13dd11a7f2d9796a2ada8faf3b722477 (commit)
       via  1732df243e8ddc1eab0eca157e4b83bd8079f774 (commit)
       via  8b75947ee3f99b87eec443763653ca6ae3eb21e1 (commit)
       via  4bd54c4c42f13434b84410b7917a8cd208d613d7 (commit)
       via  4f0e07d462b7860bb10686c27fac16970220377f (commit)
       via  4cd89bd1767bece226c412ae7c9ea37669e8706b (commit)
       via  d069de03a99dc58fd38f241435fcbaac84e9f63a (commit)
       via  d0ba6eaac1387ad817de1c2df2d7f4f00800aaa5 (commit)
       via  b1fffbeb4e06d0ec36c41c2fd9a0f23871f081c5 (commit)
       via  484370cbfe47b04e1d4222dd4a7606171c87a324 (commit)
       via  85887cd7fed798345e340480062b8ffcf3cf053a (commit)
       via  3a6966e5997ed5de342947759042ec5584f770c6 (commit)
       via  0403a0c816df1edea311b9197147fd254d131712 (commit)
       via  264ffa31bae106bb6c36643e13186289b6cd0e18 (commit)
      from  2b34839cdf95291a7356554e05e50b9ced177dd6 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.


commit 25af6c40181b95f13ec3e9e366c53cb50868d065
Merge: 3a31db0 0c529ed
Author: radhika <radhika at curoverse.com>
Date:   Wed Feb 22 14:49:37 2017 -0500

    Merge branch 'master' into 10979-cancelled-job-nodes


commit 3a31db02591a2f57d51b98ba9add7d835c5c6c26
Author: radhika <radhika at curoverse.com>
Date:   Wed Feb 22 14:48:47 2017 -0500

    10979: refactor squeue invocations

diff --git a/sdk/go/dispatch/dispatch.go b/sdk/go/dispatch/dispatch.go
index 371cbb1..990d74e 100644
--- a/sdk/go/dispatch/dispatch.go
+++ b/sdk/go/dispatch/dispatch.go
@@ -205,6 +205,9 @@ func (d *Dispatcher) Unlock(uuid string) error {
 
 // TrackContainer starts a tracker for given uuid if one is not already existing, despite its state.
 func (d *Dispatcher) TrackContainer(uuid string) {
+	d.mtx.Lock()
+	defer d.mtx.Unlock()
+
 	if d.trackers == nil {
 		d.trackers = make(map[string]*runTracker)
 	}
@@ -214,14 +217,6 @@ func (d *Dispatcher) TrackContainer(uuid string) {
 		return
 	}
 
-	d.mtx.Lock()
-	defer d.mtx.Unlock()
-
-	_, alreadyTracking = d.trackers[uuid]
-	if alreadyTracking {
-		return
-	}
-
 	var cntr arvados.Container
 	err := d.Arv.Call("GET", "containers", uuid, "", nil, &cntr)
 	if err != nil {
@@ -229,10 +224,7 @@ func (d *Dispatcher) TrackContainer(uuid string) {
 		return
 	}
 
-	tracker := &runTracker{updates: make(chan arvados.Container, 1)}
-	tracker.updates <- cntr
-
-	d.trackers[uuid] = tracker
+	d.trackers[uuid] = d.start(c)
 }
 
 type runTracker struct {
diff --git a/services/api/lib/crunch_dispatch.rb b/services/api/lib/crunch_dispatch.rb
index 34fed92..cf19cc4 100644
--- a/services/api/lib/crunch_dispatch.rb
+++ b/services/api/lib/crunch_dispatch.rb
@@ -871,22 +871,14 @@ class CrunchDispatch
       end
       Rails.logger.info "fail_jobs: threshold is #{threshold}"
 
-      if Rails.configuration.crunch_job_wrapper == :slurm_immediate
-        # [["slurm_job_id", "slurm_job_name"], ...]
-        squeue = File.popen(['squeue', '-h', '-o', '%i %j']).readlines.map do |line|
-          line.strip.split(' ', 2)
-        end
-      else
-        squeue = []
-      end
-
+      squeue = squeue_jobs
       Job.where('state = ? and started_at < ?', Job::Running, threshold).
         each do |job|
         Rails.logger.debug "fail_jobs: #{job.uuid} started #{job.started_at}"
-        squeue.each do |slurm_id, slurm_name|
+        squeue.each do |slurm_name|
           if slurm_name == job.uuid
-            Rails.logger.info "fail_jobs: scancel #{slurm_id} for #{job.uuid}"
-            scancel slurm_id
+            Rails.logger.info "fail_jobs: scancel #{job.uuid}"
+            scancel slurm_name
           end
         end
         fail_job(job, "cleaned up stale job: started before #{threshold}",
@@ -897,21 +889,19 @@ class CrunchDispatch
 
   def check_orphaned_slurm_jobs
     act_as_system_user do
-      if Rails.configuration.crunch_job_wrapper == :slurm_immediate
-        squeue_uuids = File.popen(['squeue', '-a', '-h', '-o', '%j']).readlines.map do |line|
-          line.strip.split(' ', 1)
-        end.collect{|l| l[0]}.
-            select{|uuid| uuid.match(HasUuid::UUID_REGEX)}.
-            select{|uuid| !@running.has_key?(uuid)}
-
-        return if squeue_uuids.size == 0
-
-        scancel_uuids = squeue_uuids - Job.where('uuid in (?) and (state=? or modified_at>?)',
-                                                 squeue_uuids, 'Running', (Time.now - 60)).collect(&:uuid)
-        scancel_uuids.each do |uuid|
-          Rails.logger.info "orphaned job: scancel #{uuid}"
-          scancel uuid, true
-        end
+      squeue_uuids = squeue_jobs.select{|uuid| uuid.match(HasUuid::UUID_REGEX)}.
+                                  select{|uuid| !@running.has_key?(uuid)}
+
+      return if squeue_uuids.size == 0
+
+      scancel_uuids = squeue_uuids - Job.where('uuid in (?) and (state in (?) or modified_at>?)',
+                                               squeue_uuids,
+                                               ['Running', 'Queued'],
+                                               (Time.now - 60)).
+                                         collect(&:uuid)
+      scancel_uuids.each do |uuid|
+        Rails.logger.info "orphaned job: scancel #{uuid}"
+        scancel uuid
       end
     end
   end
@@ -970,14 +960,22 @@ class CrunchDispatch
     end
   end
 
-  def scancel slurm_id, use_name=false
-    scancel_cmd = ['scancel']
-    scancel_cmd << '-n' if use_name
-    scancel_cmd << slurm_id
-    cmd = sudo_preface + scancel_cmd
+  # An array of job_uuids in squeue
+  def squeue_jobs
+    if Rails.configuration.crunch_job_wrapper == :slurm_immediate
+      squeue = File.popen(['squeue', '-a', '-h', '-o', '%j']).readlines.map do |line|
+        line.strip
+      end
+    else
+      squeue = []
+    end
+  end
+
+  def scancel slurm_name
+    cmd = sudo_preface + ['scancel', '-n', slurm_name]
     puts File.popen(cmd).read
     if not $?.success?
-      Rails.logger.error "scancel #{slurm_id.shellescape}: $?"
+      Rails.logger.error "scancel #{slurm_name.shellescape}: $?"
     end
   end
 end
diff --git a/services/api/test/unit/crunch_dispatch_test.rb b/services/api/test/unit/crunch_dispatch_test.rb
index 8e604ee..4646f7a 100644
--- a/services/api/test/unit/crunch_dispatch_test.rb
+++ b/services/api/test/unit/crunch_dispatch_test.rb
@@ -209,6 +209,7 @@ class CrunchDispatchTest < ActiveSupport::TestCase
       dispatch = CrunchDispatch.new
 
       squeue_resp = File.popen("echo zzzzz-8i9sb-pshmckwoma9plh7\necho thisisnotvalidjobuuid\necho zzzzz-8i9sb-4cf0abc123e809j\n")
+      scancel_resp = File.popen("true")
 
       File.expects(:popen).
         with(['squeue', '-a', '-h', '-o', '%j']).
@@ -216,7 +217,7 @@ class CrunchDispatchTest < ActiveSupport::TestCase
 
       File.expects(:popen).
         with(dispatch.sudo_preface + ['scancel', '-n', 'zzzzz-8i9sb-4cf0abc123e809j']).
-        returns(squeue_resp)
+        returns(scancel_resp)
 
       dispatch.check_orphaned_slurm_jobs
     end
diff --git a/services/api/test/unit/fail_jobs_test.rb b/services/api/test/unit/fail_jobs_test.rb
index 1f5847a..8c6539e 100644
--- a/services/api/test/unit/fail_jobs_test.rb
+++ b/services/api/test/unit/fail_jobs_test.rb
@@ -38,13 +38,13 @@ class FailJobsTest < ActiveSupport::TestCase
   test 'cancel slurm jobs' do
     Rails.configuration.crunch_job_wrapper = :slurm_immediate
     Rails.configuration.crunch_job_user = 'foobar'
-    fake_squeue = File.popen("echo 1234 #{@job[:before_reboot].uuid}")
+    fake_squeue = File.popen("echo #{@job[:before_reboot].uuid}")
     fake_scancel = File.popen("true")
     File.expects(:popen).
-      with(['squeue', '-h', '-o', '%i %j']).
+      with(['squeue', '-a', '-h', '-o', '%j']).
       returns(fake_squeue)
     File.expects(:popen).
-      with(includes('sudo', '-u', 'foobar', 'scancel', '1234')).
+      with(includes('sudo', '-u', 'foobar', 'scancel', '-n', @job[:before_reboot].uuid)).
       returns(fake_scancel)
     @dispatch.fail_jobs(before: Time.at(BOOT_TIME).to_s)
     assert_end_states

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list