[ARVADOS] created: 3d84dd62a85647e9b3dfc34af8303c09b9923498

git at public.curoverse.com git at public.curoverse.com
Wed Sep 24 14:36:38 EDT 2014


        at  3d84dd62a85647e9b3dfc34af8303c09b9923498 (commit)


commit 3d84dd62a85647e9b3dfc34af8303c09b9923498
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Wed Sep 24 14:36:21 2014 -0400

    3899: crunch-dispatch uses state column to mark crashed jobs as failed.

diff --git a/services/api/script/crunch-dispatch.rb b/services/api/script/crunch-dispatch.rb
index 08ea229..69c2d57 100755
--- a/services/api/script/crunch-dispatch.rb
+++ b/services/api/script/crunch-dispatch.rb
@@ -417,14 +417,11 @@ class Dispatcher
     exit_status = j_done[:wait_thr].value
 
     jobrecord = Job.find_by_uuid(job_done.uuid)
-    if exit_status.to_i != 75 and jobrecord.started_at
-      # Clean up state fields in case crunch-job exited without
-      # putting the job in a suitable "finished" state.
-      jobrecord.running = false
-      jobrecord.finished_at ||= Time.now
-      if jobrecord.success.nil?
-        jobrecord.success = false
-      end
+    if exit_status.to_i != 75 and jobrecord.state == "Running"
+      # crunch-job did not return exit code 75 (see below) and left the job in
+      # the "Running" state, which means there was an unhandled error.  Fail
+      # the job.
+      jobrecord.state = "Failed"
       jobrecord.save!
     else
       # Don't fail the job if crunch-job didn't even get as far as

commit 93c7127afd21f82662e9b471e7c617ae8c59ad19
Merge: 68dc1fc 2a8d349
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Wed Sep 24 14:21:50 2014 -0400

    Merge remote-tracking branch 'origin/master' into 3988-crunch-use-job-state


commit 68dc1fcaec00d7d954be19f2c184e04626639616
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Wed Sep 24 14:11:33 2014 -0400

    3899: Fix syntax errors in arv-run-pipeline-instance, and update job record if
    job is queued or running.

diff --git a/sdk/cli/bin/arv-run-pipeline-instance b/sdk/cli/bin/arv-run-pipeline-instance
index bc87c5d..040a71d 100755
--- a/sdk/cli/bin/arv-run-pipeline-instance
+++ b/sdk/cli/bin/arv-run-pipeline-instance
@@ -509,7 +509,7 @@ class WhRunPipelineInstance
         # the job's current state")
         c_already_finished = (c[:job] &&
                               c[:job][:uuid] &&
-                              ["Complete", "Failed", "Cancelled"].include? c[:job][:state])
+                              ["Complete", "Failed", "Cancelled"].include?(c[:job][:state]))
         if !c[:job] and
             c[:script_parameters].select { |pname, p| p.is_a? Hash and p[:output_of]}.empty?
           # No job yet associated with this component and is component inputs
@@ -526,7 +526,7 @@ class WhRunPipelineInstance
             :owner_uuid => owner_uuid,
             :is_locked_by_uuid => (@options[:run_jobs_here] ? owner_uuid : nil),
             :submit_id => my_submit_id,
-            :state => (if @options[:run_jobs_here] then "Running" else "Queued")
+            :state => (if @options[:run_jobs_here] then "Running" else "Queued" end)
           }, {
             # This is the right place to put these attributes when
             # dealing with new API servers.
@@ -582,8 +582,8 @@ class WhRunPipelineInstance
         end
 
         if c[:job] and c[:job][:uuid]
-          if c[:job][:state] == "Running"
-            # Job is running so update copy of job record
+          if ["Running", "Queued"].include?(c[:job][:state])
+            # Job is running (or may be soon) so update copy of job record
             c[:job] = JobCache.get(c[:job][:uuid])
           end
 
@@ -655,8 +655,11 @@ class WhRunPipelineInstance
           elsif c[:job][:state] == "Running"
             # Job is still running
             moretodo = true
-          elsif c[:job][:cancelled_at]
+          elsif c[:job][:state] == "Cancelled"
             debuglog "component #{cname} job #{c[:job][:uuid]} cancelled."
+            moretodo = false
+          elsif c[:job][:state] == "Failed"
+            moretodo = false
           end
         end
       end
@@ -759,16 +762,18 @@ class WhRunPipelineInstance
         @components.each do |cname, c|
           jstatus = if !c[:job]
                       "-"
-                    elsif c[:job][:state] == "Running"
-                      "#{c[:job][:tasks_summary].inspect}"
-                    elsif c[:job][:state] == "Complete"
-                      c[:job][:output]
-                    elsif c[:job][:state] == "Cancelled"
-                      "cancelled #{c[:job][:cancelled_at]}"
-                    elsif c[:job][:state] == "Failed"
-                      "failed #{c[:job][:finished_at]}"
-                    elsif c[:job][:state] == "Queued"
-                      "queued #{c[:job][:created_at]}"
+                    else case c[:job][:state]
+                         when "Running"
+                           "#{c[:job][:tasks_summary].inspect}"
+                         when "Complete"
+                           c[:job][:output]
+                         when "Cancelled"
+                           "cancelled #{c[:job][:cancelled_at]}"
+                         when "Failed"
+                           "failed #{c[:job][:finished_at]}"
+                         when "Queued"
+                           "queued #{c[:job][:created_at]}"
+                         end
                     end
           f.puts "#{cname.to_s.ljust namewidth} #{c[:job] ? c[:job][:uuid] : '-'.ljust(27)} #{jstatus}"
         end

commit 4c53b3aee935b41e11c71ff1468a10d6a80c390c
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Wed Sep 24 13:32:01 2014 -0400

    3988: look at job state instead of cancelled_at to determine if the job should
    be stopped.

diff --git a/sdk/cli/bin/crunch-job b/sdk/cli/bin/crunch-job
index 3dd1627..fbd7bef 100755
--- a/sdk/cli/bin/crunch-job
+++ b/sdk/cli/bin/crunch-job
@@ -1037,11 +1037,12 @@ sub check_refresh_wanted
       my $Job2 = $arv->{'jobs'}->{'get'}->execute('uuid' => $jobspec);
       for my $attr ('cancelled_at',
                     'cancelled_by_user_uuid',
-                    'cancelled_by_client_uuid') {
+                    'cancelled_by_client_uuid',
+                    'state') {
         $Job->{$attr} = $Job2->{$attr};
       }
-      if ($Job->{'cancelled_at'}) {
-        Log (undef, "Job cancelled at " . $Job->{cancelled_at} .
+      if ($Job->{'state'} ne "Running") {
+        Log (undef, "Job state changed to " . $Job->{'state'} . " at " . $Job->{cancelled_at} .
              " by user " . $Job->{cancelled_by_user_uuid});
         $main::success = 0;
         $main::please_freeze = 1;

commit c8ad3a2a0f27964eff5d5c2d845afab728836473
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Wed Sep 24 13:28:58 2014 -0400

    3988: Update crunch-job, arv-run-pipeline-instance and workbench to read/write
    "state" field of jobs instead of using running/success (+
    cancelled_at/started_at/finished_at) to figure out what state the job is in.

diff --git a/apps/workbench/app/views/jobs/_show_recent.html.erb b/apps/workbench/app/views/jobs/_show_recent.html.erb
index b19b7d9..c823fc5 100644
--- a/apps/workbench/app/views/jobs/_show_recent.html.erb
+++ b/apps/workbench/app/views/jobs/_show_recent.html.erb
@@ -83,7 +83,7 @@
             <td>
               <% if j.finished_at.is_a? Time %>
 	      <%= raw('ran ' + distance_of_time_in_words(j.finished_at, j.started_at).sub('about ','~').sub(' ',' ')) %>
-              <% elsif j.running %>
+              <% elsif j.state == "Running" %>
               <span class="badge badge-success" title="tasks finished">&#x2714; <%= j.tasks_summary[:done] %></span>
               <span class="badge badge-info" title="tasks running">&#x2708; <%= j.tasks_summary[:running] %></span>
               <span class="badge" title="tasks todo">&#x2709; <%= j.tasks_summary[:todo] %></span>
diff --git a/apps/workbench/app/views/jobs/show.html.erb b/apps/workbench/app/views/jobs/show.html.erb
index d3047ce..276aec5 100644
--- a/apps/workbench/app/views/jobs/show.html.erb
+++ b/apps/workbench/app/views/jobs/show.html.erb
@@ -1,5 +1,5 @@
 <% content_for :tab_line_buttons do %>
-    <% if @object.running %>
+    <% if @object.state == "Running" %>
     <%= form_tag "/jobs/#{@object.uuid}/cancel", style: "display:inline; padding-left: 1em" do |f| %>
       <%= button_tag "Cancel running job", {class: 'btn btn-sm btn-danger', id: "cancel-job-button"} %>
     <% end %>
diff --git a/apps/workbench/app/views/users/_tables.html.erb b/apps/workbench/app/views/users/_tables.html.erb
index ebb5201..acde5ce 100644
--- a/apps/workbench/app/views/users/_tables.html.erb
+++ b/apps/workbench/app/views/users/_tables.html.erb
@@ -59,7 +59,7 @@
 
             <td>
               <small>
-                <% if j.success and j.output %>
+                <% if j.state == "Complete" and j.output %>
                   <a href="<%= collection_path(j.output) %>">
                     <% collections = collections_for_object(j.output) %>
                       <% if collections && !collections.empty? %>
diff --git a/sdk/cli/bin/arv-run-pipeline-instance b/sdk/cli/bin/arv-run-pipeline-instance
index 472c20b..bc87c5d 100755
--- a/sdk/cli/bin/arv-run-pipeline-instance
+++ b/sdk/cli/bin/arv-run-pipeline-instance
@@ -509,7 +509,7 @@ class WhRunPipelineInstance
         # the job's current state")
         c_already_finished = (c[:job] &&
                               c[:job][:uuid] &&
-                              !c[:job][:success].nil?)
+                              ["Complete", "Failed", "Cancelled"].include? c[:job][:state])
         if !c[:job] and
             c[:script_parameters].select { |pname, p| p.is_a? Hash and p[:output_of]}.empty?
           # No job yet associated with this component and is component inputs
@@ -526,6 +526,7 @@ class WhRunPipelineInstance
             :owner_uuid => owner_uuid,
             :is_locked_by_uuid => (@options[:run_jobs_here] ? owner_uuid : nil),
             :submit_id => my_submit_id,
+            :state => (if @options[:run_jobs_here] then "Running" else "Queued")
           }, {
             # This is the right place to put these attributes when
             # dealing with new API servers.
@@ -546,7 +547,7 @@ class WhRunPipelineInstance
           end
         end
 
-        if c[:job] and c[:run_in_process] and c[:job][:success].nil?
+        if c[:job] and c[:run_in_process] and not ["Complete", "Failed", "Cancelled"].include? c[:job][:state]
           report_status
           begin
             require 'open3'
@@ -575,21 +576,18 @@ class WhRunPipelineInstance
             debuglog "Interrupted (#{e}). Failing job.", 0
             $arv.job.update(uuid: c[:job][:uuid],
                             job: {
-                              finished_at: Time.now,
-                              running: false,
-                              success: false
+                              state: "Failed"
                             })
           end
         end
 
         if c[:job] and c[:job][:uuid]
-          if (c[:job][:running] or
-              not (c[:job][:finished_at] or c[:job][:cancelled_at]))
+          if c[:job][:state] == "Running"
             # Job is running so update copy of job record
             c[:job] = JobCache.get(c[:job][:uuid])
           end
 
-          if c[:job][:success]
+          if c[:job][:state] == "Complete"
             # Populate script_parameters of other components waiting for
             # this job
             @components.each do |c2name, c2|
@@ -654,8 +652,7 @@ class WhRunPipelineInstance
                 end
               end
             end
-          elsif c[:job][:running] ||
-              (!c[:job][:started_at] && !c[:job][:cancelled_at])
+          elsif c[:job][:state] == "Running"
             # Job is still running
             moretodo = true
           elsif c[:job][:cancelled_at]
@@ -686,21 +683,17 @@ class WhRunPipelineInstance
       end
     end
 
-    ended = 0
-    succeeded = 0
-    failed = 0
-    @components.each do |cname, c|
-      if c[:job]
-        if c[:job][:finished_at] or c[:job][:cancelled_at] or (c[:job][:running] == false and c[:job][:success] == false)
-          ended += 1
-          if c[:job][:success] == true
-            succeeded += 1
-          elsif c[:job][:success] == false or c[:job][:cancelled_at]
-            failed += 1
-          end
-        end
-      end
-    end
+    ended = @components.map { |cname, c| 
+      if c[:job] and ["Complete", "Failed", "Cancelled"].include? c[:job][:state] then 1 else 0 end 
+    }.reduce(:+) || 0
+
+    succeeded = @components.map { |cname, c| 
+      if c[:job] and ["Complete"].include? c[:job][:state] then 1 else 0 end 
+    }.reduce(:+) || 0
+
+    failed = @components.map { |cname, c| 
+      if c[:job] and ["Failed", "Cancelled"].include? c[:job][:state] then 1 else 0  end 
+    }.reduce(:+) || 0
 
     success = (succeeded == @components.length)
 
@@ -766,19 +759,15 @@ class WhRunPipelineInstance
         @components.each do |cname, c|
           jstatus = if !c[:job]
                       "-"
-                    elsif c[:job][:running]
+                    elsif c[:job][:state] == "Running"
                       "#{c[:job][:tasks_summary].inspect}"
-                    elsif c[:job][:success]
+                    elsif c[:job][:state] == "Complete"
                       c[:job][:output]
-                    elsif c[:job][:cancelled_at]
+                    elsif c[:job][:state] == "Cancelled"
                       "cancelled #{c[:job][:cancelled_at]}"
-                    elsif c[:job][:finished_at]
+                    elsif c[:job][:state] == "Failed"
                       "failed #{c[:job][:finished_at]}"
-                    elsif c[:job][:started_at]
-                      "started #{c[:job][:started_at]}"
-                    elsif c[:job][:is_locked_by_uuid]
-                      "starting #{c[:job][:started_at]}"
-                    else
+                    elsif c[:job][:state] == "Queued"
                       "queued #{c[:job][:created_at]}"
                     end
           f.puts "#{cname.to_s.ljust namewidth} #{c[:job] ? c[:job][:uuid] : '-'.ljust(27)} #{jstatus}"
diff --git a/sdk/cli/bin/crunch-job b/sdk/cli/bin/crunch-job
index 70f379e..3dd1627 100755
--- a/sdk/cli/bin/crunch-job
+++ b/sdk/cli/bin/crunch-job
@@ -161,6 +161,10 @@ if ($job_has_uuid)
       Log(undef, "Job is locked by " . $Job->{'is_locked_by_uuid'});
       exit EX_TEMPFAIL;
     }
+    if ($Job->{'state'} ne 'Queued') {
+      Log(undef, "Job state is " . $Job->{'state'} . ", but I can only start queued jobs.");
+      exit EX_TEMPFAIL;
+    }
     if ($Job->{'success'} ne undef) {
       Log(undef, "Job 'success' flag (" . $Job->{'success'} . ") is not null");
       exit EX_TEMPFAIL;
@@ -287,9 +291,7 @@ if ($job_has_uuid)
     Log(undef, "Error while updating / locking job, exiting ".EX_TEMPFAIL);
     exit EX_TEMPFAIL;
   }
-  $Job->update_attributes('started_at' => scalar gmtime,
-                          'running' => 1,
-                          'success' => undef,
+  $Job->update_attributes('state' => 'Running',
                           'tasks_summary' => { 'failed' => 0,
                                                'todo' => 1,
                                                'running' => 0,
@@ -876,12 +878,14 @@ Log (undef, "finish");
 save_meta();
 
 if ($job_has_uuid) {
-  $Job->update_attributes('running' => 0,
-                          'success' => $collated_output && $main::success,
-                          'finished_at' => scalar gmtime)
+  if ($collated_output && $main::success) {
+    $Job->update_attributes('state' => 'Complete')
+  } else {
+    $Job->update_attributes('state' => 'Failed')
+  }
 }
 
-exit ($Job->{'success'} ? 1 : 0);
+exit ($Job->{'state'} != 'Complete' ? 1 : 0);
 
 
 
@@ -1336,9 +1340,12 @@ sub croak
 sub cleanup
 {
   return if !$job_has_uuid;
-  $Job->update_attributes('running' => 0,
-                          'success' => 0,
-                          'finished_at' => scalar gmtime);
+  if ($Job->{'cancelled_at'}) {
+    $Job->update_attributes('state' => 'Cancelled',
+                            'finished_at' => scalar gmtime);
+  } else {
+    $Job->update_attributes('state' => 'Failed');
+  }
 }
 
 

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list