[ARVADOS] created: 0888c5335f4e1868cdcebda6b6f76c2138978c6b

Git user git at public.curoverse.com
Tue Sep 19 22:21:42 EDT 2017


        at  0888c5335f4e1868cdcebda6b6f76c2138978c6b (commit)


commit 0888c5335f4e1868cdcebda6b6f76c2138978c6b
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Tue Sep 19 22:14:37 2017 -0400

    12084: Fix dispatcher getting bogged down on "too many open files".
    
    Previously, handling the "too many open files" error from popen3
    consisted of sleeping 1 second and trying the next job in the
    queue. When lots of jobs were queued, this meant getting stuck in
    start_jobs() for a long time, futilely trying to start different jobs,
    which all failed for the same reason.
    
    Worse, being stuck here meant none of the working jobs could finish,
    which meant no file descriptors could be freed.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/services/api/lib/crunch_dispatch.rb b/services/api/lib/crunch_dispatch.rb
index ca6e28b..3cabc1e 100644
--- a/services/api/lib/crunch_dispatch.rb
+++ b/services/api/lib/crunch_dispatch.rb
@@ -429,8 +429,11 @@ class CrunchDispatch
         i, o, e, t = Open3.popen3(*cmd_args)
       rescue
         $stderr.puts "dispatch: popen3: #{$!}"
-        sleep 1
-        next
+        # This is a dispatch problem like "Too many open files";
+        # retrying another job right away would be futile. Just return
+        # and hope things are better next time, after (at least) a
+        # did_recently() delay.
+        return
       end
 
       $stderr.puts "dispatch: job #{job.uuid}"

commit 6175b80719275d88f7c2bb0a8c15417dc9eb246b
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Tue Sep 19 21:54:05 2017 -0400

    12084: Remove dead code.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/services/api/lib/crunch_dispatch.rb b/services/api/lib/crunch_dispatch.rb
index 643eace..ca6e28b 100644
--- a/services/api/lib/crunch_dispatch.rb
+++ b/services/api/lib/crunch_dispatch.rb
@@ -633,32 +633,11 @@ class CrunchDispatch
     pid_done = nil
     j_done = nil
 
-    if false
-      begin
-        pid_done = waitpid(-1, Process::WNOHANG | Process::WUNTRACED)
-        if pid_done
-          j_done = @running.values.
-            select { |j| j[:wait_thr].pid == pid_done }.
-            first
-        end
-      rescue SystemCallError
-        # I have @running processes but system reports I have no
-        # children. This is likely to happen repeatedly if it happens at
-        # all; I will log this no more than once per child process I
-        # start.
-        if 0 < @running.select { |uuid,j| j[:warned_waitpid_error].nil? }.size
-          children = @running.values.collect { |j| j[:wait_thr].pid }.join ' '
-          $stderr.puts "dispatch: IPC bug: waitpid() error (#{$!}), but I have children #{children}"
-        end
-        @running.each do |uuid,j| j[:warned_waitpid_error] = true end
-      end
-    else
-      @running.each do |uuid, j|
-        if !j[:wait_thr].status
-          pid_done = j[:wait_thr].pid
-          j_done = j
-          break
-        end
+    @running.each do |uuid, j|
+      if !j[:wait_thr].status
+        pid_done = j[:wait_thr].pid
+        j_done = j
+        break
       end
     end
 

commit fa2f64c6a39809e47bab8639ddee958b0b7ebdc3
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Tue Sep 19 21:51:38 2017 -0400

    12084: Reap children whose threads exit abnormally (status == nil).
    
    Ruby's thread status is false if the thread exited normally, nil if it
    exited abnormally.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/services/api/lib/crunch_dispatch.rb b/services/api/lib/crunch_dispatch.rb
index 230f03e..643eace 100644
--- a/services/api/lib/crunch_dispatch.rb
+++ b/services/api/lib/crunch_dispatch.rb
@@ -654,9 +654,10 @@ class CrunchDispatch
       end
     else
       @running.each do |uuid, j|
-        if j[:wait_thr].status == false
+        if !j[:wait_thr].status
           pid_done = j[:wait_thr].pid
           j_done = j
+          break
         end
       end
     end

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list