[ARVADOS] created: 0888c5335f4e1868cdcebda6b6f76c2138978c6b
Git user
git at public.curoverse.com
Tue Sep 19 22:21:42 EDT 2017
at 0888c5335f4e1868cdcebda6b6f76c2138978c6b (commit)
commit 0888c5335f4e1868cdcebda6b6f76c2138978c6b
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date: Tue Sep 19 22:14:37 2017 -0400
12084: Fix dispatcher getting bogged down on "too many open files".
Previously, handling the "too many open files" error from popen3
consisted of sleeping 1 second and trying the next job in the
queue. When lots of jobs were queued, this meant getting stuck in
start_jobs() for a long time, futilely trying to start different jobs,
which all failed for the same reason.
Worse, being stuck here meant none of the working jobs could finish,
which meant no file descriptors could be freed.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>
diff --git a/services/api/lib/crunch_dispatch.rb b/services/api/lib/crunch_dispatch.rb
index ca6e28b..3cabc1e 100644
--- a/services/api/lib/crunch_dispatch.rb
+++ b/services/api/lib/crunch_dispatch.rb
@@ -429,8 +429,11 @@ class CrunchDispatch
i, o, e, t = Open3.popen3(*cmd_args)
rescue
$stderr.puts "dispatch: popen3: #{$!}"
- sleep 1
- next
+ # This is a dispatch problem like "Too many open files";
+ # retrying another job right away would be futile. Just return
+ # and hope things are better next time, after (at least) a
+ # did_recently() delay.
+ return
end
$stderr.puts "dispatch: job #{job.uuid}"
commit 6175b80719275d88f7c2bb0a8c15417dc9eb246b
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date: Tue Sep 19 21:54:05 2017 -0400
12084: Remove dead code.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>
diff --git a/services/api/lib/crunch_dispatch.rb b/services/api/lib/crunch_dispatch.rb
index 643eace..ca6e28b 100644
--- a/services/api/lib/crunch_dispatch.rb
+++ b/services/api/lib/crunch_dispatch.rb
@@ -633,32 +633,11 @@ class CrunchDispatch
pid_done = nil
j_done = nil
- if false
- begin
- pid_done = waitpid(-1, Process::WNOHANG | Process::WUNTRACED)
- if pid_done
- j_done = @running.values.
- select { |j| j[:wait_thr].pid == pid_done }.
- first
- end
- rescue SystemCallError
- # I have @running processes but system reports I have no
- # children. This is likely to happen repeatedly if it happens at
- # all; I will log this no more than once per child process I
- # start.
- if 0 < @running.select { |uuid,j| j[:warned_waitpid_error].nil? }.size
- children = @running.values.collect { |j| j[:wait_thr].pid }.join ' '
- $stderr.puts "dispatch: IPC bug: waitpid() error (#{$!}), but I have children #{children}"
- end
- @running.each do |uuid,j| j[:warned_waitpid_error] = true end
- end
- else
- @running.each do |uuid, j|
- if !j[:wait_thr].status
- pid_done = j[:wait_thr].pid
- j_done = j
- break
- end
+ @running.each do |uuid, j|
+ if !j[:wait_thr].status
+ pid_done = j[:wait_thr].pid
+ j_done = j
+ break
end
end
commit fa2f64c6a39809e47bab8639ddee958b0b7ebdc3
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date: Tue Sep 19 21:51:38 2017 -0400
12084: Reap children whose threads exit abnormally (status == nil).
Ruby's thread status is false if the thread exited normally, nil if it
exited abnormally.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>
diff --git a/services/api/lib/crunch_dispatch.rb b/services/api/lib/crunch_dispatch.rb
index 230f03e..643eace 100644
--- a/services/api/lib/crunch_dispatch.rb
+++ b/services/api/lib/crunch_dispatch.rb
@@ -654,9 +654,10 @@ class CrunchDispatch
end
else
@running.each do |uuid, j|
- if j[:wait_thr].status == false
+ if !j[:wait_thr].status
pid_done = j[:wait_thr].pid
j_done = j
+ break
end
end
end
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list