[ARVADOS] created: d70f2d85958f988be3c227870d7adc578ee12c8a
git at public.curoverse.com
git at public.curoverse.com
Wed Feb 25 09:51:07 EST 2015
at d70f2d85958f988be3c227870d7adc578ee12c8a (commit)
commit d70f2d85958f988be3c227870d7adc578ee12c8a
Author: Brett Smith <brett at curoverse.com>
Date: Wed Feb 25 09:50:53 2015 -0500
5283: Improve reliability of crunch-job output collation.
* Check the results of all pipe opens, exit statuses, and writes.
Log any problems.
* Have fetch_block return undef when it encounters trouble, rather
than dying. create_output_collection already checks for this, so it
effectively bubbles up the error.
* Retry all of the associated API calls.
* Kill the manifest creation pipe if we give up on it, per the TODO.
This probably won't resolve #5283, but hopefully these changes will
give us additional information to help diagnose the problem.
diff --git a/sdk/cli/bin/crunch-job b/sdk/cli/bin/crunch-job
index d69aee6..0d49eae 100755
--- a/sdk/cli/bin/crunch-job
+++ b/sdk/cli/bin/crunch-job
@@ -1252,16 +1252,19 @@ sub process_stderr
sub fetch_block
{
my $hash = shift;
- my ($keep, $child_out, $output_block);
-
- my $cmd = "arv-get \Q$hash\E";
- open($keep, '-|', $cmd) or die "fetch_block: $cmd: $!";
- $output_block = '';
+ my $keep;
+ if (!open($keep, "-|", "arv-get", "--retries", retry_count(), $hash)) {
+ Log(undef, "fetch_block run error from arv-get $hash: $!");
+ return undef;
+ }
+ my $output_block = "";
while (1) {
my $buf;
my $bytes = sysread($keep, $buf, 1024 * 1024);
if (!defined $bytes) {
- die "reading from arv-get: $!";
+ Log(undef, "fetch_block read error from arv-get: $!");
+ $output_block = undef;
+ last;
} elsif ($bytes == 0) {
# sysread returns 0 at the end of the pipe.
last;
@@ -1271,6 +1274,10 @@ sub fetch_block
}
}
close $keep;
+ if ($?) {
+ Log(undef, "fetch_block arv-get exited " . exit_status_s($?));
+ $output_block = undef;
+ }
return $output_block;
}
@@ -1283,14 +1290,13 @@ sub create_output_collection
Log (undef, "collate");
my ($child_out, $child_in);
- my $pid = open2($child_out, $child_in, 'python', '-c',
- 'import arvados; ' .
- 'import sys; ' .
- 'print arvados.api()' .
- '.collections()' .
- '.create(body={"manifest_text":sys.stdin.read()})' .
- '.execute()["portable_data_hash"]'
- );
+ my $pid = open2($child_out, $child_in, 'python', '-c', q{
+import arvados
+import sys
+print (arvados.api("v1").collections().
+ create(body={"manifest_text": sys.stdin.read()}).
+ execute(num_retries=int(sys.argv[1]))["portable_data_hash"])
+}, retry_count());
my $task_idx = -1;
for (@jobstep)
@@ -1298,35 +1304,46 @@ sub create_output_collection
++$task_idx;
next unless exists $_->{'arvados_task'}->{'output'};
my $output = $_->{'arvados_task'}->{output};
- if ($output !~ /^[0-9a-f]{32}(\+\S+)*$/)
- {
- print $child_in $output;
- }
- elsif (defined (my $outblock = fetch_block ($output)))
- {
- print $child_in $outblock;
+ my $next_write;
+ if ($output =~ /^[0-9a-f]{32}(\+\S+)*$/) {
+ $next_write = fetch_block($output);
+ } else {
+ $next_write = $output;
}
- else
- {
+ if (defined($next_write)) {
+ if (!defined(syswrite($child_in, $next_write))) {
+ # There's been an error writing. Stop the loop.
+ # We'll log details about the exit code later.
+ last;
+ }
+ } else {
my $uuid = $_->{'arvados_task'}->{'uuid'};
Log (undef, "Error retrieving '$output' output by task $task_idx ($uuid)");
$main::success = 0;
}
}
- $child_in->close;
+ close($child_in);
my $joboutput;
my $s = IO::Select->new($child_out);
if ($s->can_read(120)) {
sysread($child_out, $joboutput, 64 * 1024 * 1024);
- chomp($joboutput);
- # TODO: Ensure exit status == 0.
+ waitpid($pid, 0);
+ if ($?) {
+ Log(undef, "output collection creation exited " . exit_status_s($?));
+ $joboutput = undef;
+ } else {
+ chomp($joboutput);
+ }
} else {
Log (undef, "timed out while creating output collection");
+ foreach my $signal (2, 2, 2, 15, 15, 9) {
+ kill($signal, $pid);
+ last if waitpid($pid, WNOHANG) == -1;
+ sleep(1);
+ }
}
- # TODO: kill $pid instead of waiting, now that we've decided to
- # ignore further output.
- waitpid($pid, 0);
+ close($child_out);
return $joboutput;
}
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list