[ARVADOS] updated: 62fea2a463eb0d8c3aef84a56ba84561a196666b
git at public.curoverse.com
git at public.curoverse.com
Fri Feb 27 14:23:33 EST 2015
Summary of changes:
sdk/cli/bin/crunch-job | 90 +++++++++++++++++++++++++++++++-------------------
1 file changed, 56 insertions(+), 34 deletions(-)
via 62fea2a463eb0d8c3aef84a56ba84561a196666b (commit)
via 679e083df15373a6570b0834bf2e95237a040dc7 (commit)
via 0e4de53ab1c26a83a5f542d52935358497959c3e (commit)
via eab3fd07f8f2eea43a373454a91ba0ca3fc60a6b (commit)
from b10c68ef2d5ec1418653594576e8d66b1df3453e (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
commit 62fea2a463eb0d8c3aef84a56ba84561a196666b
Merge: b10c68e 679e083
Author: Brett Smith <brett at curoverse.com>
Date: Fri Feb 27 14:23:02 2015 -0500
Merge branch '5283-crunch-collation-safety-wip'
Closes #5283, #5306.
commit 679e083df15373a6570b0834bf2e95237a040dc7
Author: Brett Smith <brett at curoverse.com>
Date: Fri Feb 27 14:22:18 2015 -0500
5283: Log more crunch-job output handling.
Requested during code review.
diff --git a/sdk/cli/bin/crunch-job b/sdk/cli/bin/crunch-job
index 567b5a2..9fb14b1 100755
--- a/sdk/cli/bin/crunch-job
+++ b/sdk/cli/bin/crunch-job
@@ -941,7 +941,7 @@ if (!$collated_output) {
Log (undef, "Failed to write output collection");
}
else {
- Log(undef, "output hash " . $collated_output);
+ Log(undef, "job output $collated_output");
$Job->update_attributes('output' => $collated_output);
}
@@ -1055,7 +1055,9 @@ sub reapchildren
$Jobstep->{'arvados_task'}->{finished_at} = strftime "%Y-%m-%dT%H:%M:%SZ", gmtime($Jobstep->{finishtime});
$Jobstep->{'arvados_task'}->save;
process_stderr ($jobstepid, $task_success);
- Log ($jobstepid, "output " . $Jobstep->{'arvados_task'}->{output});
+ Log ($jobstepid, sprintf("task output (%d bytes): %s",
+ length($Jobstep->{'arvados_task'}->{output}),
+ $Jobstep->{'arvados_task'}->{output}));
close $reader{$jobstepid};
delete $reader{$jobstepid};
@@ -1298,6 +1300,7 @@ print (arvados.api("v1").collections().
}, retry_count());
my $task_idx = -1;
+ my $manifest_size = 0;
for (@jobstep)
{
++$task_idx;
@@ -1314,6 +1317,8 @@ print (arvados.api("v1").collections().
# There's been an error writing. Stop the loop.
# We'll log details about the exit code later.
last;
+ } else {
+ $manifest_size += length($next_write);
}
} else {
my $uuid = $_->{'arvados_task'}->{'uuid'};
@@ -1322,11 +1327,12 @@ print (arvados.api("v1").collections().
}
}
close($child_in);
+ Log(undef, "collated output manifest text to send to API server is $manifest_size bytes with access tokens");
my $joboutput;
my $s = IO::Select->new($child_out);
if ($s->can_read(120)) {
- sysread($child_out, $joboutput, 64 * 1024 * 1024);
+ sysread($child_out, $joboutput, 1024 * 1024);
waitpid($pid, 0);
if ($?) {
Log(undef, "output collection creation exited " . exit_status_s($?));
commit 0e4de53ab1c26a83a5f542d52935358497959c3e
Author: Brett Smith <brett at curoverse.com>
Date: Wed Feb 25 11:37:26 2015 -0500
5283: crunch-job doesn't use freeze logic after a job fails.
If the job has failed permanently, we want to go through all the
end-of-job logic. Previously, we were getting sidetracked into
freeze_if_want_freeze, which skips some steps like setting the
permanent job output record. Refs #4472.
diff --git a/sdk/cli/bin/crunch-job b/sdk/cli/bin/crunch-job
index 068da88..567b5a2 100755
--- a/sdk/cli/bin/crunch-job
+++ b/sdk/cli/bin/crunch-job
@@ -1037,7 +1037,6 @@ sub reapchildren
if (!$temporary_fail || $Jobstep->{'failures'} >= 3) {
# Give up on this task, and the whole job
$main::success = 0;
- $main::please_freeze = 1;
}
# Put this task back on the todo queue
push @jobstep_todo, $jobstepid;
commit eab3fd07f8f2eea43a373454a91ba0ca3fc60a6b
Author: Brett Smith <brett at curoverse.com>
Date: Fri Feb 27 14:20:12 2015 -0500
5283: Improve reliability of crunch-job output collation.
* Check the results of all pipe opens, exit statuses, and writes.
Log any problems.
* Have fetch_block return undef when it encounters trouble, rather
than dying. create_output_collection already checks for this, so it
effectively bubbles up the error.
* Retry all of the associated API calls.
* Kill the manifest creation pipe if we give up on it, per the TODO.
This probably won't resolve #5283, but hopefully these changes will
give us additional information to help diagnose the problem.
diff --git a/sdk/cli/bin/crunch-job b/sdk/cli/bin/crunch-job
index d69aee6..068da88 100755
--- a/sdk/cli/bin/crunch-job
+++ b/sdk/cli/bin/crunch-job
@@ -1252,16 +1252,19 @@ sub process_stderr
sub fetch_block
{
my $hash = shift;
- my ($keep, $child_out, $output_block);
-
- my $cmd = "arv-get \Q$hash\E";
- open($keep, '-|', $cmd) or die "fetch_block: $cmd: $!";
- $output_block = '';
+ my $keep;
+ if (!open($keep, "-|", "arv-get", "--retries", retry_count(), $hash)) {
+ Log(undef, "fetch_block run error from arv-get $hash: $!");
+ return undef;
+ }
+ my $output_block = "";
while (1) {
my $buf;
my $bytes = sysread($keep, $buf, 1024 * 1024);
if (!defined $bytes) {
- die "reading from arv-get: $!";
+ Log(undef, "fetch_block read error from arv-get: $!");
+ $output_block = undef;
+ last;
} elsif ($bytes == 0) {
# sysread returns 0 at the end of the pipe.
last;
@@ -1271,6 +1274,10 @@ sub fetch_block
}
}
close $keep;
+ if ($?) {
+ Log(undef, "fetch_block arv-get exited " . exit_status_s($?));
+ $output_block = undef;
+ }
return $output_block;
}
@@ -1283,50 +1290,60 @@ sub create_output_collection
Log (undef, "collate");
my ($child_out, $child_in);
- my $pid = open2($child_out, $child_in, 'python', '-c',
- 'import arvados; ' .
- 'import sys; ' .
- 'print arvados.api()' .
- '.collections()' .
- '.create(body={"manifest_text":sys.stdin.read()})' .
- '.execute()["portable_data_hash"]'
- );
+ my $pid = open2($child_out, $child_in, 'python', '-c', q{
+import arvados
+import sys
+print (arvados.api("v1").collections().
+ create(body={"manifest_text": sys.stdin.read()}).
+ execute(num_retries=int(sys.argv[1]))["portable_data_hash"])
+}, retry_count());
my $task_idx = -1;
for (@jobstep)
{
++$task_idx;
- next unless exists $_->{'arvados_task'}->{'output'};
my $output = $_->{'arvados_task'}->{output};
- if ($output !~ /^[0-9a-f]{32}(\+\S+)*$/)
- {
- print $child_in $output;
- }
- elsif (defined (my $outblock = fetch_block ($output)))
- {
- print $child_in $outblock;
+ next if (!defined($output));
+ my $next_write;
+ if ($output =~ /^[0-9a-f]{32}(\+\S+)*$/) {
+ $next_write = fetch_block($output);
+ } else {
+ $next_write = $output;
}
- else
- {
+ if (defined($next_write)) {
+ if (!defined(syswrite($child_in, $next_write))) {
+ # There's been an error writing. Stop the loop.
+ # We'll log details about the exit code later.
+ last;
+ }
+ } else {
my $uuid = $_->{'arvados_task'}->{'uuid'};
Log (undef, "Error retrieving '$output' output by task $task_idx ($uuid)");
$main::success = 0;
}
}
- $child_in->close;
+ close($child_in);
my $joboutput;
my $s = IO::Select->new($child_out);
if ($s->can_read(120)) {
sysread($child_out, $joboutput, 64 * 1024 * 1024);
- chomp($joboutput);
- # TODO: Ensure exit status == 0.
+ waitpid($pid, 0);
+ if ($?) {
+ Log(undef, "output collection creation exited " . exit_status_s($?));
+ $joboutput = undef;
+ } else {
+ chomp($joboutput);
+ }
} else {
Log (undef, "timed out while creating output collection");
+ foreach my $signal (2, 2, 2, 15, 15, 9) {
+ kill($signal, $pid);
+ last if waitpid($pid, WNOHANG) == -1;
+ sleep(1);
+ }
}
- # TODO: kill $pid instead of waiting, now that we've decided to
- # ignore further output.
- waitpid($pid, 0);
+ close($child_out);
return $joboutput;
}
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list