[ARVADOS] updated: b85dc15076a4e9d38d1223fa4a8e614dd821def5
git at public.curoverse.com
git at public.curoverse.com
Tue Dec 30 14:43:41 EST 2014
Summary of changes:
services/api/script/crunch-failure-report.py | 137 +++++++++++++++++----------
1 file changed, 86 insertions(+), 51 deletions(-)
via b85dc15076a4e9d38d1223fa4a8e614dd821def5 (commit)
from cf328df185a2f12c19f1e4f64f7184f8ad3635d0 (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
commit b85dc15076a4e9d38d1223fa4a8e614dd821def5
Author: Tim Pierce <twp at curoverse.com>
Date: Tue Dec 30 14:42:28 2014 -0500
4598: fetch logs from Keep, more failure reporting
Per standup review: fetch logs with a CollectionReader on the log
collection uuid, rather than fetching log records from the API server.
Perform full failure reporting including job URL details.
diff --git a/services/api/script/crunch-failure-report.py b/services/api/script/crunch-failure-report.py
index 989a421..8c1529b 100755
--- a/services/api/script/crunch-failure-report.py
+++ b/services/api/script/crunch-failure-report.py
@@ -10,12 +10,11 @@ import arvados
# Useful configuration variables:
-# The number of log lines preceding a job failure message that should
-# be collected.
-FAILED_JOB_LOG_LINES = 10
+# Number of log lines to use as context in diagnosing failure.
+LOG_CONTEXT_LINES = 10
-# Regex that signifies a failed job.
-FAILED_JOB_REGEX = re.compile('fail')
+# Regex that signifies a failed task.
+FAILED_TASK_REGEX = re.compile(' \d+ failure (.*permanent)')
# Regex that signifies a successful job.
SUCCESSFUL_JOB_REGEX = re.compile('finished')
@@ -38,7 +37,15 @@ def parse_arguments(arguments):
default='fail',
help='Regular expression to match on Crunch error output lines.')
- return arg_parser.parse_args(arguments)
+ args = arg_parser.parse_args(arguments)
+
+ if args.start and not is_valid_timestamp(args.start):
+ raise ValueError(args.start)
+ if args.end and not is_valid_timestamp(args.end):
+ raise ValueError(args.end)
+
+ return args
+
def api_timestamp(when=None):
"""Returns a string representing the timestamp 'when' in a format
@@ -50,6 +57,9 @@ def api_timestamp(when=None):
return when.strftime("%Y-%m-%dT%H:%M:%SZ")
+def is_valid_timestamp(ts):
+ return re.match(r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z', ts)
+
def jobs_created_between_dates(api, start, end):
return arvados.util.list_all(
api.jobs().list,
@@ -57,15 +67,15 @@ def jobs_created_between_dates(api, start, end):
['created_at', '<=', end] ]))
-def job_error_logs(api, job_uuid):
- return arvados.util.list_all(
- api.logs().list,
- filters=json.dumps([ ['object_uuid', '=', job_uuid],
- ['event_type', '=', 'stderr'] ]))
+def job_logs(api, job):
+ # Returns the contents of the log for this job (as an array of lines).
+ log_filename = "{}.log.txt".format(job['uuid'])
+ log_collection = arvados.CollectionReader(job['log'], api)
+ return log_collection.open(log_filename).readlines()
-def is_failed_job(logline):
- return FAILED_JOB_REGEX.search(logline) != None
+def is_failed_task(logline):
+ return FAILED_TASK_REGEX.search(logline) != None
def is_successful_job(logline):
@@ -85,60 +95,85 @@ def main(arguments=None, stdout=sys.stdout, stderr=sys.stderr):
# Find all jobs created within the specified window,
# and their corresponding job logs.
- log("fetching jobs between {} and {}".format(start_time, end_time))
jobs_created = jobs_created_between_dates(api, start_time, end_time)
+ jobs_failed = [job for job in jobs_created if job['state'] == 'Failed']
+ jobs_successful = [job for job in jobs_created if job['state'] == 'Complete']
log("jobs created: {}".format(len(jobs_created)))
-
# Find failed jobs and record the job failure text.
- jobs_successful = set()
- jobs_failed = set()
jobs_failed_types = {}
- for j in jobs_created:
- # Skip this log entry if we've already recorded
- # the job failure.
- job_uuid = j['uuid']
- if job_uuid in jobs_failed:
- continue
- logs = job_error_logs(api, job_uuid)
- log("fetched {} job error logs for {}".format(len(logs), job_uuid))
- # If this line marks a failed job, record it and
- # the preceding log lines.
+ for job in jobs_failed:
+ job_uuid = job['uuid']
+ log("fetching job {} log from keep".format(job_uuid))
+ logs = job_logs(api, job)
+ # Find the first permanent task failure, and collect the
+ # preceding log lines.
for i, lg in enumerate(logs):
- if is_failed_job(lg['properties']['text']):
- jobs_failed.add(job_uuid)
- # Classify this job failure.
- lastlogs = "\n".join(
- [ l['properties']['text'] for l in logs[i-FAILED_JOB_LOG_LINES:i] ])
- log("searching job {} lastlogs: {}".format(job_uuid, lastlogs))
- for failtype in JOB_FAILURE_TYPES:
- if re.search(failtype, lastlogs):
- jobs_failed_types.setdefault(failtype, set())
- jobs_failed_types[failtype].add(job_uuid)
- continue
- # no specific reason found
- jobs_failed_types.setdefault('unknown', set())
- jobs_failed_types['unknown'].add(job_uuid)
- break
- elif is_successful_job(lg['properties']['text']):
- jobs_successful.add(job_uuid)
+ if is_failed_task(lg):
+ # Get preceding log record to provide context.
+ log_start = i - LOG_CONTEXT_LINES if i >= LOG_CONTEXT_LINES else 0
+ log_end = i + 1
+ lastlogs = ''.join(logs[log_start:log_end])
+ # try to identify the type of failure.
+ fail_reason = 'unknown'
+ for rgx in JOB_FAILURE_TYPES:
+ if re.search(rgx, lastlogs):
+ fail_reason = rgx
+ log("job {} failed for reason {}".format(job_uuid, fail_reason))
+ break
+ jobs_failed_types.setdefault(fail_reason, set())
+ jobs_failed_types[fail_reason].add(job_uuid)
break
+ # If we got here, the job is recorded as "failed" but we
+ # could not find the failure of any given task.
+ jobs_failed_types.setdefault('unknown', set())
+ jobs_failed_types['unknown'].add(job_uuid)
# Report percentages of successful, failed and unfinished jobs.
+ print "Start: {:20s}".format(start_time)
+ print "End: {:20s}".format(end_time)
+ print ""
+
+ print "Overview"
+ print ""
+
job_start_count = len(jobs_created)
job_success_count = len(jobs_successful)
job_fail_count = len(jobs_failed)
job_unfinished_count = job_start_count - job_success_count - job_fail_count
- print "Started: {0:4d}".format(job_start_count)
- print "Successful: {0:4d} ({1:3.0%})".format(job_success_count, job_success_count / float(job_start_count))
- print "Failed: {0:4d} ({1:3.0%})".format(job_fail_count, job_fail_count / float(job_start_count))
- print "In progress: {0:4d} ({1:3.0%})".format(job_unfinished_count, job_unfinished_count / float(job_start_count))
+ print " Started: {:4d}".format(job_start_count)
+ print " Successful: {:4d} ({:3.0%})".format(
+ job_success_count, job_success_count / float(job_start_count))
+ print " Failed: {:4d} ({:3.0%})".format(
+ job_fail_count, job_fail_count / float(job_start_count))
+ print " In progress: {:4d} ({:3.0%})".format(
+ job_unfinished_count, job_unfinished_count / float(job_start_count))
+ print ""
# Report failure types.
- for failtype in jobs_failed_types:
- print "{0:20s}: {1:4d} ({2:3.0%})".format(
- failtype, len(jobs_failed_types), len(jobs_failed_types) / float(job_fail_count))
+ failure_summary = ""
+ failure_detail = ""
+
+ for failtype, job_uuids in jobs_failed_types.iteritems():
+ failstat = " {:s} {:4d} ({:3.0%})\n".format(
+ failtype, len(job_uuids), len(job_uuids) / float(job_fail_count))
+ failure_summary = failure_summary + failstat
+ failure_detail = failure_detail + failstat
+ for j in job_uuids:
+ failure_detail = failure_detail + " http://crvr.se/{}\n".format(j)
+ failure_detail = failure_detail + "\n"
+
+ print "Failures by class"
+ print ""
+ print failure_summary
+
+ print "Failures by class (detail):"
+ print ""
+ print failure_detail
+
+ return 0
+
if __name__ == "__main__":
sys.exit(main())
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list