[ARVADOS] updated: cf328df185a2f12c19f1e4f64f7184f8ad3635d0
git at public.curoverse.com
git at public.curoverse.com
Tue Dec 30 11:02:17 EST 2014
Summary of changes:
services/api/script/crunch-failure-report.py | 116 ++++++++++++++++++++++-----
1 file changed, 94 insertions(+), 22 deletions(-)
via cf328df185a2f12c19f1e4f64f7184f8ad3635d0 (commit)
from b03a6a8b2c20a0579cd724baeb9283bd5f0d1f08 (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
commit cf328df185a2f12c19f1e4f64f7184f8ad3635d0
Author: Tim Pierce <twp at curoverse.com>
Date: Tue Dec 30 11:00:51 2014 -0500
4598: bug fixes, added full stats collection
Added code to report full stats on failed, successful, and incomplete
jobs. Perform basic reporting on failed job causes (not yet working).
diff --git a/services/api/script/crunch-failure-report.py b/services/api/script/crunch-failure-report.py
index e2b5d2d..989a421 100755
--- a/services/api/script/crunch-failure-report.py
+++ b/services/api/script/crunch-failure-report.py
@@ -8,6 +8,21 @@ import sys
import arvados
+# Useful configuration variables:
+
+# The number of log lines preceding a job failure message that should
+# be collected.
+FAILED_JOB_LOG_LINES = 10
+
+# Regex that signifies a failed job.
+FAILED_JOB_REGEX = re.compile('fail')
+
+# Regex that signifies a successful job.
+SUCCESSFUL_JOB_REGEX = re.compile('finished')
+
+# List of regexes by which to classify failures.
+JOB_FAILURE_TYPES = [ 'User not found on host' ]
+
def parse_arguments(arguments):
arg_parser = argparse.ArgumentParser(
description='Produce a report of Crunch failures within a specified time range')
@@ -34,6 +49,31 @@ def api_timestamp(when=None):
when = datetime.datetime.utcnow()
return when.strftime("%Y-%m-%dT%H:%M:%SZ")
+
+def jobs_created_between_dates(api, start, end):
+ return arvados.util.list_all(
+ api.jobs().list,
+ filters=json.dumps([ ['created_at', '>=', start],
+ ['created_at', '<=', end] ]))
+
+
+def job_error_logs(api, job_uuid):
+ return arvados.util.list_all(
+ api.logs().list,
+ filters=json.dumps([ ['object_uuid', '=', job_uuid],
+ ['event_type', '=', 'stderr'] ]))
+
+
+def is_failed_job(logline):
+ return FAILED_JOB_REGEX.search(logline) != None
+
+
+def is_successful_job(logline):
+ return SUCCESSFUL_JOB_REGEX.search(logline) != None
+
+def log(s):
+ print >>sys.stderr, "{}: {}".format(datetime.datetime.utcnow(), s)
+
def main(arguments=None, stdout=sys.stdout, stderr=sys.stderr):
args = parse_arguments(arguments)
@@ -42,31 +82,63 @@ def main(arguments=None, stdout=sys.stdout, stderr=sys.stderr):
now = datetime.datetime.utcnow()
start_time = args.start or api_timestamp(now - datetime.timedelta(days=1))
end_time = args.end or api_timestamp(now)
- match_re = re.compile(args.match)
- logs = arvados.util.list_all(
- api.logs().list,
- filters=json.dumps([ ['created_at', '>=', start_time],
- ['created_at', '<=', end_time],
- ['event_type', '=', 'stderr'] ]))
+ # Find all jobs created within the specified window,
+ # and their corresponding job logs.
+ log("fetching jobs between {} and {}".format(start_time, end_time))
+ jobs_created = jobs_created_between_dates(api, start_time, end_time)
+ log("jobs created: {}".format(len(jobs_created)))
+
+
+ # Find failed jobs and record the job failure text.
+ jobs_successful = set()
+ jobs_failed = set()
+ jobs_failed_types = {}
+ for j in jobs_created:
+ # Skip this log entry if we've already recorded
+ # the job failure.
+ job_uuid = j['uuid']
+ if job_uuid in jobs_failed:
+ continue
+ logs = job_error_logs(api, job_uuid)
+ log("fetched {} job error logs for {}".format(len(logs), job_uuid))
+ # If this line marks a failed job, record it and
+ # the preceding log lines.
+ for i, lg in enumerate(logs):
+ if is_failed_job(lg['properties']['text']):
+ jobs_failed.add(job_uuid)
+ # Classify this job failure.
+ lastlogs = "\n".join(
+ [ l['properties']['text'] for l in logs[i-FAILED_JOB_LOG_LINES:i] ])
+ log("searching job {} lastlogs: {}".format(job_uuid, lastlogs))
+ for failtype in JOB_FAILURE_TYPES:
+ if re.search(failtype, lastlogs):
+ jobs_failed_types.setdefault(failtype, set())
+ jobs_failed_types[failtype].add(job_uuid)
+ continue
+ # no specific reason found
+ jobs_failed_types.setdefault('unknown', set())
+ jobs_failed_types['unknown'].add(job_uuid)
+ break
+ elif is_successful_job(lg['properties']['text']):
+ jobs_successful.add(job_uuid)
+ break
+
+ # Report percentages of successful, failed and unfinished jobs.
+ job_start_count = len(jobs_created)
+ job_success_count = len(jobs_successful)
+ job_fail_count = len(jobs_failed)
+ job_unfinished_count = job_start_count - job_success_count - job_fail_count
- log_stats = {}
- for log in logs:
- for logline in log['properties']['text'].splitlines():
- # Remove timestamp at beginning of log lines
- # Mon Dec 1 23:59:55 2014
- stderr_msg = re.sub(
- '\w{3} \w{3} +\d+ \d{2}:\d{2}:\d{2} \d{4} +',
- '',
- logline)
- if match_re.search(stderr_msg):
- log_uuid = log['uuid']
- log_stats.setdefault(stderr_msg, []).append(log_uuid)
-
- # Sort the keys of log stats in decreasing order of frequency.
- for k in sorted(log_stats.keys(), cmp=lambda a,b: cmp(len(log_stats[b]), len(log_stats[a]))):
- print "{}: {}".format(k, len(log_stats[k]))
+ print "Started: {0:4d}".format(job_start_count)
+ print "Successful: {0:4d} ({1:3.0%})".format(job_success_count, job_success_count / float(job_start_count))
+ print "Failed: {0:4d} ({1:3.0%})".format(job_fail_count, job_fail_count / float(job_start_count))
+ print "In progress: {0:4d} ({1:3.0%})".format(job_unfinished_count, job_unfinished_count / float(job_start_count))
+ # Report failure types.
+ for failtype in jobs_failed_types:
+ print "{0:20s}: {1:4d} ({2:3.0%})".format(
+ failtype, len(jobs_failed_types), len(jobs_failed_types) / float(job_fail_count))
if __name__ == "__main__":
sys.exit(main())
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list