[ARVADOS] updated: 7ee73a40614a891111a4c82e83a998b4b429766b
git at public.curoverse.com
git at public.curoverse.com
Wed Jan 7 09:44:33 EST 2015
Summary of changes:
services/api/script/crunch-failure-report.py | 219 ---------------------------
1 file changed, 219 deletions(-)
delete mode 100755 services/api/script/crunch-failure-report.py
via 7ee73a40614a891111a4c82e83a998b4b429766b (commit)
from 4b9208f2b118fa4f8baab6ad3ad3caa77f6cb143 (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
commit 7ee73a40614a891111a4c82e83a998b4b429766b
Author: Tim Pierce <twp at curoverse.com>
Date: Wed Jan 7 09:43:56 2015 -0500
4598: rename script
Renamed crunch-failure-report.py to crunch_failure_report.py to permit
importing (and eventually testing).
diff --git a/services/api/script/crunch-failure-report.py b/services/api/script/crunch-failure-report.py
deleted file mode 100755
index 31ad0fe..0000000
--- a/services/api/script/crunch-failure-report.py
+++ /dev/null
@@ -1,219 +0,0 @@
-#! /usr/bin/env python
-
-import argparse
-import datetime
-import json
-import re
-import sys
-
-import arvados
-
-# Useful configuration variables:
-
-# Number of log lines to use as context in diagnosing failure.
-LOG_CONTEXT_LINES = 10
-
-# Regex that signifies a failed task.
-FAILED_TASK_REGEX = re.compile(' \d+ failure (.*permanent)')
-
-# Regular expressions used to classify failure types.
-JOB_FAILURE_TYPES = {
- 'sys/docker': 'Cannot destroy container',
- 'crunch/node': 'User not found on host',
- 'slurm/comm': 'Communication connection failure'
-}
-
-def parse_arguments(arguments):
- arg_parser = argparse.ArgumentParser(
- description='Produce a report of Crunch failures within a specified time range')
-
- arg_parser.add_argument(
- '--start',
- help='Start date and time')
- arg_parser.add_argument(
- '--end',
- help='End date and time')
-
- args = arg_parser.parse_args(arguments)
-
- if args.start and not is_valid_timestamp(args.start):
- raise ValueError(args.start)
- if args.end and not is_valid_timestamp(args.end):
- raise ValueError(args.end)
-
- return args
-
-
-def api_timestamp(when=None):
- """Returns a string representing the timestamp 'when' in a format
- suitable for delivering to the API server. Defaults to the
- current time.
- """
- if when is None:
- when = datetime.datetime.utcnow()
- return when.strftime("%Y-%m-%dT%H:%M:%SZ")
-
-
-def is_valid_timestamp(ts):
- return re.match(r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z', ts)
-
-
-def jobs_created_between_dates(api, start, end):
- return arvados.util.list_all(
- api.jobs().list,
- filters=json.dumps([ ['created_at', '>=', start],
- ['created_at', '<=', end] ]))
-
-
-def job_logs(api, job):
- # Returns the contents of the log for this job (as an array of lines).
- if job['log']:
- log_collection = arvados.CollectionReader(job['log'], api)
- log_filename = "{}.log.txt".format(job['uuid'])
- return log_collection.open(log_filename).readlines()
- return []
-
-
-user_names = {}
-def job_user_name(api, user_uuid):
- def _lookup_user_name(api, user_uuid):
- try:
- return api.users().get(uuid=user_uuid).execute()['full_name']
- except arvados.errors.ApiError:
- return user_uuid
-
- if user_uuid not in user_names:
- user_names[user_uuid] = _lookup_user_name(api, user_uuid)
- return user_names[user_uuid]
-
-
-job_pipeline_names = {}
-def job_pipeline_name(api, job_uuid):
- def _lookup_pipeline_name(api, job_uuid):
- try:
- pipelines = api.pipeline_instances().list(
- filters='[["components", "like", "%{}%"]]'.format(job_uuid)).execute()
- pi = pipelines['items'][0]
- if pi['name']:
- return pi['name']
- else:
- # Use the pipeline template name
- pt = api.pipeline_templates().get(uuid=pi['pipeline_template_uuid']).execute()
- return pt['name']
- except (TypeError, ValueError, IndexError):
- return ""
-
- if job_uuid not in job_pipeline_names:
- job_pipeline_names[job_uuid] = _lookup_pipeline_name(api, job_uuid)
- return job_pipeline_names[job_uuid]
-
-
-def is_failed_task(logline):
- return FAILED_TASK_REGEX.search(logline) != None
-
-
-def main(arguments=None, stdout=sys.stdout, stderr=sys.stderr):
- args = parse_arguments(arguments)
-
- api = arvados.api('v1')
-
- now = datetime.datetime.utcnow()
- start_time = args.start or api_timestamp(now - datetime.timedelta(days=1))
- end_time = args.end or api_timestamp(now)
-
- # Find all jobs created within the specified window,
- # and their corresponding job logs.
- jobs_created = jobs_created_between_dates(api, start_time, end_time)
- jobs_by_state = {}
- for job in jobs_created:
- jobs_by_state.setdefault(job['state'], [])
- jobs_by_state[job['state']].append(job)
-
- # Find failed jobs and record the job failure text.
-
- # failure_stats maps failure types (e.g. "sys/docker") to
- # a set of job UUIDs that failed for that reason.
- failure_stats = {}
- for job in jobs_by_state['Failed']:
- job_uuid = job['uuid']
- logs = job_logs(api, job)
- # Find the first permanent task failure, and collect the
- # preceding log lines.
- failure_type = None
- for i, lg in enumerate(logs):
- if is_failed_task(lg):
- # Get preceding log record to provide context.
- log_start = i - LOG_CONTEXT_LINES if i >= LOG_CONTEXT_LINES else 0
- log_end = i + 1
- lastlogs = ''.join(logs[log_start:log_end])
- # try to identify the type of failure.
- for key, rgx in JOB_FAILURE_TYPES.iteritems():
- if re.search(rgx, lastlogs):
- failure_type = key
- break
- if failure_type is not None:
- break
- if failure_type is None:
- failure_type = 'unknown'
- failure_stats.setdefault(failure_type, set())
- failure_stats[failure_type].add(job_uuid)
-
- # Report percentages of successful, failed and unfinished jobs.
- print "Start: {:20s}".format(start_time)
- print "End: {:20s}".format(end_time)
- print ""
-
- print "Overview"
- print ""
-
- job_start_count = len(jobs_created)
- print " {: <25s} {:4d}".format('Started', job_start_count)
- for state in ['Complete', 'Failed', 'Queued', 'Cancelled', 'Running']:
- if state in jobs_by_state:
- job_count = len(jobs_by_state[state])
- job_percentage = job_count / float(job_start_count)
- print " {: <25s} {:4d} ({: >4.0%})".format(state,
- job_count,
- job_percentage)
- print ""
-
- # Report failure types.
- failure_summary = ""
- failure_detail = ""
-
- # Generate a mapping from failed job uuids to job records, to assist
- # in generating detailed statistics for job failures.
- jobs_failed_map = { job['uuid']: job for job in jobs_by_state.get('Failed', []) }
-
- # sort the failure stats in descending order by occurrence.
- sorted_failures = sorted(failure_stats,
- reverse=True,
- key=lambda failure_type: len(failure_stats[failure_type]))
- for failtype in sorted_failures:
- job_uuids = failure_stats[failtype]
- failstat = " {: <25s} {:4d} ({: >4.0%})\n".format(
- failtype,
- len(job_uuids),
- len(job_uuids) / float(len(jobs_by_state['Failed'])))
- failure_summary = failure_summary + failstat
- failure_detail = failure_detail + failstat
- for j in job_uuids:
- job_info = jobs_failed_map[j]
- job_owner = job_user_name(api, job_info['modified_by_user_uuid'])
- job_name = job_pipeline_name(api, job_info['uuid'])
- failure_detail = failure_detail + " {} {: <15.15s} {:29.29s}\n".format(j, job_owner, job_name)
- failure_detail = failure_detail + "\n"
-
- print "Failures by class"
- print ""
- print failure_summary
-
- print "Failures by class (detail)"
- print ""
- print failure_detail
-
- return 0
-
-
-if __name__ == "__main__":
- sys.exit(main())
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list