[ARVADOS] updated: 1ee492f33846d35b4ead20fbdbbc3b496719bd86

git at public.curoverse.com git at public.curoverse.com
Tue Dec 30 16:07:59 EST 2014


Summary of changes:
 services/api/script/crunch-failure-report.py | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

       via  1ee492f33846d35b4ead20fbdbbc3b496719bd86 (commit)
      from  d71422fa22ffa7c397f9e7e8bc59622002a8ac53 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.


commit 1ee492f33846d35b4ead20fbdbbc3b496719bd86
Author: Tim Pierce <twp at curoverse.com>
Date:   Tue Dec 30 16:07:02 2014 -0500

    4598: added failure types and short names
    
    Added the sys/docker failure type. Failures now reported by short
    failure name rather than by regex.

diff --git a/services/api/script/crunch-failure-report.py b/services/api/script/crunch-failure-report.py
index e572053..63d729c 100755
--- a/services/api/script/crunch-failure-report.py
+++ b/services/api/script/crunch-failure-report.py
@@ -16,8 +16,11 @@ LOG_CONTEXT_LINES = 10
 # Regex that signifies a failed task.
 FAILED_TASK_REGEX = re.compile(' \d+ failure (.*permanent)')
 
-# List of regexes by which to classify failures.
-JOB_FAILURE_TYPES = [ 'User not found on host' ]
+# Regular expressions used to classify failure types.
+JOB_FAILURE_TYPES = {
+    'sys/docker': 'Cannot destroy container',
+    'crunch/node': 'User not found on host'
+}
 
 def parse_arguments(arguments):
     arg_parser = argparse.ArgumentParser(
@@ -87,7 +90,10 @@ def main(arguments=None, stdout=sys.stdout, stderr=sys.stderr):
     jobs_successful = [job for job in jobs_created if job['state'] == 'Complete']
 
     # Find failed jobs and record the job failure text.
-    jobs_failed_types = {}
+
+    # failure_stats maps failure types (e.g. "sys/docker") to
+    # a set of job UUIDs that failed for that reason.
+    failure_stats = {}
     for job in jobs_failed:
         job_uuid = job['uuid']
         logs = job_logs(api, job)
@@ -100,18 +106,18 @@ def main(arguments=None, stdout=sys.stdout, stderr=sys.stderr):
                 log_end = i + 1
                 lastlogs = ''.join(logs[log_start:log_end])
                 # try to identify the type of failure.
-                fail_reason = 'unknown'
-                for rgx in JOB_FAILURE_TYPES:
+                failure_type = 'unknown'
+                for key, rgx in JOB_FAILURE_TYPES.iteritems():
                     if re.search(rgx, lastlogs):
-                        fail_reason = rgx
+                        failure_type = key
                         break
-                jobs_failed_types.setdefault(fail_reason, set())
-                jobs_failed_types[fail_reason].add(job_uuid)
+                failure_stats.setdefault(failure_type, set())
+                failure_stats[failure_type].add(job_uuid)
                 break
             # If we got here, the job is recorded as "failed" but we
             # could not find the failure of any specific task.
-            jobs_failed_types.setdefault('unknown', set())
-            jobs_failed_types['unknown'].add(job_uuid)
+            failure_stats.setdefault('unknown', set())
+            failure_stats['unknown'].add(job_uuid)
 
     # Report percentages of successful, failed and unfinished jobs.
     print "Start: {:20s}".format(start_time)
@@ -139,7 +145,7 @@ def main(arguments=None, stdout=sys.stdout, stderr=sys.stderr):
     failure_summary = ""
     failure_detail = ""
 
-    for failtype, job_uuids in jobs_failed_types.iteritems():
+    for failtype, job_uuids in failure_stats.iteritems():
         failstat = "  {:s} {:4d} ({:3.0%})\n".format(
             failtype, len(job_uuids), len(job_uuids) / float(job_fail_count))
         failure_summary = failure_summary + failstat

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list