[ARVADOS] created: a7e2a2a7c0915de92345acbc209f5d941bf22b0a

Mon Dec 21 16:53:43 EST 2015

at  a7e2a2a7c0915de92345acbc209f5d941bf22b0a (commit)


commit a7e2a2a7c0915de92345acbc209f5d941bf22b0a
Author: Tom Clegg <tom at curoverse.com>
Date:   Mon Dec 21 16:53:37 2015 -0500

    7883: Recommend more economical values for resource_constraints.

diff --git a/tools/crunchstat-summary/bin/crunchstat-summary b/tools/crunchstat-summary/bin/crunchstat-summary
index f42d30b..c32b50e 100755
--- a/tools/crunchstat-summary/bin/crunchstat-summary
+++ b/tools/crunchstat-summary/bin/crunchstat-summary
@@ -4,8 +4,11 @@ from __future__ import print_function
 
 import crunchstat_summary.command
 import crunchstat_summary.summarizer
+import logging
 import sys
 
+logging.getLogger().addHandler(logging.StreamHandler())
+
 args = crunchstat_summary.command.ArgumentParser().parse_args(sys.argv[1:])
 s = crunchstat_summary.command.Command(args).summarizer()
 s.run()
diff --git a/tools/crunchstat-summary/crunchstat_summary/summarizer.py b/tools/crunchstat-summary/crunchstat_summary/summarizer.py
index 1b7f950..49b67ff 100644
--- a/tools/crunchstat-summary/crunchstat_summary/summarizer.py
+++ b/tools/crunchstat-summary/crunchstat_summary/summarizer.py
@@ -3,13 +3,21 @@ from __future__ import print_function
 import arvados
 import collections
 import functools
+import itertools
+import logging
+import math
 import re
 import sys
 
+logger = logging.getLogger(__name__)
+logger.addHandler(logging.NullHandler())
 
 class Summarizer(object):
-    def __init__(self, logdata):
+    existing_constraints = {}
+
+    def __init__(self, logdata, label='job'):
         self._logdata = logdata
+        self.label = label
 
     def run(self):
         # stats_max: {category: {stat: val}}
@@ -64,9 +72,9 @@ class Summarizer(object):
                             this_interval_s = val
                             continue
                         elif not (this_interval_s > 0):
-                            print("BUG? interval stat given with duration {!r}".
-                                  format(this_interval_s),
-                                  file=sys.stderr)
+                            logger.error(
+                                "BUG? interval stat given with duration {!r}".
+                                format(this_interval_s))
                             continue
                         else:
                             stat = stat + '__rate'
@@ -75,12 +83,7 @@ class Summarizer(object):
                         self.task_stats[task_id][category][stat] = val
                     if val > self.stats_max[category][stat]:
                         self.stats_max[category][stat] = val
-
-    def report(self):
-        return "\n".join(self._report_gen()) + "\n"
-
-    def _report_gen(self):
-        job_tot = collections.defaultdict(
+        self.job_tot = collections.defaultdict(
             functools.partial(collections.defaultdict, int))
         for task_id, task_stat in self.task_stats.iteritems():
             for category, stat_last in task_stat.iteritems():
@@ -88,7 +91,14 @@ class Summarizer(object):
                     if stat in ['cpus', 'cache', 'swap', 'rss']:
                         # meaningless stats like 16 cpu cores x 5 tasks = 80
                         continue
-                    job_tot[category][stat] += val
+                    self.job_tot[category][stat] += val
+
+    def report(self):
+        return "\n".join(itertools.chain(
+            self._report_gen(),
+            self._recommend_gen())) + "\n"
+
+    def _report_gen(self):
         yield "\t".join(['category', 'metric', 'task_max', 'task_max_rate', 'job_total'])
         for category, stat_max in sorted(self.stats_max.iteritems()):
             for stat, val in sorted(stat_max.iteritems()):
@@ -96,7 +106,7 @@ class Summarizer(object):
                     continue
                 max_rate = self._format(stat_max.get(stat+'__rate', '-'))
                 val = self._format(val)
-                tot = self._format(job_tot[category].get(stat, '-'))
+                tot = self._format(self.job_tot[category].get(stat, '-'))
                 yield "\t".join([category, stat, str(val), max_rate, tot])
         for args in (
                 ('Max CPU time spent by a single task: {}s',
@@ -106,7 +116,8 @@ class Summarizer(object):
                  self.stats_max['cpu']['user+sys__rate'],
                  lambda x: x * 100),
                 ('Overall CPU usage: {}%',
-                 job_tot['cpu']['user+sys'] / job_tot['time']['elapsed'],
+                 self.job_tot['cpu']['user+sys'] /
+                 self.job_tot['time']['elapsed'],
                  lambda x: x * 100),
                 ('Max memory used by a single task: {}GB',
                  self.stats_max['mem']['rss'],
@@ -124,6 +135,47 @@ class Summarizer(object):
                 val = transform(val)
             yield "# "+format_string.format(self._format(val))
 
+    def _recommend_gen(self):
+        return itertools.chain(
+            self._recommend_cpu(),
+            self._recommend_ram())
+
+    def _recommend_cpu(self):
+        """Recommend asking for 4 cores if max CPU usage was 333%"""
+
+        cpu_max_rate = self.stats_max['cpu']['user+sys__rate']
+        if cpu_max_rate == float('-Inf'):
+            logger.warning('%s: no CPU usage data', self.label)
+            return
+        used_cores = int(math.ceil(cpu_max_rate))
+        asked_cores =  self.existing_constraints.get('min_cores_per_node')
+        if asked_cores is None or used_cores < asked_cores:
+            yield (
+                '#!! {} max CPU usage was {}% -- '
+                'try runtime_constraints "min_cores_per_node":{}'
+            ).format(
+                self.label,
+                int(math.ceil(cpu_max_rate*100)),
+                int(used_cores))
+
+    def _recommend_ram(self):
+        """Recommend asking for 2048 MiB RAM if max rss was 1248 MiB"""
+
+        used_ram = self.stats_max['mem']['rss']
+        if used_ram == float('-Inf'):
+            logger.warning('%s: no memory usage data', self.label)
+            return
+        used_ram = math.ceil(float(used_ram) / (1<<20))
+        asked_ram = self.existing_constraints.get('min_ram_mb_per_node')
+        if asked_ram is None or math.ceil(used_ram/(1<<10)) < asked_ram/(1<<10):
+            yield (
+                '#!! {} never used more than {} MiB RAM -- '
+                'try runtime_constraints "min_ram_mb_per_node":{}'
+            ).format(
+                self.label,
+                int(used_ram),
+                int(math.ceil(used_ram/(1<<10))*(1<<10)))
+
     def _format(self, val):
         """Return a string representation of a stat.
 
@@ -133,6 +185,7 @@ class Summarizer(object):
         else:
             return '{}'.format(val)
 
+
 class CollectionSummarizer(Summarizer):
     def __init__(self, collection_id):
         collection = arvados.collection.CollectionReader(collection_id)
@@ -141,7 +194,9 @@ class CollectionSummarizer(Summarizer):
             raise ValueError(
                 "collection {} has {} files; need exactly one".format(
                     collection_id, len(filenames)))
-        super(CollectionSummarizer, self).__init__(collection.open(filenames[0]))
+        super(CollectionSummarizer, self).__init__(
+            collection.open(filenames[0]))
+
 
 class JobSummarizer(CollectionSummarizer):
     def __init__(self, job):
@@ -150,12 +205,15 @@ class JobSummarizer(CollectionSummarizer):
             self.job = arv.jobs().get(uuid=job).execute()
         else:
             self.job = job
+        self.label = self.job['uuid']
+        self.existing_constraints = self.job.get('runtime_constraints', {})
         if not self.job['log']:
             raise ValueError(
                 "job {} has no log; live summary not implemented".format(
                     self.job['uuid']))
         super(JobSummarizer, self).__init__(self.job['log'])
 
+
 class PipelineSummarizer():
     def __init__(self, pipeline_instance_uuid):
         arv = arvados.api('v1')
@@ -164,16 +222,17 @@ class PipelineSummarizer():
         self.summarizers = collections.OrderedDict()
         for cname, component in instance['components'].iteritems():
             if 'job' not in component:
-                print("{}: skipping component with no job assigned".format(
-                    cname), file=sys.stderr)
+                logger.warning(
+                    "%s: skipping component with no job assigned", cname)
             elif component['job'].get('log') is None:
-                print("{}: skipping component with no log available".format(
-                    cname), file=sys.stderr)
+                logger.warning(
+                    "%s: skipping job %s with no log available",
+                    cname, component['job'].get('uuid'))
             else:
-                print("{}: reading log from {}".format(
-                    cname, component['job']['log']), file=sys.stderr)
-                summarizer = CollectionSummarizer(component['job']['log'])
-                summarizer.job_uuid = component['job']['uuid']
+                logger.debug(
+                    "%s: reading log from %s", cname, component['job']['log'])
+                summarizer = JobSummarizer(component['job'])
+                summarizer.label = cname
                 self.summarizers[cname] = summarizer
 
     def run(self):
@@ -184,7 +243,7 @@ class PipelineSummarizer():
         txt = ''
         for cname, summarizer in self.summarizers.iteritems():
             txt += '### Summary for {} ({})\n'.format(
-                cname, summarizer.job_uuid)
+                cname, summarizer.job['uuid'])
             txt += summarizer.report()
             txt += '\n'
         return txt
diff --git a/tools/crunchstat-summary/tests/logfile_20151204190335.txt.gz.report b/tools/crunchstat-summary/tests/logfile_20151204190335.txt.gz.report
index ef7beb1..c94cd24 100644
--- a/tools/crunchstat-summary/tests/logfile_20151204190335.txt.gz.report
+++ b/tools/crunchstat-summary/tests/logfile_20151204190335.txt.gz.report
@@ -28,3 +28,5 @@ time	elapsed	80	-	80
 # Max memory used by a single task: 0.35GB
 # Max network traffic in a single task: 1.79GB
 # Max network speed in a single interval: 42.58MB/s
+#!! job max CPU usage was 13% -- try runtime_constraints "min_cores_per_node":1
+#!! job never used more than 334 MiB RAM -- try runtime_constraints "min_ram_mb_per_node":1024
diff --git a/tools/crunchstat-summary/tests/logfile_20151210063411.txt.gz.report b/tools/crunchstat-summary/tests/logfile_20151210063411.txt.gz.report
index 38af3e7..e711824 100644
--- a/tools/crunchstat-summary/tests/logfile_20151210063411.txt.gz.report
+++ b/tools/crunchstat-summary/tests/logfile_20151210063411.txt.gz.report
@@ -15,3 +15,4 @@ time	elapsed	2	-	4
 # Overall CPU usage: 0.00%
 # Max memory used by a single task: 0.00GB
 # Max network traffic in a single task: 0.00GB
+#!! job never used more than 1 MiB RAM -- try runtime_constraints "min_ram_mb_per_node":1024
diff --git a/tools/crunchstat-summary/tests/logfile_20151210063439.txt.gz.report b/tools/crunchstat-summary/tests/logfile_20151210063439.txt.gz.report
index 7e42d61..5772cb4 100644
--- a/tools/crunchstat-summary/tests/logfile_20151210063439.txt.gz.report
+++ b/tools/crunchstat-summary/tests/logfile_20151210063439.txt.gz.report
@@ -15,3 +15,4 @@ time	elapsed	2	-	3
 # Overall CPU usage: 0.00%
 # Max memory used by a single task: 0.00GB
 # Max network traffic in a single task: 0.00GB
+#!! job never used more than 1 MiB RAM -- try runtime_constraints "min_ram_mb_per_node":1024
diff --git a/tools/crunchstat-summary/tests/test_examples.py b/tools/crunchstat-summary/tests/test_examples.py
index 1bce693..a19d7ad 100644
--- a/tools/crunchstat-summary/tests/test_examples.py
+++ b/tools/crunchstat-summary/tests/test_examples.py
@@ -24,7 +24,7 @@ class ReportDiff(unittest.TestCase):
             expect, got, fromfile=expectfile, tofile="(generated)")))
 
 
-class ExampleLogsTestCase(ReportDiff):
+class SummarizeFile(ReportDiff):
     def test_example_files(self):
         for fnm in glob.glob(os.path.join(TESTS_DIR, '*.txt.gz')):
             logfile = os.path.join(TESTS_DIR, fnm)
@@ -35,22 +35,28 @@ class ExampleLogsTestCase(ReportDiff):
             self.diff_known_report(logfile, summarizer)
 
 
-class LookupJobUUID(ReportDiff):
-    fake_uuid = 'zzzzz-8i9sb-jq0ekny1xou3zoh'
+class SummarizeJob(ReportDiff):
+    fake_job_uuid = 'zzzzz-8i9sb-jjjjjjjjjjjjjjj'
+    fake_log_id = 'fake-log-collection-id'
+    fake_job = {
+        'uuid': fake_job_uuid,
+        'log': fake_log_id,
+    }
+    logfile = os.path.join(TESTS_DIR, 'logfile_20151204190335.txt.gz')
 
     @mock.patch('arvados.collection.CollectionReader')
     @mock.patch('arvados.api')
-    def test_job_uuid(self, mock_api, mock_cr):
-        logfile = os.path.join(TESTS_DIR, 'logfile_20151204190335.txt.gz')
-        mock_api().jobs().get().execute.return_value = {'log': 'fake-uuid'}
+    def test_job_report(self, mock_api, mock_cr):
+        mock_api().jobs().get().execute.return_value = self.fake_job
         mock_cr().__iter__.return_value = ['fake-logfile.txt']
-        mock_cr().open.return_value = gzip.open(logfile)
+        mock_cr().open.return_value = gzip.open(self.logfile)
         args = crunchstat_summary.command.ArgumentParser().parse_args(
-            ['--job', self.fake_uuid])
+            ['--job', self.fake_job_uuid])
         summarizer = crunchstat_summary.command.Command(args).summarizer()
         summarizer.run()
-        self.diff_known_report(logfile, summarizer)
-        mock_api().jobs().get.assert_called_with(uuid=self.fake_uuid)
+        self.diff_known_report(self.logfile, summarizer)
+        mock_api().jobs().get.assert_called_with(uuid=self.fake_job_uuid)
+        mock_cr.assert_called_with(self.fake_log_id)
         mock_cr().open.assert_called_with('fake-logfile.txt')
 
 
@@ -63,12 +69,20 @@ class SummarizePipeline(ReportDiff):
                 'job': {
                     'uuid': 'zzzzz-8i9sb-000000000000000',
                     'log': 'fake-log-pdh-0',
+                    'runtime_constraints': {
+                        'min_ram_mb_per_node': 1024,
+                        'min_cores_per_node': 1,
+                    },
                 },
             }],
             ['bar', {
                 'job': {
                     'uuid': 'zzzzz-8i9sb-000000000000001',
                     'log': 'fake-log-pdh-1',
+                    'runtime_constraints': {
+                        'min_ram_mb_per_node': 1024,
+                        'min_cores_per_node': 1,
+                    },
                 },
             }],
             ['no-job-assigned', {}],
@@ -81,6 +95,10 @@ class SummarizePipeline(ReportDiff):
                 'job': {
                     'uuid': 'zzzzz-8i9sb-000000000000002',
                     'log': 'fake-log-pdh-2',
+                    'runtime_constraints': {
+                        'min_ram_mb_per_node': 1024,
+                        'min_cores_per_node': 1,
+                    },
                 },
             }]]),
     }
@@ -98,13 +116,16 @@ class SummarizePipeline(ReportDiff):
         summarizer = crunchstat_summary.command.Command(args).summarizer()
         summarizer.run()
 
+        job_report = [
+            line for line in open(logfile+'.report').readlines()
+            if not line.startswith('#!! ')]
         expect = (
             ['### Summary for foo (zzzzz-8i9sb-000000000000000)\n'] +
-            open(logfile+'.report').readlines() + ['\n'] +
+            job_report + ['\n'] +
             ['### Summary for bar (zzzzz-8i9sb-000000000000001)\n'] +
-            open(logfile+'.report').readlines() + ['\n'] +
+            job_report + ['\n'] +
             ['### Summary for baz (zzzzz-8i9sb-000000000000002)\n'] +
-            open(logfile+'.report').readlines())
+            job_report)
         self.diff_report(summarizer, expect)
         mock_cr.assert_has_calls(
             [

commit ef6352c883eb5ab027e8bf3e27477d7f9102c286
Author: Tom Clegg <tom at curoverse.com>
Date:   Mon Dec 21 14:42:44 2015 -0500

    7883: Add --pipeline-instance mode: generate a report for each finished component.

diff --git a/tools/crunchstat-summary/bin/crunchstat-summary b/tools/crunchstat-summary/bin/crunchstat-summary
index 662d783..f42d30b 100755
--- a/tools/crunchstat-summary/bin/crunchstat-summary
+++ b/tools/crunchstat-summary/bin/crunchstat-summary
@@ -7,6 +7,6 @@ import crunchstat_summary.summarizer
 import sys
 
 args = crunchstat_summary.command.ArgumentParser().parse_args(sys.argv[1:])
-s = crunchstat_summary.summarizer.Summarizer(args)
+s = crunchstat_summary.command.Command(args).summarizer()
 s.run()
 print(s.report(), end='')
diff --git a/tools/crunchstat-summary/crunchstat_summary/command.py b/tools/crunchstat-summary/crunchstat_summary/command.py
index 8186e5d..fc37190 100644
--- a/tools/crunchstat-summary/crunchstat_summary/command.py
+++ b/tools/crunchstat-summary/crunchstat_summary/command.py
@@ -1,4 +1,8 @@
 import argparse
+import gzip
+import sys
+
+from crunchstat_summary import summarizer
 
 
 class ArgumentParser(argparse.ArgumentParser):
@@ -10,5 +14,27 @@ class ArgumentParser(argparse.ArgumentParser):
             '--job', type=str, metavar='UUID',
             help='Look up the specified job and read its log data from Keep')
         src.add_argument(
+            '--pipeline-instance', type=str, metavar='UUID',
+            help='Summarize each component of the given pipeline instance')
+        src.add_argument(
             '--log-file', type=str,
             help='Read log data from a regular file')
+
+
+class Command(object):
+    def __init__(self, args):
+        self.args = args
+
+    def summarizer(self):
+        if self.args.pipeline_instance:
+            return summarizer.PipelineSummarizer(self.args.pipeline_instance)
+        elif self.args.job:
+            return summarizer.JobSummarizer(self.args.job)
+        elif self.args.log_file:
+            if self.args.log_file.endswith('.gz'):
+                fh = gzip.open(self.args.log_file)
+            else:
+                fh = open(self.args.log_file)
+            return summarizer.Summarizer(fh)
+        else:
+            return summarizer.Summarizer(sys.stdin)
diff --git a/tools/crunchstat-summary/crunchstat_summary/summarizer.py b/tools/crunchstat-summary/crunchstat_summary/summarizer.py
index ac0964b..1b7f950 100644
--- a/tools/crunchstat-summary/crunchstat_summary/summarizer.py
+++ b/tools/crunchstat-summary/crunchstat_summary/summarizer.py
@@ -3,14 +3,13 @@ from __future__ import print_function
 import arvados
 import collections
 import functools
-import gzip
 import re
 import sys
 
 
 class Summarizer(object):
-    def __init__(self, args):
-        self.args = args
+    def __init__(self, logdata):
+        self._logdata = logdata
 
     def run(self):
         # stats_max: {category: {stat: val}}
@@ -20,7 +19,7 @@ class Summarizer(object):
         # task_stats: {task_id: {category: {stat: val}}}
         self.task_stats = collections.defaultdict(
             functools.partial(collections.defaultdict, dict))
-        for line in self._logdata():
+        for line in self._logdata:
             m = re.search(r'^\S+ \S+ \d+ (?P<seq>\d+) success in (?P<elapsed>\d+) seconds', line)
             if m:
                 task_id = m.group('seq')
@@ -35,6 +34,8 @@ class Summarizer(object):
             if m.group('category').endswith(':'):
                 # "notice:" etc.
                 continue
+            elif m.group('category') == 'error':
+                continue
             task_id = m.group('seq')
             this_interval_s = None
             for group in ['current', 'interval']:
@@ -44,10 +45,15 @@ class Summarizer(object):
                 words = m.group(group).split(' ')
                 stats = {}
                 for val, stat in zip(words[::2], words[1::2]):
-                    if '.' in val:
-                        stats[stat] = float(val)
-                    else:
-                        stats[stat] = int(val)
+                    try:
+                        if '.' in val:
+                            stats[stat] = float(val)
+                        else:
+                            stats[stat] = int(val)
+                    except ValueError as e:
+                        raise ValueError(
+                            'Error parsing {} stat in "{}": {!r}'.format(
+                                stat, line, e))
                 if 'user' in stats or 'sys' in stats:
                     stats['user+sys'] = stats.get('user', 0) + stats.get('sys', 0)
                 if 'tx' in stats or 'rx' in stats:
@@ -127,25 +133,58 @@ class Summarizer(object):
         else:
             return '{}'.format(val)
 
-    def _logdata(self):
-        if self.args.log_file:
-            if self.args.log_file.endswith('.gz'):
-                return gzip.open(self.args.log_file)
-            else:
-                return open(self.args.log_file)
-        elif self.args.job:
-            arv = arvados.api('v1')
-            job = arv.jobs().get(uuid=self.args.job).execute()
-            if not job['log']:
-                raise ValueError(
-                    "job {} has no log; live summary not implemented".format(
-                        self.args.job))
-            collection = arvados.collection.CollectionReader(job['log'])
-            filenames = [filename for filename in collection]
-            if len(filenames) != 1:
-                raise ValueError(
-                    "collection {} has {} files; need exactly one".format(
-                        job.log, len(filenames)))
-            return collection.open(filenames[0])
+class CollectionSummarizer(Summarizer):
+    def __init__(self, collection_id):
+        collection = arvados.collection.CollectionReader(collection_id)
+        filenames = [filename for filename in collection]
+        if len(filenames) != 1:
+            raise ValueError(
+                "collection {} has {} files; need exactly one".format(
+                    collection_id, len(filenames)))
+        super(CollectionSummarizer, self).__init__(collection.open(filenames[0]))
+
+class JobSummarizer(CollectionSummarizer):
+    def __init__(self, job):
+        arv = arvados.api('v1')
+        if isinstance(job, str):
+            self.job = arv.jobs().get(uuid=job).execute()
         else:
-            return sys.stdin
+            self.job = job
+        if not self.job['log']:
+            raise ValueError(
+                "job {} has no log; live summary not implemented".format(
+                    self.job['uuid']))
+        super(JobSummarizer, self).__init__(self.job['log'])
+
+class PipelineSummarizer():
+    def __init__(self, pipeline_instance_uuid):
+        arv = arvados.api('v1')
+        instance = arv.pipeline_instances().get(
+            uuid=pipeline_instance_uuid).execute()
+        self.summarizers = collections.OrderedDict()
+        for cname, component in instance['components'].iteritems():
+            if 'job' not in component:
+                print("{}: skipping component with no job assigned".format(
+                    cname), file=sys.stderr)
+            elif component['job'].get('log') is None:
+                print("{}: skipping component with no log available".format(
+                    cname), file=sys.stderr)
+            else:
+                print("{}: reading log from {}".format(
+                    cname, component['job']['log']), file=sys.stderr)
+                summarizer = CollectionSummarizer(component['job']['log'])
+                summarizer.job_uuid = component['job']['uuid']
+                self.summarizers[cname] = summarizer
+
+    def run(self):
+        for summarizer in self.summarizers.itervalues():
+            summarizer.run()
+
+    def report(self):
+        txt = ''
+        for cname, summarizer in self.summarizers.iteritems():
+            txt += '### Summary for {} ({})\n'.format(
+                cname, summarizer.job_uuid)
+            txt += summarizer.report()
+            txt += '\n'
+        return txt
diff --git a/tools/crunchstat-summary/tests/test_examples.py b/tools/crunchstat-summary/tests/test_examples.py
index dbc3843..1bce693 100644
--- a/tools/crunchstat-summary/tests/test_examples.py
+++ b/tools/crunchstat-summary/tests/test_examples.py
@@ -1,22 +1,115 @@
+import arvados
+import collections
 import crunchstat_summary.command
 import crunchstat_summary.summarizer
 import difflib
 import glob
+import gzip
+import mock
 import os
 import unittest
 
 
-class ExampleLogsTestCase(unittest.TestCase):
+TESTS_DIR = os.path.dirname(os.path.abspath(__file__))
+
+class ReportDiff(unittest.TestCase):
+    def diff_known_report(self, logfile, summarizer):
+        expectfile = logfile+'.report'
+        expect = open(expectfile).readlines()
+        self.diff_report(summarizer, expect, expectfile=expectfile)
+
+    def diff_report(self, summarizer, expect, expectfile=None):
+        got = [x+"\n" for x in summarizer.report().strip("\n").split("\n")]
+        self.assertEqual(got, expect, "\n"+"".join(difflib.context_diff(
+            expect, got, fromfile=expectfile, tofile="(generated)")))
+
+
+class ExampleLogsTestCase(ReportDiff):
     def test_example_files(self):
-        dirname = os.path.dirname(os.path.abspath(__file__))
-        for fnm in glob.glob(os.path.join(dirname, '*.txt.gz')):
-            logfile = os.path.join(dirname, fnm)
+        for fnm in glob.glob(os.path.join(TESTS_DIR, '*.txt.gz')):
+            logfile = os.path.join(TESTS_DIR, fnm)
             args = crunchstat_summary.command.ArgumentParser().parse_args(
                 ['--log-file', logfile])
-            summarizer = crunchstat_summary.summarizer.Summarizer(args)
+            summarizer = crunchstat_summary.command.Command(args).summarizer()
             summarizer.run()
-            got = [x+"\n" for x in summarizer.report().strip("\n").split("\n")]
-            expectfile = logfile+'.report'
-            expect = open(expectfile).readlines()
-            self.assertEqual(got, expect, "\n"+"".join(difflib.context_diff(
-                expect, got, fromfile=expectfile, tofile="(generated)")))
+            self.diff_known_report(logfile, summarizer)
+
+
+class LookupJobUUID(ReportDiff):
+    fake_uuid = 'zzzzz-8i9sb-jq0ekny1xou3zoh'
+
+    @mock.patch('arvados.collection.CollectionReader')
+    @mock.patch('arvados.api')
+    def test_job_uuid(self, mock_api, mock_cr):
+        logfile = os.path.join(TESTS_DIR, 'logfile_20151204190335.txt.gz')
+        mock_api().jobs().get().execute.return_value = {'log': 'fake-uuid'}
+        mock_cr().__iter__.return_value = ['fake-logfile.txt']
+        mock_cr().open.return_value = gzip.open(logfile)
+        args = crunchstat_summary.command.ArgumentParser().parse_args(
+            ['--job', self.fake_uuid])
+        summarizer = crunchstat_summary.command.Command(args).summarizer()
+        summarizer.run()
+        self.diff_known_report(logfile, summarizer)
+        mock_api().jobs().get.assert_called_with(uuid=self.fake_uuid)
+        mock_cr().open.assert_called_with('fake-logfile.txt')
+
+
+class SummarizePipeline(ReportDiff):
+    fake_instance = {
+        'uuid': 'zzzzz-d1hrv-i3e77t9z5y8j9cc',
+        'owner_uuid': 'zzzzz-tpzed-xurymjxw79nv3jz',
+        'components': collections.OrderedDict([
+            ['foo', {
+                'job': {
+                    'uuid': 'zzzzz-8i9sb-000000000000000',
+                    'log': 'fake-log-pdh-0',
+                },
+            }],
+            ['bar', {
+                'job': {
+                    'uuid': 'zzzzz-8i9sb-000000000000001',
+                    'log': 'fake-log-pdh-1',
+                },
+            }],
+            ['no-job-assigned', {}],
+            ['unfinished-job', {
+                'job': {
+                    'uuid': 'zzzzz-8i9sb-xxxxxxxxxxxxxxx',
+                },
+            }],
+            ['baz', {
+                'job': {
+                    'uuid': 'zzzzz-8i9sb-000000000000002',
+                    'log': 'fake-log-pdh-2',
+                },
+            }]]),
+    }
+
+    @mock.patch('arvados.collection.CollectionReader')
+    @mock.patch('arvados.api')
+    def test_pipeline(self, mock_api, mock_cr):
+        logfile = os.path.join(TESTS_DIR, 'logfile_20151204190335.txt.gz')
+        mock_api().pipeline_instances().get().execute. \
+            return_value = self.fake_instance
+        mock_cr().__iter__.return_value = ['fake-logfile.txt']
+        mock_cr().open.side_effect = [gzip.open(logfile) for _ in range(3)]
+        args = crunchstat_summary.command.ArgumentParser().parse_args(
+            ['--pipeline-instance', self.fake_instance['uuid']])
+        summarizer = crunchstat_summary.command.Command(args).summarizer()
+        summarizer.run()
+
+        expect = (
+            ['### Summary for foo (zzzzz-8i9sb-000000000000000)\n'] +
+            open(logfile+'.report').readlines() + ['\n'] +
+            ['### Summary for bar (zzzzz-8i9sb-000000000000001)\n'] +
+            open(logfile+'.report').readlines() + ['\n'] +
+            ['### Summary for baz (zzzzz-8i9sb-000000000000002)\n'] +
+            open(logfile+'.report').readlines())
+        self.diff_report(summarizer, expect)
+        mock_cr.assert_has_calls(
+            [
+                mock.call('fake-log-pdh-0'),
+                mock.call('fake-log-pdh-1'),
+                mock.call('fake-log-pdh-2'),
+            ], any_order=True)
+        mock_cr().open.assert_called_with('fake-logfile.txt')

-----------------------------------------------------------------------


hooks/post-receive
--