[ARVADOS] created: 1.3.0-1240-g73c4c526b

Wed Jul 3 17:53:53 UTC 2019

at  73c4c526b0248c1574677558a8a78d33c29788b1 (commit)


commit 73c4c526b0248c1574677558a8a78d33c29788b1
Author: Tom Morris <tfmorris at veritasgenetics.com>
Date:   Wed Jul 3 13:52:34 2019 -0400

    Add a simple temp disk utilization based recommendation.
    
    refs #13913
    
    Arvados-DCO-1.1-Signed-off-by: Tom Morris <tfmorris at veritasgenetics.com>

diff --git a/tools/crunchstat-summary/crunchstat_summary/summarizer.py b/tools/crunchstat-summary/crunchstat_summary/summarizer.py
index a86702ed7..305042b2b 100644
--- a/tools/crunchstat-summary/crunchstat_summary/summarizer.py
+++ b/tools/crunchstat-summary/crunchstat_summary/summarizer.py
@@ -335,7 +335,9 @@ class Summarizer(object):
         return itertools.chain(
             self._recommend_cpu(),
             self._recommend_ram(),
-            self._recommend_keep_cache())
+            self._recommend_keep_cache(),
+            self._recommend_temp_disk(),
+            )
 
     def _recommend_cpu(self):
         """Recommend asking for 4 cores if max CPU usage was 333%"""
@@ -438,6 +440,21 @@ class Summarizer(object):
                 math.ceil(asked_cache * 2 / self._runtime_constraint_mem_unit()))
 
 
+    def _recommend_temp_disk(self):
+        """Recommend decreasing temp disk if utilization < 50%"""
+        total = float(self.job_tot['statfs']['total'])
+        utilization = (float(self.job_tot['statfs']['used']) / total)
+
+        if utilization < 50.8:
+            yield (
+                '#!! {} max temp disk utilization was {:.0f}% of {:.0f} MiB -- '
+                'consider reducing "tmpdirMin" and/or "outdirMin"'
+            ).format(
+                self.label,
+                utilization * 100.0,
+                total / MB)
+
+
     def _format(self, val):
         """Return a string representation of a stat.
 

commit c7f6db1306862a42d1726c16a1714cf1b26632ef
Author: Tom Morris <tfmorris at veritasgenetics.com>
Date:   Mon Apr 22 20:48:53 2019 -0400

    WIP Graph tmp disk usage
    
    Plots temp disk space used as well as total available.
    
    Also adds support for multiple metrics per graph and
    groups CPU and network stats by category.
    
    refs #13913
    
    Arvados-DCO-1.1-Signed-off-by: Tom Morris <tfmorris at veritasgenetics.com>

diff --git a/tools/crunchstat-summary/crunchstat_summary/dygraphs.py b/tools/crunchstat-summary/crunchstat_summary/dygraphs.py
index 6df440a14..14b9ad256 100644
--- a/tools/crunchstat-summary/crunchstat_summary/dygraphs.py
+++ b/tools/crunchstat-summary/crunchstat_summary/dygraphs.py
@@ -2,6 +2,8 @@
 #
 # SPDX-License-Identifier: AGPL-3.0
 
+import sys
+
 import crunchstat_summary.webchart
 
 
@@ -13,21 +15,34 @@ class DygraphsChart(crunchstat_summary.webchart.WebChart):
     def headHTML(self):
         return '<link rel="stylesheet" href="{}">\n'.format(self.CSS)
 
-    def chartdata(self, label, tasks, stat):
+    def chartdata(self, label, tasks, stats):
+        '''For Crunch2, label is the name of container request,
+        tasks is the top level container and
+        stats is index by a tuple of (category, metric).
+        '''
         return {
-            'data': self._collate_data(tasks, stat),
+            'data': self._collate_data(tasks, stats),
             'options': {
+                'legend': 'always',
                 'connectSeparatedPoints': True,
-                'labels': ['elapsed']+[uuid for uuid, _ in tasks.items()],
-                'title': '{}: {} {}'.format(label, stat[0], stat[1]),
+                'labels': ['elapsed'] +  stats[1],
+                'title': '{}: {}'.format(label, stats[0]),
             },
         }
 
-    def _collate_data(self, tasks, stat):
+    def _collate_data(self, tasks, stats):
         data = []
         nulls = []
+        # uuid is category for crunch2
         for uuid, task in tasks.items():
-            for pt in task.series[stat]:
-                data.append([pt[0].total_seconds()] + nulls + [pt[1]])
+            # All stats in a category are assumed to have the same time base and same number of samples
+            category = stats[0]
+            series_names = stats[1]
+            sn0 = series_names[0]
+            series = task.series[(category,sn0)]
+            for i in range(len(series)):
+                pt = series[i]
+                vals = [task.series[(category,stat)][i][1] for stat in series_names[1:]]
+                data.append([pt[0].total_seconds()] + nulls + [pt[1]] + vals)
             nulls.append(None)
         return sorted(data)
diff --git a/tools/crunchstat-summary/crunchstat_summary/reader.py b/tools/crunchstat-summary/crunchstat_summary/reader.py
index 8ccdbc2fc..44fc0c567 100644
--- a/tools/crunchstat-summary/crunchstat_summary/reader.py
+++ b/tools/crunchstat-summary/crunchstat_summary/reader.py
@@ -5,6 +5,7 @@
 import arvados
 import itertools
 import queue
+import sys
 import threading
 
 from crunchstat_summary import logger
@@ -25,7 +26,8 @@ class CollectionReader(object):
         filenames = [filename for filename in collection]
         # Crunch2 has multiple stats files
         if len(filenames) > 1:
-            filenames = ['crunchstat.txt', 'arv-mount.txt']
+            filenames = ['crunchstat.txt', 'hoststat.txt', 'arv-mount.txt']
+            filenames = ['hoststat.txt']
         for filename in filenames:
             try:
                 self._readers.append(collection.open(filename))
diff --git a/tools/crunchstat-summary/crunchstat_summary/summarizer.py b/tools/crunchstat-summary/crunchstat_summary/summarizer.py
index 884f16b4a..a86702ed7 100644
--- a/tools/crunchstat-summary/crunchstat_summary/summarizer.py
+++ b/tools/crunchstat-summary/crunchstat_summary/summarizer.py
@@ -129,13 +129,14 @@ class Summarizer(object):
                 try:
                     self.label = m.group('job_uuid')
                 except IndexError:
-                    self.label = 'container'
-            if m.group('category').endswith(':'):
+                    self.label = 'label #1'
+            category = m.group('category')
+            if category.endswith(':'):
                 # "stderr crunchstat: notice: ..."
                 continue
-            elif m.group('category') in ('error', 'caught'):
+            elif category in ('error', 'caught'):
                 continue
-            elif m.group('category') in ('read', 'open', 'cgroup', 'CID', 'Running'):
+            elif category in ('read', 'open', 'cgroup', 'CID', 'Running'):
                 # "stderr crunchstat: read /proc/1234/net/dev: ..."
                 # (old logs are less careful with unprefixed error messages)
                 continue
@@ -221,11 +222,11 @@ class Summarizer(object):
                     if group == 'interval' and this_interval_s:
                             stat = stat + '__rate'
                             val = val / this_interval_s
-                            if stat in ['user+sys__rate', 'tx+rx__rate']:
+                            if stat in ['user+sys__rate', 'user__rate', 'sys__rate', 'tx+rx__rate', 'rx__rate', 'tx__rate']:
                                 task.series[category, stat].append(
                                     (timestamp - self.starttime, val))
                     else:
-                        if stat in ['rss']:
+                        if stat in ['rss','used','total']:
                             task.series[category, stat].append(
                                 (timestamp - self.starttime, val))
                         self.task_stats[task_id][category][stat] = val
@@ -315,7 +316,13 @@ class Summarizer(object):
                  (float(self.job_tot['blkio:0:0']['read']) /
                  float(self.job_tot['net:keep0']['rx']))
                  if self.job_tot['net:keep0']['rx'] > 0 else 0,
-                 lambda x: x * 100.0)):
+                 lambda x: x * 100.0),
+               ('Temp disk utilization {}%',
+                 (float(self.job_tot['statfs']['used']) /
+                 float(self.job_tot['statfs']['total']))
+                 if self.job_tot['statfs']['total'] > 0 else 0,
+                 lambda x: x * 100.0),
+                ):
             format_string, val, transform = args
             if val == float('-Inf'):
                 continue
diff --git a/tools/crunchstat-summary/crunchstat_summary/webchart.py b/tools/crunchstat-summary/crunchstat_summary/webchart.py
index cf0c1e67a..31afcf64e 100644
--- a/tools/crunchstat-summary/crunchstat_summary/webchart.py
+++ b/tools/crunchstat-summary/crunchstat_summary/webchart.py
@@ -45,10 +45,13 @@ class WebChart(object):
                 'label': s.long_label(),
                 'charts': [
                     self.chartdata(s.label, s.tasks, stat)
-                    for stat in (('cpu', 'user+sys__rate'),
-                                 ('mem', 'rss'),
-                                 ('net:eth0', 'tx+rx__rate'),
-                                 ('net:keep0', 'tx+rx__rate'))],
+                    for stat in (('cpu', ['user+sys__rate', 'user__rate', 'sys__rate']),
+                                 ('mem', ['rss']),
+                                 ('net:eth0', ['tx+rx__rate','rx__rate','tx__rate']),
+                                 ('net:keep0', ['tx+rx__rate','rx__rate','tx__rate']),
+                                 ('statfs', ['used', 'total']),
+                                 )
+                    ],
             }
             for s in self.summarizers]
 

-----------------------------------------------------------------------


hooks/post-receive
--