[ARVADOS] updated: a0a7b1a0c6f43a80449a76c00fb0c30858d38233
git at public.curoverse.com
git at public.curoverse.com
Mon Apr 28 18:31:35 EDT 2014
Summary of changes:
services/datamanager/experimental/datamanager.py | 79 +++++++++++++++++++---
1 files changed, 70 insertions(+), 9 deletions(-)
via a0a7b1a0c6f43a80449a76c00fb0c30858d38233 (commit)
via bfb190a4c3f5bd995ccae37536c3234b14eaf3ed (commit)
via 389f092ce9dc3a08245530fcf70d535783e62813 (commit)
from de9903cfc08ea7c3da459e7c4ee5a744d52a7c89 (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
commit a0a7b1a0c6f43a80449a76c00fb0c30858d38233
Author: Misha Zatsman <misha at curoverse.com>
Date: Mon Apr 28 22:17:52 2014 +0000
Added printing of garbage collection report to CSV file. Fixed bug in free disk space computation. Closes #2622
diff --git a/services/datamanager/experimental/datamanager.py b/services/datamanager/experimental/datamanager.py
index 0448823..12b8d6a 100755
--- a/services/datamanager/experimental/datamanager.py
+++ b/services/datamanager/experimental/datamanager.py
@@ -4,6 +4,7 @@ import arvados
import argparse
import cgi
+import csv
import json
import logging
import math
@@ -405,12 +406,21 @@ def computeGarbageCollectionCandidates():
mtime,
disk_size,
cumulative_disk_size,
- float(free_keep_space - cumulative_disk_size)/total_keep_space))
+ float(free_keep_space + cumulative_disk_size)/total_keep_space))
print 'The oldest Garbage Collection Candidates: '
pprint.pprint(garbage_collection_report[:20])
+def outputGarbageCollectionReport(filename):
+ with open(filename, 'wb') as csvfile:
+ gcwriter = csv.writer(csvfile)
+ gcwriter.writerow(['block uuid', 'latest mtime', 'disk size',
+ 'cumulative size', 'disk free'])
+ for line in garbage_collection_report:
+ gcwriter.writerow(line)
+
+
def detectReplicationProblems():
blocks_not_in_any_collections.update(
set(block_to_replication.keys()).difference(block_to_collections.keys()))
@@ -491,6 +501,10 @@ parser.add_argument('--user-storage-log-event-type',
default='user-storage-report',
help=('The event type to set when logging user '
'storage usage to workbench.'))
+parser.add_argument('--garbage-collection-file',
+ default='',
+ help=('The file to write a garbage collection report, or '
+ 'leave empty for no report.'))
args = None
@@ -544,7 +558,7 @@ multiplied by current replication level)
* cumulative disk size: The sum of this block's disk size and all the
blocks listed above it
* disk free: The proportion of our disk space that would be free if we
-deleted this block and all the above. So this is (free disk space -
+deleted this block and all the above. So this is (free disk space +
cumulative disk size) / total disk capacity
"""
@@ -603,12 +617,17 @@ def loadAllData():
computeReplication(keep_blocks)
- computeGarbageCollectionCandidates()
-
log.info('average replication level is %f',
(float(sum(block_to_replication.values())) /
len(block_to_replication)))
+ computeGarbageCollectionCandidates()
+
+ if args.garbage_collection_file:
+ log.info('Writing garbage Collection report to %s',
+ args.garbage_collection_file)
+ outputGarbageCollectionReport(args.garbage_collection_file)
+
detectReplicationProblems()
computeUserStorageUsage()
commit bfb190a4c3f5bd995ccae37536c3234b14eaf3ed
Author: Misha Zatsman <misha at curoverse.com>
Date: Mon Apr 28 21:58:57 2014 +0000
Added computation of resulting free space in garbage collection report.
diff --git a/services/datamanager/experimental/datamanager.py b/services/datamanager/experimental/datamanager.py
index 684e07f..0448823 100755
--- a/services/datamanager/experimental/datamanager.py
+++ b/services/datamanager/experimental/datamanager.py
@@ -400,10 +400,12 @@ def computeGarbageCollectionCandidates():
for block,mtime in garbage_collection_priority:
disk_size = blockDiskUsage(block)
cumulative_disk_size += disk_size
- garbage_collection_report.append((block,
- mtime,
- disk_size,
- cumulative_disk_size))
+ garbage_collection_report.append(
+ (block,
+ mtime,
+ disk_size,
+ cumulative_disk_size,
+ float(free_keep_space - cumulative_disk_size)/total_keep_space))
print 'The oldest Garbage Collection Candidates: '
pprint.pprint(garbage_collection_report[:20])
@@ -541,9 +543,9 @@ cumulative size)
multiplied by current replication level)
* cumulative disk size: The sum of this block's disk size and all the
blocks listed above it
-* TODO: disk free: The proportion of our disk space that would be free
-if we deleted this block and all the above. So this is (current disk
-space used - cumulative disk size) / total disk capacity
+* disk free: The proportion of our disk space that would be free if we
+deleted this block and all the above. So this is (free disk space -
+cumulative disk size) / total disk capacity
"""
# Stuff to report on
commit 389f092ce9dc3a08245530fcf70d535783e62813
Author: Misha Zatsman <misha at curoverse.com>
Date: Mon Apr 28 21:48:47 2014 +0000
Added retrieval of stats from keep servers and computing total and free space on keep disks.
diff --git a/services/datamanager/experimental/datamanager.py b/services/datamanager/experimental/datamanager.py
index 4bd2a26..684e07f 100755
--- a/services/datamanager/experimental/datamanager.py
+++ b/services/datamanager/experimental/datamanager.py
@@ -4,6 +4,7 @@ import arvados
import argparse
import cgi
+import json
import logging
import math
import pprint
@@ -352,6 +353,29 @@ def getKeepBlocks(keep_servers):
if line])
return blocks
+def getKeepStats(keep_servers):
+ MOUNT_COLUMN = 5
+ TOTAL_COLUMN = 1
+ FREE_COLUMN = 3
+ DISK_BLOCK_SIZE = 1024
+ stats = []
+ for host,port in keep_servers:
+ response = urllib2.urlopen('http://%s:%d/status.json' % (host, port))
+
+ parsed_json = json.load(response)
+ df_entries = [line.split()
+ for line in parsed_json['df'].split('\n')
+ if line]
+ keep_volumes = [columns
+ for columns in df_entries
+ if 'keep' in columns[MOUNT_COLUMN]]
+ total_space = DISK_BLOCK_SIZE*sum(map(int,map(itemgetter(TOTAL_COLUMN),
+ keep_volumes)))
+ free_space = DISK_BLOCK_SIZE*sum(map(int,map(itemgetter(FREE_COLUMN),
+ keep_volumes)))
+ stats.append([total_space, free_space])
+ return stats
+
def computeReplication(keep_blocks):
for server_blocks in keep_blocks:
@@ -498,6 +522,10 @@ user_to_usage = defaultdict(lambda : [0,]*NUM_COLS)
keep_servers = []
keep_blocks = []
+keep_stats = []
+total_keep_space = 0
+free_keep_space = 0
+
block_to_replication = defaultdict(lambda: 0)
block_to_latest_mtime = maxdict()
@@ -559,6 +587,18 @@ def loadAllData():
global keep_blocks
keep_blocks = getKeepBlocks(keep_servers)
+ log.info('Getting Stats from each Keep Server.')
+ global keep_stats, total_keep_space, free_keep_space
+ keep_stats = getKeepStats(keep_servers)
+
+ total_keep_space = sum(map(itemgetter(0), keep_stats))
+ free_keep_space = sum(map(itemgetter(1), keep_stats))
+
+ log.info('Total disk space: %s, Free disk space: %s (%d%%).' %
+ (fileSizeFormat(total_keep_space),
+ fileSizeFormat(free_keep_space),
+ 100*free_keep_space/total_keep_space))
+
computeReplication(keep_blocks)
computeGarbageCollectionCandidates()
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list