[ARVADOS] updated: f76947a7c1ba973a11e563b0977d54a2ece2ce38

Fri Apr 4 22:02:50 EDT 2014

Summary of changes:
 services/datamanager/datamanager.py |  157 ++++++++++++++++++++++++++---------
 1 files changed, 117 insertions(+), 40 deletions(-)

       via  f76947a7c1ba973a11e563b0977d54a2ece2ce38 (commit)
       via  e3ce426fda2e7d2283d7092f988245c6900c8949 (commit)
       via  cd2672dc235efaacd533aadeef489994fe684a25 (commit)
      from  40cc2b01c2a3ff911549c2d31c8195905109633d (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.


commit f76947a7c1ba973a11e563b0977d54a2ece2ce38
Author: Misha Zatsman <misha at curoverse.com>
Date:   Sat Apr 5 02:01:24 2014 +0000

    Started using global to actually assign to global variables instead of local variables that shadow them. Added webserver and threads!

diff --git a/services/datamanager/datamanager.py b/services/datamanager/datamanager.py
index a33afe1..458c70a 100755
--- a/services/datamanager/datamanager.py
+++ b/services/datamanager/datamanager.py
@@ -300,10 +300,13 @@ keep_servers = []
 keep_blocks = []
 block_to_replication = defaultdict(lambda: 0)
 
+all_data_loaded = False
+
 def loadAllData():
   checkUserIsAdmin()
 
   log.info('Building Collection List')
+  global collection_uuids
   collection_uuids = filter(None, [extractUuid(candidate)
                                    for candidate in buildCollectionsList()])
 
@@ -324,11 +327,13 @@ def loadAllData():
   reportBusiestUsers()
 
   log.info('Getting Keep Servers')
+  global keep_servers
   keep_servers = getKeepServers()
 
   print keep_servers
 
   log.info('Getting Blocks from each Keep Server.')
+  global keep_blocks
   keep_blocks = getKeepBlocks(keep_servers)
 
   computeReplication(keep_blocks)
@@ -337,7 +342,56 @@ def loadAllData():
 
   reportUserDiskUsage()
 
-loadAllData()
-
-# http://stackoverflow.com/questions/14088294/multithreaded-web-server-in-python
-
+  global all_data_loaded
+  all_data_loaded = True
+
+
+class DataManagerHandler(BaseHTTPRequestHandler):
+
+  def writeTop(self, title):
+    self.wfile.write('<HTML><HEAD><TITLE>%s</TITLE></HEAD>\n<BODY>' % title)
+    
+  def writeBottom(self):
+    self.wfile.write('</BODY></HTML>\n')
+    
+  def writeHomePage(self):
+    self.send_response(200)
+    self.end_headers()
+    self.writeTop('Home')
+    self.wfile.write('<TABLE>')
+    self.wfile.write('<TR><TH>user'
+                     '<TH>unweighted readable block size'
+                     '<TH>weighted readable block size'
+                     '<TH>unweighted persisted block size'
+                     '<TH>weighted persisted block size</TR>\n')
+    for user, usage in user_to_usage.items():
+      self.wfile.write('<TR><TD>%s<TD>%s<TD>%s<TD>%s<TD>%s</TR>\n' %
+                       (user,
+                        fileSizeFormat(usage[UNWEIGHTED_READ_SIZE_COL]),
+                        fileSizeFormat(usage[WEIGHTED_READ_SIZE_COL]),
+                        fileSizeFormat(usage[UNWEIGHTED_PERSIST_SIZE_COL]),
+                        fileSizeFormat(usage[WEIGHTED_PERSIST_SIZE_COL])))
+    self.wfile.write('</TABLE>\n')
+    self.writeBottom()
+
+  def do_GET(self):
+    if not all_data_loaded:
+      self.send_response(503)
+      self.end_headers()
+      self.writeTop('Not ready')
+      self.wfile.write('Sorry, but I am still loading all the data I need.\n')
+      self.writeBottom()
+    else:
+      self.writeHomePage()
+    return
+
+class ThreadedHTTPServer(ThreadingMixIn, HTTPServer):
+  """Handle requests in a separate thread."""
+
+#if __name__ == '__main__':
+
+loader = threading.Thread(target = loadAllData, name = 'loader')
+loader.start()
+
+server = ThreadedHTTPServer(('localhost', 9090), DataManagerHandler)
+server.serve_forever()

commit e3ce426fda2e7d2283d7092f988245c6900c8949
Author: Misha Zatsman <misha at curoverse.com>
Date:   Sat Apr 5 00:43:06 2014 +0000

    Moved main flow into loadAllData method to get ready for multithreading.

diff --git a/services/datamanager/datamanager.py b/services/datamanager/datamanager.py
index 2a3594d..a33afe1 100755
--- a/services/datamanager/datamanager.py
+++ b/services/datamanager/datamanager.py
@@ -3,14 +3,18 @@
 import arvados
 
 import argparse
+
 import logging
 import pprint
 import math
 import re
+import threading
 import urllib2
 
+from BaseHTTPServer import HTTPServer, BaseHTTPRequestHandler
 from collections import defaultdict
 from operator import itemgetter
+from SocketServer import ThreadingMixIn
 
 arv = arvados.api('v1')
 
@@ -236,11 +240,9 @@ def getKeepBlocks(keep_servers):
 
 
 def computeReplication(keep_blocks):
-  block_to_replication = defaultdict(lambda: 0)
   for server_blocks in keep_blocks:
     for block_uuid, _ in server_blocks:
       block_to_replication[block_uuid] += 1
-  return block_to_replication
 
 
 # This is the main flow here
@@ -275,25 +277,10 @@ stderr_handler.setFormatter(
   logging.Formatter('%(asctime)-15s %(levelname)-8s %(message)s'))
 log.addHandler(stderr_handler)
 
-checkUserIsAdmin()
-
-log.info('Building Collection List')
-collection_uuids = filter(None, [extractUuid(candidate)
-                                 for candidate in buildCollectionsList()])
-
-log.info('Reading Collections')
-readCollections(collection_uuids)
-
-if args.verbose:
-  pprint.pprint(CollectionInfo.all_by_uuid)
-
-log.info('Reading Links')
-readLinks()
-
-reportMostPopularCollections()
+# Global Data - don't try this at home
+collection_uuids = []
 
 # These maps all map from uuids to a set of uuids
-# The sets all contain collection uuids.
 block_to_collections = defaultdict(set)  # keep blocks
 reader_to_collections = defaultdict(set)  # collection(s) for which the user has read access
 persister_to_collections = defaultdict(set)  # collection(s) which the user has persisted
@@ -302,11 +289,6 @@ block_to_persisters = defaultdict(set)
 reader_to_blocks = defaultdict(set)
 persister_to_blocks = defaultdict(set)
 
-log.info('Building Maps')
-buildMaps()
-
-reportBusiestUsers()
-
 UNWEIGHTED_READ_SIZE_COL = 0
 WEIGHTED_READ_SIZE_COL = 1
 UNWEIGHTED_PERSIST_SIZE_COL = 2
@@ -314,16 +296,48 @@ WEIGHTED_PERSIST_SIZE_COL = 3
 NUM_COLS = 4
 user_to_usage = defaultdict(lambda : [0,]*NUM_COLS)
 
-log.info('Getting Keep Servers')
-keep_servers = getKeepServers()
+keep_servers = []
+keep_blocks = []
+block_to_replication = defaultdict(lambda: 0)
+
+def loadAllData():
+  checkUserIsAdmin()
+
+  log.info('Building Collection List')
+  collection_uuids = filter(None, [extractUuid(candidate)
+                                   for candidate in buildCollectionsList()])
+
+  log.info('Reading Collections')
+  readCollections(collection_uuids)
+
+  if args.verbose:
+    pprint.pprint(CollectionInfo.all_by_uuid)
+
+  log.info('Reading Links')
+  readLinks()
+
+  reportMostPopularCollections()
+
+  log.info('Building Maps')
+  buildMaps()
+
+  reportBusiestUsers()
+
+  log.info('Getting Keep Servers')
+  keep_servers = getKeepServers()
+
+  print keep_servers
+
+  log.info('Getting Blocks from each Keep Server.')
+  keep_blocks = getKeepBlocks(keep_servers)
+
+  computeReplication(keep_blocks)
 
-print keep_servers
+  log.info('average replication level is %f', (float(sum(block_to_replication.values())) / len(block_to_replication)))
 
-log.info('Getting Blocks from each Keep Server.')
-keep_blocks = getKeepBlocks(keep_servers)
+  reportUserDiskUsage()
 
-block_to_replication = computeReplication(keep_blocks)
+loadAllData()
 
-log.info('average replication level is %f', (float(sum(block_to_replication.values())) / len(block_to_replication)))
+# http://stackoverflow.com/questions/14088294/multithreaded-web-server-in-python
 
-reportUserDiskUsage()

commit cd2672dc235efaacd533aadeef489994fe684a25
Author: Misha Zatsman <misha at curoverse.com>
Date:   Fri Apr 4 22:03:45 2014 +0000

    Added logging to datamanager, as a step towards writing http server.

diff --git a/services/datamanager/datamanager.py b/services/datamanager/datamanager.py
index 2a642f2..2a3594d 100755
--- a/services/datamanager/datamanager.py
+++ b/services/datamanager/datamanager.py
@@ -3,12 +3,13 @@
 import arvados
 
 import argparse
+import logging
 import pprint
+import math
 import re
 import urllib2
 
 from collections import defaultdict
-from math import log
 from operator import itemgetter
 
 arv = arvados.api('v1')
@@ -16,7 +17,7 @@ arv = arvados.api('v1')
 # Adapted from http://stackoverflow.com/questions/4180980/formatting-data-quantity-capacity-as-string
 byteunits = ('B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB')
 def fileSizeFormat(value):
-  exponent = 0 if value == 0 else int(log(value, 1024))
+  exponent = 0 if value == 0 else int(math.log(value, 1024))
   return "%7.2f %-3s" % (float(value) / pow(1024, exponent),
                          byteunits[exponent])
 
@@ -65,14 +66,14 @@ def checkUserIsAdmin():
   current_user = arv.users().current().execute()
 
   if not current_user['is_admin']:
-    # TODO(misha): Use a logging framework here
-    print ('Warning current user %s (%s - %s) does not have admin access '
-           'and will not see much of the data.' %
-           (current_user['full_name'],
-            current_user['email'],
-            current_user['uuid']))
+    log.warning('Current user %s (%s - %s) does not have '
+                'admin access and will not see much of the data.',
+                current_user['full_name'],
+                current_user['email'],
+                current_user['uuid'])
     if args.require_admin_user:
-      print 'Exiting, rerun with --no-require-admin-user if you wish to continue.'
+      log.critical('Exiting, rerun with --no-require-admin-user '
+                   'if you wish to continue.')
       exit(1)
 
 def buildCollectionsList():
@@ -259,6 +260,7 @@ parser.add_argument('-u',
                     help='uuid of specific collection to process')
 parser.add_argument('--require-admin-user',
                     action='store_true',
+                    default=True,
                     help='Fail if the user is not an admin [default]')
 parser.add_argument('--no-require-admin-user',
                     dest='require_admin_user',
@@ -266,19 +268,26 @@ parser.add_argument('--no-require-admin-user',
                     help='Allow users without admin permissions with only a warning.')
 args = parser.parse_args()
 
+log = logging.getLogger('arvados.services.datamanager')
+stderr_handler = logging.StreamHandler()
+log.setLevel(logging.INFO)
+stderr_handler.setFormatter(
+  logging.Formatter('%(asctime)-15s %(levelname)-8s %(message)s'))
+log.addHandler(stderr_handler)
+
 checkUserIsAdmin()
 
-print 'Building Collection List'
+log.info('Building Collection List')
 collection_uuids = filter(None, [extractUuid(candidate)
                                  for candidate in buildCollectionsList()])
 
-print 'Reading Collections'
+log.info('Reading Collections')
 readCollections(collection_uuids)
 
 if args.verbose:
   pprint.pprint(CollectionInfo.all_by_uuid)
 
-print 'Reading Links'
+log.info('Reading Links')
 readLinks()
 
 reportMostPopularCollections()
@@ -293,7 +302,7 @@ block_to_persisters = defaultdict(set)
 reader_to_blocks = defaultdict(set)
 persister_to_blocks = defaultdict(set)
 
-print 'Building Maps'
+log.info('Building Maps')
 buildMaps()
 
 reportBusiestUsers()
@@ -305,16 +314,16 @@ WEIGHTED_PERSIST_SIZE_COL = 3
 NUM_COLS = 4
 user_to_usage = defaultdict(lambda : [0,]*NUM_COLS)
 
-print 'Getting Keep Servers'
+log.info('Getting Keep Servers')
 keep_servers = getKeepServers()
 
 print keep_servers
 
-print 'Getting Blocks from each Keep Server.'
+log.info('Getting Blocks from each Keep Server.')
 keep_blocks = getKeepBlocks(keep_servers)
 
 block_to_replication = computeReplication(keep_blocks)
 
-print 'average replication level is %f' % (float(sum(block_to_replication.values())) / len(block_to_replication))
+log.info('average replication level is %f', (float(sum(block_to_replication.values())) / len(block_to_replication)))
 
 reportUserDiskUsage()

-----------------------------------------------------------------------


hooks/post-receive
--