[ARVADOS] created: f5ed1dba16a6cdc1b5b83f0ab9c7a3bf36ef6fc7

git at public.curoverse.com git at public.curoverse.com
Wed Mar 18 10:44:27 EDT 2015


        at  f5ed1dba16a6cdc1b5b83f0ab9c7a3bf36ef6fc7 (commit)


commit f5ed1dba16a6cdc1b5b83f0ab9c7a3bf36ef6fc7
Author: Brett Smith <brett at curoverse.com>
Date:   Wed Mar 18 10:44:18 2015 -0400

    5319: Improve collection PDH fix performance with LIKE searches.
    
    PostgreSQL regexp searches use a lot of RAM, and these queries run out
    of RAM on qr1hi.  Prefer LIKE queries, which use less RAM and are more
    portable.  We have to do multiple searches, but that's life.

diff --git a/services/api/db/migrate/20150303210106_fix_collection_portable_data_hash_with_hinted_manifest.rb b/services/api/db/migrate/20150303210106_fix_collection_portable_data_hash_with_hinted_manifest.rb
index d983e7b..9bbe113 100644
--- a/services/api/db/migrate/20150303210106_fix_collection_portable_data_hash_with_hinted_manifest.rb
+++ b/services/api/db/migrate/20150303210106_fix_collection_portable_data_hash_with_hinted_manifest.rb
@@ -55,19 +55,26 @@ class FixCollectionPortableDataHashWithHintedManifest < ActiveRecord::Migration
   end
 
   def each_bad_collection
-    # It's important to make sure that this line doesn't swap.  The
-    # worst case scenario is that it finds a batch of collections that
-    # all have maximum size manifests (64MiB).  With a batch size of
-    # 50, that's about 3GiB.  Figure it will end up being 4GiB after
-    # other ActiveRecord overhead.  That's a size we're comfortable with.
-    Collection.where("manifest_text ~ '\\+[A-Z]'").
-        find_each(batch_size: 50) do |coll|
-      stripped_manifest = coll.manifest_text.
-        gsub(/( [0-9a-f]{32}(\+\d+)?)(\+\S+)/, '\1')
-      stripped_pdh = sprintf("%s+%i",
-                             Digest::MD5.hexdigest(stripped_manifest),
-                             stripped_manifest.bytesize)
-      yield [coll, stripped_pdh] if (coll.portable_data_hash != stripped_pdh)
+    seen_uuids = []
+    ("A".."Z").each do |hint_char|
+      query = Collection.where("manifest_text LIKE '%+#{hint_char}%'")
+      unless seen_uuids.empty?
+        query = query.where("uuid NOT IN (?)", seen_uuids)
+      end
+      # It's important to make sure that this line doesn't swap.  The
+      # worst case scenario is that it finds a batch of collections that
+      # all have maximum size manifests (64MiB).  With a batch size of
+      # 50, that's about 3GiB.  Figure it will end up being 4GiB after
+      # other ActiveRecord overhead.  That's a size we're comfortable with.
+      query.find_each(batch_size: 50) do |coll|
+        seen_uuids << coll.uuid
+        stripped_manifest = coll.manifest_text.
+          gsub(/( [0-9a-f]{32}(\+\d+)?)\+\S+/, '\1')
+        stripped_pdh = sprintf("%s+%i",
+                               Digest::MD5.hexdigest(stripped_manifest),
+                               stripped_manifest.bytesize)
+        yield [coll, stripped_pdh] if (coll.portable_data_hash != stripped_pdh)
+      end
     end
   end
 

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list