[ARVADOS] created: 1.2.0-73-gaf1125bd1

Git user git at public.curoverse.com
Mon Sep 17 17:54:50 EDT 2018


        at  af1125bd1bc10f6ac2f9129261176c4510aadd54 (commit)


commit af1125bd1bc10f6ac2f9129261176c4510aadd54
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Mon Sep 17 17:50:13 2018 -0400

    13752: Migrate file_names column using multiple transactions.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/services/api/db/migrate/20180917205609_recompute_file_names_index.rb b/services/api/db/migrate/20180917205609_recompute_file_names_index.rb
new file mode 100644
index 000000000..3dbc0ec3d
--- /dev/null
+++ b/services/api/db/migrate/20180917205609_recompute_file_names_index.rb
@@ -0,0 +1,51 @@
+class RecomputeFileNamesIndex < ActiveRecord::Migration
+  def do_batch(pdhs:)
+    ActiveRecord::Base.connection.exec_query('BEGIN')
+    Collection.select(:portable_data_hash, :manifest_text).where(portable_data_hash: pdhs).distinct(:portable_data_hash).each do |c|
+      ActiveRecord::Base.connection.exec_query("update collections set file_names=$1 where portable_data_hash=$2",
+                                               "update file_names index",
+                                               [[nil, c.manifest_files], [nil, c.portable_data_hash]])
+    end
+    ActiveRecord::Base.connection.exec_query('COMMIT')
+  end
+  def up
+    # Process collections in multiple transactions, where the total
+    # size of all manifest_texts processed in a transaction is no more
+    # than batch_size_max.  Collections whose manifest_text is bigger
+    # than batch_size_max are updated in their own individual
+    # transactions.
+    batch_size_max = 1 << 28    # 256 MiB
+    batch_size = 0
+    batch_pdhs = {}
+    last_pdh = '0'
+    total = Collection.distinct.count(:portable_data_hash)
+    done = 0
+    any = true
+    while any
+      any = false
+      Collection.
+        unscoped.
+        select(:portable_data_hash).distinct.
+        order(:portable_data_hash).
+        where('portable_data_hash > ?', last_pdh).
+        limit(1000).each do |c|
+        any = true
+        last_pdh = c.portable_data_hash
+        manifest_size = c.portable_data_hash.split('+')[1].to_i
+        if batch_size > 0 && batch_size + manifest_size > batch_size_max
+          do_batch(pdhs: batch_pdhs.keys)
+          done += batch_pdhs.size
+          Rails.logger.info("RecomputeFileNamesIndex: #{done}/#{total}")
+          batch_pdhs = {}
+          batch_size = 0
+        end
+        batch_pdhs[c.portable_data_hash] = true
+        batch_size += manifest_size
+      end
+    end
+    do_batch(pdhs: batch_pdhs.keys)
+    Rails.logger.info("RecomputeFileNamesIndex: finished")
+  end
+  def down
+  end
+end
diff --git a/services/api/db/structure.sql b/services/api/db/structure.sql
index 427c9afb5..f8d9b3f35 100644
--- a/services/api/db/structure.sql
+++ b/services/api/db/structure.sql
@@ -3169,3 +3169,5 @@ INSERT INTO schema_migrations (version) VALUES ('20180824155207');
 
 INSERT INTO schema_migrations (version) VALUES ('20180904110712');
 
+INSERT INTO schema_migrations (version) VALUES ('20180917205609');
+

commit 591fd5d18644037426b58abc0d21bb2ccbcae888
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Mon Sep 17 17:49:21 2018 -0400

    13752: De-duplicate file and stream names in collections.file_names.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/services/api/app/models/collection.rb b/services/api/app/models/collection.rb
index 85b12a377..7e561712a 100644
--- a/services/api/app/models/collection.rb
+++ b/services/api/app/models/collection.rb
@@ -192,11 +192,16 @@ class Collection < ArvadosModel
   def manifest_files
     return '' if !self.manifest_text
 
+    done = {}
     names = ''
     self.manifest_text.scan(/ \d+:\d+:(\S+)/) do |name|
+      next if done[name]
+      done[name] = true
       names << name.first.gsub('\040',' ') + "\n"
     end
     self.manifest_text.scan(/^\.\/(\S+)/m) do |stream_name|
+      next if done[stream_name]
+      done[stream_name] = true
       names << stream_name.first.gsub('\040',' ') + "\n"
     end
     names

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list