[ARVADOS] created: 1.2.0-73-gaf1125bd1
Git user
git at public.curoverse.com
Mon Sep 17 17:54:50 EDT 2018
at af1125bd1bc10f6ac2f9129261176c4510aadd54 (commit)
commit af1125bd1bc10f6ac2f9129261176c4510aadd54
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date: Mon Sep 17 17:50:13 2018 -0400
13752: Migrate file_names column using multiple transactions.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>
diff --git a/services/api/db/migrate/20180917205609_recompute_file_names_index.rb b/services/api/db/migrate/20180917205609_recompute_file_names_index.rb
new file mode 100644
index 000000000..3dbc0ec3d
--- /dev/null
+++ b/services/api/db/migrate/20180917205609_recompute_file_names_index.rb
@@ -0,0 +1,51 @@
+class RecomputeFileNamesIndex < ActiveRecord::Migration
+ def do_batch(pdhs:)
+ ActiveRecord::Base.connection.exec_query('BEGIN')
+ Collection.select(:portable_data_hash, :manifest_text).where(portable_data_hash: pdhs).distinct(:portable_data_hash).each do |c|
+ ActiveRecord::Base.connection.exec_query("update collections set file_names=$1 where portable_data_hash=$2",
+ "update file_names index",
+ [[nil, c.manifest_files], [nil, c.portable_data_hash]])
+ end
+ ActiveRecord::Base.connection.exec_query('COMMIT')
+ end
+ def up
+ # Process collections in multiple transactions, where the total
+ # size of all manifest_texts processed in a transaction is no more
+ # than batch_size_max. Collections whose manifest_text is bigger
+ # than batch_size_max are updated in their own individual
+ # transactions.
+ batch_size_max = 1 << 28 # 256 MiB
+ batch_size = 0
+ batch_pdhs = {}
+ last_pdh = '0'
+ total = Collection.distinct.count(:portable_data_hash)
+ done = 0
+ any = true
+ while any
+ any = false
+ Collection.
+ unscoped.
+ select(:portable_data_hash).distinct.
+ order(:portable_data_hash).
+ where('portable_data_hash > ?', last_pdh).
+ limit(1000).each do |c|
+ any = true
+ last_pdh = c.portable_data_hash
+ manifest_size = c.portable_data_hash.split('+')[1].to_i
+ if batch_size > 0 && batch_size + manifest_size > batch_size_max
+ do_batch(pdhs: batch_pdhs.keys)
+ done += batch_pdhs.size
+ Rails.logger.info("RecomputeFileNamesIndex: #{done}/#{total}")
+ batch_pdhs = {}
+ batch_size = 0
+ end
+ batch_pdhs[c.portable_data_hash] = true
+ batch_size += manifest_size
+ end
+ end
+ do_batch(pdhs: batch_pdhs.keys)
+ Rails.logger.info("RecomputeFileNamesIndex: finished")
+ end
+ def down
+ end
+end
diff --git a/services/api/db/structure.sql b/services/api/db/structure.sql
index 427c9afb5..f8d9b3f35 100644
--- a/services/api/db/structure.sql
+++ b/services/api/db/structure.sql
@@ -3169,3 +3169,5 @@ INSERT INTO schema_migrations (version) VALUES ('20180824155207');
INSERT INTO schema_migrations (version) VALUES ('20180904110712');
+INSERT INTO schema_migrations (version) VALUES ('20180917205609');
+
commit 591fd5d18644037426b58abc0d21bb2ccbcae888
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date: Mon Sep 17 17:49:21 2018 -0400
13752: De-duplicate file and stream names in collections.file_names.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>
diff --git a/services/api/app/models/collection.rb b/services/api/app/models/collection.rb
index 85b12a377..7e561712a 100644
--- a/services/api/app/models/collection.rb
+++ b/services/api/app/models/collection.rb
@@ -192,11 +192,16 @@ class Collection < ArvadosModel
def manifest_files
return '' if !self.manifest_text
+ done = {}
names = ''
self.manifest_text.scan(/ \d+:\d+:(\S+)/) do |name|
+ next if done[name]
+ done[name] = true
names << name.first.gsub('\040',' ') + "\n"
end
self.manifest_text.scan(/^\.\/(\S+)/m) do |stream_name|
+ next if done[stream_name]
+ done[stream_name] = true
names << stream_name.first.gsub('\040',' ') + "\n"
end
names
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list