[ARVADOS] updated: 1.3.0-606-g670987617

Git user git at public.curoverse.com
Wed Mar 27 21:28:39 UTC 2019


Summary of changes:
 services/api/app/models/collection.rb              |  2 +-
 services/api/app/models/container.rb               | 58 ++++++----------------
 .../20190322174136_add_file_info_to_collection.rb  | 22 +++++++-
 services/api/test/unit/container_test.rb           |  8 ++-
 4 files changed, 45 insertions(+), 45 deletions(-)

       via  6709876170511ade8e24fe60bf77da24bc4a03d4 (commit)
       via  fe08de0ddf1d3e536cb3518dcb9c82ca62197273 (commit)
       via  d6fb38cac6a7f9f98f534b4638ce5918ba94c135 (commit)
      from  a737669021ac34683deecda8130e21b243e14174 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.


commit 6709876170511ade8e24fe60bf77da24bc4a03d4
Author: Eric Biagiotti <ebiagiotti at veritasgenetics.com>
Date:   Wed Mar 27 17:28:31 2019 -0400

    14484: Fixes tests that intentionally pass in invalid manifests
    
    Arvados-DCO-1.1-Signed-off-by: Eric Biagiotti <ebiagiotti at veritasgenetics.com>

diff --git a/services/api/app/models/collection.rb b/services/api/app/models/collection.rb
index 2cebd5438..73a8df11f 100644
--- a/services/api/app/models/collection.rb
+++ b/services/api/app/models/collection.rb
@@ -199,7 +199,7 @@ class Collection < ArvadosModel
   end
 
   def set_file_count_and_total_size
-    if self.manifest_text_changed?
+    if self.manifest_text_changed? && self.valid?
       m = Keep::Manifest.new(self.manifest_text)
       self.file_size_total = m.files_size
       self.file_count = m.files_count

commit fe08de0ddf1d3e536cb3518dcb9c82ca62197273
Author: Eric Biagiotti <ebiagiotti at veritasgenetics.com>
Date:   Wed Mar 27 16:56:47 2019 -0400

    14484: Adds test for pdh grouping functionality in the container model
    
    Arvados-DCO-1.1-Signed-off-by: Eric Biagiotti <ebiagiotti at veritasgenetics.com>

diff --git a/services/api/app/models/container.rb b/services/api/app/models/container.rb
index f3da80082..0f48a7501 100644
--- a/services/api/app/models/container.rb
+++ b/services/api/app/models/container.rb
@@ -411,8 +411,7 @@ class Container < ArvadosModel
   #
   # Correctly groups pdhs to use for batch database updates. Helps avoid
   # updating too many database rows in a single transaction.
-  def self.group_pdhs_for_multiple_transactions(distinct_ordered_pdhs, distinct_pdh_count, log_prefix)
-    batch_size_max = 1 << 28 # 256 MiB
+  def self.group_pdhs_for_multiple_transactions(distinct_ordered_pdhs, distinct_pdh_count, batch_size_max, log_prefix)
     batch_size = 0
     batch_pdhs = {}
     last_pdh = '0'
diff --git a/services/api/db/migrate/20190322174136_add_file_info_to_collection.rb b/services/api/db/migrate/20190322174136_add_file_info_to_collection.rb
index 99db8133d..3e87b0c8b 100755
--- a/services/api/db/migrate/20190322174136_add_file_info_to_collection.rb
+++ b/services/api/db/migrate/20190322174136_add_file_info_to_collection.rb
@@ -46,7 +46,11 @@ class AddFileInfoToCollection < ActiveRecord::Migration
       end
     }
 
-    Container.group_pdhs_for_multiple_transactions(ordered_pdh_query, distinct_pdh_count, "AddFileInfoToCollection") do |pdhs|
+    batch_size_max = 1 << 28 # 256 MiB
+    Container.group_pdhs_for_multiple_transactions(ordered_pdh_query,
+                                                   distinct_pdh_count,
+                                                   batch_size_max,
+                                                   "AddFileInfoToCollection") do |pdhs|
       do_batch(pdhs)
     end
   end
diff --git a/services/api/test/unit/container_test.rb b/services/api/test/unit/container_test.rb
index 2b7fda8d7..783d2a985 100644
--- a/services/api/test/unit/container_test.rb
+++ b/services/api/test/unit/container_test.rb
@@ -962,8 +962,14 @@ class ContainerTest < ActiveSupport::TestCase
   test "pdh_grouping_by_manifest_size" do
     batch_size_max = 200
     pdhs_in = ['x1+30', 'x2+30', 'x3+201', 'x4+100', 'x5+100']
+    pdh_lambda = lambda { |last_pdh, &block|
+      pdhs = pdhs_in.select{|pdh| pdh > last_pdh} 
+      pdhs.each do |p|
+        block.call(p)
+      end
+    }
     batched_pdhs = []
-    Container.group_pdhs_by_manifest_size(pdhs_in, batch_size_max) do |pdhs|
+    Container.group_pdhs_for_multiple_transactions(pdh_lambda, pdhs_in.size, batch_size_max, "") do |pdhs|
       batched_pdhs << pdhs
     end
     expected = [['x1+30', 'x2+30'], ['x3+201'], ['x4+100', 'x5+100']]

commit d6fb38cac6a7f9f98f534b4638ce5918ba94c135
Author: Eric Biagiotti <ebiagiotti at veritasgenetics.com>
Date:   Wed Mar 27 16:35:14 2019 -0400

    14484: Simplifies pdh transaction grouping, keeps SQL in the migration
    
    Arvados-DCO-1.1-Signed-off-by: Eric Biagiotti <ebiagiotti at veritasgenetics.com>

diff --git a/services/api/app/models/container.rb b/services/api/app/models/container.rb
index 694aa5a0d..f3da80082 100644
--- a/services/api/app/models/container.rb
+++ b/services/api/app/models/container.rb
@@ -411,58 +411,33 @@ class Container < ArvadosModel
   #
   # Correctly groups pdhs to use for batch database updates. Helps avoid
   # updating too many database rows in a single transaction.
-  def self.group_pdhs_for_multiple_transactions(log_prefix)
+  def self.group_pdhs_for_multiple_transactions(distinct_ordered_pdhs, distinct_pdh_count, log_prefix)
     batch_size_max = 1 << 28 # 256 MiB
+    batch_size = 0
+    batch_pdhs = {}
     last_pdh = '0'
     done = 0
     any = true
 
-    total = ActiveRecord::Base.connection.exec_query(
-      "SELECT DISTINCT portable_data_hash FROM collections"
-    ).rows.count
-
     while any
       any = false
-      pdhs_res = ActiveRecord::Base.connection.exec_query(
-        "SELECT DISTINCT portable_data_hash FROM collections "\
-        "WHERE portable_data_hash > '#{last_pdh}' "\
-        "GROUP BY portable_data_hash LIMIT 1000"
-      )
-      break if pdhs_res.rows.count.zero?
-
-      pdhs = pdhs_res.rows.collect { |r| r[0] }
-      Container.group_pdhs_by_manifest_size(pdhs, batch_size_max) do |grouped_pdhs|
+      distinct_ordered_pdhs.call(last_pdh) do |pdh|
         any = true
-        yield grouped_pdhs
-        done += grouped_pdhs.size
-        last_pdh = pdhs[-1]
-        Rails.logger.info(log_prefix + ": #{done}/#{total}")
-      end
-    end
-    Rails.logger.info(log_prefix + ": finished")
-  end
-
-  # NOTE: Migration 20190322174136_add_file_info_to_collection.rb relies on this function.
-  #
-  # Change with caution!
-  #
-  # Given an array of pdhs, yield a subset array of pdhs when the total
-  # size of all manifest_texts is no more than batch_size_max. Pdhs whose manifest_text 
-  # is bigger than batch_size_max are yielded by themselves
-  def self.group_pdhs_by_manifest_size(pdhs, batch_size_max)
-    batch_size = 0
-    batch_pdhs = {}
-    pdhs.each do |pdh|
-      manifest_size = pdh.split('+')[1].to_i
-      if batch_size > 0 && batch_size + manifest_size > batch_size_max
-        yield batch_pdhs.keys
-        batch_pdhs = {}
-        batch_size = 0
+        last_pdh = pdh
+        manifest_size = pdh.split('+')[1].to_i
+        if batch_size > 0 && batch_size + manifest_size > batch_size_max
+          yield batch_pdhs.keys
+          done += batch_pdhs.size
+          Rails.logger.info(log_prefix + ": #{done}/#{distinct_pdh_count}")
+          batch_pdhs = {}
+          batch_size = 0
+        end
+        batch_pdhs[pdh] = true
+        batch_size += manifest_size
       end
-      batch_pdhs[pdh] = true
-      batch_size += manifest_size
     end
     yield batch_pdhs.keys
+    Rails.logger.info(log_prefix + ": finished")
   end
 
   protected
diff --git a/services/api/db/migrate/20190322174136_add_file_info_to_collection.rb b/services/api/db/migrate/20190322174136_add_file_info_to_collection.rb
index c1c336247..99db8133d 100755
--- a/services/api/db/migrate/20190322174136_add_file_info_to_collection.rb
+++ b/services/api/db/migrate/20190322174136_add_file_info_to_collection.rb
@@ -30,7 +30,23 @@ class AddFileInfoToCollection < ActiveRecord::Migration
     add_column :collections, :file_count, :integer, default: 0, null: false
     add_column :collections, :file_size_total, :integer, default: 0, null: false
 
-    Container.group_pdhs_for_multiple_transactions("AddFileInfoToCollection") do |pdhs|
+    distinct_pdh_count = ActiveRecord::Base.connection.exec_query(
+      "SELECT DISTINCT portable_data_hash FROM collections"
+    ).rows.count
+
+    # Generator that queries for all the distince pdhs greater than last_pdh
+    ordered_pdh_query = lambda { |last_pdh, &block|
+      pdhs = ActiveRecord::Base.connection.exec_query(
+        "SELECT DISTINCT portable_data_hash FROM collections "\
+        "WHERE portable_data_hash > '#{last_pdh}' "\
+        "ORDER BY portable_data_hash LIMIT 1000"
+      )
+      pdhs.rows.each do |row|
+        block.call(row[0])
+      end
+    }
+
+    Container.group_pdhs_for_multiple_transactions(ordered_pdh_query, distinct_pdh_count, "AddFileInfoToCollection") do |pdhs|
       do_batch(pdhs)
     end
   end

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list