[ARVADOS] created: 1.3.0-964-g0d8adf9bb

Git user git at public.curoverse.com
Fri May 31 18:53:44 UTC 2019

        at  0d8adf9bb22e169fb47d64ca19045283ff5fafd1 (commit)

commit 0d8adf9bb22e169fb47d64ca19045283ff5fafd1
Author: Ward Vandewege <wvandewege at veritasgenetics.com>
Date:   Fri May 31 14:51:50 2019 -0400

    Move the population of the new columns on the collections table to a standalone
    script that should be run separate from the migration. Add a note to the
    upgrade documentation along those lines. Make the script not blow up on
    collections with invalid manifests, but rather just skip them.
    refs #15093
    refs #14484
    Arvados-DCO-1.1-Signed-off-by: Ward Vandewege <wvandewege at veritasgenetics.com>

diff --git a/doc/admin/upgrading.html.textile.liquid b/doc/admin/upgrading.html.textile.liquid
index 09bef2a62..def8bed79 100644
--- a/doc/admin/upgrading.html.textile.liquid
+++ b/doc/admin/upgrading.html.textile.liquid
@@ -32,6 +32,10 @@ TODO: extract this information based on git commit messages and generate changel
 h3. current master branch
+h4. Populating the new file_count and file_size_total columns on the collections table
+As part of story "#14484":https://dev.arvados.org/issues/14484, two new columns were added to the collections table in a database migration. These columns are initialized with a zero value. In order to populate them, it is necessary to run a script called <code class="userinput">populate-file-info-columns-in-collections.rb</code> from the scripts directory of the API server. This can be done out of band, ideally directly after the API server has been upgraded to v1.4.0.
 h4. Stricter collection manifest validation on the API server
 As a consequence of "#14482":https://dev.arvados.org/issues/14482, the Ruby SDK does a more rigorous collection manifest validation. Collections created after 2015-05 are unlikely to be invalid, however you may check for invalid manifests using the script below.
diff --git a/services/api/db/migrate/20190322174136_add_file_info_to_collection.rb b/services/api/db/migrate/20190322174136_add_file_info_to_collection.rb
old mode 100755
new mode 100644
index 61f9b2d88..c0cd40d28
--- a/services/api/db/migrate/20190322174136_add_file_info_to_collection.rb
+++ b/services/api/db/migrate/20190322174136_add_file_info_to_collection.rb
@@ -2,58 +2,16 @@
 # SPDX-License-Identifier: AGPL-3.0
-require "arvados/keep"
-require "group_pdhs"
 class AddFileInfoToCollection < ActiveRecord::Migration[4.2]
-  def do_batch(pdhs)
-    pdhs_str = ''
-    pdhs.each do |pdh|
-      pdhs_str << "'" << pdh << "'" << ","
-    end
-    collections = ActiveRecord::Base.connection.exec_query(
-      "SELECT DISTINCT portable_data_hash, manifest_text FROM collections "\
-      "WHERE portable_data_hash IN (#{pdhs_str[0..-2]}) "
-    )
-    collections.rows.each do |row|
-      manifest = Keep::Manifest.new(row[1])
-      ActiveRecord::Base.connection.exec_query("BEGIN")
-      ActiveRecord::Base.connection.exec_query("UPDATE collections SET file_count=#{manifest.files_count}, "\
-                                               "file_size_total=#{manifest.files_size} "\
-                                               "WHERE portable_data_hash='#{row[0]}'")
-      ActiveRecord::Base.connection.exec_query("COMMIT")
-    end
-  end
   def up
     add_column :collections, :file_count, :integer, default: 0, null: false
     add_column :collections, :file_size_total, :integer, limit: 8, default: 0, null: false
-    distinct_pdh_count = ActiveRecord::Base.connection.exec_query(
-      "SELECT DISTINCT portable_data_hash FROM collections"
-    ).rows.count
-    # Generator that queries for all the distinct pdhs greater than last_pdh
-    ordered_pdh_query = lambda { |last_pdh, &block|
-      pdhs = ActiveRecord::Base.connection.exec_query(
-        "SELECT DISTINCT portable_data_hash FROM collections "\
-        "WHERE portable_data_hash > '#{last_pdh}' "\
-        "ORDER BY portable_data_hash LIMIT 1000"
-      )
-      pdhs.rows.each do |row|
-        block.call(row[0])
-      end
-    }
-    batch_size_max = 1 << 28 # 256 MiB
-    GroupPdhs.group_pdhs_for_multiple_transactions(ordered_pdh_query,
-                                                   distinct_pdh_count,
-                                                   batch_size_max,
-                                                   "AddFileInfoToCollection") do |pdhs|
-      do_batch(pdhs)
-    end
+    puts "Collections now have two new columns, file_count and file_size_total."
+    puts "They were initialized with a zero value. If you are upgrading an Arvados"
+    puts "installation, please run the populate-file-info-columns-in-collections.rb"
+    puts "script to populate the columns. If this is a new installation, that is not"
+    puts "necessary."
   def down
diff --git a/services/api/script/populate-file-info-columns-in-collections.rb b/services/api/script/populate-file-info-columns-in-collections.rb
new file mode 100755
index 000000000..b0bc5a21a
--- /dev/null
+++ b/services/api/script/populate-file-info-columns-in-collections.rb
@@ -0,0 +1,97 @@
+#!/usr/bin/env ruby
+# Copyright (C) The Arvados Authors. All rights reserved.
+# SPDX-License-Identifier: AGPL-3.0
+# Arvados version 1.4.0 introduces two new columns on the collections table named
+#   file_count
+#   file_size_total
+# The database migration that adds these columns does not populate them with data,
+# it initializes them set to zero.
+# This script will populate the columns, if file_count is zero. It will ignore
+# collections that have invalid manifests, but it will spit out details for those
+# collections.
+# Run the script as
+# cd scripts
+# RAILS_ENV=production bundle exec populate-file-info-columns-in-collections.rb
+ENV["RAILS_ENV"] = ARGV[0] || ENV["RAILS_ENV"] || "development"
+require File.dirname(__FILE__) + '/../config/boot'
+require File.dirname(__FILE__) + '/../config/environment'
+require "arvados/keep"
+require "group_pdhs"
+  def do_batch(pdhs)
+    pdhs_str = ''
+    pdhs.each do |pdh|
+      pdhs_str << "'" << pdh << "'" << ","
+    end
+    collections = ActiveRecord::Base.connection.exec_query(
+      "SELECT DISTINCT portable_data_hash, manifest_text FROM collections "\
+      "WHERE portable_data_hash IN (#{pdhs_str[0..-2]}) "
+    )
+    collections.rows.each do |row|
+      begin
+        manifest = Keep::Manifest.new(row[1])
+        ActiveRecord::Base.connection.exec_query("BEGIN")
+        ActiveRecord::Base.connection.exec_query("UPDATE collections SET file_count=#{manifest.files_count}, "\
+                                                 "file_size_total=#{manifest.files_size} "\
+                                                 "WHERE portable_data_hash='#{row[0]}'")
+        ActiveRecord::Base.connection.exec_query("COMMIT")
+      rescue ArgumentError => detail
+        require 'pp'
+        puts
+        puts "*************** Row detail ***************"
+        puts
+        pp row
+        puts
+        puts "************ Collection detail ***********"
+        puts
+        pp Collection.find_by_portable_data_hash(row[0])
+        puts
+        puts "************** Error detail **************"
+        puts
+        pp detail
+        puts
+        puts "Skipping this collection, continuing!"
+        next
+      end
+    end
+  end
+def main
+  distinct_pdh_count = ActiveRecord::Base.connection.exec_query(
+    "SELECT DISTINCT portable_data_hash FROM collections"
+  ).rows.count
+  # Generator that queries for all the distinct pdhs greater than last_pdh
+  ordered_pdh_query = lambda { |last_pdh, &block|
+    pdhs = ActiveRecord::Base.connection.exec_query(
+      "SELECT DISTINCT portable_data_hash FROM collections "\
+      "WHERE file_count=0 and portable_data_hash > '#{last_pdh}' "\
+      "ORDER BY portable_data_hash LIMIT 1000"
+    )
+    pdhs.rows.each do |row|
+      block.call(row[0])
+    end
+  }
+  batch_size_max = 1 << 28 # 256 MiB
+  GroupPdhs.group_pdhs_for_multiple_transactions(ordered_pdh_query,
+                                                 distinct_pdh_count,
+                                                 batch_size_max,
+                                                 "AddFileInfoToCollection") do |pdhs|
+    do_batch(pdhs)
+  end



More information about the arvados-commits mailing list