[ARVADOS] updated: bafb417c531d6094697a13a7465313d1cc7c0bc9

git at public.curoverse.com git at public.curoverse.com
Wed Sep 10 12:52:11 EDT 2014


Summary of changes:
 apps/workbench/Gemfile                             |  2 +-
 apps/workbench/Gemfile.lock                        |  4 +-
 .../app/controllers/collections_controller.rb      |  9 +--
 apps/workbench/app/models/collection.rb            | 14 ++--
 sdk/ruby/lib/arvados/keep.rb                       | 77 ++++++++++++++++---
 sdk/ruby/test/test_keep_manifest.rb                | 87 +++++++++++++++++-----
 services/api/Gemfile                               |  2 +-
 services/api/Gemfile.lock                          |  4 +-
 services/api/app/models/collection.rb              |  9 +--
 9 files changed, 154 insertions(+), 54 deletions(-)

       via  bafb417c531d6094697a13a7465313d1cc7c0bc9 (commit)
      from  b0440160e64caecbff160a26f62301dd15d84c7f (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.


commit bafb417c531d6094697a13a7465313d1cc7c0bc9
Author: Brett Smith <brett at curoverse.com>
Date:   Wed Sep 10 12:39:58 2014 -0400

    3842: Keep::Manifest concatenates file information from manifest.
    
    The previous implementation failed to consider the possibility that
    file information would be spread across multiple lines of a manifest.
    This would cause, e.g., the same file to be yielded many times from
    each_file.
    
    This requirement makes it impossible to return file size information
    without parsing the entire manifest.  Because of that, I have reworked
    the Ruby SDK API so that method names are more consistent with their
    performance characteristics.  I have also added some methods to do
    some basic file existence checking that do not require parsing the
    whole manifest.
    
    Closes #3842.  Refs #3720.

diff --git a/apps/workbench/Gemfile b/apps/workbench/Gemfile
index 2a17598..c7e3870 100644
--- a/apps/workbench/Gemfile
+++ b/apps/workbench/Gemfile
@@ -3,7 +3,7 @@ source 'https://rubygems.org'
 gem 'rails', '~> 4.1.0'
 gem 'minitest', '>= 5.0.0'
 
-gem 'arvados', '>= 0.1.20140905115707'
+gem 'arvados', '>= 0.1.20140910123800'
 
 # Bundle edge Rails instead:
 # gem 'rails', :git => 'git://github.com/rails/rails.git'
diff --git a/apps/workbench/Gemfile.lock b/apps/workbench/Gemfile.lock
index f0e20f1..7d98bf1 100644
--- a/apps/workbench/Gemfile.lock
+++ b/apps/workbench/Gemfile.lock
@@ -39,7 +39,7 @@ GEM
     addressable (2.3.6)
     andand (1.3.3)
     arel (5.0.1.20140414130214)
-    arvados (0.1.20140905115707)
+    arvados (0.1.20140910123800)
       activesupport (>= 3.2.13)
       andand
       google-api-client (~> 0.6.3)
@@ -229,7 +229,7 @@ PLATFORMS
 DEPENDENCIES
   RedCloth
   andand
-  arvados (>= 0.1.20140905115707)
+  arvados (>= 0.1.20140910123800)
   bootstrap-sass (~> 3.1.0)
   bootstrap-x-editable-rails
   capybara
diff --git a/apps/workbench/app/controllers/collections_controller.rb b/apps/workbench/app/controllers/collections_controller.rb
index 41b4771..f87579e 100644
--- a/apps/workbench/app/controllers/collections_controller.rb
+++ b/apps/workbench/app/controllers/collections_controller.rb
@@ -145,7 +145,7 @@ class CollectionsController < ApplicationController
     end
     if usable_token.nil?
       return  # Response already rendered.
-    elsif params[:file].nil? or not file_in_collection?(coll, params[:file])
+    elsif params[:file].nil? or not coll.manifest.has_file?(params[:file])
       return render_not_found
     end
     opts = params.merge(arvados_api_token: usable_token)
@@ -277,13 +277,6 @@ class CollectionsController < ApplicationController
     return nil
   end
 
-  def file_in_collection?(collection, filename)
-    target = CollectionsHelper.file_path(File.split(filename))
-    collection.manifest.each_file.any? do |file_spec|
-      CollectionsHelper.file_path(file_spec) == target
-    end
-  end
-
   def file_enumerator(opts)
     FileStreamer.new opts
   end
diff --git a/apps/workbench/app/models/collection.rb b/apps/workbench/app/models/collection.rb
index 24e6da5..edd87ad 100644
--- a/apps/workbench/app/models/collection.rb
+++ b/apps/workbench/app/models/collection.rb
@@ -21,17 +21,17 @@ class Collection < ArvadosBase
   end
 
   def manifest
-    Keep::Manifest.new(manifest_text || "")
+    if @manifest.nil? or manifest_text_changed?
+      @manifest = Keep::Manifest.new(manifest_text || "")
+    end
+    @manifest
   end
 
   def files
     # This method provides backwards compatibility for code that relied on
     # the old files field in API results.  New code should use manifest
     # methods directly.
-    if @files.nil? or manifest_text_changed?
-      @files = manifest.each_file.to_a
-    end
-    @files
+    manifest.files
   end
 
   def content_summary
@@ -39,11 +39,11 @@ class Collection < ArvadosBase
   end
 
   def total_bytes
-    manifest.each_file.inject(0) { |sum, filespec| sum + filespec.last }
+    manifest.files.inject(0) { |sum, filespec| sum + filespec.last }
   end
 
   def files_tree
-    tree = manifest.each_file.group_by do |file_spec|
+    tree = manifest.files.group_by do |file_spec|
       File.split(file_spec.first)
     end
     return [] if tree.empty?
diff --git a/sdk/ruby/lib/arvados/keep.rb b/sdk/ruby/lib/arvados/keep.rb
index 6b18ddf..8afe13f 100644
--- a/sdk/ruby/lib/arvados/keep.rb
+++ b/sdk/ruby/lib/arvados/keep.rb
@@ -94,9 +94,10 @@ module Keep
     # Class to parse a manifest text and provide common views of that data.
     def initialize(manifest_text)
       @text = manifest_text
+      @files = nil
     end
 
-    def each_stream
+    def each_line
       return to_enum(__method__) unless block_given?
       @text.each_line do |line|
         tokens = line.split
@@ -110,17 +111,8 @@ module Keep
       end
     end
 
-    def each_file
-      return to_enum(__method__) unless block_given?
-      each_stream do |streamname, blocklist, filelist|
-        filelist.each do |filespec|
-          start_pos, filesize, filename = filespec.split(':', 3)
-          yield [streamname, filename, filesize.to_i]
-        end
-      end
-    end
-
     def unescape(s)
+      # Parse backslash escapes in a Keep manifest stream or file name.
       s.gsub(/\\(\\|[0-7]{3})/) do |_|
         case $1
         when '\\'
@@ -130,5 +122,68 @@ module Keep
         end
       end
     end
+
+    def each_file_spec(speclist)
+      return to_enum(__method__, speclist) unless block_given?
+      speclist.each do |filespec|
+        start_pos, filesize, filename = filespec.split(':', 3)
+        yield [start_pos.to_i, filesize.to_i, filename]
+      end
+    end
+
+    def files
+      if @files.nil?
+        file_sizes = Hash.new(0)
+        each_line do |streamname, blocklist, filelist|
+          each_file_spec(filelist) do |_, filesize, filename|
+            file_sizes[[streamname, filename]] += filesize
+          end
+        end
+        @files = file_sizes.each_pair.map do |(streamname, filename), size|
+          [streamname, filename, size]
+        end
+      end
+      @files
+    end
+
+    def files_count(stop_after=nil)
+      # Return the number of files represented in this manifest.
+      # If stop_after is provided, files_count will read the manifest
+      # incrementally, and return immediately when it counts that number of
+      # files.  This can help you avoid parsing the entire manifest if you
+      # just want to check if a small number of files are specified.
+      if stop_after.nil? or not @files.nil?
+        return files.size
+      end
+      seen_files = {}
+      each_line do |streamname, blocklist, filelist|
+        each_file_spec(filelist) do |_, _, filename|
+          seen_files[[streamname, filename]] = true
+          return stop_after if (seen_files.size >= stop_after)
+        end
+      end
+      seen_files.size
+    end
+
+    def exact_file_count?(want_count)
+      files_count(want_count + 1) == want_count
+    end
+
+    def minimum_file_count?(want_count)
+      files_count(want_count) >= want_count
+    end
+
+    def has_file?(want_stream, want_file=nil)
+      if want_file.nil?
+        want_stream, want_file = File.split(want_stream)
+      end
+      each_line do |stream_name, _, filelist|
+        if (stream_name == want_stream) and
+            each_file_spec(filelist).any? { |_, _, name| name == want_file }
+          return true
+        end
+      end
+      false
+    end
   end
 end
diff --git a/sdk/ruby/test/test_keep_manifest.rb b/sdk/ruby/test/test_keep_manifest.rb
index 99417b5..af4698e 100644
--- a/sdk/ruby/test/test_keep_manifest.rb
+++ b/sdk/ruby/test/test_keep_manifest.rb
@@ -7,52 +7,56 @@ end
 
 class ManifestTest < Minitest::Test
   SIMPLEST_MANIFEST = ". #{random_block(9)} 0:9:simple.txt\n"
+  MULTIBLOCK_FILE_MANIFEST =
+    [". #{random_block(8)} 0:4:repfile 4:4:uniqfile",
+     "./s1 #{random_block(6)} 0:3:repfile 3:3:uniqfile",
+     ". #{random_block(8)} 0:7:uniqfile2 7:1:repfile\n"].join("\n")
   MULTILEVEL_MANIFEST =
     [". #{random_block(9)} 0:3:file1 3:3:file2 6:3:file3\n",
      "./dir1 #{random_block(9)} 0:3:file1 3:3:file2 6:3:file3\n",
      "./dir1/subdir #{random_block(9)} 0:3:file1 3:3:file2 6:3:file3\n",
      "./dir2 #{random_block(9)} 0:3:file1 3:3:file2 6:3:file3\n"].join("")
 
-  def test_simple_each_stream_array
+  def test_simple_each_line_array
     manifest = Keep::Manifest.new(SIMPLEST_MANIFEST)
     stream_name, block_s, file = SIMPLEST_MANIFEST.strip.split
-    stream_a = manifest.each_stream.to_a
+    stream_a = manifest.each_line.to_a
     assert_equal(1, stream_a.size, "wrong number of streams")
     assert_equal(stream_name, stream_a[0][0])
     assert_equal([block_s], stream_a[0][1].map(&:to_s))
     assert_equal([file], stream_a[0][2])
   end
 
-  def test_simple_each_stream_block
+  def test_simple_each_line_block
     manifest = Keep::Manifest.new(SIMPLEST_MANIFEST)
     result = []
-    manifest.each_stream do |stream, blocks, files|
+    manifest.each_line do |stream, blocks, files|
       result << files
     end
     assert_equal([[SIMPLEST_MANIFEST.split.last]], result,
-                 "wrong result from each_stream block")
+                 "wrong result from each_line block")
   end
 
-  def test_multilevel_each_stream
+  def test_multilevel_each_line
     manifest = Keep::Manifest.new(MULTILEVEL_MANIFEST)
     seen = []
-    manifest.each_stream do |stream, blocks, files|
+    manifest.each_line do |stream, blocks, files|
       refute(seen.include?(stream),
-             "each_stream already yielded stream #{stream}")
+             "each_line already yielded stream #{stream}")
       seen << stream
       assert_equal(3, files.size, "wrong file count for stream #{stream}")
     end
     assert_equal(4, seen.size, "wrong number of streams")
   end
 
-  def test_empty_each_stream
-    assert_empty(Keep::Manifest.new("").each_stream.to_a)
+  def test_empty_each_line
+    assert_empty(Keep::Manifest.new("").each_line.to_a)
   end
 
   def test_backslash_escape_parsing
     m_text = "./dir\\040name #{random_block} 0:0:file\\\\name\\011\\here.txt\n"
     manifest = Keep::Manifest.new(m_text)
-    streams = manifest.each_stream.to_a
+    streams = manifest.each_line.to_a
     assert_equal(1, streams.size, "wrong number of streams with whitespace")
     assert_equal("./dir name", streams.first.first,
                  "wrong stream name with whitespace")
@@ -60,15 +64,15 @@ class ManifestTest < Minitest::Test
                  "wrong filename(s) with whitespace")
   end
 
-  def test_simple_each_file_array
+  def test_simple_files
     manifest = Keep::Manifest.new(SIMPLEST_MANIFEST)
-    assert_equal([[".", "simple.txt", 9]], manifest.each_file.to_a)
+    assert_equal([[".", "simple.txt", 9]], manifest.files)
   end
 
-  def test_multilevel_each_file
+  def test_multilevel_files
     manifest = Keep::Manifest.new(MULTILEVEL_MANIFEST)
     seen = Hash.new { |this, key| this[key] = [] }
-    manifest.each_file do |stream, basename, size|
+    manifest.files.each do |stream, basename, size|
       refute(seen[stream].include?(basename),
              "each_file repeated #{stream}/#{basename}")
       seen[stream] << basename
@@ -80,8 +84,57 @@ class ManifestTest < Minitest::Test
     end
   end
 
-  def test_each_file_handles_filenames_with_colons
+  def test_files_with_colons_in_names
     manifest = Keep::Manifest.new(". #{random_block(9)} 0:9:file:test.txt\n")
-    assert_equal([[".", "file:test.txt", 9]], manifest.each_file.to_a)
+    assert_equal([[".", "file:test.txt", 9]], manifest.files)
+  end
+
+  def test_files_spanning_multiple_blocks
+    manifest = Keep::Manifest.new(MULTIBLOCK_FILE_MANIFEST)
+    assert_equal([[".", "repfile", 5],
+                  [".", "uniqfile", 4],
+                  [".", "uniqfile2", 7],
+                  ["./s1", "repfile", 3],
+                  ["./s1", "uniqfile", 3]],
+                 manifest.files.sort)
+  end
+
+  def test_minimum_file_count_simple
+    manifest = Keep::Manifest.new(SIMPLEST_MANIFEST)
+    assert(manifest.minimum_file_count?(1), "real minimum file count false")
+    refute(manifest.minimum_file_count?(2), "fake minimum file count true")
+  end
+
+  def test_minimum_file_count_multiblock
+    manifest = Keep::Manifest.new(MULTIBLOCK_FILE_MANIFEST)
+    assert(manifest.minimum_file_count?(2), "low minimum file count false")
+    assert(manifest.minimum_file_count?(5), "real minimum file count false")
+    refute(manifest.minimum_file_count?(6), "fake minimum file count true")
+  end
+
+  def test_exact_file_count_simple
+    manifest = Keep::Manifest.new(SIMPLEST_MANIFEST)
+    assert(manifest.exact_file_count?(1), "exact file count false")
+    refute(manifest.exact_file_count?(0), "-1 file count true")
+    refute(manifest.exact_file_count?(2), "+1 file count true")
+  end
+
+  def test_exact_file_count_multiblock
+    manifest = Keep::Manifest.new(MULTIBLOCK_FILE_MANIFEST)
+    assert(manifest.exact_file_count?(5), "exact file count false")
+    refute(manifest.exact_file_count?(4), "-1 file count true")
+    refute(manifest.exact_file_count?(6), "+1 file count true")
+  end
+
+  def test_has_file
+    manifest = Keep::Manifest.new(MULTIBLOCK_FILE_MANIFEST)
+    assert(manifest.has_file?("./repfile"), "one-arg repfile not found")
+    assert(manifest.has_file?(".", "repfile"), "two-arg repfile not found")
+    assert(manifest.has_file?("./s1/repfile"), "one-arg s1/repfile not found")
+    assert(manifest.has_file?("./s1", "repfile"), "two-arg s1/repfile not found")
+    refute(manifest.has_file?("./s1/uniqfile2"), "one-arg missing file found")
+    refute(manifest.has_file?("./s1", "uniqfile2"), "two-arg missing file found")
+    refute(manifest.has_file?("./s2/repfile"), "one-arg missing stream found")
+    refute(manifest.has_file?("./s2", "repfile"), "two-arg missing stream found")
   end
 end
diff --git a/services/api/Gemfile b/services/api/Gemfile
index e6a7857..0192887 100644
--- a/services/api/Gemfile
+++ b/services/api/Gemfile
@@ -71,7 +71,7 @@ gem 'database_cleaner'
 
 gem 'themes_for_rails'
 
-gem 'arvados', '>= 0.1.20140905165259'
+gem 'arvados', '>= 0.1.20140910123800'
 gem 'arvados-cli', '>= 0.1.20140905165259'
 
 # pg_power lets us use partial indexes in schema.rb in Rails 3
diff --git a/services/api/Gemfile.lock b/services/api/Gemfile.lock
index 8912ce6..25bbf11 100644
--- a/services/api/Gemfile.lock
+++ b/services/api/Gemfile.lock
@@ -35,7 +35,7 @@ GEM
     addressable (2.3.6)
     andand (1.3.3)
     arel (3.0.3)
-    arvados (0.1.20140905165259)
+    arvados (0.1.20140910123800)
       activesupport (>= 3.2.13)
       andand
       google-api-client (~> 0.6.3)
@@ -223,7 +223,7 @@ PLATFORMS
 DEPENDENCIES
   acts_as_api
   andand
-  arvados (>= 0.1.20140905165259)
+  arvados (>= 0.1.20140910123800)
   arvados-cli (>= 0.1.20140905165259)
   coffee-rails (~> 3.2.0)
   database_cleaner
diff --git a/services/api/app/models/collection.rb b/services/api/app/models/collection.rb
index 18a25ec..84101ff 100644
--- a/services/api/app/models/collection.rb
+++ b/services/api/app/models/collection.rb
@@ -163,11 +163,10 @@ class Collection < ArvadosModel
       coll_match = readable_by(*readers).where(portable_data_hash: loc.to_s).limit(1).first
       if coll_match
         # Check if the Collection contains exactly one file whose name
-        # looks like a saved Docker image.  We only take up to two
-        # files, because any number >1 means there's no match.
-        coll_files = Keep::Manifest.new(coll_match.manifest_text).each_file.take(2)
-        if (coll_files.size == 1) and
-            (coll_files[0][1] =~ /^[0-9A-Fa-f]{64}\.tar$/)
+        # looks like a saved Docker image.
+        manifest = Keep::Manifest.new(coll_match.manifest_text)
+        if manifest.exact_file_count?(1) and
+            (manifest.files[0][1] =~ /^[0-9A-Fa-f]{64}\.tar$/)
           return [coll_match]
         end
       end

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list