[ARVADOS] updated: bafb417c531d6094697a13a7465313d1cc7c0bc9
git at public.curoverse.com
git at public.curoverse.com
Wed Sep 10 12:52:11 EDT 2014
Summary of changes:
apps/workbench/Gemfile | 2 +-
apps/workbench/Gemfile.lock | 4 +-
.../app/controllers/collections_controller.rb | 9 +--
apps/workbench/app/models/collection.rb | 14 ++--
sdk/ruby/lib/arvados/keep.rb | 77 ++++++++++++++++---
sdk/ruby/test/test_keep_manifest.rb | 87 +++++++++++++++++-----
services/api/Gemfile | 2 +-
services/api/Gemfile.lock | 4 +-
services/api/app/models/collection.rb | 9 +--
9 files changed, 154 insertions(+), 54 deletions(-)
via bafb417c531d6094697a13a7465313d1cc7c0bc9 (commit)
from b0440160e64caecbff160a26f62301dd15d84c7f (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
commit bafb417c531d6094697a13a7465313d1cc7c0bc9
Author: Brett Smith <brett at curoverse.com>
Date: Wed Sep 10 12:39:58 2014 -0400
3842: Keep::Manifest concatenates file information from manifest.
The previous implementation failed to consider the possibility that
file information would be spread across multiple lines of a manifest.
This would cause, e.g., the same file to be yielded many times from
each_file.
This requirement makes it impossible to return file size information
without parsing the entire manifest. Because of that, I have reworked
the Ruby SDK API so that method names are more consistent with their
performance characteristics. I have also added some methods to do
some basic file existence checking that do not require parsing the
whole manifest.
Closes #3842. Refs #3720.
diff --git a/apps/workbench/Gemfile b/apps/workbench/Gemfile
index 2a17598..c7e3870 100644
--- a/apps/workbench/Gemfile
+++ b/apps/workbench/Gemfile
@@ -3,7 +3,7 @@ source 'https://rubygems.org'
gem 'rails', '~> 4.1.0'
gem 'minitest', '>= 5.0.0'
-gem 'arvados', '>= 0.1.20140905115707'
+gem 'arvados', '>= 0.1.20140910123800'
# Bundle edge Rails instead:
# gem 'rails', :git => 'git://github.com/rails/rails.git'
diff --git a/apps/workbench/Gemfile.lock b/apps/workbench/Gemfile.lock
index f0e20f1..7d98bf1 100644
--- a/apps/workbench/Gemfile.lock
+++ b/apps/workbench/Gemfile.lock
@@ -39,7 +39,7 @@ GEM
addressable (2.3.6)
andand (1.3.3)
arel (5.0.1.20140414130214)
- arvados (0.1.20140905115707)
+ arvados (0.1.20140910123800)
activesupport (>= 3.2.13)
andand
google-api-client (~> 0.6.3)
@@ -229,7 +229,7 @@ PLATFORMS
DEPENDENCIES
RedCloth
andand
- arvados (>= 0.1.20140905115707)
+ arvados (>= 0.1.20140910123800)
bootstrap-sass (~> 3.1.0)
bootstrap-x-editable-rails
capybara
diff --git a/apps/workbench/app/controllers/collections_controller.rb b/apps/workbench/app/controllers/collections_controller.rb
index 41b4771..f87579e 100644
--- a/apps/workbench/app/controllers/collections_controller.rb
+++ b/apps/workbench/app/controllers/collections_controller.rb
@@ -145,7 +145,7 @@ class CollectionsController < ApplicationController
end
if usable_token.nil?
return # Response already rendered.
- elsif params[:file].nil? or not file_in_collection?(coll, params[:file])
+ elsif params[:file].nil? or not coll.manifest.has_file?(params[:file])
return render_not_found
end
opts = params.merge(arvados_api_token: usable_token)
@@ -277,13 +277,6 @@ class CollectionsController < ApplicationController
return nil
end
- def file_in_collection?(collection, filename)
- target = CollectionsHelper.file_path(File.split(filename))
- collection.manifest.each_file.any? do |file_spec|
- CollectionsHelper.file_path(file_spec) == target
- end
- end
-
def file_enumerator(opts)
FileStreamer.new opts
end
diff --git a/apps/workbench/app/models/collection.rb b/apps/workbench/app/models/collection.rb
index 24e6da5..edd87ad 100644
--- a/apps/workbench/app/models/collection.rb
+++ b/apps/workbench/app/models/collection.rb
@@ -21,17 +21,17 @@ class Collection < ArvadosBase
end
def manifest
- Keep::Manifest.new(manifest_text || "")
+ if @manifest.nil? or manifest_text_changed?
+ @manifest = Keep::Manifest.new(manifest_text || "")
+ end
+ @manifest
end
def files
# This method provides backwards compatibility for code that relied on
# the old files field in API results. New code should use manifest
# methods directly.
- if @files.nil? or manifest_text_changed?
- @files = manifest.each_file.to_a
- end
- @files
+ manifest.files
end
def content_summary
@@ -39,11 +39,11 @@ class Collection < ArvadosBase
end
def total_bytes
- manifest.each_file.inject(0) { |sum, filespec| sum + filespec.last }
+ manifest.files.inject(0) { |sum, filespec| sum + filespec.last }
end
def files_tree
- tree = manifest.each_file.group_by do |file_spec|
+ tree = manifest.files.group_by do |file_spec|
File.split(file_spec.first)
end
return [] if tree.empty?
diff --git a/sdk/ruby/lib/arvados/keep.rb b/sdk/ruby/lib/arvados/keep.rb
index 6b18ddf..8afe13f 100644
--- a/sdk/ruby/lib/arvados/keep.rb
+++ b/sdk/ruby/lib/arvados/keep.rb
@@ -94,9 +94,10 @@ module Keep
# Class to parse a manifest text and provide common views of that data.
def initialize(manifest_text)
@text = manifest_text
+ @files = nil
end
- def each_stream
+ def each_line
return to_enum(__method__) unless block_given?
@text.each_line do |line|
tokens = line.split
@@ -110,17 +111,8 @@ module Keep
end
end
- def each_file
- return to_enum(__method__) unless block_given?
- each_stream do |streamname, blocklist, filelist|
- filelist.each do |filespec|
- start_pos, filesize, filename = filespec.split(':', 3)
- yield [streamname, filename, filesize.to_i]
- end
- end
- end
-
def unescape(s)
+ # Parse backslash escapes in a Keep manifest stream or file name.
s.gsub(/\\(\\|[0-7]{3})/) do |_|
case $1
when '\\'
@@ -130,5 +122,68 @@ module Keep
end
end
end
+
+ def each_file_spec(speclist)
+ return to_enum(__method__, speclist) unless block_given?
+ speclist.each do |filespec|
+ start_pos, filesize, filename = filespec.split(':', 3)
+ yield [start_pos.to_i, filesize.to_i, filename]
+ end
+ end
+
+ def files
+ if @files.nil?
+ file_sizes = Hash.new(0)
+ each_line do |streamname, blocklist, filelist|
+ each_file_spec(filelist) do |_, filesize, filename|
+ file_sizes[[streamname, filename]] += filesize
+ end
+ end
+ @files = file_sizes.each_pair.map do |(streamname, filename), size|
+ [streamname, filename, size]
+ end
+ end
+ @files
+ end
+
+ def files_count(stop_after=nil)
+ # Return the number of files represented in this manifest.
+ # If stop_after is provided, files_count will read the manifest
+ # incrementally, and return immediately when it counts that number of
+ # files. This can help you avoid parsing the entire manifest if you
+ # just want to check if a small number of files are specified.
+ if stop_after.nil? or not @files.nil?
+ return files.size
+ end
+ seen_files = {}
+ each_line do |streamname, blocklist, filelist|
+ each_file_spec(filelist) do |_, _, filename|
+ seen_files[[streamname, filename]] = true
+ return stop_after if (seen_files.size >= stop_after)
+ end
+ end
+ seen_files.size
+ end
+
+ def exact_file_count?(want_count)
+ files_count(want_count + 1) == want_count
+ end
+
+ def minimum_file_count?(want_count)
+ files_count(want_count) >= want_count
+ end
+
+ def has_file?(want_stream, want_file=nil)
+ if want_file.nil?
+ want_stream, want_file = File.split(want_stream)
+ end
+ each_line do |stream_name, _, filelist|
+ if (stream_name == want_stream) and
+ each_file_spec(filelist).any? { |_, _, name| name == want_file }
+ return true
+ end
+ end
+ false
+ end
end
end
diff --git a/sdk/ruby/test/test_keep_manifest.rb b/sdk/ruby/test/test_keep_manifest.rb
index 99417b5..af4698e 100644
--- a/sdk/ruby/test/test_keep_manifest.rb
+++ b/sdk/ruby/test/test_keep_manifest.rb
@@ -7,52 +7,56 @@ end
class ManifestTest < Minitest::Test
SIMPLEST_MANIFEST = ". #{random_block(9)} 0:9:simple.txt\n"
+ MULTIBLOCK_FILE_MANIFEST =
+ [". #{random_block(8)} 0:4:repfile 4:4:uniqfile",
+ "./s1 #{random_block(6)} 0:3:repfile 3:3:uniqfile",
+ ". #{random_block(8)} 0:7:uniqfile2 7:1:repfile\n"].join("\n")
MULTILEVEL_MANIFEST =
[". #{random_block(9)} 0:3:file1 3:3:file2 6:3:file3\n",
"./dir1 #{random_block(9)} 0:3:file1 3:3:file2 6:3:file3\n",
"./dir1/subdir #{random_block(9)} 0:3:file1 3:3:file2 6:3:file3\n",
"./dir2 #{random_block(9)} 0:3:file1 3:3:file2 6:3:file3\n"].join("")
- def test_simple_each_stream_array
+ def test_simple_each_line_array
manifest = Keep::Manifest.new(SIMPLEST_MANIFEST)
stream_name, block_s, file = SIMPLEST_MANIFEST.strip.split
- stream_a = manifest.each_stream.to_a
+ stream_a = manifest.each_line.to_a
assert_equal(1, stream_a.size, "wrong number of streams")
assert_equal(stream_name, stream_a[0][0])
assert_equal([block_s], stream_a[0][1].map(&:to_s))
assert_equal([file], stream_a[0][2])
end
- def test_simple_each_stream_block
+ def test_simple_each_line_block
manifest = Keep::Manifest.new(SIMPLEST_MANIFEST)
result = []
- manifest.each_stream do |stream, blocks, files|
+ manifest.each_line do |stream, blocks, files|
result << files
end
assert_equal([[SIMPLEST_MANIFEST.split.last]], result,
- "wrong result from each_stream block")
+ "wrong result from each_line block")
end
- def test_multilevel_each_stream
+ def test_multilevel_each_line
manifest = Keep::Manifest.new(MULTILEVEL_MANIFEST)
seen = []
- manifest.each_stream do |stream, blocks, files|
+ manifest.each_line do |stream, blocks, files|
refute(seen.include?(stream),
- "each_stream already yielded stream #{stream}")
+ "each_line already yielded stream #{stream}")
seen << stream
assert_equal(3, files.size, "wrong file count for stream #{stream}")
end
assert_equal(4, seen.size, "wrong number of streams")
end
- def test_empty_each_stream
- assert_empty(Keep::Manifest.new("").each_stream.to_a)
+ def test_empty_each_line
+ assert_empty(Keep::Manifest.new("").each_line.to_a)
end
def test_backslash_escape_parsing
m_text = "./dir\\040name #{random_block} 0:0:file\\\\name\\011\\here.txt\n"
manifest = Keep::Manifest.new(m_text)
- streams = manifest.each_stream.to_a
+ streams = manifest.each_line.to_a
assert_equal(1, streams.size, "wrong number of streams with whitespace")
assert_equal("./dir name", streams.first.first,
"wrong stream name with whitespace")
@@ -60,15 +64,15 @@ class ManifestTest < Minitest::Test
"wrong filename(s) with whitespace")
end
- def test_simple_each_file_array
+ def test_simple_files
manifest = Keep::Manifest.new(SIMPLEST_MANIFEST)
- assert_equal([[".", "simple.txt", 9]], manifest.each_file.to_a)
+ assert_equal([[".", "simple.txt", 9]], manifest.files)
end
- def test_multilevel_each_file
+ def test_multilevel_files
manifest = Keep::Manifest.new(MULTILEVEL_MANIFEST)
seen = Hash.new { |this, key| this[key] = [] }
- manifest.each_file do |stream, basename, size|
+ manifest.files.each do |stream, basename, size|
refute(seen[stream].include?(basename),
"each_file repeated #{stream}/#{basename}")
seen[stream] << basename
@@ -80,8 +84,57 @@ class ManifestTest < Minitest::Test
end
end
- def test_each_file_handles_filenames_with_colons
+ def test_files_with_colons_in_names
manifest = Keep::Manifest.new(". #{random_block(9)} 0:9:file:test.txt\n")
- assert_equal([[".", "file:test.txt", 9]], manifest.each_file.to_a)
+ assert_equal([[".", "file:test.txt", 9]], manifest.files)
+ end
+
+ def test_files_spanning_multiple_blocks
+ manifest = Keep::Manifest.new(MULTIBLOCK_FILE_MANIFEST)
+ assert_equal([[".", "repfile", 5],
+ [".", "uniqfile", 4],
+ [".", "uniqfile2", 7],
+ ["./s1", "repfile", 3],
+ ["./s1", "uniqfile", 3]],
+ manifest.files.sort)
+ end
+
+ def test_minimum_file_count_simple
+ manifest = Keep::Manifest.new(SIMPLEST_MANIFEST)
+ assert(manifest.minimum_file_count?(1), "real minimum file count false")
+ refute(manifest.minimum_file_count?(2), "fake minimum file count true")
+ end
+
+ def test_minimum_file_count_multiblock
+ manifest = Keep::Manifest.new(MULTIBLOCK_FILE_MANIFEST)
+ assert(manifest.minimum_file_count?(2), "low minimum file count false")
+ assert(manifest.minimum_file_count?(5), "real minimum file count false")
+ refute(manifest.minimum_file_count?(6), "fake minimum file count true")
+ end
+
+ def test_exact_file_count_simple
+ manifest = Keep::Manifest.new(SIMPLEST_MANIFEST)
+ assert(manifest.exact_file_count?(1), "exact file count false")
+ refute(manifest.exact_file_count?(0), "-1 file count true")
+ refute(manifest.exact_file_count?(2), "+1 file count true")
+ end
+
+ def test_exact_file_count_multiblock
+ manifest = Keep::Manifest.new(MULTIBLOCK_FILE_MANIFEST)
+ assert(manifest.exact_file_count?(5), "exact file count false")
+ refute(manifest.exact_file_count?(4), "-1 file count true")
+ refute(manifest.exact_file_count?(6), "+1 file count true")
+ end
+
+ def test_has_file
+ manifest = Keep::Manifest.new(MULTIBLOCK_FILE_MANIFEST)
+ assert(manifest.has_file?("./repfile"), "one-arg repfile not found")
+ assert(manifest.has_file?(".", "repfile"), "two-arg repfile not found")
+ assert(manifest.has_file?("./s1/repfile"), "one-arg s1/repfile not found")
+ assert(manifest.has_file?("./s1", "repfile"), "two-arg s1/repfile not found")
+ refute(manifest.has_file?("./s1/uniqfile2"), "one-arg missing file found")
+ refute(manifest.has_file?("./s1", "uniqfile2"), "two-arg missing file found")
+ refute(manifest.has_file?("./s2/repfile"), "one-arg missing stream found")
+ refute(manifest.has_file?("./s2", "repfile"), "two-arg missing stream found")
end
end
diff --git a/services/api/Gemfile b/services/api/Gemfile
index e6a7857..0192887 100644
--- a/services/api/Gemfile
+++ b/services/api/Gemfile
@@ -71,7 +71,7 @@ gem 'database_cleaner'
gem 'themes_for_rails'
-gem 'arvados', '>= 0.1.20140905165259'
+gem 'arvados', '>= 0.1.20140910123800'
gem 'arvados-cli', '>= 0.1.20140905165259'
# pg_power lets us use partial indexes in schema.rb in Rails 3
diff --git a/services/api/Gemfile.lock b/services/api/Gemfile.lock
index 8912ce6..25bbf11 100644
--- a/services/api/Gemfile.lock
+++ b/services/api/Gemfile.lock
@@ -35,7 +35,7 @@ GEM
addressable (2.3.6)
andand (1.3.3)
arel (3.0.3)
- arvados (0.1.20140905165259)
+ arvados (0.1.20140910123800)
activesupport (>= 3.2.13)
andand
google-api-client (~> 0.6.3)
@@ -223,7 +223,7 @@ PLATFORMS
DEPENDENCIES
acts_as_api
andand
- arvados (>= 0.1.20140905165259)
+ arvados (>= 0.1.20140910123800)
arvados-cli (>= 0.1.20140905165259)
coffee-rails (~> 3.2.0)
database_cleaner
diff --git a/services/api/app/models/collection.rb b/services/api/app/models/collection.rb
index 18a25ec..84101ff 100644
--- a/services/api/app/models/collection.rb
+++ b/services/api/app/models/collection.rb
@@ -163,11 +163,10 @@ class Collection < ArvadosModel
coll_match = readable_by(*readers).where(portable_data_hash: loc.to_s).limit(1).first
if coll_match
# Check if the Collection contains exactly one file whose name
- # looks like a saved Docker image. We only take up to two
- # files, because any number >1 means there's no match.
- coll_files = Keep::Manifest.new(coll_match.manifest_text).each_file.take(2)
- if (coll_files.size == 1) and
- (coll_files[0][1] =~ /^[0-9A-Fa-f]{64}\.tar$/)
+ # looks like a saved Docker image.
+ manifest = Keep::Manifest.new(coll_match.manifest_text)
+ if manifest.exact_file_count?(1) and
+ (manifest.files[0][1] =~ /^[0-9A-Fa-f]{64}\.tar$/)
return [coll_match]
end
end
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list