[ARVADOS] created: 1.3.0-1718-ga50fab630
Git user
git at public.curoverse.com
Fri Oct 11 05:23:59 UTC 2019
at a50fab63068c1e8d67ce1d477c6f2c2429464b5c (commit)
commit a50fab63068c1e8d67ce1d477c6f2c2429464b5c
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date: Fri Oct 11 01:11:45 2019 -0400
15699: Fix handling of streams with multiple refs to a block ID.
The manifest normalization code in the Ruby SDK (and therefore
Workbench) was based on an incorrect assumption that each block
locator could only appear once in a given stream.
If a manifest referenced the same block more than once, copying a file
from that manifest into a new one would produce a new file with the
correct size, but wrong data.
The new code uses a different strategy that deduplicates block
references in common cases, although not in all possible cases.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>
diff --git a/sdk/ruby/lib/arvados/collection.rb b/sdk/ruby/lib/arvados/collection.rb
index f236ce83a..796d1785a 100644
--- a/sdk/ruby/lib/arvados/collection.rb
+++ b/sdk/ruby/lib/arvados/collection.rb
@@ -207,7 +207,7 @@ module Arv
loop do
ii = (lo + hi) / 2
range = @ranges[ii]
- if range.include?(target)
+ if range.include?(target) && (target < range.end || ii == hi)
return ii
elsif ii == lo
raise RangeError.new("%i not in segment" % target)
@@ -481,14 +481,13 @@ module Arv
def initialize(name)
@name = name
- @loc_ranges = {}
+ @loc_ranges = []
@loc_range_start = 0
@file_specs = []
end
def add_file(coll_file)
coll_file.each_segment do |segment|
- extend_locator_ranges(segment.locators)
extend_file_specs(coll_file.name, segment)
end
end
@@ -498,48 +497,53 @@ module Arv
""
else
"%s %s %s\n" % [escape_name(@name),
- @loc_ranges.keys.join(" "),
+ @loc_ranges.collect(&:locator).join(" "),
@file_specs.join(" ")]
end
end
private
- def extend_locator_ranges(locators)
- locators.
- select { |loc_s| not @loc_ranges.include?(loc_s) }.
- each do |loc_s|
- @loc_ranges[loc_s] = LocatorRange.new(loc_s, @loc_range_start)
- @loc_range_start = @loc_ranges[loc_s].end
+ def extend_file_specs(filename, segment)
+ found_overlap = false
+ # Find the longest prefix of segment.locators that's a suffix
+ # of the existing @loc_ranges. If we find one, drop those
+ # locators (they'll be added back below, when we're handling
+ # the normal/no-overlap case).
+ (1..segment.locators.length).each do |overlap|
+ if @loc_ranges.length >= overlap && @loc_ranges[-overlap..-1].collect(&:locator) == segment.locators[0..overlap-1]
+ (1..overlap).each do
+ discarded = @loc_ranges.pop
+ @loc_range_start -= (discarded.end - discarded.begin)
+ end
+ found_overlap = true
+ break
+ end
end
- end
- def extend_file_specs(filename, segment)
- # Given a filename and a LocatorSegment, add the smallest
- # possible array of file spec strings to @file_specs that
- # builds the file from available locators.
- filename = escape_name(filename)
- start_pos = segment.start_pos
- length = segment.length
- start_loc = segment.locators.first
- prev_loc = start_loc
- # Build a list of file specs by iterating through the segment's
- # locators and preparing a file spec for each contiguous range.
- segment.locators[1..-1].each do |loc_s|
- range = @loc_ranges[loc_s]
- if range.begin != @loc_ranges[prev_loc].end
- range_start, range_length =
- start_and_length_at(start_loc, prev_loc, start_pos, length)
- @file_specs << "#{range_start}:#{range_length}:#{filename}"
- start_pos = 0
- length -= range_length
- start_loc = loc_s
+ # If there was no overlap at the end of our existing
+ # @loc_ranges, check whether the full set of segment.locators
+ # appears earlier in @loc_ranges. If so, use those instead of
+ # appending the same locators again.
+ if !found_overlap && segment.locators.length < @loc_ranges.length
+ segment_start = 0
+ (0.. at loc_ranges.length-1).each do |ri|
+ if @loc_ranges[ri..ri+segment.locators.length-1].collect(&:locator) == segment.locators
+ @file_specs << "#{segment.start_pos + @loc_ranges[ri].begin}:#{segment.length}:#{escape_name(filename)}"
+ return
+ end
end
- prev_loc = loc_s
end
- range_start, range_length =
- start_and_length_at(start_loc, prev_loc, start_pos, length)
- @file_specs << "#{range_start}:#{range_length}:#{filename}"
+
+ segment_start = @loc_range_start
+ segment_end = segment_start
+ segment.locators.each do |loc_s|
+ r = LocatorRange.new(loc_s, @loc_range_start)
+ @loc_ranges << r
+ @loc_range_start = r.end
+ segment_end += (r.end - r.begin)
+ end
+ @file_specs << "#{segment.start_pos + segment_start}:#{segment.length}:#{escape_name(filename)}"
end
def escape_name(name)
@@ -547,12 +551,6 @@ module Arv
s.each_byte.map { |c| "\\%03o" % c }.join("")
end
end
-
- def start_and_length_at(start_key, end_key, start_pos, length)
- range_begin = @loc_ranges[start_key].begin + start_pos
- range_length = [@loc_ranges[end_key].end - range_begin, length].min
- [range_begin, range_length]
- end
end
end
end
diff --git a/sdk/ruby/test/test_collection.rb b/sdk/ruby/test/test_collection.rb
index 288fd263f..29ec6d418 100644
--- a/sdk/ruby/test/test_collection.rb
+++ b/sdk/ruby/test/test_collection.rb
@@ -15,6 +15,10 @@ class CollectionTest < Minitest::Test
"./s1 #{TWO_BY_TWO_BLOCKS.last} 0:5:f1 5:4:f3\n"]
TWO_BY_TWO_MANIFEST_S = TWO_BY_TWO_MANIFEST_A.join("")
+ def abcde_blocks
+ ["aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa+9", "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb+9", "cccccccccccccccccccccccccccccccc+9", "dddddddddddddddddddddddddddddddd+9", "eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee+9"]
+ end
+
### .new
def test_empty_construction
@@ -145,12 +149,12 @@ class CollectionTest < Minitest::Test
test_normalization_file_spans_two_whole_blocks("2:3:f1 2:3:f1", 1)
end
- def test_normalization_dedups_locators
+ def test_normalization_handles_duplicate_locator
blocks = random_blocks(2, 5)
coll = Arv::Collection.new(". %s %s 1:8:f1 11:8:f1\n" %
[blocks.join(" "), blocks.reverse.join(" ")])
coll.normalize
- assert_equal(". #{blocks.join(' ')} 1:8:f1 6:4:f1 0:4:f1\n",
+ assert_equal(". #{blocks.join(' ')} #{blocks[0]} 1:8:f1 6:8:f1\n",
coll.manifest_text)
end
@@ -395,6 +399,26 @@ class CollectionTest < Minitest::Test
dst_coll.manifest_text)
end
+ def test_copy_with_repeated_blocks
+ blocks = abcde_blocks
+ src_coll = Arv::Collection.new(". #{blocks[0]} #{blocks[1]} #{blocks[2]} #{blocks[0]} #{blocks[1]} #{blocks[2]} #{blocks[3]} #{blocks[4]} 27:27:f1\n")
+ dst_coll = Arv::Collection.new()
+ dst_coll.cp_r("f1", "./", src_coll)
+ toks = dst_coll.manifest_text.split(" ")
+ assert_equal(". #{blocks[0]} #{blocks[1]} #{blocks[2]} 0:27:f1\n", dst_coll.manifest_text, "mangled by cp_r")
+ end
+
+ def test_copy_with_repeated_split_blocks
+ blocks = abcde_blocks
+ src_coll = Arv::Collection.new(". #{blocks[0]} #{blocks[1]} #{blocks[2]} #{blocks[0]} #{blocks[1]} #{blocks[2]} #{blocks[3]} #{blocks[4]} 20:27:f1\n")
+ dst_coll = Arv::Collection.new()
+ src_coll.normalize
+ assert_equal(". #{blocks[2]} #{blocks[0]} #{blocks[1]} #{blocks[2]} 2:27:f1\n", src_coll.manifest_text, "mangled by normalize()")
+ dst_coll.cp_r("f1", "./", src_coll)
+ toks = dst_coll.manifest_text.split(" ")
+ assert_equal(". #{blocks[2]} #{blocks[0]} #{blocks[1]} #{blocks[2]} 2:27:f1\n", dst_coll.manifest_text, "mangled by cp_r")
+ end
+
def test_copy_empty_source_path_raises_ArgumentError(src="", dst="./s1")
coll = Arv::Collection.new(SIMPLEST_MANIFEST)
assert_raises(ArgumentError) do
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list