[ARVADOS] created: 1.3.0-1718-ga50fab630

Git user git at public.curoverse.com
Fri Oct 11 05:23:59 UTC 2019


        at  a50fab63068c1e8d67ce1d477c6f2c2429464b5c (commit)


commit a50fab63068c1e8d67ce1d477c6f2c2429464b5c
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Fri Oct 11 01:11:45 2019 -0400

    15699: Fix handling of streams with multiple refs to a block ID.
    
    The manifest normalization code in the Ruby SDK (and therefore
    Workbench) was based on an incorrect assumption that each block
    locator could only appear once in a given stream.
    
    If a manifest referenced the same block more than once, copying a file
    from that manifest into a new one would produce a new file with the
    correct size, but wrong data.
    
    The new code uses a different strategy that deduplicates block
    references in common cases, although not in all possible cases.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/sdk/ruby/lib/arvados/collection.rb b/sdk/ruby/lib/arvados/collection.rb
index f236ce83a..796d1785a 100644
--- a/sdk/ruby/lib/arvados/collection.rb
+++ b/sdk/ruby/lib/arvados/collection.rb
@@ -207,7 +207,7 @@ module Arv
         loop do
           ii = (lo + hi) / 2
           range = @ranges[ii]
-          if range.include?(target)
+          if range.include?(target) && (target < range.end || ii == hi)
             return ii
           elsif ii == lo
             raise RangeError.new("%i not in segment" % target)
@@ -481,14 +481,13 @@ module Arv
 
       def initialize(name)
         @name = name
-        @loc_ranges = {}
+        @loc_ranges = []
         @loc_range_start = 0
         @file_specs = []
       end
 
       def add_file(coll_file)
         coll_file.each_segment do |segment|
-          extend_locator_ranges(segment.locators)
           extend_file_specs(coll_file.name, segment)
         end
       end
@@ -498,48 +497,53 @@ module Arv
           ""
         else
           "%s %s %s\n" % [escape_name(@name),
-                          @loc_ranges.keys.join(" "),
+                          @loc_ranges.collect(&:locator).join(" "),
                           @file_specs.join(" ")]
         end
       end
 
       private
 
-      def extend_locator_ranges(locators)
-        locators.
-            select { |loc_s| not @loc_ranges.include?(loc_s) }.
-            each do |loc_s|
-          @loc_ranges[loc_s] = LocatorRange.new(loc_s, @loc_range_start)
-          @loc_range_start = @loc_ranges[loc_s].end
+      def extend_file_specs(filename, segment)
+        found_overlap = false
+        # Find the longest prefix of segment.locators that's a suffix
+        # of the existing @loc_ranges. If we find one, drop those
+        # locators (they'll be added back below, when we're handling
+        # the normal/no-overlap case).
+        (1..segment.locators.length).each do |overlap|
+          if @loc_ranges.length >= overlap && @loc_ranges[-overlap..-1].collect(&:locator) == segment.locators[0..overlap-1]
+            (1..overlap).each do
+              discarded = @loc_ranges.pop
+              @loc_range_start -= (discarded.end - discarded.begin)
+            end
+            found_overlap = true
+            break
+          end
         end
-      end
 
-      def extend_file_specs(filename, segment)
-        # Given a filename and a LocatorSegment, add the smallest
-        # possible array of file spec strings to @file_specs that
-        # builds the file from available locators.
-        filename = escape_name(filename)
-        start_pos = segment.start_pos
-        length = segment.length
-        start_loc = segment.locators.first
-        prev_loc = start_loc
-        # Build a list of file specs by iterating through the segment's
-        # locators and preparing a file spec for each contiguous range.
-        segment.locators[1..-1].each do |loc_s|
-          range = @loc_ranges[loc_s]
-          if range.begin != @loc_ranges[prev_loc].end
-            range_start, range_length =
-              start_and_length_at(start_loc, prev_loc, start_pos, length)
-            @file_specs << "#{range_start}:#{range_length}:#{filename}"
-            start_pos = 0
-            length -= range_length
-            start_loc = loc_s
+        # If there was no overlap at the end of our existing
+        # @loc_ranges, check whether the full set of segment.locators
+        # appears earlier in @loc_ranges. If so, use those instead of
+        # appending the same locators again.
+        if !found_overlap && segment.locators.length < @loc_ranges.length
+          segment_start = 0
+          (0.. at loc_ranges.length-1).each do |ri|
+            if @loc_ranges[ri..ri+segment.locators.length-1].collect(&:locator) == segment.locators
+              @file_specs << "#{segment.start_pos + @loc_ranges[ri].begin}:#{segment.length}:#{escape_name(filename)}"
+              return
+            end
           end
-          prev_loc = loc_s
         end
-        range_start, range_length =
-          start_and_length_at(start_loc, prev_loc, start_pos, length)
-        @file_specs << "#{range_start}:#{range_length}:#{filename}"
+
+        segment_start = @loc_range_start
+        segment_end = segment_start
+        segment.locators.each do |loc_s|
+          r = LocatorRange.new(loc_s, @loc_range_start)
+          @loc_ranges << r
+          @loc_range_start = r.end
+          segment_end += (r.end - r.begin)
+        end
+        @file_specs << "#{segment.start_pos + segment_start}:#{segment.length}:#{escape_name(filename)}"
       end
 
       def escape_name(name)
@@ -547,12 +551,6 @@ module Arv
           s.each_byte.map { |c| "\\%03o" % c }.join("")
         end
       end
-
-      def start_and_length_at(start_key, end_key, start_pos, length)
-        range_begin = @loc_ranges[start_key].begin + start_pos
-        range_length = [@loc_ranges[end_key].end - range_begin, length].min
-        [range_begin, range_length]
-      end
     end
   end
 end
diff --git a/sdk/ruby/test/test_collection.rb b/sdk/ruby/test/test_collection.rb
index 288fd263f..29ec6d418 100644
--- a/sdk/ruby/test/test_collection.rb
+++ b/sdk/ruby/test/test_collection.rb
@@ -15,6 +15,10 @@ class CollectionTest < Minitest::Test
      "./s1 #{TWO_BY_TWO_BLOCKS.last} 0:5:f1 5:4:f3\n"]
   TWO_BY_TWO_MANIFEST_S = TWO_BY_TWO_MANIFEST_A.join("")
 
+  def abcde_blocks
+    ["aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa+9", "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb+9", "cccccccccccccccccccccccccccccccc+9", "dddddddddddddddddddddddddddddddd+9", "eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee+9"]
+  end
+
   ### .new
 
   def test_empty_construction
@@ -145,12 +149,12 @@ class CollectionTest < Minitest::Test
     test_normalization_file_spans_two_whole_blocks("2:3:f1 2:3:f1", 1)
   end
 
-  def test_normalization_dedups_locators
+  def test_normalization_handles_duplicate_locator
     blocks = random_blocks(2, 5)
     coll = Arv::Collection.new(". %s %s 1:8:f1 11:8:f1\n" %
                                [blocks.join(" "), blocks.reverse.join(" ")])
     coll.normalize
-    assert_equal(". #{blocks.join(' ')} 1:8:f1 6:4:f1 0:4:f1\n",
+    assert_equal(". #{blocks.join(' ')} #{blocks[0]} 1:8:f1 6:8:f1\n",
                  coll.manifest_text)
   end
 
@@ -395,6 +399,26 @@ class CollectionTest < Minitest::Test
                  dst_coll.manifest_text)
   end
 
+  def test_copy_with_repeated_blocks
+    blocks = abcde_blocks
+    src_coll = Arv::Collection.new(". #{blocks[0]} #{blocks[1]} #{blocks[2]} #{blocks[0]} #{blocks[1]} #{blocks[2]} #{blocks[3]} #{blocks[4]} 27:27:f1\n")
+    dst_coll = Arv::Collection.new()
+    dst_coll.cp_r("f1", "./", src_coll)
+    toks = dst_coll.manifest_text.split(" ")
+    assert_equal(". #{blocks[0]} #{blocks[1]} #{blocks[2]} 0:27:f1\n", dst_coll.manifest_text, "mangled by cp_r")
+  end
+
+  def test_copy_with_repeated_split_blocks
+    blocks = abcde_blocks
+    src_coll = Arv::Collection.new(". #{blocks[0]} #{blocks[1]} #{blocks[2]} #{blocks[0]} #{blocks[1]} #{blocks[2]} #{blocks[3]} #{blocks[4]} 20:27:f1\n")
+    dst_coll = Arv::Collection.new()
+    src_coll.normalize
+    assert_equal(". #{blocks[2]} #{blocks[0]} #{blocks[1]} #{blocks[2]} 2:27:f1\n", src_coll.manifest_text, "mangled by normalize()")
+    dst_coll.cp_r("f1", "./", src_coll)
+    toks = dst_coll.manifest_text.split(" ")
+    assert_equal(". #{blocks[2]} #{blocks[0]} #{blocks[1]} #{blocks[2]} 2:27:f1\n", dst_coll.manifest_text, "mangled by cp_r")
+  end
+
   def test_copy_empty_source_path_raises_ArgumentError(src="", dst="./s1")
     coll = Arv::Collection.new(SIMPLEST_MANIFEST)
     assert_raises(ArgumentError) do

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list