[ARVADOS] updated: b3fb8f097b06ab96c17078cfd24dfd36fd8ca6f3

Git user git at public.curoverse.com
Tue Feb 7 18:52:38 EST 2017


Summary of changes:
 sdk/go/manifest/manifest.go      | 79 +++++++++++++++++++---------------------
 sdk/go/manifest/manifest_test.go | 16 ++++----
 2 files changed, 45 insertions(+), 50 deletions(-)

       via  b3fb8f097b06ab96c17078cfd24dfd36fd8ca6f3 (commit)
       via  da3ed065bf8f2a09d3c09e4b67592d6ff5ffec12 (commit)
      from  559f3e22645f92ba9d5b13c5a55bf495f38e9b8e (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.


commit b3fb8f097b06ab96c17078cfd24dfd36fd8ca6f3
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Tue Feb 7 18:47:17 2017 -0500

    9397: Make FileSegmentIterByName channel buffered so it isn't necessary to do a
    goroutine context switch on every segment.

diff --git a/sdk/go/manifest/manifest.go b/sdk/go/manifest/manifest.go
index 0441731..3bd855f 100644
--- a/sdk/go/manifest/manifest.go
+++ b/sdk/go/manifest/manifest.go
@@ -129,7 +129,7 @@ func parseFileStreamSegment(tok string) (ft FileStreamSegment, err error) {
 }
 
 func (s *ManifestStream) FileSegmentIterByName(filepath string) <-chan *FileSegment {
-	ch := make(chan *FileSegment)
+	ch := make(chan *FileSegment, 64)
 	go func() {
 		s.sendFileSegmentIterByName(filepath, ch)
 		close(ch)
@@ -478,7 +478,7 @@ func (m *Manifest) StreamIter() <-chan ManifestStream {
 }
 
 func (m *Manifest) FileSegmentIterByName(filepath string) <-chan *FileSegment {
-	ch := make(chan *FileSegment)
+	ch := make(chan *FileSegment, 64)
 	if !strings.HasPrefix(filepath, "./") {
 		filepath = "./" + filepath
 	}

commit da3ed065bf8f2a09d3c09e4b67592d6ff5ffec12
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Tue Feb 7 17:47:57 2017 -0500

    9397: Use binary search to find the first block.  Store BlockOffsets efficiently.

diff --git a/sdk/go/manifest/manifest.go b/sdk/go/manifest/manifest.go
index d1c95b7..0441731 100644
--- a/sdk/go/manifest/manifest.go
+++ b/sdk/go/manifest/manifest.go
@@ -43,13 +43,11 @@ type FileStreamSegment struct {
 	Name   string
 }
 
-type BlockRange struct{ Begin, End int64 }
-
 // Represents a single line from a manifest.
 type ManifestStream struct {
 	StreamName         string
 	Blocks             []string
-	BlockRanges        []BlockRange
+	BlockOffsets       []uint64
 	FileStreamSegments []FileStreamSegment
 	Err                error
 }
@@ -139,15 +137,15 @@ func (s *ManifestStream) FileSegmentIterByName(filepath string) <-chan *FileSegm
 	return ch
 }
 
-func (s *ManifestStream) FirstBlock(range_start int64) int64 {
+func (s *ManifestStream) FirstBlock(range_start uint64) int {
 	// range_start/block_start is the inclusive lower bound
 	// range_end/block_end is the exclusive upper bound
 
-	hi := int64(len(s.BlockRanges))
-	var lo int64
+	hi := len(s.BlockOffsets) - 1
+	var lo int
 	i := ((hi + lo) / 2)
-	block_start := s.BlockRanges[i].Begin
-	block_end := s.BlockRanges[i].End
+	block_start := s.BlockOffsets[i]
+	block_end := s.BlockOffsets[i+1]
 
 	// perform a binary search for the first block
 	// assumes that all of the blocks are contiguous, so range_start is guaranteed
@@ -162,15 +160,14 @@ func (s *ManifestStream) FirstBlock(range_start int64) int64 {
 		} else {
 			hi = i
 			i = ((hi + lo) / 2)
-			block_start = s.BlockRanges[i].Begin
-			block_end = s.BlockRanges[i].End
+			block_start = s.BlockOffsets[i]
+			block_end = s.BlockOffsets[i+1]
 		}
 	}
 	return i
 }
 
 func (s *ManifestStream) sendFileSegmentIterByName(filepath string, ch chan<- *FileSegment) {
-	blockLens := make([]int, 0, len(s.Blocks))
 	// This is what streamName+"/"+fileName will look like:
 	target := filepath
 	if !strings.HasPrefix(target, "./") {
@@ -188,43 +185,41 @@ func (s *ManifestStream) sendFileSegmentIterByName(filepath string, ch chan<- *F
 			ch <- &FileSegment{Locator: "d41d8cd98f00b204e9800998ecf8427e+0", Offset: 0, Len: 0}
 			continue
 		}
-		// Linear search for blocks containing data for this
-		// file
-		var blockPos uint64 = 0 // position of block in stream
-		for i, loc := range s.Blocks {
-			if blockPos >= wantPos+wantLen {
+
+		// Binary search to determine first block in the stream
+		i := s.FirstBlock(wantPos)
+		if i == -1 {
+			// error
+			break
+		}
+		for i < len(s.Blocks) {
+			blockPos := s.BlockOffsets[i]
+			blockEnd := s.BlockOffsets[i+1]
+			if blockEnd <= wantPos {
+				// current block comes before current file span
+				// (shouldn't happen, FirstBlock() should start us
+				// on the right block)
 				break
 			}
-			if len(blockLens) <= i {
-				blockLens = blockLens[:i+1]
-				b, err := ParseBlockLocator(loc)
-				if err != nil {
-					// Unparseable locator -> unusable
-					// stream.
-					ch <- nil
-					return
-				}
-				blockLens[i] = b.Size
-			}
-			blockLen := uint64(blockLens[i])
-			if blockPos+blockLen <= wantPos {
-				blockPos += blockLen
-				continue
+			if blockPos >= wantPos+wantLen {
+				// current block comes after current file span
+				break
 			}
+
 			fseg := FileSegment{
-				Locator: loc,
+				Locator: s.Blocks[i],
 				Offset:  0,
-				Len:     blockLens[i],
+				Len:     int(blockEnd - blockPos),
 			}
 			if blockPos < wantPos {
 				fseg.Offset = int(wantPos - blockPos)
 				fseg.Len -= fseg.Offset
 			}
-			if blockPos+blockLen > wantPos+wantLen {
+			if blockEnd > wantPos+wantLen {
 				fseg.Len = int(wantPos+wantLen-blockPos) - fseg.Offset
 			}
 			ch <- &fseg
-			blockPos += blockLen
+			i += 1
 		}
 	}
 }
@@ -253,18 +248,18 @@ func parseManifestStream(s string) (m ManifestStream) {
 		return
 	}
 
-	m.BlockRanges = make([]BlockRange, len(m.Blocks))
-	var streamoffset int64
+	m.BlockOffsets = make([]uint64, len(m.Blocks)+1)
+	var streamoffset uint64
 	for i, b := range m.Blocks {
 		bl, err := ParseBlockLocator(b)
 		if err != nil {
 			m.Err = err
 			return
 		}
-		m.BlockRange[i].Begin = streamoffset
-		m.BlockRange[i].End = streamoffset + int64(bl.Size)
-		streamoffset = m.BlockRange[i].End
+		m.BlockOffsets[i] = streamoffset
+		streamoffset += uint64(bl.Size)
 	}
+	m.BlockOffsets[len(m.Blocks)] = streamoffset
 
 	if len(fileTokens) == 0 {
 		m.Err = fmt.Errorf("No file tokens found")
diff --git a/sdk/go/manifest/manifest_test.go b/sdk/go/manifest/manifest_test.go
index fe83a73..f9dda3e 100644
--- a/sdk/go/manifest/manifest_test.go
+++ b/sdk/go/manifest/manifest_test.go
@@ -303,14 +303,6 @@ func TestNormalizeManifest(t *testing.T) {
 ./zzz 204e43b8a1185621ca55a94839582e6f+67108864 0:999:zzz
 `)
 
-	//         with self.data_file('1000G_ref_manifest') as f6:
-	//             m6 = f6.read()
-	//             self.assertEqual(arvados.CollectionReader(m6, self.api_client).manifest_text(normalize=True), m6)
-
-	//         with self.data_file('jlake_manifest') as f7:
-	//             m7 = f7.read()
-	//             self.assertEqual(arvados.CollectionReader(m7, self.api_client).manifest_text(normalize=True), m7)
-
 	m8 := Manifest{Text: `./a\040b\040c 59ca0efa9f5633cb0371bbc0355478d8+13 0:13:hello\040world.txt
 `}
 	expectEqual(t, m8.NormalizeManifest(), m8.Text)
@@ -321,4 +313,12 @@ func TestNormalizeManifest(t *testing.T) {
 	m10 := Manifest{Text: ". acbd18db4cc2f85cedef654fccc4a4d8+40 0:10:one 20:10:two 10:10:one 30:10:two\n"}
 	expectEqual(t, m10.ManifestForPath("./two", "./three"), ". acbd18db4cc2f85cedef654fccc4a4d8+40 20:20:three\n")
 
+	m11 := Manifest{Text: arvadostest.PathologicalManifest}
+	expectEqual(t, m11.NormalizeManifest(), `. acbd18db4cc2f85cedef654fccc4a4d8+3 37b51d194a7513e45b56f6524f2d51f2+3 73feffa4b7f6bb68e44cf984c85f6e88+3+Z+K at xyzzy d41d8cd98f00b204e9800998ecf8427e+0 0:1:f 1:4:ooba 5:1:r 5:4:rbaz 9:0:zero at 0 9:0:zero at 1 9:0:zero at 4 9:0:zero at 9
+./foo acbd18db4cc2f85cedef654fccc4a4d8+3 d41d8cd98f00b204e9800998ecf8427e+0 0:3:foo 0:3:foo 3:0:zero
+./foo\040bar acbd18db4cc2f85cedef654fccc4a4d8+3 0:3:baz 0:3:baz\040waz
+./overlapReverse acbd18db4cc2f85cedef654fccc4a4d8+3 2:1:o 2:1:ofoo 0:3:ofoo 1:2:oo
+./segmented acbd18db4cc2f85cedef654fccc4a4d8+3 37b51d194a7513e45b56f6524f2d51f2+3 d41d8cd98f00b204e9800998ecf8427e+0 0:1:frob 5:1:frob 1:1:frob 6:0:frob 3:1:frob 1:2:oof 0:1:oof
+`)
+
 }

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list