[ARVADOS] updated: b3fb8f097b06ab96c17078cfd24dfd36fd8ca6f3
Git user
git at public.curoverse.com
Tue Feb 7 18:52:38 EST 2017
Summary of changes:
sdk/go/manifest/manifest.go | 79 +++++++++++++++++++---------------------
sdk/go/manifest/manifest_test.go | 16 ++++----
2 files changed, 45 insertions(+), 50 deletions(-)
via b3fb8f097b06ab96c17078cfd24dfd36fd8ca6f3 (commit)
via da3ed065bf8f2a09d3c09e4b67592d6ff5ffec12 (commit)
from 559f3e22645f92ba9d5b13c5a55bf495f38e9b8e (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
commit b3fb8f097b06ab96c17078cfd24dfd36fd8ca6f3
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date: Tue Feb 7 18:47:17 2017 -0500
9397: Make FileSegmentIterByName channel buffered so it isn't necessary to do a
goroutine context switch on every segment.
diff --git a/sdk/go/manifest/manifest.go b/sdk/go/manifest/manifest.go
index 0441731..3bd855f 100644
--- a/sdk/go/manifest/manifest.go
+++ b/sdk/go/manifest/manifest.go
@@ -129,7 +129,7 @@ func parseFileStreamSegment(tok string) (ft FileStreamSegment, err error) {
}
func (s *ManifestStream) FileSegmentIterByName(filepath string) <-chan *FileSegment {
- ch := make(chan *FileSegment)
+ ch := make(chan *FileSegment, 64)
go func() {
s.sendFileSegmentIterByName(filepath, ch)
close(ch)
@@ -478,7 +478,7 @@ func (m *Manifest) StreamIter() <-chan ManifestStream {
}
func (m *Manifest) FileSegmentIterByName(filepath string) <-chan *FileSegment {
- ch := make(chan *FileSegment)
+ ch := make(chan *FileSegment, 64)
if !strings.HasPrefix(filepath, "./") {
filepath = "./" + filepath
}
commit da3ed065bf8f2a09d3c09e4b67592d6ff5ffec12
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date: Tue Feb 7 17:47:57 2017 -0500
9397: Use binary search to find the first block. Store BlockOffsets efficiently.
diff --git a/sdk/go/manifest/manifest.go b/sdk/go/manifest/manifest.go
index d1c95b7..0441731 100644
--- a/sdk/go/manifest/manifest.go
+++ b/sdk/go/manifest/manifest.go
@@ -43,13 +43,11 @@ type FileStreamSegment struct {
Name string
}
-type BlockRange struct{ Begin, End int64 }
-
// Represents a single line from a manifest.
type ManifestStream struct {
StreamName string
Blocks []string
- BlockRanges []BlockRange
+ BlockOffsets []uint64
FileStreamSegments []FileStreamSegment
Err error
}
@@ -139,15 +137,15 @@ func (s *ManifestStream) FileSegmentIterByName(filepath string) <-chan *FileSegm
return ch
}
-func (s *ManifestStream) FirstBlock(range_start int64) int64 {
+func (s *ManifestStream) FirstBlock(range_start uint64) int {
// range_start/block_start is the inclusive lower bound
// range_end/block_end is the exclusive upper bound
- hi := int64(len(s.BlockRanges))
- var lo int64
+ hi := len(s.BlockOffsets) - 1
+ var lo int
i := ((hi + lo) / 2)
- block_start := s.BlockRanges[i].Begin
- block_end := s.BlockRanges[i].End
+ block_start := s.BlockOffsets[i]
+ block_end := s.BlockOffsets[i+1]
// perform a binary search for the first block
// assumes that all of the blocks are contiguous, so range_start is guaranteed
@@ -162,15 +160,14 @@ func (s *ManifestStream) FirstBlock(range_start int64) int64 {
} else {
hi = i
i = ((hi + lo) / 2)
- block_start = s.BlockRanges[i].Begin
- block_end = s.BlockRanges[i].End
+ block_start = s.BlockOffsets[i]
+ block_end = s.BlockOffsets[i+1]
}
}
return i
}
func (s *ManifestStream) sendFileSegmentIterByName(filepath string, ch chan<- *FileSegment) {
- blockLens := make([]int, 0, len(s.Blocks))
// This is what streamName+"/"+fileName will look like:
target := filepath
if !strings.HasPrefix(target, "./") {
@@ -188,43 +185,41 @@ func (s *ManifestStream) sendFileSegmentIterByName(filepath string, ch chan<- *F
ch <- &FileSegment{Locator: "d41d8cd98f00b204e9800998ecf8427e+0", Offset: 0, Len: 0}
continue
}
- // Linear search for blocks containing data for this
- // file
- var blockPos uint64 = 0 // position of block in stream
- for i, loc := range s.Blocks {
- if blockPos >= wantPos+wantLen {
+
+ // Binary search to determine first block in the stream
+ i := s.FirstBlock(wantPos)
+ if i == -1 {
+ // error
+ break
+ }
+ for i < len(s.Blocks) {
+ blockPos := s.BlockOffsets[i]
+ blockEnd := s.BlockOffsets[i+1]
+ if blockEnd <= wantPos {
+ // current block comes before current file span
+ // (shouldn't happen, FirstBlock() should start us
+ // on the right block)
break
}
- if len(blockLens) <= i {
- blockLens = blockLens[:i+1]
- b, err := ParseBlockLocator(loc)
- if err != nil {
- // Unparseable locator -> unusable
- // stream.
- ch <- nil
- return
- }
- blockLens[i] = b.Size
- }
- blockLen := uint64(blockLens[i])
- if blockPos+blockLen <= wantPos {
- blockPos += blockLen
- continue
+ if blockPos >= wantPos+wantLen {
+ // current block comes after current file span
+ break
}
+
fseg := FileSegment{
- Locator: loc,
+ Locator: s.Blocks[i],
Offset: 0,
- Len: blockLens[i],
+ Len: int(blockEnd - blockPos),
}
if blockPos < wantPos {
fseg.Offset = int(wantPos - blockPos)
fseg.Len -= fseg.Offset
}
- if blockPos+blockLen > wantPos+wantLen {
+ if blockEnd > wantPos+wantLen {
fseg.Len = int(wantPos+wantLen-blockPos) - fseg.Offset
}
ch <- &fseg
- blockPos += blockLen
+ i += 1
}
}
}
@@ -253,18 +248,18 @@ func parseManifestStream(s string) (m ManifestStream) {
return
}
- m.BlockRanges = make([]BlockRange, len(m.Blocks))
- var streamoffset int64
+ m.BlockOffsets = make([]uint64, len(m.Blocks)+1)
+ var streamoffset uint64
for i, b := range m.Blocks {
bl, err := ParseBlockLocator(b)
if err != nil {
m.Err = err
return
}
- m.BlockRange[i].Begin = streamoffset
- m.BlockRange[i].End = streamoffset + int64(bl.Size)
- streamoffset = m.BlockRange[i].End
+ m.BlockOffsets[i] = streamoffset
+ streamoffset += uint64(bl.Size)
}
+ m.BlockOffsets[len(m.Blocks)] = streamoffset
if len(fileTokens) == 0 {
m.Err = fmt.Errorf("No file tokens found")
diff --git a/sdk/go/manifest/manifest_test.go b/sdk/go/manifest/manifest_test.go
index fe83a73..f9dda3e 100644
--- a/sdk/go/manifest/manifest_test.go
+++ b/sdk/go/manifest/manifest_test.go
@@ -303,14 +303,6 @@ func TestNormalizeManifest(t *testing.T) {
./zzz 204e43b8a1185621ca55a94839582e6f+67108864 0:999:zzz
`)
- // with self.data_file('1000G_ref_manifest') as f6:
- // m6 = f6.read()
- // self.assertEqual(arvados.CollectionReader(m6, self.api_client).manifest_text(normalize=True), m6)
-
- // with self.data_file('jlake_manifest') as f7:
- // m7 = f7.read()
- // self.assertEqual(arvados.CollectionReader(m7, self.api_client).manifest_text(normalize=True), m7)
-
m8 := Manifest{Text: `./a\040b\040c 59ca0efa9f5633cb0371bbc0355478d8+13 0:13:hello\040world.txt
`}
expectEqual(t, m8.NormalizeManifest(), m8.Text)
@@ -321,4 +313,12 @@ func TestNormalizeManifest(t *testing.T) {
m10 := Manifest{Text: ". acbd18db4cc2f85cedef654fccc4a4d8+40 0:10:one 20:10:two 10:10:one 30:10:two\n"}
expectEqual(t, m10.ManifestForPath("./two", "./three"), ". acbd18db4cc2f85cedef654fccc4a4d8+40 20:20:three\n")
+ m11 := Manifest{Text: arvadostest.PathologicalManifest}
+ expectEqual(t, m11.NormalizeManifest(), `. acbd18db4cc2f85cedef654fccc4a4d8+3 37b51d194a7513e45b56f6524f2d51f2+3 73feffa4b7f6bb68e44cf984c85f6e88+3+Z+K at xyzzy d41d8cd98f00b204e9800998ecf8427e+0 0:1:f 1:4:ooba 5:1:r 5:4:rbaz 9:0:zero at 0 9:0:zero at 1 9:0:zero at 4 9:0:zero at 9
+./foo acbd18db4cc2f85cedef654fccc4a4d8+3 d41d8cd98f00b204e9800998ecf8427e+0 0:3:foo 0:3:foo 3:0:zero
+./foo\040bar acbd18db4cc2f85cedef654fccc4a4d8+3 0:3:baz 0:3:baz\040waz
+./overlapReverse acbd18db4cc2f85cedef654fccc4a4d8+3 2:1:o 2:1:ofoo 0:3:ofoo 1:2:oo
+./segmented acbd18db4cc2f85cedef654fccc4a4d8+3 37b51d194a7513e45b56f6524f2d51f2+3 d41d8cd98f00b204e9800998ecf8427e+0 0:1:frob 5:1:frob 1:1:frob 6:0:frob 3:1:frob 1:2:oof 0:1:oof
+`)
+
}
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list