[arvados] updated: 2.7.0-6425-g386c2d7358

git repository hosting git at public.arvados.org
Sat Apr 20 17:11:39 UTC 2024


Summary of changes:
 sdk/go/arvados/fs_collection.go      | 59 ++++++++++++++++++++----------------
 sdk/go/arvados/fs_collection_test.go | 34 ++++++++++++++++++---
 2 files changed, 63 insertions(+), 30 deletions(-)

       via  386c2d7358cd20a693a9cfd34d15e5a2ac406760 (commit)
       via  353246cb47b60d485aa32469edba2b7aa0b7049f (commit)
       via  37e566e4a6246a1d9ca59489c3be6ec9f494e3b8 (commit)
       via  55e05745561fbd409e1aeef155f75ad90846e24b (commit)
      from  2213910357aaccf3797b34e1f821e5ffab0679fe (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.


commit 386c2d7358cd20a693a9cfd34d15e5a2ac406760
Author: Tom Clegg <tom at curii.com>
Date:   Sat Apr 20 13:10:35 2024 -0400

    21696: Add test for non-contiguous file data segments.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>

diff --git a/sdk/go/arvados/fs_collection_test.go b/sdk/go/arvados/fs_collection_test.go
index e02ef9ee3e..367e67c0c3 100644
--- a/sdk/go/arvados/fs_collection_test.go
+++ b/sdk/go/arvados/fs_collection_test.go
@@ -1643,17 +1643,32 @@ func (s *CollectionFSUnitSuite) TestLargeManifest_ManyFiles(c *check.C) {
 	if testing.Short() {
 		c.Skip("slow")
 	}
-	s.testLargeManifest(c, 512, 512, 1)
+	s.testLargeManifest(c, 512, 512, 1, 0)
 }
 
 func (s *CollectionFSUnitSuite) TestLargeManifest_LargeFiles(c *check.C) {
 	if testing.Short() {
 		c.Skip("slow")
 	}
-	s.testLargeManifest(c, 1, 800, 1000)
+	s.testLargeManifest(c, 1, 800, 1000, 0)
 }
 
-func (s *CollectionFSUnitSuite) testLargeManifest(c *check.C, dirCount, filesPerDir, blocksPerFile int) {
+func (s *CollectionFSUnitSuite) TestLargeManifest_InterleavedFiles(c *check.C) {
+	if testing.Short() {
+		c.Skip("slow")
+	}
+	// Timing figures here are from a dev host, before->after
+	// commit 353246cb47b60d485aa32469edba2b7aa0b7049f
+	s.testLargeManifest(c, 1, 50, 1000, 4<<20) // 44s -> 10s
+	// s.testLargeManifest(c, 1, 200, 100, 4<<20) // 13s -> 4s
+	// s.testLargeManifest(c, 1, 200, 150, 4<<20) // 26s -> 4s
+	// s.testLargeManifest(c, 1, 200, 200, 4<<20) // 38s -> 6s
+	// s.testLargeManifest(c, 1, 200, 225, 4<<20) // 46s -> 7s
+	// s.testLargeManifest(c, 1, 400, 400, 4<<20) // 477s -> 24s
+	// s.testLargeManifest(c, 1, 800, 1000, 4<<20) // timeout -> 186s
+}
+
+func (s *CollectionFSUnitSuite) testLargeManifest(c *check.C, dirCount, filesPerDir, blocksPerFile, interleaveChunk int) {
 	const blksize = 1 << 26
 	c.Logf("%s building manifest with dirCount=%d filesPerDir=%d blocksPerFile=%d", time.Now(), dirCount, filesPerDir, blocksPerFile)
 	mb := bytes.NewBuffer(make([]byte, 0, 40000000))
@@ -1667,7 +1682,18 @@ func (s *CollectionFSUnitSuite) testLargeManifest(c *check.C, dirCount, filesPer
 			}
 		}
 		for j := 0; j < filesPerDir; j++ {
-			fmt.Fprintf(mb, " %d:%d:dir%d/file%d", (filesPerDir-j-1)*blocksPerFile*blksize, blocksPerFile*blksize, j, j)
+			if interleaveChunk == 0 {
+				fmt.Fprintf(mb, " %d:%d:dir%d/file%d", (filesPerDir-j-1)*blocksPerFile*blksize, blocksPerFile*blksize, j, j)
+				continue
+			}
+			for todo := int64(blocksPerFile) * int64(blksize); todo > 0; todo -= int64(interleaveChunk) {
+				size := int64(interleaveChunk)
+				if size > todo {
+					size = todo
+				}
+				offset := rand.Int63n(int64(blocksPerFile)*int64(blksize)*int64(filesPerDir) - size)
+				fmt.Fprintf(mb, " %d:%d:dir%d/file%d", offset, size, j, j)
+			}
 		}
 		mb.Write([]byte{'\n'})
 	}

commit 353246cb47b60d485aa32469edba2b7aa0b7049f
Author: Tom Clegg <tom at curii.com>
Date:   Sat Apr 20 13:03:17 2024 -0400

    21696: Fix slow loading of non-contiguous file data segments.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>

diff --git a/sdk/go/arvados/fs_collection.go b/sdk/go/arvados/fs_collection.go
index 7903b4737c..164559fbab 100644
--- a/sdk/go/arvados/fs_collection.go
+++ b/sdk/go/arvados/fs_collection.go
@@ -1358,6 +1358,10 @@ func (dn *dirnode) loadManifest(txt string) error {
 	}
 	streams = streams[:len(streams)-1]
 	segments := []storedSegment{}
+	// streamoffset[n] is the position in the stream of the nth
+	// block, i.e., ∑ segments[j].size ∀ 0≤j<n. We ensure
+	// len(streamoffset) == len(segments) + 1.
+	streamoffset := []int64{0}
 	// To reduce allocs, we reuse a single "pathparts" slice
 	// (pre-split on "/" separators) for the duration of this
 	// func.
@@ -1386,9 +1390,9 @@ func (dn *dirnode) loadManifest(txt string) error {
 	for i, stream := range streams {
 		lineno := i + 1
 		var anyFileTokens bool
-		var pos int64
 		var segIdx int
 		segments = segments[:0]
+		streamoffset = streamoffset[:1]
 		pathparts = nil
 		streamparts := 0
 		for i, token := range bytes.Split(stream, []byte{' '}) {
@@ -1408,6 +1412,7 @@ func (dn *dirnode) loadManifest(txt string) error {
 				if err != nil || length < 0 {
 					return fmt.Errorf("line %d: bad locator %q", lineno, token)
 				}
+				streamoffset = append(streamoffset, streamoffset[len(segments)]+int64(length))
 				segments = append(segments, storedSegment{
 					locator: string(token),
 					size:    int(length),
@@ -1450,30 +1455,37 @@ func (dn *dirnode) loadManifest(txt string) error {
 			// Map the stream offset/range coordinates to
 			// block/offset/range coordinates and add
 			// corresponding storedSegments to the filenode
-			if pos > offset {
-				// Can't continue where we left off.
-				// TODO: binary search instead of
-				// rewinding all the way (but this
-				// situation might be rare anyway)
-				segIdx, pos = 0, 0
+			if segIdx < len(segments) && streamoffset[segIdx] <= offset && streamoffset[segIdx+1] > offset {
+				// common case with an easy
+				// optimization: start where the
+				// previous segment ended
+			} else if guess := int(offset >> 26); guess >= 0 && guess < len(segments) && streamoffset[guess] <= offset && streamoffset[guess+1] > offset {
+				// another common case with an easy
+				// optimization: all blocks are 64 MiB
+				// (or close enough)
+				segIdx = guess
+			} else {
+				// general case
+				segIdx = sort.Search(len(segments), func(i int) bool {
+					return streamoffset[i+1] > offset
+				})
 			}
 			for ; segIdx < len(segments); segIdx++ {
+				blkStart := streamoffset[segIdx]
+				if blkStart >= offset+length {
+					break
+				}
 				seg := &segments[segIdx]
-				next := pos + int64(seg.Len())
-				if next <= offset || seg.Len() == 0 {
-					pos = next
+				if seg.size == 0 {
 					continue
 				}
-				if pos >= offset+length {
-					break
-				}
 				var blkOff int
-				if pos < offset {
-					blkOff = int(offset - pos)
+				if blkStart < offset {
+					blkOff = int(offset - blkStart)
 				}
-				blkLen := seg.Len() - blkOff
-				if pos+int64(blkOff+blkLen) > offset+length {
-					blkLen = int(offset + length - pos - int64(blkOff))
+				blkLen := seg.size - blkOff
+				if blkStart+int64(seg.size) > offset+length {
+					blkLen = int(offset + length - blkStart - int64(blkOff))
 				}
 				fnode.appendSegment(storedSegment{
 					kc:      dn.fs,
@@ -1482,14 +1494,9 @@ func (dn *dirnode) loadManifest(txt string) error {
 					offset:  blkOff,
 					length:  blkLen,
 				})
-				if next > offset+length {
-					break
-				} else {
-					pos = next
-				}
 			}
-			if segIdx == len(segments) && pos < offset+length {
-				return fmt.Errorf("line %d: invalid segment in %d-byte stream: %q", lineno, pos, token)
+			if segIdx == len(segments) && streamoffset[segIdx] < offset+length {
+				return fmt.Errorf("line %d: invalid segment in %d-byte stream: %q", lineno, streamoffset[segIdx], token)
 			}
 		}
 		if !anyFileTokens {

commit 37e566e4a6246a1d9ca59489c3be6ec9f494e3b8
Author: Tom Clegg <tom at curii.com>
Date:   Tue Apr 16 14:18:03 2024 -0400

    21696: Fix unnecessary copy-by-value.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>

diff --git a/sdk/go/arvados/fs_collection.go b/sdk/go/arvados/fs_collection.go
index 052cc1aa37..7903b4737c 100644
--- a/sdk/go/arvados/fs_collection.go
+++ b/sdk/go/arvados/fs_collection.go
@@ -1458,7 +1458,7 @@ func (dn *dirnode) loadManifest(txt string) error {
 				segIdx, pos = 0, 0
 			}
 			for ; segIdx < len(segments); segIdx++ {
-				seg := segments[segIdx]
+				seg := &segments[segIdx]
 				next := pos + int64(seg.Len())
 				if next <= offset || seg.Len() == 0 {
 					pos = next

commit 55e05745561fbd409e1aeef155f75ad90846e24b
Author: Tom Clegg <tom at curii.com>
Date:   Tue Apr 16 14:07:43 2024 -0400

    21696: Exercise less efficient code path by mixing up block order.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>

diff --git a/sdk/go/arvados/fs_collection_test.go b/sdk/go/arvados/fs_collection_test.go
index 5d9c52c860..e02ef9ee3e 100644
--- a/sdk/go/arvados/fs_collection_test.go
+++ b/sdk/go/arvados/fs_collection_test.go
@@ -1667,7 +1667,7 @@ func (s *CollectionFSUnitSuite) testLargeManifest(c *check.C, dirCount, filesPer
 			}
 		}
 		for j := 0; j < filesPerDir; j++ {
-			fmt.Fprintf(mb, " %d:%d:dir%d/file%d", j*blocksPerFile*blksize, blocksPerFile*blksize, j, j)
+			fmt.Fprintf(mb, " %d:%d:dir%d/file%d", (filesPerDir-j-1)*blocksPerFile*blksize, blocksPerFile*blksize, j, j)
 		}
 		mb.Write([]byte{'\n'})
 	}

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list