[arvados] created: 2.7.0-6425-g4534a1b7f4

git repository hosting git at public.arvados.org
Sat Apr 20 17:54:37 UTC 2024


        at  4534a1b7f48538344959baf56bf4464b6098e82f (commit)


commit 4534a1b7f48538344959baf56bf4464b6098e82f
Author: Tom Clegg <tom at curii.com>
Date:   Sat Apr 20 13:50:45 2024 -0400

    21696: Add test for non-contiguous file data segments.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>

diff --git a/sdk/go/arvados/fs_collection_test.go b/sdk/go/arvados/fs_collection_test.go
index e02ef9ee3e..aa4aa682c5 100644
--- a/sdk/go/arvados/fs_collection_test.go
+++ b/sdk/go/arvados/fs_collection_test.go
@@ -1643,17 +1643,33 @@ func (s *CollectionFSUnitSuite) TestLargeManifest_ManyFiles(c *check.C) {
 	if testing.Short() {
 		c.Skip("slow")
 	}
-	s.testLargeManifest(c, 512, 512, 1)
+	s.testLargeManifest(c, 512, 512, 1, 0)
 }
 
 func (s *CollectionFSUnitSuite) TestLargeManifest_LargeFiles(c *check.C) {
 	if testing.Short() {
 		c.Skip("slow")
 	}
-	s.testLargeManifest(c, 1, 800, 1000)
+	s.testLargeManifest(c, 1, 800, 1000, 0)
 }
 
-func (s *CollectionFSUnitSuite) testLargeManifest(c *check.C, dirCount, filesPerDir, blocksPerFile int) {
+func (s *CollectionFSUnitSuite) TestLargeManifest_InterleavedFiles(c *check.C) {
+	if testing.Short() {
+		c.Skip("slow")
+	}
+	// Timing figures here are from a dev host, before->after
+	// commit 353246cb47b60d485aa32469edba2b7aa0b7049f
+	s.testLargeManifest(c, 1, 800, 100, 4<<20) // 127s -> 12s
+	// s.testLargeManifest(c, 1, 50, 1000, 4<<20) // 44s -> 10s
+	// s.testLargeManifest(c, 1, 200, 100, 4<<20) // 13s -> 4s
+	// s.testLargeManifest(c, 1, 200, 150, 4<<20) // 26s -> 4s
+	// s.testLargeManifest(c, 1, 200, 200, 4<<20) // 38s -> 6s
+	// s.testLargeManifest(c, 1, 200, 225, 4<<20) // 46s -> 7s
+	// s.testLargeManifest(c, 1, 400, 400, 4<<20) // 477s -> 24s
+	// s.testLargeManifest(c, 1, 800, 1000, 4<<20) // timeout -> 186s
+}
+
+func (s *CollectionFSUnitSuite) testLargeManifest(c *check.C, dirCount, filesPerDir, blocksPerFile, interleaveChunk int) {
 	const blksize = 1 << 26
 	c.Logf("%s building manifest with dirCount=%d filesPerDir=%d blocksPerFile=%d", time.Now(), dirCount, filesPerDir, blocksPerFile)
 	mb := bytes.NewBuffer(make([]byte, 0, 40000000))
@@ -1667,7 +1683,18 @@ func (s *CollectionFSUnitSuite) testLargeManifest(c *check.C, dirCount, filesPer
 			}
 		}
 		for j := 0; j < filesPerDir; j++ {
-			fmt.Fprintf(mb, " %d:%d:dir%d/file%d", (filesPerDir-j-1)*blocksPerFile*blksize, blocksPerFile*blksize, j, j)
+			if interleaveChunk == 0 {
+				fmt.Fprintf(mb, " %d:%d:dir%d/file%d", (filesPerDir-j-1)*blocksPerFile*blksize, blocksPerFile*blksize, j, j)
+				continue
+			}
+			for todo := int64(blocksPerFile) * int64(blksize); todo > 0; todo -= int64(interleaveChunk) {
+				size := int64(interleaveChunk)
+				if size > todo {
+					size = todo
+				}
+				offset := rand.Int63n(int64(blocksPerFile)*int64(blksize)*int64(filesPerDir) - size)
+				fmt.Fprintf(mb, " %d:%d:dir%d/file%d", offset, size, j, j)
+			}
 		}
 		mb.Write([]byte{'\n'})
 	}

commit 353246cb47b60d485aa32469edba2b7aa0b7049f
Author: Tom Clegg <tom at curii.com>
Date:   Sat Apr 20 13:03:17 2024 -0400

    21696: Fix slow loading of non-contiguous file data segments.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>

diff --git a/sdk/go/arvados/fs_collection.go b/sdk/go/arvados/fs_collection.go
index 7903b4737c..164559fbab 100644
--- a/sdk/go/arvados/fs_collection.go
+++ b/sdk/go/arvados/fs_collection.go
@@ -1358,6 +1358,10 @@ func (dn *dirnode) loadManifest(txt string) error {
 	}
 	streams = streams[:len(streams)-1]
 	segments := []storedSegment{}
+	// streamoffset[n] is the position in the stream of the nth
+	// block, i.e., ∑ segments[j].size ∀ 0≤j<n. We ensure
+	// len(streamoffset) == len(segments) + 1.
+	streamoffset := []int64{0}
 	// To reduce allocs, we reuse a single "pathparts" slice
 	// (pre-split on "/" separators) for the duration of this
 	// func.
@@ -1386,9 +1390,9 @@ func (dn *dirnode) loadManifest(txt string) error {
 	for i, stream := range streams {
 		lineno := i + 1
 		var anyFileTokens bool
-		var pos int64
 		var segIdx int
 		segments = segments[:0]
+		streamoffset = streamoffset[:1]
 		pathparts = nil
 		streamparts := 0
 		for i, token := range bytes.Split(stream, []byte{' '}) {
@@ -1408,6 +1412,7 @@ func (dn *dirnode) loadManifest(txt string) error {
 				if err != nil || length < 0 {
 					return fmt.Errorf("line %d: bad locator %q", lineno, token)
 				}
+				streamoffset = append(streamoffset, streamoffset[len(segments)]+int64(length))
 				segments = append(segments, storedSegment{
 					locator: string(token),
 					size:    int(length),
@@ -1450,30 +1455,37 @@ func (dn *dirnode) loadManifest(txt string) error {
 			// Map the stream offset/range coordinates to
 			// block/offset/range coordinates and add
 			// corresponding storedSegments to the filenode
-			if pos > offset {
-				// Can't continue where we left off.
-				// TODO: binary search instead of
-				// rewinding all the way (but this
-				// situation might be rare anyway)
-				segIdx, pos = 0, 0
+			if segIdx < len(segments) && streamoffset[segIdx] <= offset && streamoffset[segIdx+1] > offset {
+				// common case with an easy
+				// optimization: start where the
+				// previous segment ended
+			} else if guess := int(offset >> 26); guess >= 0 && guess < len(segments) && streamoffset[guess] <= offset && streamoffset[guess+1] > offset {
+				// another common case with an easy
+				// optimization: all blocks are 64 MiB
+				// (or close enough)
+				segIdx = guess
+			} else {
+				// general case
+				segIdx = sort.Search(len(segments), func(i int) bool {
+					return streamoffset[i+1] > offset
+				})
 			}
 			for ; segIdx < len(segments); segIdx++ {
+				blkStart := streamoffset[segIdx]
+				if blkStart >= offset+length {
+					break
+				}
 				seg := &segments[segIdx]
-				next := pos + int64(seg.Len())
-				if next <= offset || seg.Len() == 0 {
-					pos = next
+				if seg.size == 0 {
 					continue
 				}
-				if pos >= offset+length {
-					break
-				}
 				var blkOff int
-				if pos < offset {
-					blkOff = int(offset - pos)
+				if blkStart < offset {
+					blkOff = int(offset - blkStart)
 				}
-				blkLen := seg.Len() - blkOff
-				if pos+int64(blkOff+blkLen) > offset+length {
-					blkLen = int(offset + length - pos - int64(blkOff))
+				blkLen := seg.size - blkOff
+				if blkStart+int64(seg.size) > offset+length {
+					blkLen = int(offset + length - blkStart - int64(blkOff))
 				}
 				fnode.appendSegment(storedSegment{
 					kc:      dn.fs,
@@ -1482,14 +1494,9 @@ func (dn *dirnode) loadManifest(txt string) error {
 					offset:  blkOff,
 					length:  blkLen,
 				})
-				if next > offset+length {
-					break
-				} else {
-					pos = next
-				}
 			}
-			if segIdx == len(segments) && pos < offset+length {
-				return fmt.Errorf("line %d: invalid segment in %d-byte stream: %q", lineno, pos, token)
+			if segIdx == len(segments) && streamoffset[segIdx] < offset+length {
+				return fmt.Errorf("line %d: invalid segment in %d-byte stream: %q", lineno, streamoffset[segIdx], token)
 			}
 		}
 		if !anyFileTokens {

commit 37e566e4a6246a1d9ca59489c3be6ec9f494e3b8
Author: Tom Clegg <tom at curii.com>
Date:   Tue Apr 16 14:18:03 2024 -0400

    21696: Fix unnecessary copy-by-value.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>

diff --git a/sdk/go/arvados/fs_collection.go b/sdk/go/arvados/fs_collection.go
index 052cc1aa37..7903b4737c 100644
--- a/sdk/go/arvados/fs_collection.go
+++ b/sdk/go/arvados/fs_collection.go
@@ -1458,7 +1458,7 @@ func (dn *dirnode) loadManifest(txt string) error {
 				segIdx, pos = 0, 0
 			}
 			for ; segIdx < len(segments); segIdx++ {
-				seg := segments[segIdx]
+				seg := &segments[segIdx]
 				next := pos + int64(seg.Len())
 				if next <= offset || seg.Len() == 0 {
 					pos = next

commit 55e05745561fbd409e1aeef155f75ad90846e24b
Author: Tom Clegg <tom at curii.com>
Date:   Tue Apr 16 14:07:43 2024 -0400

    21696: Exercise less efficient code path by mixing up block order.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>

diff --git a/sdk/go/arvados/fs_collection_test.go b/sdk/go/arvados/fs_collection_test.go
index 5d9c52c860..e02ef9ee3e 100644
--- a/sdk/go/arvados/fs_collection_test.go
+++ b/sdk/go/arvados/fs_collection_test.go
@@ -1667,7 +1667,7 @@ func (s *CollectionFSUnitSuite) testLargeManifest(c *check.C, dirCount, filesPer
 			}
 		}
 		for j := 0; j < filesPerDir; j++ {
-			fmt.Fprintf(mb, " %d:%d:dir%d/file%d", j*blocksPerFile*blksize, blocksPerFile*blksize, j, j)
+			fmt.Fprintf(mb, " %d:%d:dir%d/file%d", (filesPerDir-j-1)*blocksPerFile*blksize, blocksPerFile*blksize, j, j)
 		}
 		mb.Write([]byte{'\n'})
 	}

commit 2213910357aaccf3797b34e1f821e5ffab0679fe
Author: Tom Clegg <tom at curii.com>
Date:   Mon Apr 15 10:15:57 2024 -0400

    21696: Add "large collection" test with large files.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>

diff --git a/sdk/go/arvados/fs_collection_test.go b/sdk/go/arvados/fs_collection_test.go
index a29371b76c..5d9c52c860 100644
--- a/sdk/go/arvados/fs_collection_test.go
+++ b/sdk/go/arvados/fs_collection_test.go
@@ -1639,29 +1639,40 @@ type CollectionFSUnitSuite struct{}
 var _ = check.Suite(&CollectionFSUnitSuite{})
 
 // expect ~2 seconds to load a manifest with 256K files
-func (s *CollectionFSUnitSuite) TestLargeManifest(c *check.C) {
+func (s *CollectionFSUnitSuite) TestLargeManifest_ManyFiles(c *check.C) {
 	if testing.Short() {
 		c.Skip("slow")
 	}
+	s.testLargeManifest(c, 512, 512, 1)
+}
 
-	const (
-		dirCount  = 512
-		fileCount = 512
-	)
+func (s *CollectionFSUnitSuite) TestLargeManifest_LargeFiles(c *check.C) {
+	if testing.Short() {
+		c.Skip("slow")
+	}
+	s.testLargeManifest(c, 1, 800, 1000)
+}
 
+func (s *CollectionFSUnitSuite) testLargeManifest(c *check.C, dirCount, filesPerDir, blocksPerFile int) {
+	const blksize = 1 << 26
+	c.Logf("%s building manifest with dirCount=%d filesPerDir=%d blocksPerFile=%d", time.Now(), dirCount, filesPerDir, blocksPerFile)
 	mb := bytes.NewBuffer(make([]byte, 0, 40000000))
+	blkid := 0
 	for i := 0; i < dirCount; i++ {
 		fmt.Fprintf(mb, "./dir%d", i)
-		for j := 0; j <= fileCount; j++ {
-			fmt.Fprintf(mb, " %032x+42+A%040x@%08x", j, j, j)
+		for j := 0; j < filesPerDir; j++ {
+			for k := 0; k < blocksPerFile; k++ {
+				blkid++
+				fmt.Fprintf(mb, " %032x+%d+A%040x@%08x", blkid, blksize, blkid, blkid)
+			}
 		}
-		for j := 0; j < fileCount; j++ {
-			fmt.Fprintf(mb, " %d:%d:dir%d/file%d", j*42+21, 42, j, j)
+		for j := 0; j < filesPerDir; j++ {
+			fmt.Fprintf(mb, " %d:%d:dir%d/file%d", j*blocksPerFile*blksize, blocksPerFile*blksize, j, j)
 		}
 		mb.Write([]byte{'\n'})
 	}
 	coll := Collection{ManifestText: mb.String()}
-	c.Logf("%s built", time.Now())
+	c.Logf("%s built manifest size=%d", time.Now(), mb.Len())
 
 	var memstats runtime.MemStats
 	runtime.ReadMemStats(&memstats)
@@ -1670,17 +1681,27 @@ func (s *CollectionFSUnitSuite) TestLargeManifest(c *check.C) {
 	f, err := coll.FileSystem(NewClientFromEnv(), &keepClientStub{})
 	c.Check(err, check.IsNil)
 	c.Logf("%s loaded", time.Now())
-	c.Check(f.Size(), check.Equals, int64(42*dirCount*fileCount))
+	c.Check(f.Size(), check.Equals, int64(dirCount*filesPerDir*blocksPerFile*blksize))
 
+	// Stat() and OpenFile() each file. This mimics the behavior
+	// of webdav propfind, which opens each file even when just
+	// listing directory entries.
 	for i := 0; i < dirCount; i++ {
-		for j := 0; j < fileCount; j++ {
-			f.Stat(fmt.Sprintf("./dir%d/dir%d/file%d", i, j, j))
+		for j := 0; j < filesPerDir; j++ {
+			fnm := fmt.Sprintf("./dir%d/dir%d/file%d", i, j, j)
+			fi, err := f.Stat(fnm)
+			c.Assert(err, check.IsNil)
+			c.Check(fi.IsDir(), check.Equals, false)
+			f, err := f.OpenFile(fnm, os.O_RDONLY, 0)
+			c.Assert(err, check.IsNil)
+			f.Close()
 		}
 	}
-	c.Logf("%s Stat() x %d", time.Now(), dirCount*fileCount)
+	c.Logf("%s OpenFile() x %d", time.Now(), dirCount*filesPerDir)
 
 	runtime.ReadMemStats(&memstats)
 	c.Logf("%s Alloc=%d Sys=%d", time.Now(), memstats.Alloc, memstats.Sys)
+	c.Logf("%s MemorySize=%d", time.Now(), f.MemorySize())
 }
 
 // Gocheck boilerplate

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list