[arvados] updated: 2.7.0-6429-g0ab78942d8

git repository hosting git at public.arvados.org
Sun Apr 21 22:35:39 UTC 2024


Summary of changes:
 sdk/go/arvados/collection.go         | 59 +++++++++++++++++++++++++++---------
 sdk/go/arvados/fs_collection.go      | 29 ++++++++++++------
 sdk/go/arvados/fs_collection_test.go | 25 +++++++++------
 3 files changed, 78 insertions(+), 35 deletions(-)

       via  0ab78942d824ad28b69d695538a80c8800f9a4c2 (commit)
       via  ec90aba4d11becc3505b7948f32b6654255575de (commit)
       via  a9d73c7a0820f4c0997ceaac8143caa0d67e3a65 (commit)
       via  db51f3f783c766492e4b436a752f0ab4e24cdc73 (commit)
      from  4534a1b7f48538344959baf56bf4464b6098e82f (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.


commit 0ab78942d824ad28b69d695538a80c8800f9a4c2
Author: Tom Clegg <tom at curii.com>
Date:   Sun Apr 21 18:35:27 2024 -0400

    21696: Comment optimized PortableDataHash func.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>

diff --git a/sdk/go/arvados/collection.go b/sdk/go/arvados/collection.go
index b157c4c8a8..1e9616c428 100644
--- a/sdk/go/arvados/collection.go
+++ b/sdk/go/arvados/collection.go
@@ -107,27 +107,49 @@ type CollectionList struct {
 // PortableDataHash computes the portable data hash of the given
 // manifest.
 func PortableDataHash(mt string) string {
+	// To calculate the PDH, we write the manifest to an md5 hash
+	// func, except we skip the "extra" part of block tokens that
+	// look like "abcdef0123456789abcdef0123456789+12345+extra".
+	//
+	// This code is simplified by the facts that (A) all block
+	// tokens -- even the first and last in a stream -- are
+	// preceded and followed by a space character; and (B) all
+	// non-block tokens either start with '.'  or contain ':'.
+	//
+	// A regexp-based approach (like the one this replaced) would
+	// be more readable, but very slow.
 	h := md5.New()
 	size := 0
 	todo := []byte(mt)
 	for len(todo) > 0 {
+		// sp is the end of the current token (note that if
+		// the current token is the last file token in a
+		// stream, we'll also include the \n and the dirname
+		// token on the next line, which is perfectly fine for
+		// our purposes).
 		sp := bytes.IndexByte(todo, ' ')
 		if sp < 0 {
+			// Last token of the manifest, which is never
+			// a block token.
 			n, _ := h.Write(todo)
 			size += n
 			break
 		}
-		if sp >= 34 && todo[32] == '+' && bytes.IndexByte(todo[:32], ':') == -1 {
+		if sp >= 34 && todo[32] == '+' && bytes.IndexByte(todo[:32], ':') == -1 && todo[0] != '.' {
+			// todo[:sp] is a block token.
 			sizeend := bytes.IndexByte(todo[33:sp], '+')
 			if sizeend < 0 {
+				// "hash+size"
 				sizeend = sp
 			} else {
+				// "hash+size+extra"
 				sizeend += 33
 			}
 			n, _ := h.Write(todo[:sizeend])
 			h.Write([]byte{' '})
 			size += n + 1
 		} else {
+			// todo[:sp] is not a block token.
 			n, _ := h.Write(todo[:sp+1])
 			size += n
 		}

commit ec90aba4d11becc3505b7948f32b6654255575de
Author: Tom Clegg <tom at curii.com>
Date:   Sun Apr 21 18:16:45 2024 -0400

    21696: Update test case & timing results for ill-packed manifests.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>

diff --git a/sdk/go/arvados/fs_collection_test.go b/sdk/go/arvados/fs_collection_test.go
index aa4aa682c5..b57f9aa30f 100644
--- a/sdk/go/arvados/fs_collection_test.go
+++ b/sdk/go/arvados/fs_collection_test.go
@@ -1657,19 +1657,23 @@ func (s *CollectionFSUnitSuite) TestLargeManifest_InterleavedFiles(c *check.C) {
 	if testing.Short() {
 		c.Skip("slow")
 	}
-	// Timing figures here are from a dev host, before->after
-	// commit 353246cb47b60d485aa32469edba2b7aa0b7049f
-	s.testLargeManifest(c, 1, 800, 100, 4<<20) // 127s -> 12s
-	// s.testLargeManifest(c, 1, 50, 1000, 4<<20) // 44s -> 10s
-	// s.testLargeManifest(c, 1, 200, 100, 4<<20) // 13s -> 4s
-	// s.testLargeManifest(c, 1, 200, 150, 4<<20) // 26s -> 4s
-	// s.testLargeManifest(c, 1, 200, 200, 4<<20) // 38s -> 6s
-	// s.testLargeManifest(c, 1, 200, 225, 4<<20) // 46s -> 7s
-	// s.testLargeManifest(c, 1, 400, 400, 4<<20) // 477s -> 24s
-	// s.testLargeManifest(c, 1, 800, 1000, 4<<20) // timeout -> 186s
+	// Timing figures here are from a dev host, (0)->(1)->(2)->(3)
+	// (0) no optimizations (main branch commit ea697fb1e8)
+	// (1) resolve streampos->blkidx with binary search
+	// (2) ...and rewrite PortableDataHash() without regexp
+	// (3) ...and use fnodeCache in loadManifest
+	s.testLargeManifest(c, 1, 800, 100, 4<<20) // 127s    -> 12s  -> 2.5s -> 1.5s
+	s.testLargeManifest(c, 1, 50, 1000, 4<<20) // 44s     -> 10s  -> 1.5s -> 0.8s
+	s.testLargeManifest(c, 1, 200, 100, 4<<20) // 13s     -> 4s   -> 0.6s -> 0.3s
+	s.testLargeManifest(c, 1, 200, 150, 4<<20) // 26s     -> 4s   -> 1s   -> 0.5s
+	s.testLargeManifest(c, 1, 200, 200, 4<<20) // 38s     -> 6s   -> 1.3s -> 0.7s
+	s.testLargeManifest(c, 1, 200, 225, 4<<20) // 46s     -> 7s   -> 1.5s -> 1s
+	s.testLargeManifest(c, 1, 400, 400, 4<<20) // 477s    -> 24s  -> 5s   -> 3s
+	// s.testLargeManifest(c, 1, 800, 1000, 4<<20) // timeout -> 186s -> 28s  -> 17s
 }
 
 func (s *CollectionFSUnitSuite) testLargeManifest(c *check.C, dirCount, filesPerDir, blocksPerFile, interleaveChunk int) {
+	t0 := time.Now()
 	const blksize = 1 << 26
 	c.Logf("%s building manifest with dirCount=%d filesPerDir=%d blocksPerFile=%d", time.Now(), dirCount, filesPerDir, blocksPerFile)
 	mb := bytes.NewBuffer(make([]byte, 0, 40000000))
@@ -1729,6 +1733,7 @@ func (s *CollectionFSUnitSuite) testLargeManifest(c *check.C, dirCount, filesPer
 	runtime.ReadMemStats(&memstats)
 	c.Logf("%s Alloc=%d Sys=%d", time.Now(), memstats.Alloc, memstats.Sys)
 	c.Logf("%s MemorySize=%d", time.Now(), f.MemorySize())
+	c.Logf("%s ... test duration %s", time.Now(), time.Now().Sub(t0))
 }
 
 // Gocheck boilerplate

commit a9d73c7a0820f4c0997ceaac8143caa0d67e3a65
Author: Tom Clegg <tom at curii.com>
Date:   Sun Apr 21 18:09:22 2024 -0400

    21696: Cache subpath->fnode lookups.
    
    Avoids extra locks/lookups when loading non-contiguous chunks.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>

diff --git a/sdk/go/arvados/fs_collection.go b/sdk/go/arvados/fs_collection.go
index 164559fbab..101fade74b 100644
--- a/sdk/go/arvados/fs_collection.go
+++ b/sdk/go/arvados/fs_collection.go
@@ -1389,6 +1389,7 @@ func (dn *dirnode) loadManifest(txt string) error {
 	}
 	for i, stream := range streams {
 		lineno := i + 1
+		fnodeCache := make(map[string]*filenode)
 		var anyFileTokens bool
 		var segIdx int
 		segments = segments[:0]
@@ -1436,22 +1437,30 @@ func (dn *dirnode) loadManifest(txt string) error {
 			if err != nil || length < 0 {
 				return fmt.Errorf("line %d: bad file segment %q", lineno, token)
 			}
-			if !bytes.ContainsAny(toks[2], `\/`) {
-				// optimization for a common case
-				pathparts = append(pathparts[:streamparts], string(toks[2]))
-			} else {
-				pathparts = append(pathparts[:streamparts], strings.Split(manifestUnescape(string(toks[2])), "/")...)
+			fnode, cached := fnodeCache[string(toks[2])]
+			if !cached {
+				if !bytes.ContainsAny(toks[2], `\/`) {
+					// optimization for a common case
+					pathparts = append(pathparts[:streamparts], string(toks[2]))
+				} else {
+					pathparts = append(pathparts[:streamparts], strings.Split(manifestUnescape(string(toks[2])), "/")...)
+				}
+				fnode, err = dn.createFileAndParents(pathparts)
+				if err != nil {
+					return fmt.Errorf("line %d: cannot use name %q with length %d: %s", lineno, toks[2], length, err)
+				}
+				fnodeCache[string(toks[2])] = fnode
 			}
-			fnode, err := dn.createFileAndParents(pathparts)
-			if fnode == nil && err == nil && length == 0 {
+			if fnode == nil {
+				// name matches an existing directory
+				if length != 0 {
+					return fmt.Errorf("line %d: cannot use name %q with length %d: is a directory", lineno, toks[2], length)
+				}
 				// Special case: an empty file used as
 				// a marker to preserve an otherwise
 				// empty directory in a manifest.
 				continue
 			}
-			if err != nil || (fnode == nil && length != 0) {
-				return fmt.Errorf("line %d: cannot use name %q with length %d: %s", lineno, toks[2], length, err)
-			}
 			// Map the stream offset/range coordinates to
 			// block/offset/range coordinates and add
 			// corresponding storedSegments to the filenode

commit db51f3f783c766492e4b436a752f0ab4e24cdc73
Author: Tom Clegg <tom at curii.com>
Date:   Sun Apr 21 18:08:48 2024 -0400

    21696: Speed up PortableDataHash by not using regexp.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>

diff --git a/sdk/go/arvados/collection.go b/sdk/go/arvados/collection.go
index 389fe4e484..b157c4c8a8 100644
--- a/sdk/go/arvados/collection.go
+++ b/sdk/go/arvados/collection.go
@@ -104,28 +104,35 @@ type CollectionList struct {
 	Limit          int          `json:"limit"`
 }
 
-var (
-	blkRe = regexp.MustCompile(`^ [0-9a-f]{32}\+\d+`)
-	tokRe = regexp.MustCompile(` ?[^ ]*`)
-)
-
 // PortableDataHash computes the portable data hash of the given
 // manifest.
 func PortableDataHash(mt string) string {
 	h := md5.New()
 	size := 0
-	_ = tokRe.ReplaceAllFunc([]byte(mt), func(tok []byte) []byte {
-		if m := blkRe.Find(tok); m != nil {
-			// write hash+size, ignore remaining block hints
-			tok = m
+	todo := []byte(mt)
+	for len(todo) > 0 {
+		sp := bytes.IndexByte(todo, ' ')
+		if sp < 0 {
+			n, _ := h.Write(todo)
+			size += n
+			break
 		}
-		n, err := h.Write(tok)
-		if err != nil {
-			panic(err)
+		if sp >= 34 && todo[32] == '+' && bytes.IndexByte(todo[:32], ':') == -1 {
+			sizeend := bytes.IndexByte(todo[33:sp], '+')
+			if sizeend < 0 {
+				sizeend = sp
+			} else {
+				sizeend += 33
+			}
+			n, _ := h.Write(todo[:sizeend])
+			h.Write([]byte{' '})
+			size += n + 1
+		} else {
+			n, _ := h.Write(todo[:sp+1])
+			size += n
 		}
-		size += n
-		return nil
-	})
+		todo = todo[sp+1:]
+	}
 	return fmt.Sprintf("%x+%d", h.Sum(nil), size)
 }
 

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list