[ARVADOS] created: 2.1.0-1337-g34239884a

Git user git at public.arvados.org
Sat Sep 11 18:17:42 UTC 2021


        at  34239884a3eabd19ce02445c5582ba1102bbf4e8 (commit)


commit 34239884a3eabd19ce02445c5582ba1102bbf4e8
Author: Tom Clegg <tom at curii.com>
Date:   Sat Sep 11 13:47:15 2021 -0400

    18051: Optimizations for collections with millions of files.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>

diff --git a/sdk/go/arvados/fs_collection.go b/sdk/go/arvados/fs_collection.go
index 647b2c405..2b5df76ad 100644
--- a/sdk/go/arvados/fs_collection.go
+++ b/sdk/go/arvados/fs_collection.go
@@ -1041,30 +1041,56 @@ func (dn *dirnode) marshalManifest(ctx context.Context, prefix string) (string,
 }
 
 func (dn *dirnode) loadManifest(txt string) error {
-	var dirname string
 	streams := bytes.Split([]byte(txt), []byte{'\n'})
 	if len(streams[len(streams)-1]) != 0 {
 		return fmt.Errorf("line %d: no trailing newline", len(streams))
 	}
 	streams = streams[:len(streams)-1]
 	segments := []storedSegment{}
+	// To reduce allocs, we reuse a single "pathparts" slice
+	// (pre-split on "/" separators) for the duration of this
+	// func.
+	var pathparts []string
+	// To reduce allocs, we reuse a single "toks" slice of 3 byte
+	// slices.
+	var toks = make([][]byte, 3)
+	// Similar to bytes.SplitN(token, []byte{c}, 3), but splits
+	// into the toks slice rather than allocating a new one, and
+	// returns the number of toks (1, 2, or 3).
+	splitToToks := func(src []byte, c rune) int {
+		c1 := bytes.IndexRune(src, c)
+		if c1 < 0 {
+			toks[0] = src
+			return 1
+		}
+		toks[0], src = src[:c1], src[c1+1:]
+		c2 := bytes.IndexRune(src, c)
+		if c2 < 0 {
+			toks[1] = src
+			return 2
+		}
+		toks[1], toks[2] = src[:c2], src[c2+1:]
+		return 3
+	}
 	for i, stream := range streams {
 		lineno := i + 1
 		var anyFileTokens bool
 		var pos int64
 		var segIdx int
 		segments = segments[:0]
+		pathparts = nil
+		streamparts := 0
 		for i, token := range bytes.Split(stream, []byte{' '}) {
 			if i == 0 {
-				dirname = manifestUnescape(string(token))
+				pathparts = strings.Split(manifestUnescape(string(token)), "/")
+				streamparts = len(pathparts)
 				continue
 			}
 			if !bytes.ContainsRune(token, ':') {
 				if anyFileTokens {
 					return fmt.Errorf("line %d: bad file segment %q", lineno, token)
 				}
-				toks := bytes.SplitN(token, []byte{'+'}, 3)
-				if len(toks) < 2 {
+				if splitToToks(token, '+') < 2 {
 					return fmt.Errorf("line %d: bad locator %q", lineno, token)
 				}
 				length, err := strconv.ParseInt(string(toks[1]), 10, 32)
@@ -1081,9 +1107,7 @@ func (dn *dirnode) loadManifest(txt string) error {
 			} else if len(segments) == 0 {
 				return fmt.Errorf("line %d: bad locator %q", lineno, token)
 			}
-
-			toks := bytes.SplitN(token, []byte{':'}, 3)
-			if len(toks) != 3 {
+			if splitToToks(token, ':') != 3 {
 				return fmt.Errorf("line %d: bad file segment %q", lineno, token)
 			}
 			anyFileTokens = true
@@ -1096,8 +1120,13 @@ func (dn *dirnode) loadManifest(txt string) error {
 			if err != nil || length < 0 {
 				return fmt.Errorf("line %d: bad file segment %q", lineno, token)
 			}
-			name := dirname + "/" + manifestUnescape(string(toks[2]))
-			fnode, err := dn.createFileAndParents(name)
+			if !bytes.ContainsAny(toks[2], `\/`) {
+				// optimization for a common case
+				pathparts = append(pathparts[:streamparts], string(toks[2]))
+			} else {
+				pathparts = append(pathparts[:streamparts], strings.Split(manifestUnescape(string(toks[2])), "/")...)
+			}
+			fnode, err := dn.createFileAndParents(pathparts)
 			if fnode == nil && err == nil && length == 0 {
 				// Special case: an empty file used as
 				// a marker to preserve an otherwise
@@ -1105,7 +1134,7 @@ func (dn *dirnode) loadManifest(txt string) error {
 				continue
 			}
 			if err != nil || (fnode == nil && length != 0) {
-				return fmt.Errorf("line %d: cannot use path %q with length %d: %s", lineno, name, length, err)
+				return fmt.Errorf("line %d: cannot use name %q with length %d: %s", lineno, toks[2], length, err)
 			}
 			// Map the stream offset/range coordinates to
 			// block/offset/range coordinates and add
@@ -1156,7 +1185,7 @@ func (dn *dirnode) loadManifest(txt string) error {
 			return fmt.Errorf("line %d: no file segments", lineno)
 		} else if len(segments) == 0 {
 			return fmt.Errorf("line %d: no locators", lineno)
-		} else if dirname == "" {
+		} else if streamparts == 0 {
 			return fmt.Errorf("line %d: no stream name", lineno)
 		}
 	}
@@ -1167,9 +1196,11 @@ func (dn *dirnode) loadManifest(txt string) error {
 //
 // If path is a "parent directory exists" marker (the last path
 // component is "."), the returned values are both nil.
-func (dn *dirnode) createFileAndParents(path string) (fn *filenode, err error) {
+//
+// Newly added nodes have modtime==0. Caller is responsible for fixing
+// them with backdateTree.
+func (dn *dirnode) createFileAndParents(names []string) (fn *filenode, err error) {
 	var node inode = dn
-	names := strings.Split(path, "/")
 	basename := names[len(names)-1]
 	for _, name := range names[:len(names)-1] {
 		switch name {
@@ -1183,12 +1214,12 @@ func (dn *dirnode) createFileAndParents(path string) (fn *filenode, err error) {
 			node = node.Parent()
 			continue
 		}
-		modtime := node.Parent().FileInfo().ModTime()
 		node.Lock()
-		locked := node
+		unlock := node.Unlock
 		node, err = node.Child(name, func(child inode) (inode, error) {
 			if child == nil {
-				child, err := node.FS().newNode(name, 0755|os.ModeDir, modtime)
+				// note modtime will be fixed later in backdateTree()
+				child, err := node.FS().newNode(name, 0755|os.ModeDir, time.Time{})
 				if err != nil {
 					return nil, err
 				}
@@ -1200,7 +1231,7 @@ func (dn *dirnode) createFileAndParents(path string) (fn *filenode, err error) {
 				return child, nil
 			}
 		})
-		locked.Unlock()
+		unlock()
 		if err != nil {
 			return
 		}
@@ -1208,16 +1239,15 @@ func (dn *dirnode) createFileAndParents(path string) (fn *filenode, err error) {
 	if basename == "." {
 		return
 	} else if !permittedName(basename) {
-		err = fmt.Errorf("invalid file part %q in path %q", basename, path)
+		err = fmt.Errorf("invalid file part %q in path %q", basename, names)
 		return
 	}
-	modtime := node.FileInfo().ModTime()
 	node.Lock()
 	defer node.Unlock()
 	_, err = node.Child(basename, func(child inode) (inode, error) {
 		switch child := child.(type) {
 		case nil:
-			child, err = node.FS().newNode(basename, 0755, modtime)
+			child, err = node.FS().newNode(basename, 0755, time.Time{})
 			if err != nil {
 				return nil, err
 			}

commit 6ded063537e1d035f5484a45b7fb089239424882
Author: Tom Clegg <tom at curii.com>
Date:   Fri Sep 10 11:24:52 2021 -0400

    18051: Speed up manifest parsing with bytes.Split.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>

diff --git a/sdk/go/arvados/fs_collection.go b/sdk/go/arvados/fs_collection.go
index 4d9db421f..647b2c405 100644
--- a/sdk/go/arvados/fs_collection.go
+++ b/sdk/go/arvados/fs_collection.go
@@ -5,6 +5,7 @@
 package arvados
 
 import (
+	"bytes"
 	"context"
 	"encoding/json"
 	"fmt"
@@ -1041,8 +1042,8 @@ func (dn *dirnode) marshalManifest(ctx context.Context, prefix string) (string,
 
 func (dn *dirnode) loadManifest(txt string) error {
 	var dirname string
-	streams := strings.Split(txt, "\n")
-	if streams[len(streams)-1] != "" {
+	streams := bytes.Split([]byte(txt), []byte{'\n'})
+	if len(streams[len(streams)-1]) != 0 {
 		return fmt.Errorf("line %d: no trailing newline", len(streams))
 	}
 	streams = streams[:len(streams)-1]
@@ -1053,25 +1054,25 @@ func (dn *dirnode) loadManifest(txt string) error {
 		var pos int64
 		var segIdx int
 		segments = segments[:0]
-		for i, token := range strings.Split(stream, " ") {
+		for i, token := range bytes.Split(stream, []byte{' '}) {
 			if i == 0 {
-				dirname = manifestUnescape(token)
+				dirname = manifestUnescape(string(token))
 				continue
 			}
-			if !strings.Contains(token, ":") {
+			if !bytes.ContainsRune(token, ':') {
 				if anyFileTokens {
 					return fmt.Errorf("line %d: bad file segment %q", lineno, token)
 				}
-				toks := strings.SplitN(token, "+", 3)
+				toks := bytes.SplitN(token, []byte{'+'}, 3)
 				if len(toks) < 2 {
 					return fmt.Errorf("line %d: bad locator %q", lineno, token)
 				}
-				length, err := strconv.ParseInt(toks[1], 10, 32)
+				length, err := strconv.ParseInt(string(toks[1]), 10, 32)
 				if err != nil || length < 0 {
 					return fmt.Errorf("line %d: bad locator %q", lineno, token)
 				}
 				segments = append(segments, storedSegment{
-					locator: token,
+					locator: string(token),
 					size:    int(length),
 					offset:  0,
 					length:  int(length),
@@ -1081,21 +1082,21 @@ func (dn *dirnode) loadManifest(txt string) error {
 				return fmt.Errorf("line %d: bad locator %q", lineno, token)
 			}
 
-			toks := strings.SplitN(token, ":", 3)
+			toks := bytes.SplitN(token, []byte{':'}, 3)
 			if len(toks) != 3 {
 				return fmt.Errorf("line %d: bad file segment %q", lineno, token)
 			}
 			anyFileTokens = true
 
-			offset, err := strconv.ParseInt(toks[0], 10, 64)
+			offset, err := strconv.ParseInt(string(toks[0]), 10, 64)
 			if err != nil || offset < 0 {
 				return fmt.Errorf("line %d: bad file segment %q", lineno, token)
 			}
-			length, err := strconv.ParseInt(toks[1], 10, 64)
+			length, err := strconv.ParseInt(string(toks[1]), 10, 64)
 			if err != nil || length < 0 {
 				return fmt.Errorf("line %d: bad file segment %q", lineno, token)
 			}
-			name := dirname + "/" + manifestUnescape(toks[2])
+			name := dirname + "/" + manifestUnescape(string(toks[2]))
 			fnode, err := dn.createFileAndParents(name)
 			if fnode == nil && err == nil && length == 0 {
 				// Special case: an empty file used as

commit 88c34bf1acd79b727c052d172f9f13e48d73f824
Author: Tom Clegg <tom at curii.com>
Date:   Fri Sep 10 11:24:26 2021 -0400

    18051: Add manifest-parsing / fs-building benchmark.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>

diff --git a/sdk/go/arvados/fs_collection_test.go b/sdk/go/arvados/fs_collection_test.go
index c032b0716..beb4d61fc 100644
--- a/sdk/go/arvados/fs_collection_test.go
+++ b/sdk/go/arvados/fs_collection_test.go
@@ -1433,6 +1433,31 @@ func (s *CollectionFSSuite) TestEdgeCaseManifests(c *check.C) {
 	}
 }
 
+var bigmanifest = func() string {
+	var buf bytes.Buffer
+	for i := 0; i < 2000; i++ {
+		fmt.Fprintf(&buf, "./dir%d", i)
+		for i := 0; i < 100; i++ {
+			fmt.Fprintf(&buf, " d41d8cd98f00b204e9800998ecf8427e+99999")
+		}
+		for i := 0; i < 2000; i++ {
+			fmt.Fprintf(&buf, " 1200000:300000:file%d", i)
+		}
+		fmt.Fprintf(&buf, "\n")
+	}
+	return buf.String()
+}()
+
+func (s *CollectionFSSuite) BenchmarkParseManifest(c *check.C) {
+	DebugLocksPanicMode = false
+	c.Logf("test manifest is %d bytes", len(bigmanifest))
+	for i := 0; i < c.N; i++ {
+		fs, err := (&Collection{ManifestText: bigmanifest}).FileSystem(s.client, s.kc)
+		c.Check(err, check.IsNil)
+		c.Check(fs, check.NotNil)
+	}
+}
+
 func (s *CollectionFSSuite) checkMemSize(c *check.C, f File) {
 	fn := f.(*filehandle).inode.(*filenode)
 	var memsize int64

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list