[ARVADOS] created: 2.1.0-1337-g34239884a
Git user
git at public.arvados.org
Sat Sep 11 18:17:42 UTC 2021
at 34239884a3eabd19ce02445c5582ba1102bbf4e8 (commit)
commit 34239884a3eabd19ce02445c5582ba1102bbf4e8
Author: Tom Clegg <tom at curii.com>
Date: Sat Sep 11 13:47:15 2021 -0400
18051: Optimizations for collections with millions of files.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>
diff --git a/sdk/go/arvados/fs_collection.go b/sdk/go/arvados/fs_collection.go
index 647b2c405..2b5df76ad 100644
--- a/sdk/go/arvados/fs_collection.go
+++ b/sdk/go/arvados/fs_collection.go
@@ -1041,30 +1041,56 @@ func (dn *dirnode) marshalManifest(ctx context.Context, prefix string) (string,
}
func (dn *dirnode) loadManifest(txt string) error {
- var dirname string
streams := bytes.Split([]byte(txt), []byte{'\n'})
if len(streams[len(streams)-1]) != 0 {
return fmt.Errorf("line %d: no trailing newline", len(streams))
}
streams = streams[:len(streams)-1]
segments := []storedSegment{}
+ // To reduce allocs, we reuse a single "pathparts" slice
+ // (pre-split on "/" separators) for the duration of this
+ // func.
+ var pathparts []string
+ // To reduce allocs, we reuse a single "toks" slice of 3 byte
+ // slices.
+ var toks = make([][]byte, 3)
+ // Similar to bytes.SplitN(token, []byte{c}, 3), but splits
+ // into the toks slice rather than allocating a new one, and
+ // returns the number of toks (1, 2, or 3).
+ splitToToks := func(src []byte, c rune) int {
+ c1 := bytes.IndexRune(src, c)
+ if c1 < 0 {
+ toks[0] = src
+ return 1
+ }
+ toks[0], src = src[:c1], src[c1+1:]
+ c2 := bytes.IndexRune(src, c)
+ if c2 < 0 {
+ toks[1] = src
+ return 2
+ }
+ toks[1], toks[2] = src[:c2], src[c2+1:]
+ return 3
+ }
for i, stream := range streams {
lineno := i + 1
var anyFileTokens bool
var pos int64
var segIdx int
segments = segments[:0]
+ pathparts = nil
+ streamparts := 0
for i, token := range bytes.Split(stream, []byte{' '}) {
if i == 0 {
- dirname = manifestUnescape(string(token))
+ pathparts = strings.Split(manifestUnescape(string(token)), "/")
+ streamparts = len(pathparts)
continue
}
if !bytes.ContainsRune(token, ':') {
if anyFileTokens {
return fmt.Errorf("line %d: bad file segment %q", lineno, token)
}
- toks := bytes.SplitN(token, []byte{'+'}, 3)
- if len(toks) < 2 {
+ if splitToToks(token, '+') < 2 {
return fmt.Errorf("line %d: bad locator %q", lineno, token)
}
length, err := strconv.ParseInt(string(toks[1]), 10, 32)
@@ -1081,9 +1107,7 @@ func (dn *dirnode) loadManifest(txt string) error {
} else if len(segments) == 0 {
return fmt.Errorf("line %d: bad locator %q", lineno, token)
}
-
- toks := bytes.SplitN(token, []byte{':'}, 3)
- if len(toks) != 3 {
+ if splitToToks(token, ':') != 3 {
return fmt.Errorf("line %d: bad file segment %q", lineno, token)
}
anyFileTokens = true
@@ -1096,8 +1120,13 @@ func (dn *dirnode) loadManifest(txt string) error {
if err != nil || length < 0 {
return fmt.Errorf("line %d: bad file segment %q", lineno, token)
}
- name := dirname + "/" + manifestUnescape(string(toks[2]))
- fnode, err := dn.createFileAndParents(name)
+ if !bytes.ContainsAny(toks[2], `\/`) {
+ // optimization for a common case
+ pathparts = append(pathparts[:streamparts], string(toks[2]))
+ } else {
+ pathparts = append(pathparts[:streamparts], strings.Split(manifestUnescape(string(toks[2])), "/")...)
+ }
+ fnode, err := dn.createFileAndParents(pathparts)
if fnode == nil && err == nil && length == 0 {
// Special case: an empty file used as
// a marker to preserve an otherwise
@@ -1105,7 +1134,7 @@ func (dn *dirnode) loadManifest(txt string) error {
continue
}
if err != nil || (fnode == nil && length != 0) {
- return fmt.Errorf("line %d: cannot use path %q with length %d: %s", lineno, name, length, err)
+ return fmt.Errorf("line %d: cannot use name %q with length %d: %s", lineno, toks[2], length, err)
}
// Map the stream offset/range coordinates to
// block/offset/range coordinates and add
@@ -1156,7 +1185,7 @@ func (dn *dirnode) loadManifest(txt string) error {
return fmt.Errorf("line %d: no file segments", lineno)
} else if len(segments) == 0 {
return fmt.Errorf("line %d: no locators", lineno)
- } else if dirname == "" {
+ } else if streamparts == 0 {
return fmt.Errorf("line %d: no stream name", lineno)
}
}
@@ -1167,9 +1196,11 @@ func (dn *dirnode) loadManifest(txt string) error {
//
// If path is a "parent directory exists" marker (the last path
// component is "."), the returned values are both nil.
-func (dn *dirnode) createFileAndParents(path string) (fn *filenode, err error) {
+//
+// Newly added nodes have modtime==0. Caller is responsible for fixing
+// them with backdateTree.
+func (dn *dirnode) createFileAndParents(names []string) (fn *filenode, err error) {
var node inode = dn
- names := strings.Split(path, "/")
basename := names[len(names)-1]
for _, name := range names[:len(names)-1] {
switch name {
@@ -1183,12 +1214,12 @@ func (dn *dirnode) createFileAndParents(path string) (fn *filenode, err error) {
node = node.Parent()
continue
}
- modtime := node.Parent().FileInfo().ModTime()
node.Lock()
- locked := node
+ unlock := node.Unlock
node, err = node.Child(name, func(child inode) (inode, error) {
if child == nil {
- child, err := node.FS().newNode(name, 0755|os.ModeDir, modtime)
+ // note modtime will be fixed later in backdateTree()
+ child, err := node.FS().newNode(name, 0755|os.ModeDir, time.Time{})
if err != nil {
return nil, err
}
@@ -1200,7 +1231,7 @@ func (dn *dirnode) createFileAndParents(path string) (fn *filenode, err error) {
return child, nil
}
})
- locked.Unlock()
+ unlock()
if err != nil {
return
}
@@ -1208,16 +1239,15 @@ func (dn *dirnode) createFileAndParents(path string) (fn *filenode, err error) {
if basename == "." {
return
} else if !permittedName(basename) {
- err = fmt.Errorf("invalid file part %q in path %q", basename, path)
+ err = fmt.Errorf("invalid file part %q in path %q", basename, names)
return
}
- modtime := node.FileInfo().ModTime()
node.Lock()
defer node.Unlock()
_, err = node.Child(basename, func(child inode) (inode, error) {
switch child := child.(type) {
case nil:
- child, err = node.FS().newNode(basename, 0755, modtime)
+ child, err = node.FS().newNode(basename, 0755, time.Time{})
if err != nil {
return nil, err
}
commit 6ded063537e1d035f5484a45b7fb089239424882
Author: Tom Clegg <tom at curii.com>
Date: Fri Sep 10 11:24:52 2021 -0400
18051: Speed up manifest parsing with bytes.Split.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>
diff --git a/sdk/go/arvados/fs_collection.go b/sdk/go/arvados/fs_collection.go
index 4d9db421f..647b2c405 100644
--- a/sdk/go/arvados/fs_collection.go
+++ b/sdk/go/arvados/fs_collection.go
@@ -5,6 +5,7 @@
package arvados
import (
+ "bytes"
"context"
"encoding/json"
"fmt"
@@ -1041,8 +1042,8 @@ func (dn *dirnode) marshalManifest(ctx context.Context, prefix string) (string,
func (dn *dirnode) loadManifest(txt string) error {
var dirname string
- streams := strings.Split(txt, "\n")
- if streams[len(streams)-1] != "" {
+ streams := bytes.Split([]byte(txt), []byte{'\n'})
+ if len(streams[len(streams)-1]) != 0 {
return fmt.Errorf("line %d: no trailing newline", len(streams))
}
streams = streams[:len(streams)-1]
@@ -1053,25 +1054,25 @@ func (dn *dirnode) loadManifest(txt string) error {
var pos int64
var segIdx int
segments = segments[:0]
- for i, token := range strings.Split(stream, " ") {
+ for i, token := range bytes.Split(stream, []byte{' '}) {
if i == 0 {
- dirname = manifestUnescape(token)
+ dirname = manifestUnescape(string(token))
continue
}
- if !strings.Contains(token, ":") {
+ if !bytes.ContainsRune(token, ':') {
if anyFileTokens {
return fmt.Errorf("line %d: bad file segment %q", lineno, token)
}
- toks := strings.SplitN(token, "+", 3)
+ toks := bytes.SplitN(token, []byte{'+'}, 3)
if len(toks) < 2 {
return fmt.Errorf("line %d: bad locator %q", lineno, token)
}
- length, err := strconv.ParseInt(toks[1], 10, 32)
+ length, err := strconv.ParseInt(string(toks[1]), 10, 32)
if err != nil || length < 0 {
return fmt.Errorf("line %d: bad locator %q", lineno, token)
}
segments = append(segments, storedSegment{
- locator: token,
+ locator: string(token),
size: int(length),
offset: 0,
length: int(length),
@@ -1081,21 +1082,21 @@ func (dn *dirnode) loadManifest(txt string) error {
return fmt.Errorf("line %d: bad locator %q", lineno, token)
}
- toks := strings.SplitN(token, ":", 3)
+ toks := bytes.SplitN(token, []byte{':'}, 3)
if len(toks) != 3 {
return fmt.Errorf("line %d: bad file segment %q", lineno, token)
}
anyFileTokens = true
- offset, err := strconv.ParseInt(toks[0], 10, 64)
+ offset, err := strconv.ParseInt(string(toks[0]), 10, 64)
if err != nil || offset < 0 {
return fmt.Errorf("line %d: bad file segment %q", lineno, token)
}
- length, err := strconv.ParseInt(toks[1], 10, 64)
+ length, err := strconv.ParseInt(string(toks[1]), 10, 64)
if err != nil || length < 0 {
return fmt.Errorf("line %d: bad file segment %q", lineno, token)
}
- name := dirname + "/" + manifestUnescape(toks[2])
+ name := dirname + "/" + manifestUnescape(string(toks[2]))
fnode, err := dn.createFileAndParents(name)
if fnode == nil && err == nil && length == 0 {
// Special case: an empty file used as
commit 88c34bf1acd79b727c052d172f9f13e48d73f824
Author: Tom Clegg <tom at curii.com>
Date: Fri Sep 10 11:24:26 2021 -0400
18051: Add manifest-parsing / fs-building benchmark.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>
diff --git a/sdk/go/arvados/fs_collection_test.go b/sdk/go/arvados/fs_collection_test.go
index c032b0716..beb4d61fc 100644
--- a/sdk/go/arvados/fs_collection_test.go
+++ b/sdk/go/arvados/fs_collection_test.go
@@ -1433,6 +1433,31 @@ func (s *CollectionFSSuite) TestEdgeCaseManifests(c *check.C) {
}
}
+var bigmanifest = func() string {
+ var buf bytes.Buffer
+ for i := 0; i < 2000; i++ {
+ fmt.Fprintf(&buf, "./dir%d", i)
+ for i := 0; i < 100; i++ {
+ fmt.Fprintf(&buf, " d41d8cd98f00b204e9800998ecf8427e+99999")
+ }
+ for i := 0; i < 2000; i++ {
+ fmt.Fprintf(&buf, " 1200000:300000:file%d", i)
+ }
+ fmt.Fprintf(&buf, "\n")
+ }
+ return buf.String()
+}()
+
+func (s *CollectionFSSuite) BenchmarkParseManifest(c *check.C) {
+ DebugLocksPanicMode = false
+ c.Logf("test manifest is %d bytes", len(bigmanifest))
+ for i := 0; i < c.N; i++ {
+ fs, err := (&Collection{ManifestText: bigmanifest}).FileSystem(s.client, s.kc)
+ c.Check(err, check.IsNil)
+ c.Check(fs, check.NotNil)
+ }
+}
+
func (s *CollectionFSSuite) checkMemSize(c *check.C, f File) {
fn := f.(*filehandle).inode.(*filenode)
var memsize int64
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list