[arvados] updated: 2.7.0-6265-gec0b74da39
git repository hosting
git at public.arvados.org
Tue Apr 30 15:19:34 UTC 2024
Summary of changes:
go.mod | 2 +-
go.sum | 4 +-
lib/crunchrun/copier.go | 88 +++++++++++++++++++++++++++++++++++++++-----
lib/crunchrun/copier_test.go | 76 +++++++++++++++++++++++++++++++++++---
4 files changed, 153 insertions(+), 17 deletions(-)
via ec0b74da392f739d4340b7a9120f2f90f5871f47 (commit)
from a32d74c4179f0b17640c0623daaac240ea9ec38b (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
commit ec0b74da392f739d4340b7a9120f2f90f5871f47
Author: Tom Clegg <tom at curii.com>
Date: Tue Apr 30 11:18:31 2024 -0400
12430: Avoid loading collections excluded by globs.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>
diff --git a/go.mod b/go.mod
index 62a31c1c58..e26a8699a4 100644
--- a/go.mod
+++ b/go.mod
@@ -11,6 +11,7 @@ require (
github.com/arvados/cgofuse v1.2.0-arvados1
github.com/aws/aws-sdk-go v1.44.174
github.com/aws/aws-sdk-go-v2 v0.23.0
+ github.com/bmatcuk/doublestar/v4 v4.6.1
github.com/bradleypeabody/godap v0.0.0-20170216002349-c249933bc092
github.com/coreos/go-oidc/v3 v3.5.0
github.com/coreos/go-systemd v0.0.0-20190321100706-95778dfbb74e
@@ -65,7 +66,6 @@ require (
github.com/anmitsu/go-shlex v0.0.0-20161002113705-648efa622239 // indirect
github.com/beorn7/perks v1.0.1 // indirect
github.com/bgentry/speakeasy v0.1.0 // indirect
- github.com/bmatcuk/doublestar v1.3.4 // indirect
github.com/cespare/xxhash/v2 v2.2.0 // indirect
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/dimchansky/utfbom v1.1.1 // indirect
diff --git a/go.sum b/go.sum
index fe6febb043..9d445185fb 100644
--- a/go.sum
+++ b/go.sum
@@ -59,8 +59,8 @@ github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
github.com/bgentry/speakeasy v0.1.0 h1:ByYyxL9InA1OWqxJqqp2A5pYHUrCiAL6K3J+LKSsQkY=
github.com/bgentry/speakeasy v0.1.0/go.mod h1:+zsyZBPWlz7T6j88CTgSN5bM796AkVf0kBD4zp0CCIs=
-github.com/bmatcuk/doublestar v1.3.4 h1:gPypJ5xD31uhX6Tf54sDPUOBXTqKH4c9aPY66CyQrS0=
-github.com/bmatcuk/doublestar v1.3.4/go.mod h1:wiQtGV+rzVYxB7WIlirSN++5HPtPlXEo9MEoZQC/PmE=
+github.com/bmatcuk/doublestar/v4 v4.6.1 h1:FH9SifrbvJhnlQpztAx++wlkk70QBf0iBWDwNy7PA4I=
+github.com/bmatcuk/doublestar/v4 v4.6.1/go.mod h1:xBQ8jztBU6kakFMg+8WGxn0c6z1fTSPVIjEY1Wr7jzc=
github.com/boltdb/bolt v1.3.1/go.mod h1:clJnj/oiGkjum5o1McbSZDSLxVThjynRyGBgiAx27Ps=
github.com/bradleypeabody/godap v0.0.0-20170216002349-c249933bc092 h1:0Di2onNnlN5PAyWPbqlPyN45eOQ+QW/J9eqLynt4IV4=
github.com/bradleypeabody/godap v0.0.0-20170216002349-c249933bc092/go.mod h1:8IzBjZCRSnsvM6MJMG8HNNtnzMl48H22rbJL2kRUJ0Y=
diff --git a/lib/crunchrun/copier.go b/lib/crunchrun/copier.go
index 8d986d7066..507029b36e 100644
--- a/lib/crunchrun/copier.go
+++ b/lib/crunchrun/copier.go
@@ -18,7 +18,7 @@ import (
"git.arvados.org/arvados.git/sdk/go/arvados"
"git.arvados.org/arvados.git/sdk/go/keepclient"
"git.arvados.org/arvados.git/sdk/go/manifest"
- "github.com/bmatcuk/doublestar"
+ "github.com/bmatcuk/doublestar/v4"
)
type printfer interface {
@@ -115,14 +115,44 @@ func (cp *copier) Copy() (string, error) {
return collfs.MarshalManifest(".")
}
-func (cp *copier) matchGlobs(path string) bool {
+func (cp *copier) matchGlobs(path string, isDir bool) bool {
// An entry in the top level of the output directory looks
// like "/foo", but globs look like "foo", so we strip the
// leading "/" before matching.
path = strings.TrimLeft(path, "/")
for _, glob := range cp.globs {
- if match, _ := doublestar.Match(glob, path); match {
+ if !isDir && strings.HasSuffix(glob, "/**") {
+ // doublestar.Match("f*/**", "ff") and
+ // doublestar.Match("f*/**", "ff/gg") both
+ // return true, but (to be compatible with
+ // bash shopt) "ff" should match only if it is
+ // a directory.
+ //
+ // To avoid errant matches, we add the file's
+ // basename to the end of the pattern:
+ //
+ // Match("f*/**/ff", "ff") => false
+ // Match("f*/**/gg", "ff/gg") => true
+ //
+ // Of course, we need to escape basename in
+ // case it contains *, ?, \, etc.
+ _, name := filepath.Split(path)
+ escapedName := strings.TrimSuffix(strings.Replace(name, "", "\\", -1), "\\")
+ if match, _ := doublestar.Match(glob+"/"+escapedName, path); match {
+ return true
+ }
+ } else if match, _ := doublestar.Match(glob, path); match {
return true
+ } else if isDir {
+ // Workaround doublestar bug (v4.6.1).
+ // "foo*/**" should match "foo", but does not,
+ // because isZeroLengthPattern does not accept
+ // "*/**" as a zero length pattern.
+ if trunc := strings.TrimSuffix(glob, "*/**"); trunc != glob {
+ if match, _ := doublestar.Match(trunc, path); match {
+ return true
+ }
+ }
}
}
return false
@@ -141,7 +171,7 @@ func (cp *copier) applyGlobsToFilesAndDirs() {
}
keepdirs := make(map[string]bool)
for _, path := range cp.dirs {
- if cp.matchGlobs(path) {
+ if cp.matchGlobs(path, true) {
keepdirs[path] = true
}
}
@@ -154,7 +184,7 @@ func (cp *copier) applyGlobsToFilesAndDirs() {
}
var keepfiles []filetodo
for _, file := range cp.files {
- if cp.matchGlobs(file.dst) {
+ if cp.matchGlobs(file.dst, false) {
keepfiles = append(keepfiles, file)
}
}
@@ -182,7 +212,7 @@ func (cp *copier) applyGlobsToCollectionFS(collfs arvados.CollectionFileSystem)
}
include := make(map[string]bool)
err := fs.WalkDir(arvados.FS(collfs), "", func(path string, ent fs.DirEntry, err error) error {
- if cp.matchGlobs(path) {
+ if cp.matchGlobs(path, ent.IsDir()) {
for i, c := range path {
if i > 0 && c == '/' {
include[path[:i]] = true
@@ -213,6 +243,42 @@ func (cp *copier) applyGlobsToCollectionFS(collfs arvados.CollectionFileSystem)
return err
}
+// Return true if it's possible for any descendant of the given path
+// to match anything in cp.globs. Used by walkMount to avoid loading
+// collections that are mounted underneath ctrOutputPath but excluded
+// by globs.
+func (cp *copier) subtreeCouldMatch(path string) bool {
+ if len(cp.globs) == 0 {
+ return true
+ }
+ pathdepth := 1 + strings.Count(path, "/")
+ for _, glob := range cp.globs {
+ globdepth := 0
+ lastsep := 0
+ for i, c := range glob {
+ if c != '/' || !doublestar.ValidatePattern(glob[:i]) {
+ // Escaped "/", or "/" in a character
+ // class, is not a path separator.
+ continue
+ }
+ if glob[lastsep:i] == "**" {
+ return true
+ }
+ lastsep = i + 1
+ if globdepth++; globdepth == pathdepth {
+ if match, _ := doublestar.Match(glob[:i]+"/*", path+"/z"); match {
+ return true
+ }
+ break
+ }
+ }
+ if globdepth < pathdepth && glob[lastsep:] == "**" {
+ return true
+ }
+ }
+ return false
+}
+
func (cp *copier) copyFile(fs arvados.CollectionFileSystem, f filetodo) (int64, error) {
cp.logger.Printf("copying %q (%d bytes)", strings.TrimLeft(f.dst, "/"), f.size)
dst, err := fs.OpenFile(f.dst, os.O_CREATE|os.O_WRONLY, 0666)
@@ -267,9 +333,8 @@ func (cp *copier) walkMount(dest, src string, maxSymlinks int, walkMountsBelow b
// copy, relative to its mount point -- ".", "./foo.txt", ...
srcRelPath := filepath.Join(".", srcMount.Path, src[len(srcRoot):])
- // outputRelPath is the path relative in the output directory
- // that corresponds to the path in the output collection where
- // the file will go, for logging
+ // outputRelPath is the destination path relative to the
+ // output directory. Used for logging and glob matching.
var outputRelPath = ""
if strings.HasPrefix(src, cp.ctrOutputDir) {
outputRelPath = strings.TrimPrefix(src[len(cp.ctrOutputDir):], "/")
@@ -283,6 +348,9 @@ func (cp *copier) walkMount(dest, src string, maxSymlinks int, walkMountsBelow b
switch {
case srcMount.ExcludeFromOutput:
+ case outputRelPath != "*" && !cp.subtreeCouldMatch(outputRelPath):
+ cp.logger.Printf("not copying %q because contents cannot match output globs", outputRelPath)
+ return nil
case srcMount.Kind == "tmp":
// Handle by walking the host filesystem.
return cp.walkHostFS(dest, src, maxSymlinks, walkMountsBelow)
@@ -434,6 +502,8 @@ func (cp *copier) walkHostFS(dest, src string, maxSymlinks int, includeMounts bo
// (...except mount types that are
// handled as regular files.)
continue
+ } else if isMount && !cp.subtreeCouldMatch(src[len(cp.ctrOutputDir)+1:]) {
+ continue
}
err = cp.walkHostFS(dest, src, maxSymlinks, false)
if err != nil {
diff --git a/lib/crunchrun/copier_test.go b/lib/crunchrun/copier_test.go
index a1fc81c716..9deba783b2 100644
--- a/lib/crunchrun/copier_test.go
+++ b/lib/crunchrun/copier_test.go
@@ -214,6 +214,68 @@ func (s *copierSuite) TestWritableMountBelow(c *check.C) {
})
}
+// Check some glob-matching edge cases. In particular, check that
+// patterns like "foo/**" do not match regular files named "foo"
+// (unless of course they are inside a directory named "foo").
+func (s *copierSuite) TestMatchGlobs(c *check.C) {
+ s.cp.globs = []string{"foo*/**"}
+ c.Check(s.cp.matchGlobs("foo", true), check.Equals, true)
+ c.Check(s.cp.matchGlobs("food", true), check.Equals, true)
+ c.Check(s.cp.matchGlobs("foo", false), check.Equals, false)
+ c.Check(s.cp.matchGlobs("food", false), check.Equals, false)
+
+ s.cp.globs = []string{"ba[!/]/foo*/**"}
+ c.Check(s.cp.matchGlobs("bar/foo", true), check.Equals, true)
+ c.Check(s.cp.matchGlobs("bar/food", true), check.Equals, true)
+ c.Check(s.cp.matchGlobs("bar/foo", false), check.Equals, false)
+ c.Check(s.cp.matchGlobs("bar/food", false), check.Equals, false)
+ c.Check(s.cp.matchGlobs("bar/foo/z\\[", true), check.Equals, true)
+ c.Check(s.cp.matchGlobs("bar/food/z\\[", true), check.Equals, true)
+ c.Check(s.cp.matchGlobs("bar/foo/z\\[", false), check.Equals, true)
+ c.Check(s.cp.matchGlobs("bar/food/z\\[", false), check.Equals, true)
+
+ s.cp.globs = []string{"waz/**/foo*/**"}
+ c.Check(s.cp.matchGlobs("waz/quux/foo", true), check.Equals, true)
+ c.Check(s.cp.matchGlobs("waz/quux/food", true), check.Equals, true)
+ c.Check(s.cp.matchGlobs("waz/quux/foo", false), check.Equals, false)
+ c.Check(s.cp.matchGlobs("waz/quux/food", false), check.Equals, false)
+ c.Check(s.cp.matchGlobs("waz/quux/foo/foo", true), check.Equals, true)
+ c.Check(s.cp.matchGlobs("waz/quux/food/foo", true), check.Equals, true)
+ c.Check(s.cp.matchGlobs("waz/quux/foo/foo", false), check.Equals, true)
+ c.Check(s.cp.matchGlobs("waz/quux/food/foo", false), check.Equals, true)
+}
+
+func (s *copierSuite) TestSubtreeCouldMatch(c *check.C) {
+ for _, trial := range []struct {
+ mount string // relative to output dir
+ glob string
+ could bool
+ }{
+ {mount: "abc", glob: "*"},
+ {mount: "abc", glob: "abc/*", could: true},
+ {mount: "abc", glob: "a*/**", could: true},
+ {mount: "abc", glob: "**", could: true},
+ {mount: "abc", glob: "*/*", could: true},
+ {mount: "abc", glob: "**/*.txt", could: true},
+ {mount: "abc/def", glob: "*"},
+ {mount: "abc/def", glob: "*/*"},
+ {mount: "abc/def", glob: "*/*.txt"},
+ {mount: "abc/def", glob: "*/*/*", could: true},
+ {mount: "abc/def", glob: "**", could: true},
+ {mount: "abc/def", glob: "**/bar", could: true},
+ {mount: "abc/def", glob: "abc/**", could: true},
+ {mount: "abc/def/ghi", glob: "*c/**/bar", could: true},
+ {mount: "abc/def/ghi", glob: "*c/*f/bar"},
+ {mount: "abc/def/ghi", glob: "abc/d[^/]f/ghi/*", could: true},
+ } {
+ c.Logf("=== %+v", trial)
+ got := (&copier{
+ globs: []string{trial.glob},
+ }).subtreeCouldMatch(trial.mount)
+ c.Check(got, check.Equals, trial.could)
+ }
+}
+
func (s *copierSuite) TestMountBelowExcludedByGlob(c *check.C) {
bindtmp := c.MkDir()
s.cp.mounts["/ctr/outdir/include/includer"] = arvados.Mount{
@@ -234,7 +296,7 @@ func (s *copierSuite) TestMountBelowExcludedByGlob(c *check.C) {
PortableDataHash: arvadostest.FooCollectionPDH,
Writable: true,
}
- s.cp.mounts["/ctr/outdir/nonexistent-collection"] = arvados.Mount{
+ s.cp.mounts["/ctr/outdir/nonexistent/collection"] = arvados.Mount{
// As extra assurance, plant a collection that will
// fail if copier attempts to load its manifest. (For
// performance reasons it's important that copier
@@ -244,8 +306,8 @@ func (s *copierSuite) TestMountBelowExcludedByGlob(c *check.C) {
PortableDataHash: "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa+1234",
}
s.cp.globs = []string{
- "?ncl*/*r",
- "*/?ncl*",
+ "?ncl*/*r/*",
+ "*/?ncl*/**",
}
c.Assert(os.MkdirAll(s.cp.hostOutputDir+"/include/includer", 0755), check.IsNil)
c.Assert(os.MkdirAll(s.cp.hostOutputDir+"/include/includew", 0755), check.IsNil)
@@ -264,14 +326,18 @@ func (s *copierSuite) TestMountBelowExcludedByGlob(c *check.C) {
c.Check(err, check.IsNil)
c.Log(s.log.String())
- c.Check(s.cp.dirs, check.DeepEquals, []string{"/include", "/include/includew"})
+ // Note it's OK that "/exclude" is not excluded by walkMount:
+ // it is just a local filesystem directory, not a mount point
+ // that's expensive to walk. In real-life usage, it will be
+ // removed from cp.dirs before any copying happens.
+ c.Check(s.cp.dirs, check.DeepEquals, []string{"/exclude", "/include", "/include/includew"})
c.Check(s.cp.files, check.DeepEquals, []filetodo{
{src: s.cp.hostOutputDir + "/include/includew/foo", dst: "/include/includew/foo", size: 3},
})
c.Check(s.cp.manifest, check.Matches, `(?ms).*\./include/includer .*`)
c.Check(s.cp.manifest, check.Not(check.Matches), `(?ms).*exclude.*`)
c.Check(s.log.String(), check.Matches, `(?ms).*not copying \\"exclude/excluder\\".*`)
- c.Check(s.log.String(), check.Matches, `(?ms).*not copying \\"exclude/excludew\\".*`)
+ c.Check(s.log.String(), check.Matches, `(?ms).*not copying \\"nonexistent/collection\\".*`)
}
func (s *copierSuite) writeFileInOutputDir(c *check.C, path, data string) {
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list