[arvados] updated: 2.7.0-6265-gec0b74da39

git repository hosting git at public.arvados.org
Tue Apr 30 15:19:34 UTC 2024


Summary of changes:
 go.mod                       |  2 +-
 go.sum                       |  4 +-
 lib/crunchrun/copier.go      | 88 +++++++++++++++++++++++++++++++++++++++-----
 lib/crunchrun/copier_test.go | 76 +++++++++++++++++++++++++++++++++++---
 4 files changed, 153 insertions(+), 17 deletions(-)

       via  ec0b74da392f739d4340b7a9120f2f90f5871f47 (commit)
      from  a32d74c4179f0b17640c0623daaac240ea9ec38b (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.


commit ec0b74da392f739d4340b7a9120f2f90f5871f47
Author: Tom Clegg <tom at curii.com>
Date:   Tue Apr 30 11:18:31 2024 -0400

    12430: Avoid loading collections excluded by globs.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>

diff --git a/go.mod b/go.mod
index 62a31c1c58..e26a8699a4 100644
--- a/go.mod
+++ b/go.mod
@@ -11,6 +11,7 @@ require (
 	github.com/arvados/cgofuse v1.2.0-arvados1
 	github.com/aws/aws-sdk-go v1.44.174
 	github.com/aws/aws-sdk-go-v2 v0.23.0
+	github.com/bmatcuk/doublestar/v4 v4.6.1
 	github.com/bradleypeabody/godap v0.0.0-20170216002349-c249933bc092
 	github.com/coreos/go-oidc/v3 v3.5.0
 	github.com/coreos/go-systemd v0.0.0-20190321100706-95778dfbb74e
@@ -65,7 +66,6 @@ require (
 	github.com/anmitsu/go-shlex v0.0.0-20161002113705-648efa622239 // indirect
 	github.com/beorn7/perks v1.0.1 // indirect
 	github.com/bgentry/speakeasy v0.1.0 // indirect
-	github.com/bmatcuk/doublestar v1.3.4 // indirect
 	github.com/cespare/xxhash/v2 v2.2.0 // indirect
 	github.com/davecgh/go-spew v1.1.1 // indirect
 	github.com/dimchansky/utfbom v1.1.1 // indirect
diff --git a/go.sum b/go.sum
index fe6febb043..9d445185fb 100644
--- a/go.sum
+++ b/go.sum
@@ -59,8 +59,8 @@ github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
 github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
 github.com/bgentry/speakeasy v0.1.0 h1:ByYyxL9InA1OWqxJqqp2A5pYHUrCiAL6K3J+LKSsQkY=
 github.com/bgentry/speakeasy v0.1.0/go.mod h1:+zsyZBPWlz7T6j88CTgSN5bM796AkVf0kBD4zp0CCIs=
-github.com/bmatcuk/doublestar v1.3.4 h1:gPypJ5xD31uhX6Tf54sDPUOBXTqKH4c9aPY66CyQrS0=
-github.com/bmatcuk/doublestar v1.3.4/go.mod h1:wiQtGV+rzVYxB7WIlirSN++5HPtPlXEo9MEoZQC/PmE=
+github.com/bmatcuk/doublestar/v4 v4.6.1 h1:FH9SifrbvJhnlQpztAx++wlkk70QBf0iBWDwNy7PA4I=
+github.com/bmatcuk/doublestar/v4 v4.6.1/go.mod h1:xBQ8jztBU6kakFMg+8WGxn0c6z1fTSPVIjEY1Wr7jzc=
 github.com/boltdb/bolt v1.3.1/go.mod h1:clJnj/oiGkjum5o1McbSZDSLxVThjynRyGBgiAx27Ps=
 github.com/bradleypeabody/godap v0.0.0-20170216002349-c249933bc092 h1:0Di2onNnlN5PAyWPbqlPyN45eOQ+QW/J9eqLynt4IV4=
 github.com/bradleypeabody/godap v0.0.0-20170216002349-c249933bc092/go.mod h1:8IzBjZCRSnsvM6MJMG8HNNtnzMl48H22rbJL2kRUJ0Y=
diff --git a/lib/crunchrun/copier.go b/lib/crunchrun/copier.go
index 8d986d7066..507029b36e 100644
--- a/lib/crunchrun/copier.go
+++ b/lib/crunchrun/copier.go
@@ -18,7 +18,7 @@ import (
 	"git.arvados.org/arvados.git/sdk/go/arvados"
 	"git.arvados.org/arvados.git/sdk/go/keepclient"
 	"git.arvados.org/arvados.git/sdk/go/manifest"
-	"github.com/bmatcuk/doublestar"
+	"github.com/bmatcuk/doublestar/v4"
 )
 
 type printfer interface {
@@ -115,14 +115,44 @@ func (cp *copier) Copy() (string, error) {
 	return collfs.MarshalManifest(".")
 }
 
-func (cp *copier) matchGlobs(path string) bool {
+func (cp *copier) matchGlobs(path string, isDir bool) bool {
 	// An entry in the top level of the output directory looks
 	// like "/foo", but globs look like "foo", so we strip the
 	// leading "/" before matching.
 	path = strings.TrimLeft(path, "/")
 	for _, glob := range cp.globs {
-		if match, _ := doublestar.Match(glob, path); match {
+		if !isDir && strings.HasSuffix(glob, "/**") {
+			// doublestar.Match("f*/**", "ff") and
+			// doublestar.Match("f*/**", "ff/gg") both
+			// return true, but (to be compatible with
+			// bash shopt) "ff" should match only if it is
+			// a directory.
+			//
+			// To avoid errant matches, we add the file's
+			// basename to the end of the pattern:
+			//
+			// Match("f*/**/ff", "ff") => false
+			// Match("f*/**/gg", "ff/gg") => true
+			//
+			// Of course, we need to escape basename in
+			// case it contains *, ?, \, etc.
+			_, name := filepath.Split(path)
+			escapedName := strings.TrimSuffix(strings.Replace(name, "", "\\", -1), "\\")
+			if match, _ := doublestar.Match(glob+"/"+escapedName, path); match {
+				return true
+			}
+		} else if match, _ := doublestar.Match(glob, path); match {
 			return true
+		} else if isDir {
+			// Workaround doublestar bug (v4.6.1).
+			// "foo*/**" should match "foo", but does not,
+			// because isZeroLengthPattern does not accept
+			// "*/**" as a zero length pattern.
+			if trunc := strings.TrimSuffix(glob, "*/**"); trunc != glob {
+				if match, _ := doublestar.Match(trunc, path); match {
+					return true
+				}
+			}
 		}
 	}
 	return false
@@ -141,7 +171,7 @@ func (cp *copier) applyGlobsToFilesAndDirs() {
 	}
 	keepdirs := make(map[string]bool)
 	for _, path := range cp.dirs {
-		if cp.matchGlobs(path) {
+		if cp.matchGlobs(path, true) {
 			keepdirs[path] = true
 		}
 	}
@@ -154,7 +184,7 @@ func (cp *copier) applyGlobsToFilesAndDirs() {
 	}
 	var keepfiles []filetodo
 	for _, file := range cp.files {
-		if cp.matchGlobs(file.dst) {
+		if cp.matchGlobs(file.dst, false) {
 			keepfiles = append(keepfiles, file)
 		}
 	}
@@ -182,7 +212,7 @@ func (cp *copier) applyGlobsToCollectionFS(collfs arvados.CollectionFileSystem)
 	}
 	include := make(map[string]bool)
 	err := fs.WalkDir(arvados.FS(collfs), "", func(path string, ent fs.DirEntry, err error) error {
-		if cp.matchGlobs(path) {
+		if cp.matchGlobs(path, ent.IsDir()) {
 			for i, c := range path {
 				if i > 0 && c == '/' {
 					include[path[:i]] = true
@@ -213,6 +243,42 @@ func (cp *copier) applyGlobsToCollectionFS(collfs arvados.CollectionFileSystem)
 	return err
 }
 
+// Return true if it's possible for any descendant of the given path
+// to match anything in cp.globs.  Used by walkMount to avoid loading
+// collections that are mounted underneath ctrOutputPath but excluded
+// by globs.
+func (cp *copier) subtreeCouldMatch(path string) bool {
+	if len(cp.globs) == 0 {
+		return true
+	}
+	pathdepth := 1 + strings.Count(path, "/")
+	for _, glob := range cp.globs {
+		globdepth := 0
+		lastsep := 0
+		for i, c := range glob {
+			if c != '/' || !doublestar.ValidatePattern(glob[:i]) {
+				// Escaped "/", or "/" in a character
+				// class, is not a path separator.
+				continue
+			}
+			if glob[lastsep:i] == "**" {
+				return true
+			}
+			lastsep = i + 1
+			if globdepth++; globdepth == pathdepth {
+				if match, _ := doublestar.Match(glob[:i]+"/*", path+"/z"); match {
+					return true
+				}
+				break
+			}
+		}
+		if globdepth < pathdepth && glob[lastsep:] == "**" {
+			return true
+		}
+	}
+	return false
+}
+
 func (cp *copier) copyFile(fs arvados.CollectionFileSystem, f filetodo) (int64, error) {
 	cp.logger.Printf("copying %q (%d bytes)", strings.TrimLeft(f.dst, "/"), f.size)
 	dst, err := fs.OpenFile(f.dst, os.O_CREATE|os.O_WRONLY, 0666)
@@ -267,9 +333,8 @@ func (cp *copier) walkMount(dest, src string, maxSymlinks int, walkMountsBelow b
 	// copy, relative to its mount point -- ".", "./foo.txt", ...
 	srcRelPath := filepath.Join(".", srcMount.Path, src[len(srcRoot):])
 
-	// outputRelPath is the path relative in the output directory
-	// that corresponds to the path in the output collection where
-	// the file will go, for logging
+	// outputRelPath is the destination path relative to the
+	// output directory. Used for logging and glob matching.
 	var outputRelPath = ""
 	if strings.HasPrefix(src, cp.ctrOutputDir) {
 		outputRelPath = strings.TrimPrefix(src[len(cp.ctrOutputDir):], "/")
@@ -283,6 +348,9 @@ func (cp *copier) walkMount(dest, src string, maxSymlinks int, walkMountsBelow b
 
 	switch {
 	case srcMount.ExcludeFromOutput:
+	case outputRelPath != "*" && !cp.subtreeCouldMatch(outputRelPath):
+		cp.logger.Printf("not copying %q because contents cannot match output globs", outputRelPath)
+		return nil
 	case srcMount.Kind == "tmp":
 		// Handle by walking the host filesystem.
 		return cp.walkHostFS(dest, src, maxSymlinks, walkMountsBelow)
@@ -434,6 +502,8 @@ func (cp *copier) walkHostFS(dest, src string, maxSymlinks int, includeMounts bo
 				// (...except mount types that are
 				// handled as regular files.)
 				continue
+			} else if isMount && !cp.subtreeCouldMatch(src[len(cp.ctrOutputDir)+1:]) {
+				continue
 			}
 			err = cp.walkHostFS(dest, src, maxSymlinks, false)
 			if err != nil {
diff --git a/lib/crunchrun/copier_test.go b/lib/crunchrun/copier_test.go
index a1fc81c716..9deba783b2 100644
--- a/lib/crunchrun/copier_test.go
+++ b/lib/crunchrun/copier_test.go
@@ -214,6 +214,68 @@ func (s *copierSuite) TestWritableMountBelow(c *check.C) {
 	})
 }
 
+// Check some glob-matching edge cases. In particular, check that
+// patterns like "foo/**" do not match regular files named "foo"
+// (unless of course they are inside a directory named "foo").
+func (s *copierSuite) TestMatchGlobs(c *check.C) {
+	s.cp.globs = []string{"foo*/**"}
+	c.Check(s.cp.matchGlobs("foo", true), check.Equals, true)
+	c.Check(s.cp.matchGlobs("food", true), check.Equals, true)
+	c.Check(s.cp.matchGlobs("foo", false), check.Equals, false)
+	c.Check(s.cp.matchGlobs("food", false), check.Equals, false)
+
+	s.cp.globs = []string{"ba[!/]/foo*/**"}
+	c.Check(s.cp.matchGlobs("bar/foo", true), check.Equals, true)
+	c.Check(s.cp.matchGlobs("bar/food", true), check.Equals, true)
+	c.Check(s.cp.matchGlobs("bar/foo", false), check.Equals, false)
+	c.Check(s.cp.matchGlobs("bar/food", false), check.Equals, false)
+	c.Check(s.cp.matchGlobs("bar/foo/z\\[", true), check.Equals, true)
+	c.Check(s.cp.matchGlobs("bar/food/z\\[", true), check.Equals, true)
+	c.Check(s.cp.matchGlobs("bar/foo/z\\[", false), check.Equals, true)
+	c.Check(s.cp.matchGlobs("bar/food/z\\[", false), check.Equals, true)
+
+	s.cp.globs = []string{"waz/**/foo*/**"}
+	c.Check(s.cp.matchGlobs("waz/quux/foo", true), check.Equals, true)
+	c.Check(s.cp.matchGlobs("waz/quux/food", true), check.Equals, true)
+	c.Check(s.cp.matchGlobs("waz/quux/foo", false), check.Equals, false)
+	c.Check(s.cp.matchGlobs("waz/quux/food", false), check.Equals, false)
+	c.Check(s.cp.matchGlobs("waz/quux/foo/foo", true), check.Equals, true)
+	c.Check(s.cp.matchGlobs("waz/quux/food/foo", true), check.Equals, true)
+	c.Check(s.cp.matchGlobs("waz/quux/foo/foo", false), check.Equals, true)
+	c.Check(s.cp.matchGlobs("waz/quux/food/foo", false), check.Equals, true)
+}
+
+func (s *copierSuite) TestSubtreeCouldMatch(c *check.C) {
+	for _, trial := range []struct {
+		mount string // relative to output dir
+		glob  string
+		could bool
+	}{
+		{mount: "abc", glob: "*"},
+		{mount: "abc", glob: "abc/*", could: true},
+		{mount: "abc", glob: "a*/**", could: true},
+		{mount: "abc", glob: "**", could: true},
+		{mount: "abc", glob: "*/*", could: true},
+		{mount: "abc", glob: "**/*.txt", could: true},
+		{mount: "abc/def", glob: "*"},
+		{mount: "abc/def", glob: "*/*"},
+		{mount: "abc/def", glob: "*/*.txt"},
+		{mount: "abc/def", glob: "*/*/*", could: true},
+		{mount: "abc/def", glob: "**", could: true},
+		{mount: "abc/def", glob: "**/bar", could: true},
+		{mount: "abc/def", glob: "abc/**", could: true},
+		{mount: "abc/def/ghi", glob: "*c/**/bar", could: true},
+		{mount: "abc/def/ghi", glob: "*c/*f/bar"},
+		{mount: "abc/def/ghi", glob: "abc/d[^/]f/ghi/*", could: true},
+	} {
+		c.Logf("=== %+v", trial)
+		got := (&copier{
+			globs: []string{trial.glob},
+		}).subtreeCouldMatch(trial.mount)
+		c.Check(got, check.Equals, trial.could)
+	}
+}
+
 func (s *copierSuite) TestMountBelowExcludedByGlob(c *check.C) {
 	bindtmp := c.MkDir()
 	s.cp.mounts["/ctr/outdir/include/includer"] = arvados.Mount{
@@ -234,7 +296,7 @@ func (s *copierSuite) TestMountBelowExcludedByGlob(c *check.C) {
 		PortableDataHash: arvadostest.FooCollectionPDH,
 		Writable:         true,
 	}
-	s.cp.mounts["/ctr/outdir/nonexistent-collection"] = arvados.Mount{
+	s.cp.mounts["/ctr/outdir/nonexistent/collection"] = arvados.Mount{
 		// As extra assurance, plant a collection that will
 		// fail if copier attempts to load its manifest.  (For
 		// performance reasons it's important that copier
@@ -244,8 +306,8 @@ func (s *copierSuite) TestMountBelowExcludedByGlob(c *check.C) {
 		PortableDataHash: "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa+1234",
 	}
 	s.cp.globs = []string{
-		"?ncl*/*r",
-		"*/?ncl*",
+		"?ncl*/*r/*",
+		"*/?ncl*/**",
 	}
 	c.Assert(os.MkdirAll(s.cp.hostOutputDir+"/include/includer", 0755), check.IsNil)
 	c.Assert(os.MkdirAll(s.cp.hostOutputDir+"/include/includew", 0755), check.IsNil)
@@ -264,14 +326,18 @@ func (s *copierSuite) TestMountBelowExcludedByGlob(c *check.C) {
 	c.Check(err, check.IsNil)
 	c.Log(s.log.String())
 
-	c.Check(s.cp.dirs, check.DeepEquals, []string{"/include", "/include/includew"})
+	// Note it's OK that "/exclude" is not excluded by walkMount:
+	// it is just a local filesystem directory, not a mount point
+	// that's expensive to walk.  In real-life usage, it will be
+	// removed from cp.dirs before any copying happens.
+	c.Check(s.cp.dirs, check.DeepEquals, []string{"/exclude", "/include", "/include/includew"})
 	c.Check(s.cp.files, check.DeepEquals, []filetodo{
 		{src: s.cp.hostOutputDir + "/include/includew/foo", dst: "/include/includew/foo", size: 3},
 	})
 	c.Check(s.cp.manifest, check.Matches, `(?ms).*\./include/includer .*`)
 	c.Check(s.cp.manifest, check.Not(check.Matches), `(?ms).*exclude.*`)
 	c.Check(s.log.String(), check.Matches, `(?ms).*not copying \\"exclude/excluder\\".*`)
-	c.Check(s.log.String(), check.Matches, `(?ms).*not copying \\"exclude/excludew\\".*`)
+	c.Check(s.log.String(), check.Matches, `(?ms).*not copying \\"nonexistent/collection\\".*`)
 }
 
 func (s *copierSuite) writeFileInOutputDir(c *check.C, path, data string) {

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list