[arvados] created: 2.6.0-369-g0c881f2fa

git repository hosting git at public.arvados.org
Wed Jul 12 04:20:19 UTC 2023


        at  0c881f2fa420fbe98be2431d763a13f249f8edfc (commit)


commit 0c881f2fa420fbe98be2431d763a13f249f8edfc
Author: Tom Clegg <tom at curii.com>
Date:   Wed Jul 12 00:19:08 2023 -0400

    20726: Fix traversing projects/collections that precede page marker.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>

diff --git a/services/keep-web/s3.go b/services/keep-web/s3.go
index fd8764da8..8615ceaf0 100644
--- a/services/keep-web/s3.go
+++ b/services/keep-web/s3.go
@@ -790,6 +790,13 @@ func (h *handler) s3list(bucket string, w http.ResponseWriter, r *http.Request,
 		params.marker = r.FormValue("marker")
 	}
 
+	// startAfter is params.marker or params.startAfter, whichever
+	// comes last.
+	startAfter := params.startAfter
+	if startAfter < params.marker {
+		startAfter = params.marker
+	}
+
 	bucketdir := "by_id/" + bucket
 	// walkpath is the directory (relative to bucketdir) we need
 	// to walk: the innermost directory that is guaranteed to
@@ -821,6 +828,7 @@ func (h *handler) s3list(bucket string, w http.ResponseWriter, r *http.Request,
 	nextMarker := ""
 
 	commonPrefixes := map[string]bool{}
+	full := false
 	err := walkFS(fs, strings.TrimSuffix(bucketdir+"/"+walkpath, "/"), true, func(path string, fi os.FileInfo) error {
 		if path == bucketdir {
 			return nil
@@ -831,36 +839,29 @@ func (h *handler) s3list(bucket string, w http.ResponseWriter, r *http.Request,
 			path += "/"
 			filesize = 0
 		}
-		if len(path) <= len(params.prefix) {
-			if path > params.prefix[:len(path)] {
-				// with prefix "foobar", walking "fooz" means we're done
-				return errDone
-			}
-			if path < params.prefix[:len(path)] {
-				// with prefix "foobar", walking "foobag" is pointless
-				return filepath.SkipDir
-			}
-			if fi.IsDir() && !strings.HasPrefix(params.prefix+"/", path) {
-				// with prefix "foo/bar", walking "fo"
-				// is pointless (but walking "foo" or
-				// "foo/bar" is necessary)
-				return filepath.SkipDir
-			}
-			if len(path) < len(params.prefix) {
-				// can't skip anything, and this entry
-				// isn't in the results, so just
-				// continue descent
-				return nil
-			}
-		} else {
-			if path[:len(params.prefix)] > params.prefix {
-				// with prefix "foobar", nothing we
-				// see after "foozzz" is relevant
-				return errDone
-			}
-		}
-		if path <= params.marker || path < params.prefix || path <= params.startAfter {
+		if strings.HasPrefix(params.prefix, path) && params.prefix != path {
+			// Descend into subtree until we reach desired prefix
 			return nil
+		} else if path < params.prefix {
+			// Not an ancestor or descendant of desired
+			// prefix, therefore none of its descendants
+			// can be either -- skip
+			return filepath.SkipDir
+		} else if path > params.prefix && !strings.HasPrefix(path, params.prefix) {
+			// We must have traversed everything under
+			// desired prefix
+			return errDone
+		} else if path == startAfter {
+			// Skip startAfter itself, just descend into
+			// subtree
+			return nil
+		} else if strings.HasPrefix(startAfter, path) {
+			// Descend into subtree in case it contains
+			// something after startAfter
+			return nil
+		} else if path < startAfter {
+			// Skip ahead until we reach startAfter
+			return filepath.SkipDir
 		}
 		if fi.IsDir() && !h.Cluster.Collections.S3FolderObjects {
 			// Note we don't add anything to
@@ -870,10 +871,6 @@ func (h *handler) s3list(bucket string, w http.ResponseWriter, r *http.Request,
 			// finding a regular file inside it.
 			return nil
 		}
-		if len(resp.Contents)+len(commonPrefixes) >= params.maxKeys {
-			resp.IsTruncated = true
-			return errDone
-		}
 		if params.delimiter != "" {
 			idx := strings.Index(path[len(params.prefix):], params.delimiter)
 			if idx >= 0 {
@@ -882,19 +879,32 @@ func (h *handler) s3list(bucket string, w http.ResponseWriter, r *http.Request,
 				// add "/baz" to commonPrefixes and
 				// stop descending.
 				prefix := path[:len(params.prefix)+idx+1]
-				if prefix != params.marker {
+				if prefix == startAfter {
+					return nil
+				} else if prefix < startAfter && !strings.HasPrefix(startAfter, prefix) {
+					return nil
+				} else if full {
+					resp.IsTruncated = true
+					return errDone
+				} else {
 					commonPrefixes[prefix] = true
 					nextMarker = prefix
+					full = len(resp.Contents)+len(commonPrefixes) >= params.maxKeys
+					return filepath.SkipDir
 				}
-				return filepath.SkipDir
 			}
 		}
+		if full {
+			resp.IsTruncated = true
+			return errDone
+		}
 		resp.Contents = append(resp.Contents, s3Key{
 			Key:          path,
 			LastModified: fi.ModTime().UTC().Format("2006-01-02T15:04:05.999") + "Z",
 			Size:         filesize,
 		})
 		nextMarker = path
+		full = len(resp.Contents)+len(commonPrefixes) >= params.maxKeys
 		return nil
 	})
 	if err != nil && err != errDone {
diff --git a/services/keep-web/s3_test.go b/services/keep-web/s3_test.go
index 556f8ba2c..9f8650b8e 100644
--- a/services/keep-web/s3_test.go
+++ b/services/keep-web/s3_test.go
@@ -943,7 +943,8 @@ func (s *IntegrationSuite) testS3CollectionListRollup(c *check.C) {
 		{"dir0", "", ""},
 		{"dir0/", "", ""},
 		{"dir0/f", "", ""},
-		{"dir0", "/", "dir0/file14.txt"},       // no commonprefixes
+		{"dir0", "/", "dir0/file14.txt"},       // one commonprefix, "dir0/"
+		{"dir0", "/", "dir0/zzzzfile.txt"},     // no commonprefixes
 		{"", "", "dir0/file14.txt"},            // middle page, skip walking dir1
 		{"", "", "dir1/file14.txt"},            // middle page, skip walking dir0
 		{"", "", "dir1/file498.txt"},           // last page of results
@@ -1017,6 +1018,61 @@ func (s *IntegrationSuite) testS3CollectionListRollup(c *check.C) {
 	}
 }
 
+func (s *IntegrationSuite) TestS3ListObjectsV2ManySubprojects(c *check.C) {
+	stage := s.s3setup(c)
+	defer stage.teardown(c)
+	projects := 50
+	collectionsPerProject := 2
+	for i := 0; i < projects; i++ {
+		var subproj arvados.Group
+		err := stage.arv.RequestAndDecode(&subproj, "POST", "arvados/v1/groups", nil, map[string]interface{}{
+			"group": map[string]interface{}{
+				"owner_uuid":  stage.subproj.UUID,
+				"group_class": "project",
+				"name":        fmt.Sprintf("keep-web s3 test subproject %d", i),
+			},
+		})
+		c.Assert(err, check.IsNil)
+		for j := 0; j < collectionsPerProject; j++ {
+			err = stage.arv.RequestAndDecode(nil, "POST", "arvados/v1/collections", nil, map[string]interface{}{"collection": map[string]interface{}{
+				"owner_uuid":    subproj.UUID,
+				"name":          fmt.Sprintf("keep-web s3 test collection %d", j),
+				"manifest_text": ". d41d8cd98f00b204e9800998ecf8427e+0 0:0:emptyfile\n./emptydir d41d8cd98f00b204e9800998ecf8427e+0 0:0:.\n",
+			}})
+			c.Assert(err, check.IsNil)
+		}
+	}
+	c.Logf("setup complete")
+
+	sess := aws_session.Must(aws_session.NewSession(&aws_aws.Config{
+		Region:           aws_aws.String("auto"),
+		Endpoint:         aws_aws.String(s.testServer.URL),
+		Credentials:      aws_credentials.NewStaticCredentials(url.QueryEscape(arvadostest.ActiveTokenV2), url.QueryEscape(arvadostest.ActiveTokenV2), ""),
+		S3ForcePathStyle: aws_aws.Bool(true),
+	}))
+	client := aws_s3.New(sess)
+	ctx := context.Background()
+	params := aws_s3.ListObjectsV2Input{
+		Bucket:    aws_aws.String(stage.proj.UUID),
+		Delimiter: aws_aws.String("/"),
+		Prefix:    aws_aws.String("keep-web s3 test subproject/"),
+		MaxKeys:   aws_aws.Int64(int64(projects / 2)),
+	}
+	for page := 1; ; page++ {
+		t0 := time.Now()
+		result, err := client.ListObjectsV2WithContext(ctx, &params)
+		if !c.Check(err, check.IsNil) {
+			break
+		}
+		c.Logf("got page %d in %v with len(Contents) == %d, len(CommonPrefixes) == %d", page, time.Since(t0), len(result.Contents), len(result.CommonPrefixes))
+		if !*result.IsTruncated {
+			break
+		}
+		params.ContinuationToken = result.NextContinuationToken
+		*params.MaxKeys = *params.MaxKeys/2 + 1
+	}
+}
+
 func (s *IntegrationSuite) TestS3ListObjectsV2(c *check.C) {
 	stage := s.s3setup(c)
 	defer stage.teardown(c)

commit 8e57b2534bbf5ece2518c3c24ce79c5bb07adf3a
Author: Tom Clegg <tom at curii.com>
Date:   Tue Jul 11 15:25:22 2023 -0400

    20726: Fix ListObjects[V2] pages duplicating last item on next page.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>

diff --git a/services/keep-web/s3.go b/services/keep-web/s3.go
index 38428cdab..fd8764da8 100644
--- a/services/keep-web/s3.go
+++ b/services/keep-web/s3.go
@@ -772,6 +772,9 @@ func (h *handler) s3list(bucket string, w http.ResponseWriter, r *http.Request,
 			http.Error(w, "invalid continuation token", http.StatusBadRequest)
 			return
 		}
+		// marker and start-after perform the same function,
+		// but we keep them separate so we can repeat them
+		// back to the client in the response.
 		params.marker = string(marker)
 		params.startAfter = r.FormValue("start-after")
 		switch r.FormValue("encoding-type") {
@@ -783,6 +786,7 @@ func (h *handler) s3list(bucket string, w http.ResponseWriter, r *http.Request,
 			return
 		}
 	} else {
+		// marker is functionally equivalent to start-after.
 		params.marker = r.FormValue("marker")
 	}
 
@@ -809,6 +813,11 @@ func (h *handler) s3list(bucket string, w http.ResponseWriter, r *http.Request,
 		ContinuationToken: r.FormValue("continuation-token"),
 		StartAfter:        params.startAfter,
 	}
+
+	// nextMarker will be the last path we add to either
+	// resp.Contents or commonPrefixes.  It will be included in
+	// the response as NextMarker or NextContinuationToken if
+	// needed.
 	nextMarker := ""
 
 	commonPrefixes := map[string]bool{}
@@ -850,7 +859,7 @@ func (h *handler) s3list(bucket string, w http.ResponseWriter, r *http.Request,
 				return errDone
 			}
 		}
-		if path < params.marker || path < params.prefix || path <= params.startAfter {
+		if path <= params.marker || path < params.prefix || path <= params.startAfter {
 			return nil
 		}
 		if fi.IsDir() && !h.Cluster.Collections.S3FolderObjects {
@@ -863,9 +872,6 @@ func (h *handler) s3list(bucket string, w http.ResponseWriter, r *http.Request,
 		}
 		if len(resp.Contents)+len(commonPrefixes) >= params.maxKeys {
 			resp.IsTruncated = true
-			if params.delimiter != "" || params.v2 {
-				nextMarker = path
-			}
 			return errDone
 		}
 		if params.delimiter != "" {
@@ -875,7 +881,11 @@ func (h *handler) s3list(bucket string, w http.ResponseWriter, r *http.Request,
 				// "z", when we hit "foobar/baz", we
 				// add "/baz" to commonPrefixes and
 				// stop descending.
-				commonPrefixes[path[:len(params.prefix)+idx+1]] = true
+				prefix := path[:len(params.prefix)+idx+1]
+				if prefix != params.marker {
+					commonPrefixes[prefix] = true
+					nextMarker = prefix
+				}
 				return filepath.SkipDir
 			}
 		}
@@ -884,12 +894,16 @@ func (h *handler) s3list(bucket string, w http.ResponseWriter, r *http.Request,
 			LastModified: fi.ModTime().UTC().Format("2006-01-02T15:04:05.999") + "Z",
 			Size:         filesize,
 		})
+		nextMarker = path
 		return nil
 	})
 	if err != nil && err != errDone {
 		http.Error(w, err.Error(), http.StatusInternalServerError)
 		return
 	}
+	if params.delimiter == "" && !params.v2 || !resp.IsTruncated {
+		nextMarker = ""
+	}
 	if params.delimiter != "" {
 		resp.CommonPrefixes = make([]commonPrefix, 0, len(commonPrefixes))
 		for prefix := range commonPrefixes {
diff --git a/services/keep-web/s3_test.go b/services/keep-web/s3_test.go
index bfe8bb862..556f8ba2c 100644
--- a/services/keep-web/s3_test.go
+++ b/services/keep-web/s3_test.go
@@ -851,6 +851,9 @@ func (s *IntegrationSuite) testS3List(c *check.C, bucket *s3.Bucket, prefix stri
 			break
 		}
 		for _, key := range resp.Contents {
+			if _, dup := gotKeys[key.Key]; dup {
+				c.Errorf("got duplicate key %q on page %d", key.Key, pages)
+			}
 			gotKeys[key.Key] = key
 			if strings.Contains(key.Key, "sailboat.txt") {
 				c.Check(key.Size, check.Equals, int64(4))
@@ -971,28 +974,31 @@ func (s *IntegrationSuite) testS3CollectionListRollup(c *check.C) {
 		var expectTruncated bool
 		for _, key := range allfiles {
 			full := len(expectKeys)+len(expectPrefixes) >= maxKeys
-			if !strings.HasPrefix(key, trial.prefix) || key < trial.marker {
+			if !strings.HasPrefix(key, trial.prefix) || key <= trial.marker {
 				continue
 			} else if idx := strings.Index(key[len(trial.prefix):], trial.delimiter); trial.delimiter != "" && idx >= 0 {
 				prefix := key[:len(trial.prefix)+idx+1]
 				if len(expectPrefixes) > 0 && expectPrefixes[len(expectPrefixes)-1] == prefix {
 					// same prefix as previous key
 				} else if full {
-					expectNextMarker = key
 					expectTruncated = true
 				} else {
 					expectPrefixes = append(expectPrefixes, prefix)
+					expectNextMarker = prefix
 				}
 			} else if full {
-				if trial.delimiter != "" {
-					expectNextMarker = key
-				}
 				expectTruncated = true
 				break
 			} else {
 				expectKeys = append(expectKeys, key)
+				if trial.delimiter != "" {
+					expectNextMarker = key
+				}
 			}
 		}
+		if !expectTruncated {
+			expectNextMarker = ""
+		}
 
 		var gotKeys []string
 		for _, key := range resp.Contents {

commit 6f28783ebb3c0ba0f74c59879e091bc46a874b28
Author: Tom Clegg <tom at curii.com>
Date:   Tue Jul 11 13:38:38 2023 -0400

    20726: Update s3ListObjects paging test.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>

diff --git a/services/keep-web/s3_test.go b/services/keep-web/s3_test.go
index aa91d82ae..bfe8bb862 100644
--- a/services/keep-web/s3_test.go
+++ b/services/keep-web/s3_test.go
@@ -17,6 +17,7 @@ import (
 	"net/url"
 	"os"
 	"os/exec"
+	"sort"
 	"strings"
 	"sync"
 	"time"
@@ -817,8 +818,8 @@ func (s *IntegrationSuite) TestS3CollectionList(c *check.C) {
 
 	var markers int
 	for markers, s.handler.Cluster.Collections.S3FolderObjects = range []bool{false, true} {
-		dirs := 2
-		filesPerDir := 1001
+		dirs := 2000
+		filesPerDir := 2
 		stage.writeBigDirs(c, dirs, filesPerDir)
 		// Total # objects is:
 		//                 2 file entries from s3setup (emptyfile and sailboat.txt)
@@ -827,6 +828,7 @@ func (s *IntegrationSuite) TestS3CollectionList(c *check.C) {
 		// +filesPerDir*dirs file entries from writeBigDirs (dir0/file0.txt, etc.)
 		s.testS3List(c, stage.collbucket, "", 4000, markers+2+(filesPerDir+markers)*dirs)
 		s.testS3List(c, stage.collbucket, "", 131, markers+2+(filesPerDir+markers)*dirs)
+		s.testS3List(c, stage.collbucket, "", 51, markers+2+(filesPerDir+markers)*dirs)
 		s.testS3List(c, stage.collbucket, "dir0/", 71, filesPerDir+markers)
 	}
 }
@@ -863,7 +865,16 @@ func (s *IntegrationSuite) testS3List(c *check.C, bucket *s3.Bucket, prefix stri
 		}
 		nextMarker = resp.NextMarker
 	}
-	c.Check(len(gotKeys), check.Equals, expectFiles)
+	if !c.Check(len(gotKeys), check.Equals, expectFiles) {
+		var sorted []string
+		for k := range gotKeys {
+			sorted = append(sorted, k)
+		}
+		sort.Strings(sorted)
+		for _, k := range sorted {
+			c.Logf("got %s", k)
+		}
+	}
 }
 
 func (s *IntegrationSuite) TestS3CollectionListRollup(c *check.C) {

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list