[ARVADOS] updated: 2.3.1-12-g56c37ef9b

Git user git at public.arvados.org
Tue Dec 7 20:08:48 UTC 2021


Summary of changes:
 services/keep-balance/balance.go          | 54 ++++++++++++++++++++-----------
 services/keep-balance/balance_run_test.go | 26 +++++++++++++++
 services/keep-balance/balance_test.go     | 27 ++++++++--------
 3 files changed, 76 insertions(+), 31 deletions(-)

       via  56c37ef9b76b992dad59524cae6c34b86bb911d0 (commit)
       via  11864d817434e1f3e36cf3c0ef9ab37736938f65 (commit)
      from  b008c44eaf5c6b45c9f36116601918748aeb8323 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.


commit 56c37ef9b76b992dad59524cae6c34b86bb911d0
Author: Tom Clegg <tom at curii.com>
Date:   Mon Dec 6 19:32:34 2021 -0500

    18547: Error out if two volumes return the same non-empty DeviceID.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>

diff --git a/services/keep-balance/balance.go b/services/keep-balance/balance.go
index fa01d512b..eb6f580f4 100644
--- a/services/keep-balance/balance.go
+++ b/services/keep-balance/balance.go
@@ -8,6 +8,7 @@ import (
 	"bytes"
 	"context"
 	"crypto/md5"
+	"errors"
 	"fmt"
 	"io"
 	"io/ioutil"
@@ -266,6 +267,29 @@ func (bal *Balancer) CheckSanityEarly(c *arvados.Client) error {
 		}
 	}
 
+	mountProblem := false
+	type deviceMount struct {
+		srv *KeepService
+		mnt *KeepMount
+	}
+	deviceMounted := map[string]deviceMount{} // DeviceID -> mount
+	for _, srv := range bal.KeepServices {
+		for _, mnt := range srv.mounts {
+			if first, dup := deviceMounted[mnt.DeviceID]; dup && first.mnt.UUID != mnt.UUID && mnt.DeviceID != "" {
+				bal.logf("config error: device %s is mounted with multiple volume UUIDs: %s on %s, and %s on %s",
+					mnt.DeviceID,
+					first.mnt.UUID, first.srv,
+					mnt.UUID, srv)
+				mountProblem = true
+				continue
+			}
+			deviceMounted[mnt.DeviceID] = deviceMount{srv, mnt}
+		}
+	}
+	if mountProblem {
+		return errors.New("cannot continue with config errors (see above)")
+	}
+
 	var checkPage arvados.CollectionList
 	if err = c.RequestAndDecode(&checkPage, "GET", "arvados/v1/collections", nil, arvados.ResourceListParams{
 		Limit:              new(int),
diff --git a/services/keep-balance/balance_run_test.go b/services/keep-balance/balance_run_test.go
index 4e2c6803c..0d1b6b591 100644
--- a/services/keep-balance/balance_run_test.go
+++ b/services/keep-balance/balance_run_test.go
@@ -397,6 +397,32 @@ func (s *runSuite) TestRefuseNonAdmin(c *check.C) {
 	c.Check(pullReqs.Count(), check.Equals, 0)
 }
 
+func (s *runSuite) TestRefuseSameDeviceDifferentVolumes(c *check.C) {
+	opts := RunOptions{
+		CommitPulls: true,
+		CommitTrash: true,
+		Logger:      ctxlog.TestLogger(c),
+	}
+	s.stub.serveCurrentUserAdmin()
+	s.stub.serveZeroCollections()
+	s.stub.serveKeepServices(stubServices)
+	s.stub.mux.HandleFunc("/mounts", func(w http.ResponseWriter, r *http.Request) {
+		hostid := r.Host[:5] // "keep0.zzzzz.arvadosapi.com:25107" => "keep0"
+		json.NewEncoder(w).Encode([]arvados.KeepMount{{
+			UUID:           "zzzzz-ivpuk-0000000000" + hostid,
+			DeviceID:       "keep0-vol0",
+			StorageClasses: map[string]bool{"default": true},
+		}})
+	})
+	trashReqs := s.stub.serveKeepstoreTrash()
+	pullReqs := s.stub.serveKeepstorePull()
+	srv := s.newServer(&opts)
+	_, err := srv.runOnce()
+	c.Check(err, check.ErrorMatches, "cannot continue with config errors.*")
+	c.Check(trashReqs.Count(), check.Equals, 0)
+	c.Check(pullReqs.Count(), check.Equals, 0)
+}
+
 func (s *runSuite) TestWriteLostBlocks(c *check.C) {
 	lostf, err := ioutil.TempFile("", "keep-balance-lost-blocks-test-")
 	c.Assert(err, check.IsNil)

commit 11864d817434e1f3e36cf3c0ef9ab37736938f65
Author: Tom Clegg <tom at curii.com>
Date:   Fri Dec 3 10:54:59 2021 -0500

    18547: Use volume UUID instead of DeviceID to deduplicate mounts.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>

diff --git a/services/keep-balance/balance.go b/services/keep-balance/balance.go
index bb590e13b..fa01d512b 100644
--- a/services/keep-balance/balance.go
+++ b/services/keep-balance/balance.go
@@ -217,8 +217,8 @@ func (bal *Balancer) cleanupMounts() {
 	rwdev := map[string]*KeepService{}
 	for _, srv := range bal.KeepServices {
 		for _, mnt := range srv.mounts {
-			if !mnt.ReadOnly && mnt.DeviceID != "" {
-				rwdev[mnt.DeviceID] = srv
+			if !mnt.ReadOnly {
+				rwdev[mnt.UUID] = srv
 			}
 		}
 	}
@@ -227,8 +227,8 @@ func (bal *Balancer) cleanupMounts() {
 	for _, srv := range bal.KeepServices {
 		var dedup []*KeepMount
 		for _, mnt := range srv.mounts {
-			if mnt.ReadOnly && rwdev[mnt.DeviceID] != nil {
-				bal.logf("skipping srv %s readonly mount %q because same device %q is mounted read-write on srv %s", srv, mnt.UUID, mnt.DeviceID, rwdev[mnt.DeviceID])
+			if mnt.ReadOnly && rwdev[mnt.UUID] != nil {
+				bal.logf("skipping srv %s readonly mount %q because same volume is mounted read-write on srv %s", srv, mnt.UUID, rwdev[mnt.UUID])
 			} else {
 				dedup = append(dedup, mnt)
 			}
@@ -357,12 +357,10 @@ func (bal *Balancer) GetCurrentState(ctx context.Context, c *arvados.Client, pag
 	deviceMount := map[string]*KeepMount{}
 	for _, srv := range bal.KeepServices {
 		for _, mnt := range srv.mounts {
-			equiv := deviceMount[mnt.DeviceID]
+			equiv := deviceMount[mnt.UUID]
 			if equiv == nil {
 				equiv = mnt
-				if mnt.DeviceID != "" {
-					deviceMount[mnt.DeviceID] = equiv
-				}
+				deviceMount[mnt.UUID] = equiv
 			}
 			equivMount[equiv] = append(equivMount[equiv], mnt)
 		}
@@ -667,7 +665,7 @@ func (bal *Balancer) balanceBlock(blkid arvados.SizedDigest, blk *BlockState) ba
 				// new/remaining replicas uniformly
 				// across qualifying mounts on a given
 				// server.
-				return rendezvousLess(si.mnt.DeviceID, sj.mnt.DeviceID, blkid)
+				return rendezvousLess(si.mnt.UUID, sj.mnt.UUID, blkid)
 			}
 		})
 
@@ -692,7 +690,7 @@ func (bal *Balancer) balanceBlock(blkid arvados.SizedDigest, blk *BlockState) ba
 		// and returns true if all requirements are met.
 		trySlot := func(i int) bool {
 			slot := slots[i]
-			if wantMnt[slot.mnt] || wantDev[slot.mnt.DeviceID] {
+			if wantMnt[slot.mnt] || wantDev[slot.mnt.UUID] {
 				// Already allocated a replica to this
 				// backend device, possibly on a
 				// different server.
@@ -707,9 +705,7 @@ func (bal *Balancer) balanceBlock(blkid arvados.SizedDigest, blk *BlockState) ba
 				slots[i].want = true
 				wantSrv[slot.mnt.KeepService] = true
 				wantMnt[slot.mnt] = true
-				if slot.mnt.DeviceID != "" {
-					wantDev[slot.mnt.DeviceID] = true
-				}
+				wantDev[slot.mnt.UUID] = true
 				replWant += slot.mnt.Replication
 			}
 			return replProt >= desired && replWant >= desired
@@ -751,7 +747,7 @@ func (bal *Balancer) balanceBlock(blkid arvados.SizedDigest, blk *BlockState) ba
 		// haven't already been added to unsafeToDelete
 		// because the servers report different Mtimes.
 		for _, slot := range slots {
-			if slot.repl != nil && wantDev[slot.mnt.DeviceID] {
+			if slot.repl != nil && wantDev[slot.mnt.UUID] {
 				unsafeToDelete[slot.repl.Mtime] = true
 			}
 		}
@@ -834,7 +830,7 @@ func computeBlockState(slots []slot, onlyCount map[*KeepMount]bool, have, needRe
 		if onlyCount != nil && !onlyCount[slot.mnt] {
 			continue
 		}
-		if countedDev[slot.mnt.DeviceID] {
+		if countedDev[slot.mnt.UUID] {
 			continue
 		}
 		switch {
@@ -848,9 +844,7 @@ func computeBlockState(slots []slot, onlyCount map[*KeepMount]bool, have, needRe
 			bbs.pulling++
 			repl += slot.mnt.Replication
 		}
-		if slot.mnt.DeviceID != "" {
-			countedDev[slot.mnt.DeviceID] = true
-		}
+		countedDev[slot.mnt.UUID] = true
 	}
 	if repl < needRepl {
 		bbs.unachievable = true
diff --git a/services/keep-balance/balance_test.go b/services/keep-balance/balance_test.go
index c529ac150..df04145b9 100644
--- a/services/keep-balance/balance_test.go
+++ b/services/keep-balance/balance_test.go
@@ -167,8 +167,8 @@ func (bal *balancerSuite) testMultipleViews(c *check.C, readonly bool) {
 		srv.mounts[0].KeepMount.DeviceID = fmt.Sprintf("writable-by-srv-%x", i)
 		srv.mounts = append(srv.mounts, &KeepMount{
 			KeepMount: arvados.KeepMount{
-				DeviceID:       fmt.Sprintf("writable-by-srv-%x", (i+1)%len(bal.srvs)),
-				UUID:           fmt.Sprintf("zzzzz-mount-%015x", i<<16),
+				DeviceID:       bal.srvs[(i+1)%len(bal.srvs)].mounts[0].KeepMount.DeviceID,
+				UUID:           bal.srvs[(i+1)%len(bal.srvs)].mounts[0].KeepMount.UUID,
 				ReadOnly:       readonly,
 				Replication:    1,
 				StorageClasses: map[string]bool{"default": true},
@@ -347,6 +347,7 @@ func (bal *balancerSuite) TestDecreaseReplBlockTooNew(c *check.C) {
 func (bal *balancerSuite) TestCleanupMounts(c *check.C) {
 	bal.srvs[3].mounts[0].KeepMount.ReadOnly = true
 	bal.srvs[3].mounts[0].KeepMount.DeviceID = "abcdef"
+	bal.srvs[14].mounts[0].KeepMount.UUID = bal.srvs[3].mounts[0].KeepMount.UUID
 	bal.srvs[14].mounts[0].KeepMount.DeviceID = "abcdef"
 	c.Check(len(bal.srvs[3].mounts), check.Equals, 1)
 	bal.cleanupMounts()
@@ -485,32 +486,32 @@ func (bal *balancerSuite) TestVolumeReplication(c *check.C) {
 }
 
 func (bal *balancerSuite) TestDeviceRWMountedByMultipleServers(c *check.C) {
-	bal.srvs[0].mounts[0].KeepMount.DeviceID = "abcdef"
-	bal.srvs[9].mounts[0].KeepMount.DeviceID = "abcdef"
-	bal.srvs[14].mounts[0].KeepMount.DeviceID = "abcdef"
+	dupUUID := bal.srvs[0].mounts[0].KeepMount.UUID
+	bal.srvs[9].mounts[0].KeepMount.UUID = dupUUID
+	bal.srvs[14].mounts[0].KeepMount.UUID = dupUUID
 	// block 0 belongs on servers 3 and e, which have different
-	// device IDs.
+	// UUIDs.
 	bal.try(c, tester{
 		known:      0,
 		desired:    map[string]int{"default": 2},
 		current:    slots{1},
 		shouldPull: slots{0}})
 	// block 1 belongs on servers 0 and 9, which both report
-	// having a replica, but the replicas are on the same device
-	// ID -- so we should pull to the third position (7).
+	// having a replica, but the replicas are on the same volume
+	// -- so we should pull to the third position (7).
 	bal.try(c, tester{
 		known:      1,
 		desired:    map[string]int{"default": 2},
 		current:    slots{0, 1},
 		shouldPull: slots{2}})
-	// block 1 can be pulled to the doubly-mounted device, but the
+	// block 1 can be pulled to the doubly-mounted volume, but the
 	// pull should only be done on the first of the two servers.
 	bal.try(c, tester{
 		known:      1,
 		desired:    map[string]int{"default": 2},
 		current:    slots{2},
 		shouldPull: slots{0}})
-	// block 0 has one replica on a single device mounted on two
+	// block 0 has one replica on a single volume mounted on two
 	// servers (e,9 at positions 1,9). Trashing the replica on 9
 	// would lose the block.
 	bal.try(c, tester{
@@ -523,7 +524,7 @@ func (bal *balancerSuite) TestDeviceRWMountedByMultipleServers(c *check.C) {
 			pulling: 1,
 		}})
 	// block 0 is overreplicated, but the second and third
-	// replicas are the same replica according to DeviceID
+	// replicas are the same replica according to volume UUID
 	// (despite different Mtimes). Don't trash the third replica.
 	bal.try(c, tester{
 		known:   0,
@@ -595,7 +596,7 @@ func (bal *balancerSuite) TestChangeStorageClasses(c *check.C) {
 		desired:          map[string]int{"default": 2, "special": 1},
 		current:          slots{0, 1},
 		shouldPull:       slots{9},
-		shouldPullMounts: []string{"zzzzz-mount-special00000009"}})
+		shouldPullMounts: []string{"zzzzz-mount-special20000009"}})
 	// If some storage classes are not satisfied, don't trash any
 	// excess replicas. (E.g., if someone desires repl=1 on
 	// class=durable, and we have two copies on class=volatile, we
@@ -605,7 +606,7 @@ func (bal *balancerSuite) TestChangeStorageClasses(c *check.C) {
 		desired:          map[string]int{"special": 1},
 		current:          slots{0, 1},
 		shouldPull:       slots{9},
-		shouldPullMounts: []string{"zzzzz-mount-special00000009"}})
+		shouldPullMounts: []string{"zzzzz-mount-special20000009"}})
 	// Once storage classes are satisfied, trash excess replicas
 	// that appear earlier in probe order but aren't needed to
 	// satisfy the desired classes.

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list