[ARVADOS] updated: 2.3.1-12-g56c37ef9b
Git user
git at public.arvados.org
Tue Dec 7 20:08:48 UTC 2021
Summary of changes:
services/keep-balance/balance.go | 54 ++++++++++++++++++++-----------
services/keep-balance/balance_run_test.go | 26 +++++++++++++++
services/keep-balance/balance_test.go | 27 ++++++++--------
3 files changed, 76 insertions(+), 31 deletions(-)
via 56c37ef9b76b992dad59524cae6c34b86bb911d0 (commit)
via 11864d817434e1f3e36cf3c0ef9ab37736938f65 (commit)
from b008c44eaf5c6b45c9f36116601918748aeb8323 (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
commit 56c37ef9b76b992dad59524cae6c34b86bb911d0
Author: Tom Clegg <tom at curii.com>
Date: Mon Dec 6 19:32:34 2021 -0500
18547: Error out if two volumes return the same non-empty DeviceID.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>
diff --git a/services/keep-balance/balance.go b/services/keep-balance/balance.go
index fa01d512b..eb6f580f4 100644
--- a/services/keep-balance/balance.go
+++ b/services/keep-balance/balance.go
@@ -8,6 +8,7 @@ import (
"bytes"
"context"
"crypto/md5"
+ "errors"
"fmt"
"io"
"io/ioutil"
@@ -266,6 +267,29 @@ func (bal *Balancer) CheckSanityEarly(c *arvados.Client) error {
}
}
+ mountProblem := false
+ type deviceMount struct {
+ srv *KeepService
+ mnt *KeepMount
+ }
+ deviceMounted := map[string]deviceMount{} // DeviceID -> mount
+ for _, srv := range bal.KeepServices {
+ for _, mnt := range srv.mounts {
+ if first, dup := deviceMounted[mnt.DeviceID]; dup && first.mnt.UUID != mnt.UUID && mnt.DeviceID != "" {
+ bal.logf("config error: device %s is mounted with multiple volume UUIDs: %s on %s, and %s on %s",
+ mnt.DeviceID,
+ first.mnt.UUID, first.srv,
+ mnt.UUID, srv)
+ mountProblem = true
+ continue
+ }
+ deviceMounted[mnt.DeviceID] = deviceMount{srv, mnt}
+ }
+ }
+ if mountProblem {
+ return errors.New("cannot continue with config errors (see above)")
+ }
+
var checkPage arvados.CollectionList
if err = c.RequestAndDecode(&checkPage, "GET", "arvados/v1/collections", nil, arvados.ResourceListParams{
Limit: new(int),
diff --git a/services/keep-balance/balance_run_test.go b/services/keep-balance/balance_run_test.go
index 4e2c6803c..0d1b6b591 100644
--- a/services/keep-balance/balance_run_test.go
+++ b/services/keep-balance/balance_run_test.go
@@ -397,6 +397,32 @@ func (s *runSuite) TestRefuseNonAdmin(c *check.C) {
c.Check(pullReqs.Count(), check.Equals, 0)
}
+func (s *runSuite) TestRefuseSameDeviceDifferentVolumes(c *check.C) {
+ opts := RunOptions{
+ CommitPulls: true,
+ CommitTrash: true,
+ Logger: ctxlog.TestLogger(c),
+ }
+ s.stub.serveCurrentUserAdmin()
+ s.stub.serveZeroCollections()
+ s.stub.serveKeepServices(stubServices)
+ s.stub.mux.HandleFunc("/mounts", func(w http.ResponseWriter, r *http.Request) {
+ hostid := r.Host[:5] // "keep0.zzzzz.arvadosapi.com:25107" => "keep0"
+ json.NewEncoder(w).Encode([]arvados.KeepMount{{
+ UUID: "zzzzz-ivpuk-0000000000" + hostid,
+ DeviceID: "keep0-vol0",
+ StorageClasses: map[string]bool{"default": true},
+ }})
+ })
+ trashReqs := s.stub.serveKeepstoreTrash()
+ pullReqs := s.stub.serveKeepstorePull()
+ srv := s.newServer(&opts)
+ _, err := srv.runOnce()
+ c.Check(err, check.ErrorMatches, "cannot continue with config errors.*")
+ c.Check(trashReqs.Count(), check.Equals, 0)
+ c.Check(pullReqs.Count(), check.Equals, 0)
+}
+
func (s *runSuite) TestWriteLostBlocks(c *check.C) {
lostf, err := ioutil.TempFile("", "keep-balance-lost-blocks-test-")
c.Assert(err, check.IsNil)
commit 11864d817434e1f3e36cf3c0ef9ab37736938f65
Author: Tom Clegg <tom at curii.com>
Date: Fri Dec 3 10:54:59 2021 -0500
18547: Use volume UUID instead of DeviceID to deduplicate mounts.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>
diff --git a/services/keep-balance/balance.go b/services/keep-balance/balance.go
index bb590e13b..fa01d512b 100644
--- a/services/keep-balance/balance.go
+++ b/services/keep-balance/balance.go
@@ -217,8 +217,8 @@ func (bal *Balancer) cleanupMounts() {
rwdev := map[string]*KeepService{}
for _, srv := range bal.KeepServices {
for _, mnt := range srv.mounts {
- if !mnt.ReadOnly && mnt.DeviceID != "" {
- rwdev[mnt.DeviceID] = srv
+ if !mnt.ReadOnly {
+ rwdev[mnt.UUID] = srv
}
}
}
@@ -227,8 +227,8 @@ func (bal *Balancer) cleanupMounts() {
for _, srv := range bal.KeepServices {
var dedup []*KeepMount
for _, mnt := range srv.mounts {
- if mnt.ReadOnly && rwdev[mnt.DeviceID] != nil {
- bal.logf("skipping srv %s readonly mount %q because same device %q is mounted read-write on srv %s", srv, mnt.UUID, mnt.DeviceID, rwdev[mnt.DeviceID])
+ if mnt.ReadOnly && rwdev[mnt.UUID] != nil {
+ bal.logf("skipping srv %s readonly mount %q because same volume is mounted read-write on srv %s", srv, mnt.UUID, rwdev[mnt.UUID])
} else {
dedup = append(dedup, mnt)
}
@@ -357,12 +357,10 @@ func (bal *Balancer) GetCurrentState(ctx context.Context, c *arvados.Client, pag
deviceMount := map[string]*KeepMount{}
for _, srv := range bal.KeepServices {
for _, mnt := range srv.mounts {
- equiv := deviceMount[mnt.DeviceID]
+ equiv := deviceMount[mnt.UUID]
if equiv == nil {
equiv = mnt
- if mnt.DeviceID != "" {
- deviceMount[mnt.DeviceID] = equiv
- }
+ deviceMount[mnt.UUID] = equiv
}
equivMount[equiv] = append(equivMount[equiv], mnt)
}
@@ -667,7 +665,7 @@ func (bal *Balancer) balanceBlock(blkid arvados.SizedDigest, blk *BlockState) ba
// new/remaining replicas uniformly
// across qualifying mounts on a given
// server.
- return rendezvousLess(si.mnt.DeviceID, sj.mnt.DeviceID, blkid)
+ return rendezvousLess(si.mnt.UUID, sj.mnt.UUID, blkid)
}
})
@@ -692,7 +690,7 @@ func (bal *Balancer) balanceBlock(blkid arvados.SizedDigest, blk *BlockState) ba
// and returns true if all requirements are met.
trySlot := func(i int) bool {
slot := slots[i]
- if wantMnt[slot.mnt] || wantDev[slot.mnt.DeviceID] {
+ if wantMnt[slot.mnt] || wantDev[slot.mnt.UUID] {
// Already allocated a replica to this
// backend device, possibly on a
// different server.
@@ -707,9 +705,7 @@ func (bal *Balancer) balanceBlock(blkid arvados.SizedDigest, blk *BlockState) ba
slots[i].want = true
wantSrv[slot.mnt.KeepService] = true
wantMnt[slot.mnt] = true
- if slot.mnt.DeviceID != "" {
- wantDev[slot.mnt.DeviceID] = true
- }
+ wantDev[slot.mnt.UUID] = true
replWant += slot.mnt.Replication
}
return replProt >= desired && replWant >= desired
@@ -751,7 +747,7 @@ func (bal *Balancer) balanceBlock(blkid arvados.SizedDigest, blk *BlockState) ba
// haven't already been added to unsafeToDelete
// because the servers report different Mtimes.
for _, slot := range slots {
- if slot.repl != nil && wantDev[slot.mnt.DeviceID] {
+ if slot.repl != nil && wantDev[slot.mnt.UUID] {
unsafeToDelete[slot.repl.Mtime] = true
}
}
@@ -834,7 +830,7 @@ func computeBlockState(slots []slot, onlyCount map[*KeepMount]bool, have, needRe
if onlyCount != nil && !onlyCount[slot.mnt] {
continue
}
- if countedDev[slot.mnt.DeviceID] {
+ if countedDev[slot.mnt.UUID] {
continue
}
switch {
@@ -848,9 +844,7 @@ func computeBlockState(slots []slot, onlyCount map[*KeepMount]bool, have, needRe
bbs.pulling++
repl += slot.mnt.Replication
}
- if slot.mnt.DeviceID != "" {
- countedDev[slot.mnt.DeviceID] = true
- }
+ countedDev[slot.mnt.UUID] = true
}
if repl < needRepl {
bbs.unachievable = true
diff --git a/services/keep-balance/balance_test.go b/services/keep-balance/balance_test.go
index c529ac150..df04145b9 100644
--- a/services/keep-balance/balance_test.go
+++ b/services/keep-balance/balance_test.go
@@ -167,8 +167,8 @@ func (bal *balancerSuite) testMultipleViews(c *check.C, readonly bool) {
srv.mounts[0].KeepMount.DeviceID = fmt.Sprintf("writable-by-srv-%x", i)
srv.mounts = append(srv.mounts, &KeepMount{
KeepMount: arvados.KeepMount{
- DeviceID: fmt.Sprintf("writable-by-srv-%x", (i+1)%len(bal.srvs)),
- UUID: fmt.Sprintf("zzzzz-mount-%015x", i<<16),
+ DeviceID: bal.srvs[(i+1)%len(bal.srvs)].mounts[0].KeepMount.DeviceID,
+ UUID: bal.srvs[(i+1)%len(bal.srvs)].mounts[0].KeepMount.UUID,
ReadOnly: readonly,
Replication: 1,
StorageClasses: map[string]bool{"default": true},
@@ -347,6 +347,7 @@ func (bal *balancerSuite) TestDecreaseReplBlockTooNew(c *check.C) {
func (bal *balancerSuite) TestCleanupMounts(c *check.C) {
bal.srvs[3].mounts[0].KeepMount.ReadOnly = true
bal.srvs[3].mounts[0].KeepMount.DeviceID = "abcdef"
+ bal.srvs[14].mounts[0].KeepMount.UUID = bal.srvs[3].mounts[0].KeepMount.UUID
bal.srvs[14].mounts[0].KeepMount.DeviceID = "abcdef"
c.Check(len(bal.srvs[3].mounts), check.Equals, 1)
bal.cleanupMounts()
@@ -485,32 +486,32 @@ func (bal *balancerSuite) TestVolumeReplication(c *check.C) {
}
func (bal *balancerSuite) TestDeviceRWMountedByMultipleServers(c *check.C) {
- bal.srvs[0].mounts[0].KeepMount.DeviceID = "abcdef"
- bal.srvs[9].mounts[0].KeepMount.DeviceID = "abcdef"
- bal.srvs[14].mounts[0].KeepMount.DeviceID = "abcdef"
+ dupUUID := bal.srvs[0].mounts[0].KeepMount.UUID
+ bal.srvs[9].mounts[0].KeepMount.UUID = dupUUID
+ bal.srvs[14].mounts[0].KeepMount.UUID = dupUUID
// block 0 belongs on servers 3 and e, which have different
- // device IDs.
+ // UUIDs.
bal.try(c, tester{
known: 0,
desired: map[string]int{"default": 2},
current: slots{1},
shouldPull: slots{0}})
// block 1 belongs on servers 0 and 9, which both report
- // having a replica, but the replicas are on the same device
- // ID -- so we should pull to the third position (7).
+ // having a replica, but the replicas are on the same volume
+ // -- so we should pull to the third position (7).
bal.try(c, tester{
known: 1,
desired: map[string]int{"default": 2},
current: slots{0, 1},
shouldPull: slots{2}})
- // block 1 can be pulled to the doubly-mounted device, but the
+ // block 1 can be pulled to the doubly-mounted volume, but the
// pull should only be done on the first of the two servers.
bal.try(c, tester{
known: 1,
desired: map[string]int{"default": 2},
current: slots{2},
shouldPull: slots{0}})
- // block 0 has one replica on a single device mounted on two
+ // block 0 has one replica on a single volume mounted on two
// servers (e,9 at positions 1,9). Trashing the replica on 9
// would lose the block.
bal.try(c, tester{
@@ -523,7 +524,7 @@ func (bal *balancerSuite) TestDeviceRWMountedByMultipleServers(c *check.C) {
pulling: 1,
}})
// block 0 is overreplicated, but the second and third
- // replicas are the same replica according to DeviceID
+ // replicas are the same replica according to volume UUID
// (despite different Mtimes). Don't trash the third replica.
bal.try(c, tester{
known: 0,
@@ -595,7 +596,7 @@ func (bal *balancerSuite) TestChangeStorageClasses(c *check.C) {
desired: map[string]int{"default": 2, "special": 1},
current: slots{0, 1},
shouldPull: slots{9},
- shouldPullMounts: []string{"zzzzz-mount-special00000009"}})
+ shouldPullMounts: []string{"zzzzz-mount-special20000009"}})
// If some storage classes are not satisfied, don't trash any
// excess replicas. (E.g., if someone desires repl=1 on
// class=durable, and we have two copies on class=volatile, we
@@ -605,7 +606,7 @@ func (bal *balancerSuite) TestChangeStorageClasses(c *check.C) {
desired: map[string]int{"special": 1},
current: slots{0, 1},
shouldPull: slots{9},
- shouldPullMounts: []string{"zzzzz-mount-special00000009"}})
+ shouldPullMounts: []string{"zzzzz-mount-special20000009"}})
// Once storage classes are satisfied, trash excess replicas
// that appear earlier in probe order but aren't needed to
// satisfy the desired classes.
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list