[ARVADOS] updated: ac9cacc92f66867673e186716616452cb628352f
Git user
git at public.curoverse.com
Tue May 24 16:38:14 EDT 2016
Summary of changes:
services/keep-balance/balance.go | 29 ++++++++++++++++++++++++++---
services/keep-balance/balance_run_test.go | 17 ++++++++++-------
2 files changed, 36 insertions(+), 10 deletions(-)
via ac9cacc92f66867673e186716616452cb628352f (commit)
from c4c44a33aabe63ad95d3bd02899390ae269cd1c8 (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
commit ac9cacc92f66867673e186716616452cb628352f
Author: Tom Clegg <tom at curoverse.com>
Date: Tue May 24 16:38:08 2016 -0400
9162: Fix race: clear trash lists before getting indexes.
diff --git a/services/keep-balance/balance.go b/services/keep-balance/balance.go
index a0ba995..bbb4509 100644
--- a/services/keep-balance/balance.go
+++ b/services/keep-balance/balance.go
@@ -65,6 +65,11 @@ func (bal *Balancer) Run(config Config, runOptions RunOptions) (err error) {
if err = bal.CheckSanityEarly(&config.Client); err != nil {
return
}
+ if runOptions.CommitTrash {
+ if err = bal.ClearTrashLists(&config.Client); err != nil {
+ return
+ }
+ }
if err = bal.GetCurrentState(&config.Client); err != nil {
return
}
@@ -131,6 +136,27 @@ func (bal *Balancer) CheckSanityEarly(c *arvados.Client) error {
return nil
}
+// ClearTrashLists sends an empty trash list to each keep
+// service. Calling this before GetCurrentState avoids races.
+//
+// When a block appears in an index, we assume that replica will still
+// exist after we delete other replicas on other servers. However,
+// it's possible that a previous rebalancing operation made different
+// decisions (e.g., servers were added/removed, and rendezvous order
+// changed). In this case, the replica might already be on that
+// server's trash list, and it might be deleted before we send a
+// replacement trash list.
+//
+// We avoid this problem if we clear all trash lists before getting
+// indexes. (We also assume there is only one rebalancing process
+// running at a time.)
+func (bal *Balancer) ClearTrashLists(c *arvados.Client) error {
+ for _, srv := range bal.KeepServices {
+ srv.ChangeSet = ChangeSet{}
+ }
+ return bal.CommitTrash(c)
+}
+
// GetCurrentState determines the current replication state, and the
// desired replication level, for every block that is either
// retrievable or referenced.
@@ -546,9 +572,6 @@ func (bal *Balancer) CommitTrash(c *arvados.Client) error {
}
func (bal *Balancer) commitAsync(c *arvados.Client, label string, f func(srv *KeepService) error) error {
- if err := bal.CheckSanityLate(); err != nil {
- return err
- }
errs := make(chan error)
for _, srv := range bal.KeepServices {
go func(srv *KeepService) {
diff --git a/services/keep-balance/balance_run_test.go b/services/keep-balance/balance_run_test.go
index e6c2ea0..c3e8420 100644
--- a/services/keep-balance/balance_run_test.go
+++ b/services/keep-balance/balance_run_test.go
@@ -216,9 +216,11 @@ func (s *runSuite) TestRefuseZeroCollections(c *check.C) {
s.stub.serveFourDiskKeepServices()
s.stub.serveKeepstoreIndexFoo4Bar1()
trashReqs := s.stub.serveKeepstoreTrash()
+ pullReqs := s.stub.serveKeepstorePull()
err := (&Balancer{}).Run(s.config, opts)
c.Check(err, check.ErrorMatches, "received zero collections")
- c.Check(trashReqs.Count(), check.Equals, 0)
+ c.Check(trashReqs.Count(), check.Equals, 4)
+ c.Check(pullReqs.Count(), check.Equals, 0)
}
func (s *runSuite) TestServiceTypes(c *check.C) {
@@ -293,7 +295,7 @@ func (s *runSuite) TestCommit(c *check.C) {
var bal Balancer
err := bal.Run(s.config, opts)
c.Check(err, check.IsNil)
- c.Check(trashReqs.Count(), check.Equals, 4)
+ c.Check(trashReqs.Count(), check.Equals, 8)
c.Check(pullReqs.Count(), check.Equals, 4)
stats := bal.getStatistics()
// "foo" block is overreplicated by 2
@@ -321,12 +323,13 @@ func (s *runSuite) TestRunForever(c *check.C) {
s.config.RunPeriod = duration(time.Millisecond)
go RunForever(s.config, opts, stop)
- // Each run should send 4 pull + 4 trash lists. We should
- // complete four runs in much less than 1 minute.
- for t0 := time.Now(); trashReqs.Count() < 16 && time.Since(t0) < time.Minute; {
+ // Each run should send 4 clear trash lists + 4 pull lists + 4
+ // trash lists. We should complete four runs in much less than
+ // a second.
+ for t0 := time.Now(); pullReqs.Count() < 16 && time.Since(t0) < 10*time.Second; {
time.Sleep(time.Millisecond)
}
stop <- true
- c.Check(trashReqs.Count() >= 16, check.Equals, true)
- c.Check(pullReqs.Count(), check.Equals, trashReqs.Count())
+ c.Check(pullReqs.Count() >= 16, check.Equals, true)
+ c.Check(trashReqs.Count(), check.Equals, 2*pullReqs.Count())
}
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list