[ARVADOS] updated: ac9cacc92f66867673e186716616452cb628352f

Git user git at public.curoverse.com
Tue May 24 16:38:14 EDT 2016


Summary of changes:
 services/keep-balance/balance.go          | 29 ++++++++++++++++++++++++++---
 services/keep-balance/balance_run_test.go | 17 ++++++++++-------
 2 files changed, 36 insertions(+), 10 deletions(-)

       via  ac9cacc92f66867673e186716616452cb628352f (commit)
      from  c4c44a33aabe63ad95d3bd02899390ae269cd1c8 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.


commit ac9cacc92f66867673e186716616452cb628352f
Author: Tom Clegg <tom at curoverse.com>
Date:   Tue May 24 16:38:08 2016 -0400

    9162: Fix race: clear trash lists before getting indexes.

diff --git a/services/keep-balance/balance.go b/services/keep-balance/balance.go
index a0ba995..bbb4509 100644
--- a/services/keep-balance/balance.go
+++ b/services/keep-balance/balance.go
@@ -65,6 +65,11 @@ func (bal *Balancer) Run(config Config, runOptions RunOptions) (err error) {
 	if err = bal.CheckSanityEarly(&config.Client); err != nil {
 		return
 	}
+	if runOptions.CommitTrash {
+		if err = bal.ClearTrashLists(&config.Client); err != nil {
+			return
+		}
+	}
 	if err = bal.GetCurrentState(&config.Client); err != nil {
 		return
 	}
@@ -131,6 +136,27 @@ func (bal *Balancer) CheckSanityEarly(c *arvados.Client) error {
 	return nil
 }
 
+// ClearTrashLists sends an empty trash list to each keep
+// service. Calling this before GetCurrentState avoids races.
+//
+// When a block appears in an index, we assume that replica will still
+// exist after we delete other replicas on other servers. However,
+// it's possible that a previous rebalancing operation made different
+// decisions (e.g., servers were added/removed, and rendezvous order
+// changed). In this case, the replica might already be on that
+// server's trash list, and it might be deleted before we send a
+// replacement trash list.
+//
+// We avoid this problem if we clear all trash lists before getting
+// indexes. (We also assume there is only one rebalancing process
+// running at a time.)
+func (bal *Balancer) ClearTrashLists(c *arvados.Client) error {
+	for _, srv := range bal.KeepServices {
+		srv.ChangeSet = ChangeSet{}
+	}
+	return bal.CommitTrash(c)
+}
+
 // GetCurrentState determines the current replication state, and the
 // desired replication level, for every block that is either
 // retrievable or referenced.
@@ -546,9 +572,6 @@ func (bal *Balancer) CommitTrash(c *arvados.Client) error {
 }
 
 func (bal *Balancer) commitAsync(c *arvados.Client, label string, f func(srv *KeepService) error) error {
-	if err := bal.CheckSanityLate(); err != nil {
-		return err
-	}
 	errs := make(chan error)
 	for _, srv := range bal.KeepServices {
 		go func(srv *KeepService) {
diff --git a/services/keep-balance/balance_run_test.go b/services/keep-balance/balance_run_test.go
index e6c2ea0..c3e8420 100644
--- a/services/keep-balance/balance_run_test.go
+++ b/services/keep-balance/balance_run_test.go
@@ -216,9 +216,11 @@ func (s *runSuite) TestRefuseZeroCollections(c *check.C) {
 	s.stub.serveFourDiskKeepServices()
 	s.stub.serveKeepstoreIndexFoo4Bar1()
 	trashReqs := s.stub.serveKeepstoreTrash()
+	pullReqs := s.stub.serveKeepstorePull()
 	err := (&Balancer{}).Run(s.config, opts)
 	c.Check(err, check.ErrorMatches, "received zero collections")
-	c.Check(trashReqs.Count(), check.Equals, 0)
+	c.Check(trashReqs.Count(), check.Equals, 4)
+	c.Check(pullReqs.Count(), check.Equals, 0)
 }
 
 func (s *runSuite) TestServiceTypes(c *check.C) {
@@ -293,7 +295,7 @@ func (s *runSuite) TestCommit(c *check.C) {
 	var bal Balancer
 	err := bal.Run(s.config, opts)
 	c.Check(err, check.IsNil)
-	c.Check(trashReqs.Count(), check.Equals, 4)
+	c.Check(trashReqs.Count(), check.Equals, 8)
 	c.Check(pullReqs.Count(), check.Equals, 4)
 	stats := bal.getStatistics()
 	// "foo" block is overreplicated by 2
@@ -321,12 +323,13 @@ func (s *runSuite) TestRunForever(c *check.C) {
 	s.config.RunPeriod = duration(time.Millisecond)
 	go RunForever(s.config, opts, stop)
 
-	// Each run should send 4 pull + 4 trash lists. We should
-	// complete four runs in much less than 1 minute.
-	for t0 := time.Now(); trashReqs.Count() < 16 && time.Since(t0) < time.Minute; {
+	// Each run should send 4 clear trash lists + 4 pull lists + 4
+	// trash lists. We should complete four runs in much less than
+	// a second.
+	for t0 := time.Now(); pullReqs.Count() < 16 && time.Since(t0) < 10*time.Second; {
 		time.Sleep(time.Millisecond)
 	}
 	stop <- true
-	c.Check(trashReqs.Count() >= 16, check.Equals, true)
-	c.Check(pullReqs.Count(), check.Equals, trashReqs.Count())
+	c.Check(pullReqs.Count() >= 16, check.Equals, true)
+	c.Check(trashReqs.Count(), check.Equals, 2*pullReqs.Count())
 }

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list