[ARVADOS] updated: 1.3.0-197-gc1aa581b3

Git user git at public.curoverse.com
Fri Jan 25 16:32:16 EST 2019


Summary of changes:
 build/run-build-packages.sh                   |  5 +-
 lib/dispatchcloud/scheduler/interfaces.go     |  5 +-
 lib/dispatchcloud/scheduler/run_queue_test.go | 46 +++++++++----
 lib/dispatchcloud/worker/pool.go              |  6 ++
 lib/dispatchcloud/worker/pool_test.go         | 96 +++++++++++++++++++++++++--
 lib/dispatchcloud/worker/worker.go            |  4 +-
 6 files changed, 136 insertions(+), 26 deletions(-)

       via  c1aa581b3511b89527f185fd3fac7447aa33c9fc (commit)
       via  8808efaa1c87688b6b89e60c0337b6f0589df779 (commit)
       via  804ed25e843c38c4a5bf381f70dbaa0a61072a86 (commit)
       via  9668d19ed127b01f986ae7defc657c0fd23e604b (commit)
       via  c88ffa1a163c929ffa963af3eb1bcdbca1f6b6f2 (commit)
       via  be8ed479042df4fdefe1fd18c1e2e984e1c99bc0 (commit)
      from  630601173bda46a7c02b5fbf43eaf5422a95b7d7 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.


commit c1aa581b3511b89527f185fd3fac7447aa33c9fc
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Fri Jan 25 16:31:48 2019 -0500

    14325: Build crunch-dispatch-cloud package.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/build/run-build-packages.sh b/build/run-build-packages.sh
index f316c563b..d81f2de39 100755
--- a/build/run-build-packages.sh
+++ b/build/run-build-packages.sh
@@ -294,9 +294,8 @@ package_go_binary cmd/arvados-server arvados-server \
     "Arvados server daemons"
 package_go_binary cmd/arvados-server arvados-controller \
     "Arvados cluster controller daemon"
-# No package until #14325
-#package_go_binary cmd/arvados-server crunch-dispatch-cloud \
-#    "Arvados cluster cloud dispatch"
+package_go_binary cmd/arvados-server crunch-dispatch-cloud \
+    "Arvados cluster cloud dispatch"
 package_go_binary sdk/go/crunchrunner crunchrunner \
     "Crunchrunner executes a command inside a container and uploads the output"
 package_go_binary services/arv-git-httpd arvados-git-httpd \

commit 8808efaa1c87688b6b89e60c0337b6f0589df779
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Fri Jan 25 16:28:31 2019 -0500

    14325: Point to API details in interface doc comments.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/lib/dispatchcloud/scheduler/interfaces.go b/lib/dispatchcloud/scheduler/interfaces.go
index e1b575c8a..18cdc94fa 100644
--- a/lib/dispatchcloud/scheduler/interfaces.go
+++ b/lib/dispatchcloud/scheduler/interfaces.go
@@ -13,7 +13,8 @@ import (
 )
 
 // A ContainerQueue is a set of containers that need to be started or
-// stopped. Implemented by container.Queue and test stubs.
+// stopped. Implemented by container.Queue and test stubs. See
+// container.Queue method documentation for details.
 type ContainerQueue interface {
 	Entries() (entries map[string]container.QueueEnt, updated time.Time)
 	Lock(uuid string) error
@@ -28,7 +29,7 @@ type ContainerQueue interface {
 
 // A WorkerPool asynchronously starts and stops worker VMs, and starts
 // and stops containers on them. Implemented by worker.Pool and test
-// stubs.
+// stubs. See worker.Pool method documentation for details.
 type WorkerPool interface {
 	Running() map[string]time.Time
 	Unallocated() map[arvados.InstanceType]int

commit 804ed25e843c38c4a5bf381f70dbaa0a61072a86
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Fri Jan 25 15:24:19 2019 -0500

    14325: Test resuming worker pool state after restart.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/lib/dispatchcloud/worker/pool_test.go b/lib/dispatchcloud/worker/pool_test.go
index 6a6cdc423..60e21b716 100644
--- a/lib/dispatchcloud/worker/pool_test.go
+++ b/lib/dispatchcloud/worker/pool_test.go
@@ -5,11 +5,14 @@
 package worker
 
 import (
+	"sort"
+	"strings"
 	"time"
 
 	"git.curoverse.com/arvados.git/lib/cloud"
 	"git.curoverse.com/arvados.git/lib/dispatchcloud/test"
 	"git.curoverse.com/arvados.git/sdk/go/arvados"
+	"github.com/prometheus/client_golang/prometheus"
 	"github.com/sirupsen/logrus"
 	check "gopkg.in/check.v1"
 )
@@ -34,6 +37,97 @@ func (suite *PoolSuite) SetUpSuite(c *check.C) {
 	logrus.StandardLogger().SetLevel(logrus.DebugLevel)
 }
 
+func (suite *PoolSuite) TestResumeAfterRestart(c *check.C) {
+	type1 := test.InstanceType(1)
+	type2 := test.InstanceType(2)
+	type3 := test.InstanceType(3)
+	waitForIdle := func(pool *Pool, notify <-chan struct{}) {
+		timeout := time.NewTimer(time.Second)
+		for {
+			instances := pool.Instances()
+			sort.Slice(instances, func(i, j int) bool {
+				return strings.Compare(instances[i].ArvadosInstanceType, instances[j].ArvadosInstanceType) < 0
+			})
+			if len(instances) == 3 &&
+				instances[0].ArvadosInstanceType == type1.Name &&
+				instances[0].WorkerState == StateIdle.String() &&
+				instances[1].ArvadosInstanceType == type1.Name &&
+				instances[1].WorkerState == StateIdle.String() &&
+				instances[2].ArvadosInstanceType == type2.Name &&
+				instances[2].WorkerState == StateIdle.String() {
+				return
+			}
+			select {
+			case <-timeout.C:
+				c.Logf("pool.Instances() == %#v", instances)
+				c.Error("timed out")
+				return
+			case <-notify:
+			}
+		}
+	}
+
+	logger := logrus.StandardLogger()
+	driver := &test.StubDriver{}
+	is, err := driver.InstanceSet(nil, "", logger)
+	c.Assert(err, check.IsNil)
+
+	newExecutor := func(cloud.Instance) Executor {
+		return stubExecutor{
+			"crunch-run --list": stubResp{},
+			"true":              stubResp{},
+		}
+	}
+
+	cluster := &arvados.Cluster{
+		Dispatch: arvados.Dispatch{
+			MaxProbesPerSecond: 1000,
+			ProbeInterval:      arvados.Duration(time.Millisecond * 10),
+		},
+		CloudVMs: arvados.CloudVMs{
+			BootProbeCommand: "true",
+			SyncInterval:     arvados.Duration(time.Millisecond * 10),
+		},
+		InstanceTypes: arvados.InstanceTypeMap{
+			type1.Name: type1,
+			type2.Name: type2,
+			type3.Name: type3,
+		},
+	}
+
+	pool := NewPool(logger, arvados.NewClientFromEnv(), prometheus.NewRegistry(), is, newExecutor, cluster)
+	notify := pool.Subscribe()
+	defer pool.Unsubscribe(notify)
+	pool.Create(type1)
+	pool.Create(type1)
+	pool.Create(type2)
+	waitForIdle(pool, notify)
+	var heldInstanceID cloud.InstanceID
+	for _, inst := range pool.Instances() {
+		if inst.ArvadosInstanceType == type2.Name {
+			heldInstanceID = cloud.InstanceID(inst.Instance)
+			pool.SetIdleBehavior(heldInstanceID, IdleBehaviorHold)
+		}
+	}
+	pool.Stop()
+
+	c.Log("------- starting new pool, waiting to recover state")
+
+	pool2 := NewPool(logger, arvados.NewClientFromEnv(), prometheus.NewRegistry(), is, newExecutor, cluster)
+	notify2 := pool2.Subscribe()
+	defer pool2.Unsubscribe(notify2)
+	waitForIdle(pool2, notify2)
+	for _, inst := range pool2.Instances() {
+		if inst.ArvadosInstanceType == type2.Name {
+			c.Check(inst.Instance, check.Equals, heldInstanceID)
+			c.Check(inst.IdleBehavior, check.Equals, IdleBehaviorHold)
+		} else {
+			c.Check(inst.IdleBehavior, check.Equals, IdleBehaviorRun)
+		}
+	}
+	pool2.Stop()
+}
+
 func (suite *PoolSuite) TestCreateUnallocShutdown(c *check.C) {
 	lameInstanceSet := &test.LameInstanceSet{Hold: make(chan bool)}
 	type1 := arvados.InstanceType{Name: "a1s", ProviderType: "a1.small", VCPUs: 1, RAM: 1 * GiB, Price: .01}
diff --git a/lib/dispatchcloud/worker/worker.go b/lib/dispatchcloud/worker/worker.go
index a75d2bbb8..baa56adde 100644
--- a/lib/dispatchcloud/worker/worker.go
+++ b/lib/dispatchcloud/worker/worker.go
@@ -57,8 +57,8 @@ type IdleBehavior string
 
 const (
 	IdleBehaviorRun   IdleBehavior = "run"   // run containers, or shutdown on idle timeout
-	IdleBehaviorHold               = "hold"  // don't shutdown or run more containers
-	IdleBehaviorDrain              = "drain" // shutdown immediately when idle
+	IdleBehaviorHold  IdleBehavior = "hold"  // don't shutdown or run more containers
+	IdleBehaviorDrain IdleBehavior = "drain" // shutdown immediately when idle
 )
 
 var validIdleBehavior = map[IdleBehavior]bool{

commit 9668d19ed127b01f986ae7defc657c0fd23e604b
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Fri Jan 25 13:28:15 2019 -0500

    14325: Remove obsolete test stubs.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/lib/dispatchcloud/worker/pool_test.go b/lib/dispatchcloud/worker/pool_test.go
index aa8195843..6a6cdc423 100644
--- a/lib/dispatchcloud/worker/pool_test.go
+++ b/lib/dispatchcloud/worker/pool_test.go
@@ -34,14 +34,6 @@ func (suite *PoolSuite) SetUpSuite(c *check.C) {
 	logrus.StandardLogger().SetLevel(logrus.DebugLevel)
 }
 
-func (suite *PoolSuite) TestStartContainer(c *check.C) {
-	// TODO: use an instanceSet stub with an SSH server
-}
-
-func (suite *PoolSuite) TestVerifyHostKey(c *check.C) {
-	// TODO: use an instanceSet stub with an SSH server
-}
-
 func (suite *PoolSuite) TestCreateUnallocShutdown(c *check.C) {
 	lameInstanceSet := &test.LameInstanceSet{Hold: make(chan bool)}
 	type1 := arvados.InstanceType{Name: "a1s", ProviderType: "a1.small", VCPUs: 1, RAM: 1 * GiB, Price: .01}

commit c88ffa1a163c929ffa963af3eb1bcdbca1f6b6f2
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Fri Jan 25 13:22:25 2019 -0500

    14325: Clean up unsafe concurrency in tests.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/lib/dispatchcloud/scheduler/run_queue_test.go b/lib/dispatchcloud/scheduler/run_queue_test.go
index b586d62c9..090a53435 100644
--- a/lib/dispatchcloud/scheduler/run_queue_test.go
+++ b/lib/dispatchcloud/scheduler/run_queue_test.go
@@ -5,6 +5,7 @@
 package scheduler
 
 import (
+	"sync"
 	"time"
 
 	"git.curoverse.com/arvados.git/lib/dispatchcloud/test"
@@ -42,13 +43,24 @@ type stubPool struct {
 	creates   []arvados.InstanceType
 	starts    []string
 	shutdowns int
+	sync.Mutex
 }
 
-func (p *stubPool) AtQuota() bool                 { return p.atQuota }
-func (p *stubPool) Subscribe() <-chan struct{}    { return p.notify }
-func (p *stubPool) Unsubscribe(<-chan struct{})   {}
-func (p *stubPool) Running() map[string]time.Time { return p.running }
+func (p *stubPool) AtQuota() bool               { return p.atQuota }
+func (p *stubPool) Subscribe() <-chan struct{}  { return p.notify }
+func (p *stubPool) Unsubscribe(<-chan struct{}) {}
+func (p *stubPool) Running() map[string]time.Time {
+	p.Lock()
+	defer p.Unlock()
+	r := map[string]time.Time{}
+	for k, v := range p.running {
+		r[k] = v
+	}
+	return r
+}
 func (p *stubPool) Unallocated() map[arvados.InstanceType]int {
+	p.Lock()
+	defer p.Unlock()
 	r := map[arvados.InstanceType]int{}
 	for it, n := range p.unalloc {
 		r[it] = n
@@ -56,6 +68,8 @@ func (p *stubPool) Unallocated() map[arvados.InstanceType]int {
 	return r
 }
 func (p *stubPool) Create(it arvados.InstanceType) bool {
+	p.Lock()
+	defer p.Unlock()
 	p.creates = append(p.creates, it)
 	if p.canCreate < 1 {
 		return false
@@ -65,13 +79,17 @@ func (p *stubPool) Create(it arvados.InstanceType) bool {
 	return true
 }
 func (p *stubPool) KillContainer(uuid string) {
-	p.running[uuid] = time.Now()
+	p.Lock()
+	defer p.Unlock()
+	delete(p.running, uuid)
 }
 func (p *stubPool) Shutdown(arvados.InstanceType) bool {
 	p.shutdowns++
 	return false
 }
 func (p *stubPool) CountWorkers() map[worker.State]int {
+	p.Lock()
+	defer p.Unlock()
 	return map[worker.State]int{
 		worker.StateBooting: len(p.unalloc) - len(p.idle),
 		worker.StateIdle:    len(p.idle),
@@ -79,6 +97,8 @@ func (p *stubPool) CountWorkers() map[worker.State]int {
 	}
 }
 func (p *stubPool) StartContainer(it arvados.InstanceType, ctr arvados.Container) bool {
+	p.Lock()
+	defer p.Unlock()
 	p.starts = append(p.starts, ctr.UUID)
 	if p.idle[it] == 0 {
 		return false
@@ -89,6 +109,10 @@ func (p *stubPool) StartContainer(it arvados.InstanceType, ctr arvados.Container
 	return true
 }
 
+func chooseType(ctr *arvados.Container) (arvados.InstanceType, error) {
+	return test.InstanceType(ctr.RuntimeConstraints.VCPUs), nil
+}
+
 var _ = check.Suite(&SchedulerSuite{})
 
 type SchedulerSuite struct{}
@@ -100,9 +124,7 @@ type SchedulerSuite struct{}
 // create.
 func (*SchedulerSuite) TestUseIdleWorkers(c *check.C) {
 	queue := test.Queue{
-		ChooseType: func(ctr *arvados.Container) (arvados.InstanceType, error) {
-			return test.InstanceType(ctr.RuntimeConstraints.VCPUs), nil
-		},
+		ChooseType: chooseType,
 		Containers: []arvados.Container{
 			{
 				UUID:     test.ContainerUUID(1),
@@ -174,9 +196,7 @@ func (*SchedulerSuite) TestShutdownAtQuota(c *check.C) {
 			shouldCreate = append(shouldCreate, test.InstanceType(3))
 		}
 		queue := test.Queue{
-			ChooseType: func(ctr *arvados.Container) (arvados.InstanceType, error) {
-				return test.InstanceType(ctr.RuntimeConstraints.VCPUs), nil
-			},
+			ChooseType: chooseType,
 			Containers: []arvados.Container{
 				{
 					UUID:     test.ContainerUUID(2),
@@ -235,9 +255,7 @@ func (*SchedulerSuite) TestStartWhileCreating(c *check.C) {
 		canCreate: 4,
 	}
 	queue := test.Queue{
-		ChooseType: func(ctr *arvados.Container) (arvados.InstanceType, error) {
-			return test.InstanceType(ctr.RuntimeConstraints.VCPUs), nil
-		},
+		ChooseType: chooseType,
 		Containers: []arvados.Container{
 			{
 				// create a new worker

commit be8ed479042df4fdefe1fd18c1e2e984e1c99bc0
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Fri Jan 25 13:20:34 2019 -0500

    14325: Document Running() return value.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/lib/dispatchcloud/worker/pool.go b/lib/dispatchcloud/worker/pool.go
index 2e6cdb162..1665a1e43 100644
--- a/lib/dispatchcloud/worker/pool.go
+++ b/lib/dispatchcloud/worker/pool.go
@@ -404,6 +404,12 @@ func (wp *Pool) CountWorkers() map[State]int {
 }
 
 // Running returns the container UUIDs being prepared/run on workers.
+//
+// In the returned map, the time value indicates when the Pool
+// observed that the container process had exited. A container that
+// has not yet exited has a zero time value. The caller should use
+// KillContainer() to garbage-collect the entries for exited
+// containers.
 func (wp *Pool) Running() map[string]time.Time {
 	wp.setupOnce.Do(wp.setup)
 	wp.mtx.Lock()

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list