[ARVADOS] updated: 1.2.0-300-g3877bdae2

Thu Nov 15 11:00:27 EST 2018

Summary of changes:
 lib/dispatchcloud/cmd.go                           |   4 +-
 lib/dispatchcloud/container/queue.go               |  89 +++-
 lib/dispatchcloud/dispatcher.go                    |  80 ++--
 lib/dispatchcloud/dispatcher_test.go               |  31 +-
 lib/dispatchcloud/node_size.go                     |  62 ---
 lib/dispatchcloud/readme.go                        |  71 ++-
 lib/dispatchcloud/scheduler/fix_stale_locks.go     |  24 +-
 lib/dispatchcloud/scheduler/interfaces.go          |   3 +
 lib/dispatchcloud/scheduler/map.go                 | 144 ------
 lib/dispatchcloud/scheduler/run_queue.go           | 153 +++++++
 .../scheduler/{map_test.go => run_queue_test.go}   | 240 +++++-----
 lib/dispatchcloud/scheduler/scheduler.go           | 111 +++++
 lib/dispatchcloud/scheduler/sync.go                |  49 +-
 lib/dispatchcloud/ssh_executor/executor.go         |  18 +
 lib/dispatchcloud/test/fixtures.go                 |   1 +
 lib/dispatchcloud/test/queue.go                    |  55 ++-
 lib/dispatchcloud/test/stub_driver.go              |   8 +-
 lib/dispatchcloud/worker/pool.go                   | 504 +++++++--------------
 lib/dispatchcloud/worker/pool_test.go              |  21 +-
 lib/dispatchcloud/worker/worker.go                 | 272 ++++++++++-
 .../crunch-dispatch-slurm/crunch-dispatch-slurm.go |   2 +-
 .../crunch-dispatch-slurm/node_type.go             |  68 +--
 services/crunch-run/background.go                  |   9 +-
 23 files changed, 1129 insertions(+), 890 deletions(-)
 delete mode 100644 lib/dispatchcloud/scheduler/map.go
 create mode 100644 lib/dispatchcloud/scheduler/run_queue.go
 rename lib/dispatchcloud/scheduler/{map_test.go => run_queue_test.go} (52%)
 create mode 100644 lib/dispatchcloud/scheduler/scheduler.go
 copy lib/dispatchcloud/node_size.go => services/crunch-dispatch-slurm/node_type.go (53%)

       via  3877bdae211807bd4c8cc5d824ffbede288dd708 (commit)
       via  0f65ce49f183f7936d694961ce0f5980a8abffcc (commit)
       via  ff8c3cfde1e73c1c0adccb670040e98d82120e99 (commit)
       via  15b7d1d4fa484641d846c1750487e640bfe4be09 (commit)
       via  93671366e7633cbf0ca3cab68395e211e3afc31c (commit)
       via  d6bf3b66c2c938ca2ce2bda3276c0c0f4e54f1b6 (commit)
       via  49f46db107c8226d70ed6d489cd000bfba82ca2f (commit)
       via  3cb857be0aad14d0ebd631305a7a5c0f847811d4 (commit)
       via  3654f222637b00728158300e700d615c4c913b29 (commit)
       via  1c1e245bc3163a7841859d5ff75d0cff59dfc90c (commit)
       via  f33f8084159f701b8a1435e8cf77b7ab7f3417ae (commit)
       via  e5e2dc5e6bf1bce9893517470632295d42bb5f1c (commit)
       via  359dc0dba7e93744c583b431d428de62fb8e0c90 (commit)
       via  17479bd75a29c52470abe0049cb447e114eb39e9 (commit)
       via  67464732a18e0aac11d89a75987151543b704cf6 (commit)
       via  a5ef124795af5b28a97c98e69a89c4c6e99cf0b7 (commit)
       via  a5fe34a8cd05c4e55deaec599347f65e6d662d22 (commit)
       via  8ef9e211fa1eb91deb1b6aa7245cd7e5b8dd4c33 (commit)
       via  468a33e9931c3dbadc83332a3b85d7677c9e4dc3 (commit)
       via  0f159e5eb304e08a0704156046762709bd6e1eba (commit)
       via  3ebb991101ef62387268d594e3e4307a0f5c4e9e (commit)
       via  968519edf4c9150ac65991aea5f977356551b4e4 (commit)
       via  71460cf96c9b43c8ab0a38118c3745a4c0e6d7e9 (commit)
       via  438ce6f86261376880b186775d7ba5fbade560a6 (commit)
       via  c11e1e9b7e04b819a8a1794acc3ffb0a816715b7 (commit)
       via  b10284a496cd2d953b865144a5f28265fc277938 (commit)
       via  8dff5fe14f23f3db9efaf5d71a21668738a9f164 (commit)
       via  c81a653c1e800c40a0c6e1a5d94cddd6620b5e52 (commit)
       via  8162f74b2611b686cc8bcec225c6724c71b8926d (commit)
       via  1b47c040b802c09d8652b952704e2ad09f31aa27 (commit)
       via  8c44ed01463d5e0f0d167b37474b0e78fe11bd8d (commit)
      from  c1b959750a78509a2f5182dbd17685d010826b64 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.


commit 3877bdae211807bd4c8cc5d824ffbede288dd708
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Thu Nov 15 10:59:28 2018 -0500

    14360: Retry shutdown after configured timeout.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/lib/dispatchcloud/dispatcher_test.go b/lib/dispatchcloud/dispatcher_test.go
index 3ac9370a3..83797e984 100644
--- a/lib/dispatchcloud/dispatcher_test.go
+++ b/lib/dispatchcloud/dispatcher_test.go
@@ -188,6 +188,8 @@ func (s *DispatcherSuite) SetUpTest(c *check.C) {
 		},
 		HostKey:        hostpriv,
 		AuthorizedKeys: []ssh.PublicKey{dispatchpub},
+
+		ErrorRateDestroy: 0.1,
 	}
 
 	s.cluster = &arvados.Cluster{
diff --git a/lib/dispatchcloud/test/stub_driver.go b/lib/dispatchcloud/test/stub_driver.go
index 53e312ea2..bfe0f8f26 100644
--- a/lib/dispatchcloud/test/stub_driver.go
+++ b/lib/dispatchcloud/test/stub_driver.go
@@ -27,7 +27,10 @@ type StubDriver struct {
 	Exec           StubExecFunc
 	HostKey        ssh.Signer
 	AuthorizedKeys []ssh.PublicKey
-	instanceSets   []*StubInstanceSet
+
+	ErrorRateDestroy float64
+
+	instanceSets []*StubInstanceSet
 }
 
 // InstanceSet returns a new *StubInstanceSet.
@@ -142,6 +145,9 @@ func (si stubInstance) Address() string {
 }
 
 func (si stubInstance) Destroy() error {
+	if math_rand.Float64() < si.ss.sis.driver.ErrorRateDestroy {
+		return errors.New("instance could not be destroyed")
+	}
 	si.ss.SSHService.Close()
 	sis := si.ss.sis
 	sis.mtx.Lock()
diff --git a/lib/dispatchcloud/worker/pool.go b/lib/dispatchcloud/worker/pool.go
index be66895a9..37add6d3d 100644
--- a/lib/dispatchcloud/worker/pool.go
+++ b/lib/dispatchcloud/worker/pool.go
@@ -60,6 +60,7 @@ const (
 	defaultTimeoutIdle        = time.Minute
 	defaultTimeoutBooting     = time.Minute * 10
 	defaultTimeoutProbe       = time.Minute * 10
+	defaultTimeoutShutdown    = time.Second * 10
 )
 
 func duration(conf arvados.Duration, def time.Duration) time.Duration {
@@ -88,6 +89,7 @@ func NewPool(logger logrus.FieldLogger, reg *prometheus.Registry, instanceSet cl
 		timeoutIdle:        duration(cluster.CloudVMs.TimeoutIdle, defaultTimeoutIdle),
 		timeoutBooting:     duration(cluster.CloudVMs.TimeoutBooting, defaultTimeoutBooting),
 		timeoutProbe:       duration(cluster.CloudVMs.TimeoutProbe, defaultTimeoutProbe),
+		timeoutShutdown:    duration(cluster.CloudVMs.TimeoutShutdown, defaultTimeoutShutdown),
 	}
 	wp.registerMetrics(reg)
 	go func() {
@@ -115,6 +117,7 @@ type Pool struct {
 	timeoutIdle        time.Duration
 	timeoutBooting     time.Duration
 	timeoutProbe       time.Duration
+	timeoutShutdown    time.Duration
 
 	// private state
 	subscribers  map[<-chan struct{}]chan<- struct{}
@@ -230,19 +233,21 @@ func (wp *Pool) AtQuota() bool {
 }
 
 // Add or update worker attached to the given instance. Use
-// initialState if a new worker is created. Caller must have lock.
+// initialState if a new worker is created.
 //
-// Returns true when a new worker is created.
-func (wp *Pool) updateWorker(inst cloud.Instance, it arvados.InstanceType, initialState State) bool {
+// The second return value is true if a new worker is created.
+//
+// Caller must have lock.
+func (wp *Pool) updateWorker(inst cloud.Instance, it arvados.InstanceType, initialState State) (*worker, bool) {
 	id := inst.ID()
-	if wp.workers[id] != nil {
-		wp.workers[id].executor.SetTarget(inst)
-		wp.workers[id].instance = inst
-		wp.workers[id].updated = time.Now()
-		if initialState == StateBooting && wp.workers[id].state == StateUnknown {
-			wp.workers[id].state = StateBooting
+	if wkr := wp.workers[id]; wkr != nil {
+		wkr.executor.SetTarget(inst)
+		wkr.instance = inst
+		wkr.updated = time.Now()
+		if initialState == StateBooting && wkr.state == StateUnknown {
+			wkr.state = StateBooting
 		}
-		return false
+		return wkr, false
 	}
 	if initialState == StateUnknown && inst.Tags()[tagKeyHold] != "" {
 		initialState = StateHold
@@ -253,7 +258,7 @@ func (wp *Pool) updateWorker(inst cloud.Instance, it arvados.InstanceType, initi
 	})
 	logger.WithField("State", initialState).Infof("instance appeared in cloud")
 	now := time.Now()
-	wp.workers[id] = &worker{
+	wkr := &worker{
 		mtx:      &wp.mtx,
 		wp:       wp,
 		logger:   logger,
@@ -268,7 +273,8 @@ func (wp *Pool) updateWorker(inst cloud.Instance, it arvados.InstanceType, initi
 		starting: make(map[string]struct{}),
 		probing:  make(chan struct{}, 1),
 	}
-	return true
+	wp.workers[id] = wkr
+	return wkr, true
 }
 
 // caller must have lock.
@@ -619,8 +625,11 @@ func (wp *Pool) sync(threshold time.Time, instances []cloud.Instance) {
 			wp.logger.WithField("Instance", inst).Errorf("unknown InstanceType tag %q --- ignoring", itTag)
 			continue
 		}
-		if wp.updateWorker(inst, it, StateUnknown) {
+		if wkr, isNew := wp.updateWorker(inst, it, StateUnknown); isNew {
 			notify = true
+		} else if wkr.state == StateShutdown && time.Since(wkr.destroyed) > wp.timeoutShutdown {
+			wp.logger.WithField("Instance", inst).Info("worker still listed after shutdown; retrying")
+			wkr.shutdown()
 		}
 	}
 
diff --git a/lib/dispatchcloud/worker/worker.go b/lib/dispatchcloud/worker/worker.go
index c3825db52..c8e6e86f8 100644
--- a/lib/dispatchcloud/worker/worker.go
+++ b/lib/dispatchcloud/worker/worker.go
@@ -58,19 +58,20 @@ type worker struct {
 	executor Executor
 	wp       *Pool
 
-	mtx      sync.Locker // must be wp's Locker.
-	state    State
-	instance cloud.Instance
-	instType arvados.InstanceType
-	vcpus    int64
-	memory   int64
-	probed   time.Time
-	updated  time.Time
-	busy     time.Time
-	lastUUID string
-	running  map[string]struct{} // remember to update state idle<->running when this changes
-	starting map[string]struct{} // remember to update state idle<->running when this changes
-	probing  chan struct{}
+	mtx       sync.Locker // must be wp's Locker.
+	state     State
+	instance  cloud.Instance
+	instType  arvados.InstanceType
+	vcpus     int64
+	memory    int64
+	probed    time.Time
+	updated   time.Time
+	busy      time.Time
+	destroyed time.Time
+	lastUUID  string
+	running   map[string]struct{} // remember to update state idle<->running when this changes
+	starting  map[string]struct{} // remember to update state idle<->running when this changes
+	probing   chan struct{}
 }
 
 // caller must have lock.
@@ -300,7 +301,9 @@ func (wkr *worker) shutdownIfIdle() bool {
 
 // caller must have lock
 func (wkr *worker) shutdown() {
-	wkr.updated = time.Now()
+	now := time.Now()
+	wkr.updated = now
+	wkr.destroyed = now
 	wkr.state = StateShutdown
 	go func() {
 		err := wkr.instance.Destroy()

commit 0f65ce49f183f7936d694961ce0f5980a8abffcc
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Thu Nov 8 16:48:19 2018 -0500

    14360: Fix false negative in test suite.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/lib/dispatchcloud/dispatcher_test.go b/lib/dispatchcloud/dispatcher_test.go
index 248986bf3..3ac9370a3 100644
--- a/lib/dispatchcloud/dispatcher_test.go
+++ b/lib/dispatchcloud/dispatcher_test.go
@@ -113,6 +113,10 @@ func (fvm *fakeVM) exec(queue *test.Queue, onComplete, onCancel func(uuid string
 				logger.Print("[test] container was killed")
 				return
 			}
+			// TODO: Check whether the stub instance has
+			// been destroyed, and if so, don't call
+			// onComplete. Then "container finished twice"
+			// can be classified as a bug.
 			if crashluck < fvm.crunchRunCrashRate {
 				logger.Print("[test] crashing crunch-run stub")
 				if onCancel != nil && ctr.State == arvados.ContainerStateRunning {
@@ -259,7 +263,7 @@ func (s *DispatcherSuite) TestDispatchToStubDriver(c *check.C) {
 		mtx.Lock()
 		defer mtx.Unlock()
 		if _, ok := waiting[uuid]; !ok {
-			c.Errorf("container completed twice: %s", uuid)
+			c.Logf("container completed twice: %s -- perhaps completed after stub instance was killed?", uuid)
 		}
 		delete(waiting, uuid)
 		if len(waiting) == 0 {

commit ff8c3cfde1e73c1c0adccb670040e98d82120e99
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Thu Nov 8 16:48:17 2018 -0500

    14360: Don't block scheduling while locking containers.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/lib/dispatchcloud/scheduler/run_queue.go b/lib/dispatchcloud/scheduler/run_queue.go
index 9fc1a1658..7b432adc6 100644
--- a/lib/dispatchcloud/scheduler/run_queue.go
+++ b/lib/dispatchcloud/scheduler/run_queue.go
@@ -34,6 +34,7 @@ func (sch *Scheduler) runQueue() {
 	dontstart := map[arvados.InstanceType]bool{}
 	var overquota []container.QueueEnt // entries that are unmappable because of worker pool quota
 
+tryrun:
 	for i, ctr := range sorted {
 		ctr, it := ctr.Container, ctr.InstanceType
 		logger := sch.logger.WithFields(logrus.Fields{
@@ -43,60 +44,53 @@ func (sch *Scheduler) runQueue() {
 		if _, running := running[ctr.UUID]; running || ctr.Priority < 1 {
 			continue
 		}
-		if ctr.State == arvados.ContainerStateQueued {
+		switch ctr.State {
+		case arvados.ContainerStateQueued:
 			if unalloc[it] < 1 && sch.pool.AtQuota() {
-				logger.Debugf("not locking: AtQuota and no unalloc workers")
+				logger.Debug("not locking: AtQuota and no unalloc workers")
 				overquota = sorted[i:]
-				break
+				break tryrun
 			}
-			logger.Debugf("locking")
-			err := sch.queue.Lock(ctr.UUID)
-			if err != nil {
-				logger.WithError(err).Warnf("lock error")
+			sch.bgLock(logger, ctr.UUID)
+			unalloc[it]--
+		case arvados.ContainerStateLocked:
+			if unalloc[it] < 1 {
+				if sch.pool.AtQuota() {
+					logger.Debug("not starting: AtQuota and no unalloc workers")
+					overquota = sorted[i:]
+					break tryrun
+				}
+				logger.Info("creating new instance")
+				err := sch.pool.Create(it)
+				if err != nil {
+					if _, ok := err.(cloud.QuotaError); !ok {
+						logger.WithError(err).Warn("error creating worker")
+					}
+					sch.queue.Unlock(ctr.UUID)
+					// Don't let lower-priority
+					// containers starve this one
+					// by using keeping idle
+					// workers alive on different
+					// instance types.  TODO:
+					// avoid getting starved here
+					// if instances of a specific
+					// type always fail.
+					overquota = sorted[i:]
+					break tryrun
+				}
 				unalloc[it]++
-				continue
 			}
-			var ok bool
-			ctr, ok = sch.queue.Get(ctr.UUID)
-			if !ok {
-				logger.Error("(BUG?) container disappeared from queue after Lock succeeded")
-				continue
-			}
-			if ctr.State != arvados.ContainerStateLocked {
-				logger.Warnf("(race?) container has state=%q after Lock succeeded", ctr.State)
-			}
-		}
-		if ctr.State != arvados.ContainerStateLocked {
-			continue
-		}
-		if unalloc[it] < 1 {
-			logger.Info("creating new instance")
-			err := sch.pool.Create(it)
-			if err != nil {
-				if _, ok := err.(cloud.QuotaError); !ok {
-					logger.WithError(err).Warn("error creating worker")
-				}
-				sch.queue.Unlock(ctr.UUID)
-				// Don't let lower-priority containers
-				// starve this one by using keeping
-				// idle workers alive on different
-				// instance types.  TODO: avoid
-				// getting starved here if instances
-				// of a specific type always fail.
-				overquota = sorted[i:]
-				break
+
+			if dontstart[it] {
+				// We already tried & failed to start
+				// a higher-priority container on the
+				// same instance type. Don't let this
+				// one sneak in ahead of it.
+			} else if sch.pool.StartContainer(it, ctr) {
+				unalloc[it]--
+			} else {
+				dontstart[it] = true
 			}
-			unalloc[it]++
-		}
-		if dontstart[it] {
-			// We already tried & failed to start a
-			// higher-priority container on the same
-			// instance type. Don't let this one sneak in
-			// ahead of it.
-		} else if sch.pool.StartContainer(it, ctr) {
-			unalloc[it]--
-		} else {
-			dontstart[it] = true
 		}
 	}
 
@@ -124,3 +118,36 @@ func (sch *Scheduler) runQueue() {
 		}
 	}
 }
+
+// Start an API call to lock the given container, and return
+// immediately while waiting for the response in a new goroutine. Do
+// nothing if a lock request is already in progress for this
+// container.
+func (sch *Scheduler) bgLock(logger logrus.FieldLogger, uuid string) {
+	logger.Debug("locking")
+	sch.mtx.Lock()
+	defer sch.mtx.Unlock()
+	if sch.locking[uuid] {
+		logger.Debug("locking in progress, doing nothing")
+		return
+	}
+	sch.locking[uuid] = true
+	go func() {
+		defer func() {
+			sch.mtx.Lock()
+			defer sch.mtx.Unlock()
+			delete(sch.locking, uuid)
+		}()
+		err := sch.queue.Lock(uuid)
+		if err != nil {
+			logger.WithError(err).Warn("error locking container")
+			return
+		}
+		ctr, ok := sch.queue.Get(uuid)
+		if !ok {
+			logger.Error("(BUG?) container disappeared from queue after Lock succeeded")
+		} else if ctr.State != arvados.ContainerStateLocked {
+			logger.Warnf("(race?) container has state=%q after Lock succeeded", ctr.State)
+		}
+	}()
+}
diff --git a/lib/dispatchcloud/scheduler/run_queue_test.go b/lib/dispatchcloud/scheduler/run_queue_test.go
index b8a03275f..35db70522 100644
--- a/lib/dispatchcloud/scheduler/run_queue_test.go
+++ b/lib/dispatchcloud/scheduler/run_queue_test.go
@@ -108,7 +108,7 @@ func (*SchedulerSuite) TestUseIdleWorkers(c *check.C) {
 			{
 				UUID:     test.ContainerUUID(1),
 				Priority: 1,
-				State:    arvados.ContainerStateQueued,
+				State:    arvados.ContainerStateLocked,
 				RuntimeConstraints: arvados.RuntimeConstraints{
 					VCPUs: 1,
 					RAM:   1 << 30,
@@ -117,7 +117,7 @@ func (*SchedulerSuite) TestUseIdleWorkers(c *check.C) {
 			{
 				UUID:     test.ContainerUUID(2),
 				Priority: 2,
-				State:    arvados.ContainerStateQueued,
+				State:    arvados.ContainerStateLocked,
 				RuntimeConstraints: arvados.RuntimeConstraints{
 					VCPUs: 1,
 					RAM:   1 << 30,
@@ -126,7 +126,7 @@ func (*SchedulerSuite) TestUseIdleWorkers(c *check.C) {
 			{
 				UUID:     test.ContainerUUID(3),
 				Priority: 3,
-				State:    arvados.ContainerStateQueued,
+				State:    arvados.ContainerStateLocked,
 				RuntimeConstraints: arvados.RuntimeConstraints{
 					VCPUs: 1,
 					RAM:   1 << 30,
@@ -135,7 +135,7 @@ func (*SchedulerSuite) TestUseIdleWorkers(c *check.C) {
 			{
 				UUID:     test.ContainerUUID(4),
 				Priority: 4,
-				State:    arvados.ContainerStateQueued,
+				State:    arvados.ContainerStateLocked,
 				RuntimeConstraints: arvados.RuntimeConstraints{
 					VCPUs: 1,
 					RAM:   1 << 30,
@@ -154,11 +154,11 @@ func (*SchedulerSuite) TestUseIdleWorkers(c *check.C) {
 			test.InstanceType(2): 2,
 		},
 		running:   map[string]time.Time{},
-		canCreate: 1,
+		canCreate: 0,
 	}
 	New(logger, &queue, &pool, time.Millisecond, time.Millisecond).runQueue()
 	c.Check(pool.creates, check.DeepEquals, []arvados.InstanceType{test.InstanceType(1)})
-	c.Check(pool.starts, check.DeepEquals, []string{test.ContainerUUID(4), test.ContainerUUID(3)})
+	c.Check(pool.starts, check.DeepEquals, []string{test.ContainerUUID(4)})
 	c.Check(pool.running, check.HasLen, 1)
 	for uuid := range pool.running {
 		c.Check(uuid, check.Equals, uuids[4])
@@ -169,9 +169,10 @@ func (*SchedulerSuite) TestUseIdleWorkers(c *check.C) {
 // Create(), if AtQuota() is true.
 func (*SchedulerSuite) TestShutdownAtQuota(c *check.C) {
 	for quota := 0; quota < 2; quota++ {
+		c.Logf("quota=%d", quota)
 		shouldCreate := []arvados.InstanceType{}
-		for i := 1; i < 1+quota; i++ {
-			shouldCreate = append(shouldCreate, test.InstanceType(i))
+		for i := 0; i < quota; i++ {
+			shouldCreate = append(shouldCreate, test.InstanceType(1))
 		}
 		queue := test.Queue{
 			ChooseType: func(ctr *arvados.Container) (arvados.InstanceType, error) {
@@ -181,7 +182,7 @@ func (*SchedulerSuite) TestShutdownAtQuota(c *check.C) {
 				{
 					UUID:     test.ContainerUUID(1),
 					Priority: 1,
-					State:    arvados.ContainerStateQueued,
+					State:    arvados.ContainerStateLocked,
 					RuntimeConstraints: arvados.RuntimeConstraints{
 						VCPUs: 1,
 						RAM:   1 << 30,
@@ -223,7 +224,7 @@ func (*SchedulerSuite) TestStartWhileCreating(c *check.C) {
 			test.InstanceType(2): 1,
 		},
 		running:   map[string]time.Time{},
-		canCreate: 2,
+		canCreate: 4,
 	}
 	queue := test.Queue{
 		ChooseType: func(ctr *arvados.Container) (arvados.InstanceType, error) {
@@ -234,7 +235,7 @@ func (*SchedulerSuite) TestStartWhileCreating(c *check.C) {
 				// create a new worker
 				UUID:     test.ContainerUUID(1),
 				Priority: 1,
-				State:    arvados.ContainerStateQueued,
+				State:    arvados.ContainerStateLocked,
 				RuntimeConstraints: arvados.RuntimeConstraints{
 					VCPUs: 1,
 					RAM:   1 << 30,
@@ -244,7 +245,7 @@ func (*SchedulerSuite) TestStartWhileCreating(c *check.C) {
 				// tentatively map to unalloc worker
 				UUID:     test.ContainerUUID(2),
 				Priority: 2,
-				State:    arvados.ContainerStateQueued,
+				State:    arvados.ContainerStateLocked,
 				RuntimeConstraints: arvados.RuntimeConstraints{
 					VCPUs: 1,
 					RAM:   1 << 30,
@@ -254,7 +255,7 @@ func (*SchedulerSuite) TestStartWhileCreating(c *check.C) {
 				// start now on idle worker
 				UUID:     test.ContainerUUID(3),
 				Priority: 3,
-				State:    arvados.ContainerStateQueued,
+				State:    arvados.ContainerStateLocked,
 				RuntimeConstraints: arvados.RuntimeConstraints{
 					VCPUs: 1,
 					RAM:   1 << 30,
@@ -264,7 +265,7 @@ func (*SchedulerSuite) TestStartWhileCreating(c *check.C) {
 				// create a new worker
 				UUID:     test.ContainerUUID(4),
 				Priority: 4,
-				State:    arvados.ContainerStateQueued,
+				State:    arvados.ContainerStateLocked,
 				RuntimeConstraints: arvados.RuntimeConstraints{
 					VCPUs: 2,
 					RAM:   2 << 30,
@@ -274,7 +275,7 @@ func (*SchedulerSuite) TestStartWhileCreating(c *check.C) {
 				// tentatively map to unalloc worker
 				UUID:     test.ContainerUUID(5),
 				Priority: 5,
-				State:    arvados.ContainerStateQueued,
+				State:    arvados.ContainerStateLocked,
 				RuntimeConstraints: arvados.RuntimeConstraints{
 					VCPUs: 2,
 					RAM:   2 << 30,
@@ -284,7 +285,7 @@ func (*SchedulerSuite) TestStartWhileCreating(c *check.C) {
 				// start now on idle worker
 				UUID:     test.ContainerUUID(6),
 				Priority: 6,
-				State:    arvados.ContainerStateQueued,
+				State:    arvados.ContainerStateLocked,
 				RuntimeConstraints: arvados.RuntimeConstraints{
 					VCPUs: 2,
 					RAM:   2 << 30,
diff --git a/lib/dispatchcloud/scheduler/scheduler.go b/lib/dispatchcloud/scheduler/scheduler.go
index 0be8edb7b..9a5fb10d5 100644
--- a/lib/dispatchcloud/scheduler/scheduler.go
+++ b/lib/dispatchcloud/scheduler/scheduler.go
@@ -32,6 +32,9 @@ type Scheduler struct {
 	staleLockTimeout    time.Duration
 	queueUpdateInterval time.Duration
 
+	locking map[string]bool
+	mtx     sync.Mutex
+
 	runOnce sync.Once
 	stop    chan struct{}
 }
@@ -48,6 +51,7 @@ func New(logger logrus.FieldLogger, queue ContainerQueue, pool WorkerPool, stale
 		staleLockTimeout:    staleLockTimeout,
 		queueUpdateInterval: queueUpdateInterval,
 		stop:                make(chan struct{}),
+		locking:             map[string]bool{},
 	}
 }
 
diff --git a/lib/dispatchcloud/test/queue.go b/lib/dispatchcloud/test/queue.go
index 152094f52..fda04d52b 100644
--- a/lib/dispatchcloud/test/queue.go
+++ b/lib/dispatchcloud/test/queue.go
@@ -108,7 +108,7 @@ func (q *Queue) notify() {
 func (q *Queue) changeState(uuid string, from, to arvados.ContainerState) error {
 	ent := q.entries[uuid]
 	if ent.Container.State != from {
-		return fmt.Errorf("lock failed: state=%q", ent.Container.State)
+		return fmt.Errorf("changeState failed: state=%q", ent.Container.State)
 	}
 	ent.Container.State = to
 	q.entries[uuid] = ent

commit 15b7d1d4fa484641d846c1750487e640bfe4be09
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Wed Nov 7 14:10:01 2018 -0500

    14360: Move single-worker Pool methods to worker type.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/lib/dispatchcloud/worker/pool.go b/lib/dispatchcloud/worker/pool.go
index 45e9da37a..be66895a9 100644
--- a/lib/dispatchcloud/worker/pool.go
+++ b/lib/dispatchcloud/worker/pool.go
@@ -5,7 +5,6 @@
 package worker
 
 import (
-	"bytes"
 	"io"
 	"sort"
 	"strings"
@@ -137,22 +136,6 @@ type Pool struct {
 	mMemoryInuse       prometheus.Gauge
 }
 
-type worker struct {
-	state    State
-	instance cloud.Instance
-	executor Executor
-	instType arvados.InstanceType
-	vcpus    int64
-	memory   int64
-	probed   time.Time
-	updated  time.Time
-	busy     time.Time
-	lastUUID string
-	running  map[string]struct{} // remember to update state idle<->running when this changes
-	starting map[string]struct{} // remember to update state idle<->running when this changes
-	probing  chan struct{}
-}
-
 // Subscribe returns a channel that becomes ready whenever a worker's
 // state changes.
 //
@@ -264,13 +247,16 @@ func (wp *Pool) updateWorker(inst cloud.Instance, it arvados.InstanceType, initi
 	if initialState == StateUnknown && inst.Tags()[tagKeyHold] != "" {
 		initialState = StateHold
 	}
-	wp.logger.WithFields(logrus.Fields{
+	logger := wp.logger.WithFields(logrus.Fields{
 		"InstanceType": it.Name,
 		"Instance":     inst,
-		"State":        initialState,
-	}).Infof("instance appeared in cloud")
+	})
+	logger.WithField("State", initialState).Infof("instance appeared in cloud")
 	now := time.Now()
 	wp.workers[id] = &worker{
+		mtx:      &wp.mtx,
+		wp:       wp,
+		logger:   logger,
 		executor: wp.newExecutor(inst),
 		state:    initialState,
 		instance: inst,
@@ -285,6 +271,11 @@ func (wp *Pool) updateWorker(inst cloud.Instance, it arvados.InstanceType, initi
 	return true
 }
 
+// caller must have lock.
+func (wp *Pool) notifyExited(uuid string, t time.Time) {
+	wp.exited[uuid] = t
+}
+
 // Shutdown shuts down a worker with the given type, or returns false
 // if all workers with the given type are busy.
 func (wp *Pool) Shutdown(it arvados.InstanceType) bool {
@@ -298,9 +289,8 @@ func (wp *Pool) Shutdown(it arvados.InstanceType) bool {
 		// time (Idle) or the earliest create time (Booting)
 		for _, wkr := range wp.workers {
 			if wkr.state == tryState && wkr.instType == it {
-				logger = logger.WithField("Instance", wkr.instance)
-				logger.Info("shutting down")
-				wp.shutdown(wkr, logger)
+				logger.WithField("Instance", wkr.instance).Info("shutting down")
+				wkr.shutdown()
 				return true
 			}
 		}
@@ -308,23 +298,6 @@ func (wp *Pool) Shutdown(it arvados.InstanceType) bool {
 	return false
 }
 
-// caller must have lock
-func (wp *Pool) shutdown(wkr *worker, logger logrus.FieldLogger) {
-	wkr.updated = time.Now()
-	wkr.state = StateShutdown
-	go func() {
-		err := wkr.instance.Destroy()
-		if err != nil {
-			logger.WithError(err).WithField("Instance", wkr.instance).Warn("shutdown failed")
-			return
-		}
-		wp.mtx.Lock()
-		wp.atQuotaUntil = time.Now()
-		wp.mtx.Unlock()
-		wp.notify()
-	}()
-}
-
 // Workers returns the current number of workers in each state.
 func (wp *Pool) Workers() map[State]int {
 	wp.setupOnce.Do(wp.setup)
@@ -360,11 +333,6 @@ func (wp *Pool) Running() map[string]time.Time {
 // StartContainer starts a container on an idle worker immediately if
 // possible, otherwise returns false.
 func (wp *Pool) StartContainer(it arvados.InstanceType, ctr arvados.Container) bool {
-	logger := wp.logger.WithFields(logrus.Fields{
-		"InstanceType":  it.Name,
-		"ContainerUUID": ctr.UUID,
-		"Priority":      ctr.Priority,
-	})
 	wp.setupOnce.Do(wp.setup)
 	wp.mtx.Lock()
 	defer wp.mtx.Unlock()
@@ -379,34 +347,7 @@ func (wp *Pool) StartContainer(it arvados.InstanceType, ctr arvados.Container) b
 	if wkr == nil {
 		return false
 	}
-	logger = logger.WithField("Instance", wkr.instance)
-	logger.Debug("starting container")
-	wkr.starting[ctr.UUID] = struct{}{}
-	wkr.state = StateRunning
-	go func() {
-		stdout, stderr, err := wkr.executor.Execute("crunch-run --detach '"+ctr.UUID+"'", nil)
-		wp.mtx.Lock()
-		defer wp.mtx.Unlock()
-		now := time.Now()
-		wkr.updated = now
-		wkr.busy = now
-		delete(wkr.starting, ctr.UUID)
-		wkr.running[ctr.UUID] = struct{}{}
-		wkr.lastUUID = ctr.UUID
-		if err != nil {
-			logger.WithField("stdout", string(stdout)).
-				WithField("stderr", string(stderr)).
-				WithError(err).
-				Error("error starting crunch-run process")
-			// Leave uuid in wkr.running, though: it's
-			// possible the error was just a communication
-			// failure and the process was in fact
-			// started.  Wait for next probe to find out.
-			return
-		}
-		logger.Info("crunch-run process started")
-		wkr.lastUUID = ctr.UUID
-	}()
+	wkr.startContainer(ctr)
 	return true
 }
 
@@ -556,7 +497,7 @@ func (wp *Pool) runProbes() {
 		workers = workers[:0]
 		wp.mtx.Lock()
 		for id, wkr := range wp.workers {
-			if wkr.state == StateShutdown || wp.shutdownIfIdle(wkr) {
+			if wkr.state == StateShutdown || wkr.shutdownIfIdle() {
 				continue
 			}
 			workers = append(workers, id)
@@ -567,20 +508,12 @@ func (wp *Pool) runProbes() {
 			wp.mtx.Lock()
 			wkr, ok := wp.workers[id]
 			wp.mtx.Unlock()
-			if !ok || wkr.state == StateShutdown {
-				// Deleted/shutdown while we
-				// were probing others
+			if !ok {
+				// Deleted while we were probing
+				// others
 				continue
 			}
-			select {
-			case wkr.probing <- struct{}{}:
-				go func() {
-					wp.probeAndUpdate(wkr)
-					<-wkr.probing
-				}()
-			default:
-				wp.logger.WithField("Instance", wkr.instance).Debug("still waiting for last probe to finish")
-			}
+			go wkr.ProbeAndUpdate()
 			select {
 			case <-wp.stop:
 				return
@@ -609,45 +542,6 @@ func (wp *Pool) runSync() {
 	}
 }
 
-// caller must have lock.
-func (wp *Pool) shutdownIfBroken(wkr *worker, dur time.Duration) {
-	if wkr.state == StateHold {
-		return
-	}
-	label, threshold := "", wp.timeoutProbe
-	if wkr.state == StateBooting {
-		label, threshold = "new ", wp.timeoutBooting
-	}
-	if dur < threshold {
-		return
-	}
-	wp.logger.WithFields(logrus.Fields{
-		"Instance": wkr.instance,
-		"Duration": dur,
-		"Since":    wkr.probed,
-		"State":    wkr.state,
-	}).Warnf("%sinstance unresponsive, shutting down", label)
-	wp.shutdown(wkr, wp.logger)
-}
-
-// caller must have lock.
-func (wp *Pool) shutdownIfIdle(wkr *worker) bool {
-	if wkr.state != StateIdle {
-		return false
-	}
-	age := time.Since(wkr.busy)
-	if age < wp.timeoutIdle {
-		return false
-	}
-	logger := wp.logger.WithFields(logrus.Fields{
-		"Age":      age,
-		"Instance": wkr.instance,
-	})
-	logger.Info("shutdown idle worker")
-	wp.shutdown(wkr, logger)
-	return true
-}
-
 // Stop synchronizing with the InstanceSet.
 func (wp *Pool) Stop() {
 	wp.setupOnce.Do(wp.setup)
@@ -753,146 +647,3 @@ func (wp *Pool) sync(threshold time.Time, instances []cloud.Instance) {
 		go wp.notify()
 	}
 }
-
-// should be called in a new goroutine
-func (wp *Pool) probeAndUpdate(wkr *worker) {
-	logger := wp.logger.WithField("Instance", wkr.instance)
-	wp.mtx.Lock()
-	updated := wkr.updated
-	needProbeRunning := wkr.state == StateRunning || wkr.state == StateIdle
-	needProbeBooted := wkr.state == StateUnknown || wkr.state == StateBooting
-	wp.mtx.Unlock()
-	if !needProbeBooted && !needProbeRunning {
-		return
-	}
-
-	var (
-		ctrUUIDs []string
-		ok       bool
-		stderr   []byte
-	)
-	if needProbeBooted {
-		ok, stderr = wp.probeBooted(wkr)
-		wp.mtx.Lock()
-		if ok || wkr.state == StateRunning || wkr.state == StateIdle {
-			logger.Info("instance booted; will try probeRunning")
-			needProbeRunning = true
-		}
-		wp.mtx.Unlock()
-	}
-	if needProbeRunning {
-		ctrUUIDs, ok, stderr = wp.probeRunning(wkr)
-	}
-	logger = logger.WithField("stderr", string(stderr))
-	wp.mtx.Lock()
-	defer wp.mtx.Unlock()
-	if !ok {
-		if wkr.state == StateShutdown && wkr.updated.After(updated) {
-			// Skip the logging noise if shutdown was
-			// initiated during probe.
-			return
-		}
-		dur := time.Since(wkr.probed)
-		logger := logger.WithFields(logrus.Fields{
-			"Duration": dur,
-			"State":    wkr.state,
-		})
-		if wkr.state == StateBooting {
-			logger.Debug("new instance not responding")
-		} else {
-			logger.Info("instance not responding")
-		}
-		wp.shutdownIfBroken(wkr, dur)
-		return
-	}
-
-	updateTime := time.Now()
-	wkr.probed = updateTime
-
-	if updated != wkr.updated {
-		// Worker was updated after the probe began, so
-		// wkr.running might have a container UUID that was
-		// not yet running when ctrUUIDs was generated. Leave
-		// wkr.running alone and wait for the next probe to
-		// catch up on any changes.
-		return
-	}
-
-	if len(ctrUUIDs) > 0 {
-		wkr.busy = updateTime
-		wkr.lastUUID = ctrUUIDs[0]
-	} else if len(wkr.running) > 0 {
-		// Actual last-busy time was sometime between wkr.busy
-		// and now. Now is the earliest opportunity to take
-		// advantage of the non-busy state, though.
-		wkr.busy = updateTime
-	}
-	running := map[string]struct{}{}
-	changed := false
-	for _, uuid := range ctrUUIDs {
-		running[uuid] = struct{}{}
-		if _, ok := wkr.running[uuid]; !ok {
-			changed = true
-		}
-	}
-	for uuid := range wkr.running {
-		if _, ok := running[uuid]; !ok {
-			logger.WithField("ContainerUUID", uuid).Info("crunch-run process ended")
-			wp.exited[uuid] = updateTime
-			changed = true
-		}
-	}
-	if wkr.state == StateUnknown || wkr.state == StateBooting {
-		wkr.state = StateIdle
-		changed = true
-	}
-	if changed {
-		wkr.running = running
-		if wkr.state == StateIdle && len(wkr.starting)+len(wkr.running) > 0 {
-			wkr.state = StateRunning
-		} else if wkr.state == StateRunning && len(wkr.starting)+len(wkr.running) == 0 {
-			wkr.state = StateIdle
-		}
-		wkr.updated = updateTime
-		go wp.notify()
-	}
-}
-
-func (wp *Pool) probeRunning(wkr *worker) (running []string, ok bool, stderr []byte) {
-	cmd := "crunch-run --list"
-	stdout, stderr, err := wkr.executor.Execute(cmd, nil)
-	if err != nil {
-		wp.logger.WithFields(logrus.Fields{
-			"Instance": wkr.instance,
-			"Command":  cmd,
-			"stdout":   string(stdout),
-			"stderr":   string(stderr),
-		}).WithError(err).Warn("probe failed")
-		return nil, false, stderr
-	}
-	stdout = bytes.TrimRight(stdout, "\n")
-	if len(stdout) == 0 {
-		return nil, true, stderr
-	}
-	return strings.Split(string(stdout), "\n"), true, stderr
-}
-
-func (wp *Pool) probeBooted(wkr *worker) (ok bool, stderr []byte) {
-	cmd := wp.bootProbeCommand
-	if cmd == "" {
-		cmd = "true"
-	}
-	stdout, stderr, err := wkr.executor.Execute(cmd, nil)
-	logger := wp.logger.WithFields(logrus.Fields{
-		"Instance": wkr.instance,
-		"Command":  cmd,
-		"stdout":   string(stdout),
-		"stderr":   string(stderr),
-	})
-	if err != nil {
-		logger.WithError(err).Debug("boot probe failed")
-		return false, stderr
-	}
-	logger.Info("boot probe succeeded")
-	return true, stderr
-}
diff --git a/lib/dispatchcloud/worker/worker.go b/lib/dispatchcloud/worker/worker.go
index c24efeb3c..c3825db52 100644
--- a/lib/dispatchcloud/worker/worker.go
+++ b/lib/dispatchcloud/worker/worker.go
@@ -5,7 +5,14 @@
 package worker
 
 import (
+	"bytes"
+	"strings"
+	"sync"
 	"time"
+
+	"git.curoverse.com/arvados.git/lib/cloud"
+	"git.curoverse.com/arvados.git/sdk/go/arvados"
+	"github.com/Sirupsen/logrus"
 )
 
 // State indicates whether a worker is available to do work, and (if
@@ -45,3 +52,261 @@ func (s State) String() string {
 func (s State) MarshalText() ([]byte, error) {
 	return []byte(stateString[s]), nil
 }
+
+type worker struct {
+	logger   logrus.FieldLogger
+	executor Executor
+	wp       *Pool
+
+	mtx      sync.Locker // must be wp's Locker.
+	state    State
+	instance cloud.Instance
+	instType arvados.InstanceType
+	vcpus    int64
+	memory   int64
+	probed   time.Time
+	updated  time.Time
+	busy     time.Time
+	lastUUID string
+	running  map[string]struct{} // remember to update state idle<->running when this changes
+	starting map[string]struct{} // remember to update state idle<->running when this changes
+	probing  chan struct{}
+}
+
+// caller must have lock.
+func (wkr *worker) startContainer(ctr arvados.Container) {
+	logger := wkr.logger.WithFields(logrus.Fields{
+		"ContainerUUID": ctr.UUID,
+		"Priority":      ctr.Priority,
+	})
+	logger = logger.WithField("Instance", wkr.instance)
+	logger.Debug("starting container")
+	wkr.starting[ctr.UUID] = struct{}{}
+	wkr.state = StateRunning
+	go func() {
+		stdout, stderr, err := wkr.executor.Execute("crunch-run --detach '"+ctr.UUID+"'", nil)
+		wkr.mtx.Lock()
+		defer wkr.mtx.Unlock()
+		now := time.Now()
+		wkr.updated = now
+		wkr.busy = now
+		delete(wkr.starting, ctr.UUID)
+		wkr.running[ctr.UUID] = struct{}{}
+		wkr.lastUUID = ctr.UUID
+		if err != nil {
+			logger.WithField("stdout", string(stdout)).
+				WithField("stderr", string(stderr)).
+				WithError(err).
+				Error("error starting crunch-run process")
+			// Leave uuid in wkr.running, though: it's
+			// possible the error was just a communication
+			// failure and the process was in fact
+			// started.  Wait for next probe to find out.
+			return
+		}
+		logger.Info("crunch-run process started")
+		wkr.lastUUID = ctr.UUID
+	}()
+}
+
+// ProbeAndUpdate conducts appropriate boot/running probes (if any)
+// for the worker's curent state. If a previous probe is still
+// running, it does nothing.
+//
+// It should be called in a new goroutine.
+func (wkr *worker) ProbeAndUpdate() {
+	select {
+	case wkr.probing <- struct{}{}:
+		wkr.probeAndUpdate()
+		<-wkr.probing
+	default:
+		wkr.logger.Debug("still waiting for last probe to finish")
+	}
+}
+
+// should be called in a new goroutine
+func (wkr *worker) probeAndUpdate() {
+	wkr.mtx.Lock()
+	updated := wkr.updated
+	needProbeRunning := wkr.state == StateRunning || wkr.state == StateIdle
+	needProbeBooted := wkr.state == StateUnknown || wkr.state == StateBooting
+	wkr.mtx.Unlock()
+	if !needProbeBooted && !needProbeRunning {
+		return
+	}
+
+	var (
+		ctrUUIDs []string
+		ok       bool
+		stderr   []byte
+	)
+	if needProbeBooted {
+		ok, stderr = wkr.probeBooted()
+		wkr.mtx.Lock()
+		if ok || wkr.state == StateRunning || wkr.state == StateIdle {
+			wkr.logger.Info("instance booted; will try probeRunning")
+			needProbeRunning = true
+		}
+		wkr.mtx.Unlock()
+	}
+	if needProbeRunning {
+		ctrUUIDs, ok, stderr = wkr.probeRunning()
+	}
+	logger := wkr.logger.WithField("stderr", string(stderr))
+	wkr.mtx.Lock()
+	defer wkr.mtx.Unlock()
+	if !ok {
+		if wkr.state == StateShutdown && wkr.updated.After(updated) {
+			// Skip the logging noise if shutdown was
+			// initiated during probe.
+			return
+		}
+		dur := time.Since(wkr.probed)
+		logger := logger.WithFields(logrus.Fields{
+			"Duration": dur,
+			"State":    wkr.state,
+		})
+		if wkr.state == StateBooting {
+			logger.Debug("new instance not responding")
+		} else {
+			logger.Info("instance not responding")
+		}
+		wkr.shutdownIfBroken(dur)
+		return
+	}
+
+	updateTime := time.Now()
+	wkr.probed = updateTime
+
+	if updated != wkr.updated {
+		// Worker was updated after the probe began, so
+		// wkr.running might have a container UUID that was
+		// not yet running when ctrUUIDs was generated. Leave
+		// wkr.running alone and wait for the next probe to
+		// catch up on any changes.
+		return
+	}
+
+	if len(ctrUUIDs) > 0 {
+		wkr.busy = updateTime
+		wkr.lastUUID = ctrUUIDs[0]
+	} else if len(wkr.running) > 0 {
+		// Actual last-busy time was sometime between wkr.busy
+		// and now. Now is the earliest opportunity to take
+		// advantage of the non-busy state, though.
+		wkr.busy = updateTime
+	}
+	running := map[string]struct{}{}
+	changed := false
+	for _, uuid := range ctrUUIDs {
+		running[uuid] = struct{}{}
+		if _, ok := wkr.running[uuid]; !ok {
+			changed = true
+		}
+	}
+	for uuid := range wkr.running {
+		if _, ok := running[uuid]; !ok {
+			logger.WithField("ContainerUUID", uuid).Info("crunch-run process ended")
+			wkr.wp.notifyExited(uuid, updateTime)
+			changed = true
+		}
+	}
+	if wkr.state == StateUnknown || wkr.state == StateBooting {
+		wkr.state = StateIdle
+		changed = true
+	}
+	if changed {
+		wkr.running = running
+		if wkr.state == StateIdle && len(wkr.starting)+len(wkr.running) > 0 {
+			wkr.state = StateRunning
+		} else if wkr.state == StateRunning && len(wkr.starting)+len(wkr.running) == 0 {
+			wkr.state = StateIdle
+		}
+		wkr.updated = updateTime
+		go wkr.wp.notify()
+	}
+}
+
+func (wkr *worker) probeRunning() (running []string, ok bool, stderr []byte) {
+	cmd := "crunch-run --list"
+	stdout, stderr, err := wkr.executor.Execute(cmd, nil)
+	if err != nil {
+		wkr.logger.WithFields(logrus.Fields{
+			"Command": cmd,
+			"stdout":  string(stdout),
+			"stderr":  string(stderr),
+		}).WithError(err).Warn("probe failed")
+		return nil, false, stderr
+	}
+	stdout = bytes.TrimRight(stdout, "\n")
+	if len(stdout) == 0 {
+		return nil, true, stderr
+	}
+	return strings.Split(string(stdout), "\n"), true, stderr
+}
+
+func (wkr *worker) probeBooted() (ok bool, stderr []byte) {
+	cmd := wkr.wp.bootProbeCommand
+	if cmd == "" {
+		cmd = "true"
+	}
+	stdout, stderr, err := wkr.executor.Execute(cmd, nil)
+	logger := wkr.logger.WithFields(logrus.Fields{
+		"Command": cmd,
+		"stdout":  string(stdout),
+		"stderr":  string(stderr),
+	})
+	if err != nil {
+		logger.WithError(err).Debug("boot probe failed")
+		return false, stderr
+	}
+	logger.Info("boot probe succeeded")
+	return true, stderr
+}
+
+// caller must have lock.
+func (wkr *worker) shutdownIfBroken(dur time.Duration) {
+	if wkr.state == StateHold {
+		return
+	}
+	label, threshold := "", wkr.wp.timeoutProbe
+	if wkr.state == StateBooting {
+		label, threshold = "new ", wkr.wp.timeoutBooting
+	}
+	if dur < threshold {
+		return
+	}
+	wkr.logger.WithFields(logrus.Fields{
+		"Duration": dur,
+		"Since":    wkr.probed,
+		"State":    wkr.state,
+	}).Warnf("%sinstance unresponsive, shutting down", label)
+	wkr.shutdown()
+}
+
+// caller must have lock.
+func (wkr *worker) shutdownIfIdle() bool {
+	if wkr.state != StateIdle {
+		return false
+	}
+	age := time.Since(wkr.busy)
+	if age < wkr.wp.timeoutIdle {
+		return false
+	}
+	wkr.logger.WithField("Age", age).Info("shutdown idle worker")
+	wkr.shutdown()
+	return true
+}
+
+// caller must have lock
+func (wkr *worker) shutdown() {
+	wkr.updated = time.Now()
+	wkr.state = StateShutdown
+	go func() {
+		err := wkr.instance.Destroy()
+		if err != nil {
+			wkr.logger.WithError(err).Warn("shutdown failed")
+			return
+		}
+	}()
+}

commit 93671366e7633cbf0ca3cab68395e211e3afc31c
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Tue Nov 6 17:01:06 2018 -0500

    14360: Note race in docs, fix flaky test.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/lib/dispatchcloud/worker/pool.go b/lib/dispatchcloud/worker/pool.go
index 4e331256f..45e9da37a 100644
--- a/lib/dispatchcloud/worker/pool.go
+++ b/lib/dispatchcloud/worker/pool.go
@@ -185,6 +185,11 @@ func (wp *Pool) Unsubscribe(ch <-chan struct{}) {
 
 // Unallocated returns the number of unallocated (creating + booting +
 // idle + unknown) workers for each instance type.
+//
+// The returned counts should be interpreted as upper bounds, rather
+// than exact counts: they are sometimes artificially high when a
+// newly created instance appears in the driver's Instances() list
+// before the Create() call returns.
 func (wp *Pool) Unallocated() map[arvados.InstanceType]int {
 	wp.setupOnce.Do(wp.setup)
 	wp.mtx.RLock()
diff --git a/lib/dispatchcloud/worker/pool_test.go b/lib/dispatchcloud/worker/pool_test.go
index 790ee851e..f8667e06c 100644
--- a/lib/dispatchcloud/worker/pool_test.go
+++ b/lib/dispatchcloud/worker/pool_test.go
@@ -19,6 +19,16 @@ const GiB arvados.ByteSize = 1 << 30
 
 var _ = check.Suite(&PoolSuite{})
 
+type lessChecker struct {
+	*check.CheckerInfo
+}
+
+func (*lessChecker) Check(params []interface{}, names []string) (result bool, error string) {
+	return params[0].(int) < params[1].(int), ""
+}
+
+var less = &lessChecker{&check.CheckerInfo{Name: "less", Params: []string{"obtained", "expected"}}}
+
 type PoolSuite struct{}
 
 func (suite *PoolSuite) SetUpSuite(c *check.C) {
@@ -69,8 +79,13 @@ func (suite *PoolSuite) TestCreateUnallocShutdown(c *check.C) {
 	c.Check(pool.Unallocated()[type1], check.Equals, 1)
 	c.Check(pool.Unallocated()[type2], check.Equals, 2)
 	pool.getInstancesAndSync()
-	c.Check(pool.Unallocated()[type1], check.Equals, 1)
-	c.Check(pool.Unallocated()[type2], check.Equals, 2)
+	// Returned counts can be temporarily higher than 1 and 2 if
+	// poll ran before Create() returned.
+	c.Check(pool.Unallocated()[type1], check.Not(less), 1)
+	c.Check(pool.Unallocated()[type2], check.Not(less), 2)
+	suite.wait(c, pool, notify, func() bool {
+		return pool.Unallocated()[type1] == 1 && pool.Unallocated()[type2] == 2
+	})
 
 	c.Check(pool.Shutdown(type2), check.Equals, true)
 	suite.wait(c, pool, notify, func() bool {

commit d6bf3b66c2c938ca2ce2bda3276c0c0f4e54f1b6
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Tue Nov 6 16:17:58 2018 -0500

    14360: Add "idle" state.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/lib/dispatchcloud/scheduler/run_queue_test.go b/lib/dispatchcloud/scheduler/run_queue_test.go
index e5e8c7ecf..b8a03275f 100644
--- a/lib/dispatchcloud/scheduler/run_queue_test.go
+++ b/lib/dispatchcloud/scheduler/run_queue_test.go
@@ -75,7 +75,8 @@ func (p *stubPool) Shutdown(arvados.InstanceType) bool {
 func (p *stubPool) Workers() map[worker.State]int {
 	return map[worker.State]int{
 		worker.StateBooting: len(p.unalloc) - len(p.idle),
-		worker.StateRunning: len(p.idle) - len(p.running),
+		worker.StateIdle:    len(p.idle),
+		worker.StateRunning: len(p.running),
 	}
 }
 func (p *stubPool) StartContainer(it arvados.InstanceType, ctr arvados.Container) bool {
diff --git a/lib/dispatchcloud/worker/pool.go b/lib/dispatchcloud/worker/pool.go
index b1b570bd9..4e331256f 100644
--- a/lib/dispatchcloud/worker/pool.go
+++ b/lib/dispatchcloud/worker/pool.go
@@ -148,8 +148,8 @@ type worker struct {
 	updated  time.Time
 	busy     time.Time
 	lastUUID string
-	running  map[string]struct{}
-	starting map[string]struct{}
+	running  map[string]struct{} // remember to update state idle<->running when this changes
+	starting map[string]struct{} // remember to update state idle<->running when this changes
 	probing  chan struct{}
 }
 
@@ -194,7 +194,7 @@ func (wp *Pool) Unallocated() map[arvados.InstanceType]int {
 		u[it] = c
 	}
 	for _, wkr := range wp.workers {
-		if len(wkr.running)+len(wkr.starting) == 0 && (wkr.state == StateRunning || wkr.state == StateBooting || wkr.state == StateUnknown) {
+		if wkr.state == StateIdle || wkr.state == StateBooting || wkr.state == StateUnknown {
 			u[wkr.instType]++
 		}
 	}
@@ -288,21 +288,16 @@ func (wp *Pool) Shutdown(it arvados.InstanceType) bool {
 	defer wp.mtx.Unlock()
 	logger := wp.logger.WithField("InstanceType", it.Name)
 	logger.Info("shutdown requested")
-	for _, tryState := range []State{StateBooting, StateRunning} {
+	for _, tryState := range []State{StateBooting, StateIdle} {
 		// TODO: shutdown the worker with the longest idle
-		// time (Running) or the earliest create time
-		// (Booting)
+		// time (Idle) or the earliest create time (Booting)
 		for _, wkr := range wp.workers {
-			if wkr.state != tryState || len(wkr.running)+len(wkr.starting) > 0 {
-				continue
-			}
-			if wkr.instType != it {
-				continue
+			if wkr.state == tryState && wkr.instType == it {
+				logger = logger.WithField("Instance", wkr.instance)
+				logger.Info("shutting down")
+				wp.shutdown(wkr, logger)
+				return true
 			}
-			logger = logger.WithField("Instance", wkr.instance)
-			logger.Info("shutting down")
-			wp.shutdown(wkr, logger)
-			return true
 		}
 	}
 	return false
@@ -370,7 +365,7 @@ func (wp *Pool) StartContainer(it arvados.InstanceType, ctr arvados.Container) b
 	defer wp.mtx.Unlock()
 	var wkr *worker
 	for _, w := range wp.workers {
-		if w.instType == it && w.state == StateRunning && len(w.running)+len(w.starting) == 0 {
+		if w.instType == it && w.state == StateIdle {
 			if wkr == nil || w.busy.After(wkr.busy) {
 				wkr = w
 			}
@@ -382,6 +377,7 @@ func (wp *Pool) StartContainer(it arvados.InstanceType, ctr arvados.Container) b
 	logger = logger.WithField("Instance", wkr.instance)
 	logger.Debug("starting container")
 	wkr.starting[ctr.UUID] = struct{}{}
+	wkr.state = StateRunning
 	go func() {
 		stdout, stderr, err := wkr.executor.Execute("crunch-run --detach '"+ctr.UUID+"'", nil)
 		wp.mtx.Lock()
@@ -451,6 +447,9 @@ func (wp *Pool) kill(wkr *worker, uuid string) {
 	defer wp.mtx.Unlock()
 	if _, ok := wkr.running[uuid]; ok {
 		delete(wkr.running, uuid)
+		if wkr.state == StateRunning && len(wkr.running)+len(wkr.starting) == 0 {
+			wkr.state = StateIdle
+		}
 		wkr.updated = time.Now()
 		go wp.notify()
 	}
@@ -628,7 +627,7 @@ func (wp *Pool) shutdownIfBroken(wkr *worker, dur time.Duration) {
 
 // caller must have lock.
 func (wp *Pool) shutdownIfIdle(wkr *worker) bool {
-	if len(wkr.running)+len(wkr.starting) > 0 || wkr.state != StateRunning {
+	if wkr.state != StateIdle {
 		return false
 	}
 	age := time.Since(wkr.busy)
@@ -755,7 +754,7 @@ func (wp *Pool) probeAndUpdate(wkr *worker) {
 	logger := wp.logger.WithField("Instance", wkr.instance)
 	wp.mtx.Lock()
 	updated := wkr.updated
-	needProbeRunning := wkr.state == StateRunning
+	needProbeRunning := wkr.state == StateRunning || wkr.state == StateIdle
 	needProbeBooted := wkr.state == StateUnknown || wkr.state == StateBooting
 	wp.mtx.Unlock()
 	if !needProbeBooted && !needProbeRunning {
@@ -770,13 +769,10 @@ func (wp *Pool) probeAndUpdate(wkr *worker) {
 	if needProbeBooted {
 		ok, stderr = wp.probeBooted(wkr)
 		wp.mtx.Lock()
-		if ok && (wkr.state == StateUnknown || wkr.state == StateBooting) {
-			wkr.state = StateRunning
-			wkr.probed = time.Now()
-			logger.Info("instance booted")
-			go wp.notify()
+		if ok || wkr.state == StateRunning || wkr.state == StateIdle {
+			logger.Info("instance booted; will try probeRunning")
+			needProbeRunning = true
 		}
-		needProbeRunning = wkr.state == StateRunning
 		wp.mtx.Unlock()
 	}
 	if needProbeRunning {
@@ -786,7 +782,7 @@ func (wp *Pool) probeAndUpdate(wkr *worker) {
 	wp.mtx.Lock()
 	defer wp.mtx.Unlock()
 	if !ok {
-		if wkr.state == StateShutdown {
+		if wkr.state == StateShutdown && wkr.updated.After(updated) {
 			// Skip the logging noise if shutdown was
 			// initiated during probe.
 			return
@@ -841,8 +837,17 @@ func (wp *Pool) probeAndUpdate(wkr *worker) {
 			changed = true
 		}
 	}
+	if wkr.state == StateUnknown || wkr.state == StateBooting {
+		wkr.state = StateIdle
+		changed = true
+	}
 	if changed {
 		wkr.running = running
+		if wkr.state == StateIdle && len(wkr.starting)+len(wkr.running) > 0 {
+			wkr.state = StateRunning
+		} else if wkr.state == StateRunning && len(wkr.starting)+len(wkr.running) == 0 {
+			wkr.state = StateIdle
+		}
 		wkr.updated = updateTime
 		go wp.notify()
 	}
diff --git a/lib/dispatchcloud/worker/worker.go b/lib/dispatchcloud/worker/worker.go
index 7828d4f69..c24efeb3c 100644
--- a/lib/dispatchcloud/worker/worker.go
+++ b/lib/dispatchcloud/worker/worker.go
@@ -15,7 +15,8 @@ type State int
 const (
 	StateUnknown  State = iota // might be running a container already
 	StateBooting               // instance is booting
-	StateRunning               // instance is running
+	StateIdle                  // instance booted, no containers are running
+	StateRunning               // instance is running one or more containers
 	StateShutdown              // worker has stopped monitoring the instance
 	StateHold                  // running, but not available to run new containers
 )
@@ -28,6 +29,7 @@ const (
 var stateString = map[State]string{
 	StateUnknown:  "unknown",
 	StateBooting:  "booting",
+	StateIdle:     "idle",
 	StateRunning:  "running",
 	StateShutdown: "shutdown",
 	StateHold:     "hold",

commit 49f46db107c8226d70ed6d489cd000bfba82ca2f
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Tue Oct 30 16:37:31 2018 -0400

    14360: Fix concurrent map access in tests.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/lib/dispatchcloud/test/queue.go b/lib/dispatchcloud/test/queue.go
index 0bad5a391..152094f52 100644
--- a/lib/dispatchcloud/test/queue.go
+++ b/lib/dispatchcloud/test/queue.go
@@ -61,14 +61,20 @@ func (q *Queue) Forget(uuid string) {
 }
 
 func (q *Queue) Lock(uuid string) error {
+	q.mtx.Lock()
+	defer q.mtx.Unlock()
 	return q.changeState(uuid, arvados.ContainerStateQueued, arvados.ContainerStateLocked)
 }
 
 func (q *Queue) Unlock(uuid string) error {
+	q.mtx.Lock()
+	defer q.mtx.Unlock()
 	return q.changeState(uuid, arvados.ContainerStateLocked, arvados.ContainerStateQueued)
 }
 
 func (q *Queue) Cancel(uuid string) error {
+	q.mtx.Lock()
+	defer q.mtx.Unlock()
 	return q.changeState(uuid, q.entries[uuid].Container.State, arvados.ContainerStateCancelled)
 }
 
@@ -98,9 +104,8 @@ func (q *Queue) notify() {
 	}
 }
 
+// caller must have lock.
 func (q *Queue) changeState(uuid string, from, to arvados.ContainerState) error {
-	q.mtx.Lock()
-	defer q.mtx.Unlock()
 	ent := q.entries[uuid]
 	if ent.Container.State != from {
 		return fmt.Errorf("lock failed: state=%q", ent.Container.State)

commit 3cb857be0aad14d0ebd631305a7a5c0f847811d4
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Tue Oct 30 16:35:34 2018 -0400

    14360: Move slurm-only code to crunch-dispatch-slurm.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/lib/dispatchcloud/node_size.go b/lib/dispatchcloud/node_size.go
index 3bada3baf..e77c862b3 100644
--- a/lib/dispatchcloud/node_size.go
+++ b/lib/dispatchcloud/node_size.go
@@ -6,11 +6,7 @@ package dispatchcloud
 
 import (
 	"errors"
-	"log"
-	"os/exec"
 	"sort"
-	"strings"
-	"time"
 
 	"git.curoverse.com/arvados.git/sdk/go/arvados"
 )
@@ -78,61 +74,3 @@ func ChooseInstanceType(cc *arvados.Cluster, ctr *arvados.Container) (best arvad
 	}
 	return
 }
-
-// SlurmNodeTypeFeatureKludge ensures SLURM accepts every instance
-// type name as a valid feature name, even if no instances of that
-// type have appeared yet.
-//
-// It takes advantage of some SLURM peculiarities:
-//
-// (1) A feature is valid after it has been offered by a node, even if
-// it is no longer offered by any node. So, to make a feature name
-// valid, we can add it to a dummy node ("compute0"), then remove it.
-//
-// (2) To test whether a set of feature names are valid without
-// actually submitting a job, we can call srun --test-only with the
-// desired features.
-//
-// SlurmNodeTypeFeatureKludge does a test-and-fix operation
-// immediately, and then periodically, in case slurm restarts and
-// forgets the list of valid features. It never returns (unless there
-// are no node types configured, in which case it returns
-// immediately), so it should generally be invoked with "go".
-func SlurmNodeTypeFeatureKludge(cc *arvados.Cluster) {
-	if len(cc.InstanceTypes) == 0 {
-		return
-	}
-	var features []string
-	for _, it := range cc.InstanceTypes {
-		features = append(features, "instancetype="+it.Name)
-	}
-	for {
-		slurmKludge(features)
-		time.Sleep(2 * time.Second)
-	}
-}
-
-const slurmDummyNode = "compute0"
-
-func slurmKludge(features []string) {
-	allFeatures := strings.Join(features, ",")
-
-	cmd := exec.Command("sinfo", "--nodes="+slurmDummyNode, "--format=%f", "--noheader")
-	out, err := cmd.CombinedOutput()
-	if err != nil {
-		log.Printf("running %q %q: %s (output was %q)", cmd.Path, cmd.Args, err, out)
-		return
-	}
-	if string(out) == allFeatures+"\n" {
-		// Already configured correctly, nothing to do.
-		return
-	}
-
-	log.Printf("configuring node %q with all node type features", slurmDummyNode)
-	cmd = exec.Command("scontrol", "update", "NodeName="+slurmDummyNode, "Features="+allFeatures)
-	log.Printf("running: %q %q", cmd.Path, cmd.Args)
-	out, err = cmd.CombinedOutput()
-	if err != nil {
-		log.Printf("error: scontrol: %s (output was %q)", err, out)
-	}
-}
diff --git a/services/crunch-dispatch-slurm/crunch-dispatch-slurm.go b/services/crunch-dispatch-slurm/crunch-dispatch-slurm.go
index 084700d39..30e88bf92 100644
--- a/services/crunch-dispatch-slurm/crunch-dispatch-slurm.go
+++ b/services/crunch-dispatch-slurm/crunch-dispatch-slurm.go
@@ -197,7 +197,7 @@ func (disp *Dispatcher) run() error {
 	defer disp.sqCheck.Stop()
 
 	if disp.cluster != nil && len(disp.cluster.InstanceTypes) > 0 {
-		go dispatchcloud.SlurmNodeTypeFeatureKludge(disp.cluster)
+		go SlurmNodeTypeFeatureKludge(disp.cluster)
 	}
 
 	if _, err := daemon.SdNotify(false, "READY=1"); err != nil {
diff --git a/lib/dispatchcloud/node_size.go b/services/crunch-dispatch-slurm/node_type.go
similarity index 53%
copy from lib/dispatchcloud/node_size.go
copy to services/crunch-dispatch-slurm/node_type.go
index 3bada3baf..62a96932f 100644
--- a/lib/dispatchcloud/node_size.go
+++ b/services/crunch-dispatch-slurm/node_type.go
@@ -2,83 +2,17 @@
 //
 // SPDX-License-Identifier: AGPL-3.0
 
-package dispatchcloud
+package main
 
 import (
-	"errors"
 	"log"
 	"os/exec"
-	"sort"
 	"strings"
 	"time"
 
 	"git.curoverse.com/arvados.git/sdk/go/arvados"
 )
 
-var ErrInstanceTypesNotConfigured = errors.New("site configuration does not list any instance types")
-
-var discountConfiguredRAMPercent = 5
-
-// ConstraintsNotSatisfiableError includes a list of available instance types
-// to be reported back to the user.
-type ConstraintsNotSatisfiableError struct {
-	error
-	AvailableTypes []arvados.InstanceType
-}
-
-// ChooseInstanceType returns the cheapest available
-// arvados.InstanceType big enough to run ctr.
-func ChooseInstanceType(cc *arvados.Cluster, ctr *arvados.Container) (best arvados.InstanceType, err error) {
-	if len(cc.InstanceTypes) == 0 {
-		err = ErrInstanceTypesNotConfigured
-		return
-	}
-
-	needScratch := int64(0)
-	for _, m := range ctr.Mounts {
-		if m.Kind == "tmp" {
-			needScratch += m.Capacity
-		}
-	}
-
-	needVCPUs := ctr.RuntimeConstraints.VCPUs
-
-	needRAM := ctr.RuntimeConstraints.RAM + ctr.RuntimeConstraints.KeepCacheRAM
-	needRAM = (needRAM * 100) / int64(100-discountConfiguredRAMPercent)
-
-	ok := false
-	for _, it := range cc.InstanceTypes {
-		switch {
-		case ok && it.Price > best.Price:
-		case int64(it.Scratch) < needScratch:
-		case int64(it.RAM) < needRAM:
-		case it.VCPUs < needVCPUs:
-		case it.Preemptible != ctr.SchedulingParameters.Preemptible:
-		case it.Price == best.Price && (it.RAM < best.RAM || it.VCPUs < best.VCPUs):
-			// Equal price, but worse specs
-		default:
-			// Lower price || (same price && better specs)
-			best = it
-			ok = true
-		}
-	}
-	if !ok {
-		availableTypes := make([]arvados.InstanceType, 0, len(cc.InstanceTypes))
-		for _, t := range cc.InstanceTypes {
-			availableTypes = append(availableTypes, t)
-		}
-		sort.Slice(availableTypes, func(a, b int) bool {
-			return availableTypes[a].Price < availableTypes[b].Price
-		})
-		err = ConstraintsNotSatisfiableError{
-			errors.New("constraints not satisfiable by any configured instance type"),
-			availableTypes,
-		}
-		return
-	}
-	return
-}
-
 // SlurmNodeTypeFeatureKludge ensures SLURM accepts every instance
 // type name as a valid feature name, even if no instances of that
 // type have appeared yet.

commit 3654f222637b00728158300e700d615c4c913b29
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Tue Oct 30 16:12:11 2018 -0400

    14360: Fail Create calls fast during quota quarantine.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/lib/dispatchcloud/worker/pool.go b/lib/dispatchcloud/worker/pool.go
index 68174e6b7..b1b570bd9 100644
--- a/lib/dispatchcloud/worker/pool.go
+++ b/lib/dispatchcloud/worker/pool.go
@@ -124,6 +124,7 @@ type Pool struct {
 	loaded       bool                 // loaded list of instances from InstanceSet at least once
 	exited       map[string]time.Time // containers whose crunch-run proc has exited, but KillContainer has not been called
 	atQuotaUntil time.Time
+	atQuotaErr   cloud.QuotaError
 	stop         chan bool
 	mtx          sync.RWMutex
 	setupOnce    sync.Once
@@ -208,6 +209,9 @@ func (wp *Pool) Create(it arvados.InstanceType) error {
 	wp.setupOnce.Do(wp.setup)
 	wp.mtx.Lock()
 	defer wp.mtx.Unlock()
+	if time.Now().Before(wp.atQuotaUntil) {
+		return wp.atQuotaErr
+	}
 	tags := cloud.InstanceTags{tagKeyInstanceType: it.Name}
 	wp.creating[it]++
 	go func() {
@@ -217,6 +221,7 @@ func (wp *Pool) Create(it arvados.InstanceType) error {
 		defer wp.mtx.Unlock()
 		wp.creating[it]--
 		if err, ok := err.(cloud.QuotaError); ok && err.IsQuotaError() {
+			wp.atQuotaErr = err
 			wp.atQuotaUntil = time.Now().Add(time.Minute)
 		}
 		if err != nil {

commit 1c1e245bc3163a7841859d5ff75d0cff59dfc90c
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Tue Oct 30 16:08:47 2018 -0400

    14360: Remove redundant wp.unallocated field.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/lib/dispatchcloud/worker/pool.go b/lib/dispatchcloud/worker/pool.go
index 4aa4f089a..68174e6b7 100644
--- a/lib/dispatchcloud/worker/pool.go
+++ b/lib/dispatchcloud/worker/pool.go
@@ -30,7 +30,7 @@ type InstanceView struct {
 	ArvadosInstanceType  string
 	ProviderInstanceType string
 	LastContainerUUID    string
-	Unallocated          time.Time
+	LastBusy             time.Time
 	WorkerState          string
 }
 
@@ -137,20 +137,19 @@ type Pool struct {
 }
 
 type worker struct {
-	state       State
-	instance    cloud.Instance
-	executor    Executor
-	instType    arvados.InstanceType
-	vcpus       int64
-	memory      int64
-	probed      time.Time
-	updated     time.Time
-	busy        time.Time
-	unallocated time.Time
-	lastUUID    string
-	running     map[string]struct{}
-	starting    map[string]struct{}
-	probing     chan struct{}
+	state    State
+	instance cloud.Instance
+	executor Executor
+	instType arvados.InstanceType
+	vcpus    int64
+	memory   int64
+	probed   time.Time
+	updated  time.Time
+	busy     time.Time
+	lastUUID string
+	running  map[string]struct{}
+	starting map[string]struct{}
+	probing  chan struct{}
 }
 
 // Subscribe returns a channel that becomes ready whenever a worker's
@@ -262,17 +261,16 @@ func (wp *Pool) updateWorker(inst cloud.Instance, it arvados.InstanceType, initi
 	}).Infof("instance appeared in cloud")
 	now := time.Now()
 	wp.workers[id] = &worker{
-		executor:    wp.newExecutor(inst),
-		state:       initialState,
-		instance:    inst,
-		instType:    it,
-		probed:      now,
-		busy:        now,
-		updated:     now,
-		unallocated: now,
-		running:     make(map[string]struct{}),
-		starting:    make(map[string]struct{}),
-		probing:     make(chan struct{}, 1),
+		executor: wp.newExecutor(inst),
+		state:    initialState,
+		instance: inst,
+		instType: it,
+		probed:   now,
+		busy:     now,
+		updated:  now,
+		running:  make(map[string]struct{}),
+		starting: make(map[string]struct{}),
+		probing:  make(chan struct{}, 1),
 	}
 	return true
 }
@@ -628,7 +626,7 @@ func (wp *Pool) shutdownIfIdle(wkr *worker) bool {
 	if len(wkr.running)+len(wkr.starting) > 0 || wkr.state != StateRunning {
 		return false
 	}
-	age := time.Since(wkr.unallocated)
+	age := time.Since(wkr.busy)
 	if age < wp.timeoutIdle {
 		return false
 	}
@@ -660,7 +658,7 @@ func (wp *Pool) Instances() []InstanceView {
 			ArvadosInstanceType:  w.instType.Name,
 			ProviderInstanceType: w.instType.ProviderType,
 			LastContainerUUID:    w.lastUUID,
-			Unallocated:          w.unallocated,
+			LastBusy:             w.busy,
 			WorkerState:          w.state.String(),
 		})
 	}
@@ -818,7 +816,10 @@ func (wp *Pool) probeAndUpdate(wkr *worker) {
 		wkr.busy = updateTime
 		wkr.lastUUID = ctrUUIDs[0]
 	} else if len(wkr.running) > 0 {
-		wkr.unallocated = updateTime
+		// Actual last-busy time was sometime between wkr.busy
+		// and now. Now is the earliest opportunity to take
+		// advantage of the non-busy state, though.
+		wkr.busy = updateTime
 	}
 	running := map[string]struct{}{}
 	changed := false

commit f33f8084159f701b8a1435e8cf77b7ab7f3417ae
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Tue Oct 30 15:35:07 2018 -0400

    14360: Remove redundant wkr.booted flag.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/lib/dispatchcloud/worker/pool.go b/lib/dispatchcloud/worker/pool.go
index 1e759e38f..4aa4f089a 100644
--- a/lib/dispatchcloud/worker/pool.go
+++ b/lib/dispatchcloud/worker/pool.go
@@ -143,7 +143,6 @@ type worker struct {
 	instType    arvados.InstanceType
 	vcpus       int64
 	memory      int64
-	booted      bool
 	probed      time.Time
 	updated     time.Time
 	busy        time.Time
@@ -753,26 +752,31 @@ func (wp *Pool) probeAndUpdate(wkr *worker) {
 	logger := wp.logger.WithField("Instance", wkr.instance)
 	wp.mtx.Lock()
 	updated := wkr.updated
-	booted := wkr.booted
+	needProbeRunning := wkr.state == StateRunning
+	needProbeBooted := wkr.state == StateUnknown || wkr.state == StateBooting
 	wp.mtx.Unlock()
+	if !needProbeBooted && !needProbeRunning {
+		return
+	}
 
 	var (
 		ctrUUIDs []string
 		ok       bool
 		stderr   []byte
 	)
-	if !booted {
-		booted, stderr = wp.probeBooted(wkr)
+	if needProbeBooted {
+		ok, stderr = wp.probeBooted(wkr)
 		wp.mtx.Lock()
-		if booted && !wkr.booted {
-			wkr.booted = booted
+		if ok && (wkr.state == StateUnknown || wkr.state == StateBooting) {
+			wkr.state = StateRunning
+			wkr.probed = time.Now()
 			logger.Info("instance booted")
-		} else {
-			booted = wkr.booted
+			go wp.notify()
 		}
+		needProbeRunning = wkr.state == StateRunning
 		wp.mtx.Unlock()
 	}
-	if booted {
+	if needProbeRunning {
 		ctrUUIDs, ok, stderr = wp.probeRunning(wkr)
 	}
 	logger = logger.WithField("stderr", string(stderr))
@@ -780,6 +784,8 @@ func (wp *Pool) probeAndUpdate(wkr *worker) {
 	defer wp.mtx.Unlock()
 	if !ok {
 		if wkr.state == StateShutdown {
+			// Skip the logging noise if shutdown was
+			// initiated during probe.
 			return
 		}
 		dur := time.Since(wkr.probed)
@@ -798,15 +804,6 @@ func (wp *Pool) probeAndUpdate(wkr *worker) {
 
 	updateTime := time.Now()
 	wkr.probed = updateTime
-	if wkr.state == StateShutdown || wkr.state == StateHold {
-	} else if booted {
-		if wkr.state != StateRunning {
-			wkr.state = StateRunning
-			go wp.notify()
-		}
-	} else {
-		wkr.state = StateBooting
-	}
 
 	if updated != wkr.updated {
 		// Worker was updated after the probe began, so

commit e5e2dc5e6bf1bce9893517470632295d42bb5f1c
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Tue Oct 30 14:32:24 2018 -0400

    14360: Ensure signals don't pass to child in startup race.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/services/crunch-run/background.go b/services/crunch-run/background.go
index 57f770ae1..bd02db430 100644
--- a/services/crunch-run/background.go
+++ b/services/crunch-run/background.go
@@ -65,7 +65,12 @@ func detach(uuid string, args []string, stdout, stderr io.Writer) error {
 	cmd := exec.Command(args[0], append([]string{"-detached"}, args[1:]...)...)
 	cmd.Stdout = outfile
 	cmd.Stderr = errfile
+	// Child inherits lockfile.
 	cmd.ExtraFiles = []*os.File{lockfile}
+	// Ensure child isn't interrupted even if we receive signals
+	// from parent (sshd) while sending lockfile content to
+	// caller.
+	cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
 	err = cmd.Start()
 	if err != nil {
 		os.Remove(outfile.Name())

commit 359dc0dba7e93744c583b431d428de62fb8e0c90
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Tue Oct 30 14:18:35 2018 -0400

    14360: Fix mismatched flag name, -nodetach -> -detached.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/services/crunch-run/background.go b/services/crunch-run/background.go
index 1f98d3791..57f770ae1 100644
--- a/services/crunch-run/background.go
+++ b/services/crunch-run/background.go
@@ -32,7 +32,7 @@ type procinfo struct {
 }
 
 // Detach acquires a lock for the given uuid, and starts the current
-// program as a child process (with -nodetach prepended to the given
+// program as a child process (with -detached prepended to the given
 // arguments so the child knows not to detach again). The lock is
 // passed along to the child process.
 func Detach(uuid string, args []string, stdout, stderr io.Writer) int {
@@ -62,7 +62,7 @@ func detach(uuid string, args []string, stdout, stderr io.Writer) error {
 	}
 	defer errfile.Close()
 
-	cmd := exec.Command(args[0], append([]string{"-nodetach"}, args[1:]...)...)
+	cmd := exec.Command(args[0], append([]string{"-detached"}, args[1:]...)...)
 	cmd.Stdout = outfile
 	cmd.Stderr = errfile
 	cmd.ExtraFiles = []*os.File{lockfile}

commit 17479bd75a29c52470abe0049cb447e114eb39e9
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Tue Oct 30 13:50:30 2018 -0400

    14360: Encapsulate scheduler object.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/lib/dispatchcloud/container/queue.go b/lib/dispatchcloud/container/queue.go
index e0e1cd0cd..432f4d488 100644
--- a/lib/dispatchcloud/container/queue.go
+++ b/lib/dispatchcloud/container/queue.go
@@ -69,6 +69,9 @@ type Queue struct {
 	// When no network update is in progress, this protection is
 	// not needed, and dontupdate is nil.
 	dontupdate map[string]struct{}
+
+	// active notification subscribers (see Subscribe)
+	subscribers map[<-chan struct{}]chan struct{}
 }
 
 // NewQueue returns a new Queue. When a new container appears in the
@@ -76,11 +79,46 @@ type Queue struct {
 // assign an appropriate arvados.InstanceType for the queue entry.
 func NewQueue(logger logrus.FieldLogger, reg *prometheus.Registry, chooseType typeChooser, client APIClient) *Queue {
 	return &Queue{
-		logger:     logger,
-		reg:        reg,
-		chooseType: chooseType,
-		client:     client,
-		current:    map[string]QueueEnt{},
+		logger:      logger,
+		reg:         reg,
+		chooseType:  chooseType,
+		client:      client,
+		current:     map[string]QueueEnt{},
+		subscribers: map[<-chan struct{}]chan struct{}{},
+	}
+}
+
+// Subscribe returns a channel that becomes ready to receive when an
+// entry in the Queue is updated.
+//
+//	ch := q.Subscribe()
+//	defer q.Unsubscribe(ch)
+//	for range ch {
+//		// ...
+//	}
+func (cq *Queue) Subscribe() <-chan struct{} {
+	cq.mtx.Lock()
+	defer cq.mtx.Unlock()
+	ch := make(chan struct{}, 1)
+	cq.subscribers[ch] = ch
+	return ch
+}
+
+// Unsubscribe stops sending updates to the given channel. See
+// Subscribe.
+func (cq *Queue) Unsubscribe(ch <-chan struct{}) {
+	cq.mtx.Lock()
+	defer cq.mtx.Unlock()
+	delete(cq.subscribers, ch)
+}
+
+// Caller must have lock.
+func (cq *Queue) notify() {
+	for _, ch := range cq.subscribers {
+		select {
+		case ch <- struct{}{}:
+		default:
+		}
 	}
 }
 
@@ -167,6 +205,7 @@ func (cq *Queue) Update() error {
 	}
 	cq.dontupdate = nil
 	cq.updated = updateStarted
+	cq.notify()
 	return nil
 }
 
@@ -192,9 +231,16 @@ func (cq *Queue) Unlock(uuid string) error {
 
 // Cancel cancels the given container.
 func (cq *Queue) Cancel(uuid string) error {
-	return cq.client.RequestAndDecode(nil, "PUT", "arvados/v1/containers/"+uuid, nil, map[string]map[string]interface{}{
+	err := cq.client.RequestAndDecode(nil, "PUT", "arvados/v1/containers/"+uuid, nil, map[string]map[string]interface{}{
 		"container": {"state": arvados.ContainerStateCancelled},
 	})
+	if err != nil {
+		return err
+	}
+	cq.mtx.Lock()
+	defer cq.mtx.Unlock()
+	cq.notify()
+	return nil
 }
 
 func (cq *Queue) apiUpdate(uuid, action string) error {
@@ -215,6 +261,7 @@ func (cq *Queue) apiUpdate(uuid, action string) error {
 		ent.Container.State, ent.Container.Priority, ent.Container.LockedByUUID = resp.State, resp.Priority, resp.LockedByUUID
 		cq.current[uuid] = ent
 	}
+	cq.notify()
 	return nil
 }
 
diff --git a/lib/dispatchcloud/dispatcher.go b/lib/dispatchcloud/dispatcher.go
index c441dc6fa..97aacf604 100644
--- a/lib/dispatchcloud/dispatcher.go
+++ b/lib/dispatchcloud/dispatcher.go
@@ -28,14 +28,10 @@ import (
 )
 
 const (
-	defaultPollInterval = time.Second
+	defaultPollInterval     = time.Second
+	defaultStaleLockTimeout = time.Minute
 )
 
-type containerQueue interface {
-	scheduler.ContainerQueue
-	Update() error
-}
-
 type pool interface {
 	scheduler.WorkerPool
 	Instances() []worker.InstanceView
@@ -45,14 +41,13 @@ type dispatcher struct {
 	Cluster       *arvados.Cluster
 	InstanceSetID cloud.InstanceSetID
 
-	logger       logrus.FieldLogger
-	reg          *prometheus.Registry
-	instanceSet  cloud.InstanceSet
-	pool         pool
-	queue        containerQueue
-	httpHandler  http.Handler
-	pollInterval time.Duration
-	sshKey       ssh.Signer
+	logger      logrus.FieldLogger
+	reg         *prometheus.Registry
+	instanceSet cloud.InstanceSet
+	pool        pool
+	queue       scheduler.ContainerQueue
+	httpHandler http.Handler
+	sshKey      ssh.Signer
 
 	setupOnce sync.Once
 	stop      chan struct{}
@@ -140,39 +135,24 @@ func (disp *dispatcher) initialize() {
 	mux.Handle("/metrics", metricsH)
 	mux.Handle("/metrics.json", metricsH)
 	disp.httpHandler = auth.RequireLiteralToken(disp.Cluster.ManagementToken, mux)
-
-	if d := disp.Cluster.Dispatch.PollInterval; d > 0 {
-		disp.pollInterval = time.Duration(d)
-	} else {
-		disp.pollInterval = defaultPollInterval
-	}
 }
 
 func (disp *dispatcher) run() {
 	defer disp.instanceSet.Stop()
 
-	t0 := time.Now()
-	disp.logger.Infof("FixStaleLocks starting.")
-	scheduler.FixStaleLocks(disp.logger, disp.queue, disp.pool, time.Duration(disp.Cluster.Dispatch.StaleLockTimeout))
-	disp.logger.Infof("FixStaleLocks finished (%s), starting scheduling.", time.Since(t0))
-
-	wp := disp.pool.Subscribe()
-	defer disp.pool.Unsubscribe(wp)
-	poll := time.NewTicker(disp.pollInterval)
-	for {
-		scheduler.Map(disp.logger, disp.queue, disp.pool)
-		scheduler.Sync(disp.logger, disp.queue, disp.pool)
-		select {
-		case <-disp.stop:
-			return
-		case <-wp:
-		case <-poll.C:
-			err := disp.queue.Update()
-			if err != nil {
-				disp.logger.Errorf("error updating queue: %s", err)
-			}
-		}
+	staleLockTimeout := time.Duration(disp.Cluster.Dispatch.StaleLockTimeout)
+	if staleLockTimeout == 0 {
+		staleLockTimeout = defaultStaleLockTimeout
 	}
+	pollInterval := time.Duration(disp.Cluster.Dispatch.PollInterval)
+	if pollInterval <= 0 {
+		pollInterval = defaultPollInterval
+	}
+	sched := scheduler.New(disp.logger, disp.queue, disp.pool, staleLockTimeout, pollInterval)
+	sched.Start()
+	defer sched.Stop()
+
+	<-disp.stop
 }
 
 // Management API: all active and queued containers.
diff --git a/lib/dispatchcloud/dispatcher_test.go b/lib/dispatchcloud/dispatcher_test.go
index 362fb4942..248986bf3 100644
--- a/lib/dispatchcloud/dispatcher_test.go
+++ b/lib/dispatchcloud/dispatcher_test.go
@@ -199,6 +199,7 @@ func (s *DispatcherSuite) SetUpTest(c *check.C) {
 			PrivateKey:         dispatchprivraw,
 			PollInterval:       arvados.Duration(5 * time.Millisecond),
 			ProbeInterval:      arvados.Duration(5 * time.Millisecond),
+			StaleLockTimeout:   arvados.Duration(5 * time.Millisecond),
 			MaxProbesPerSecond: 1000,
 		},
 		InstanceTypes: arvados.InstanceTypeMap{
diff --git a/lib/dispatchcloud/scheduler/fix_stale_locks.go b/lib/dispatchcloud/scheduler/fix_stale_locks.go
index e9644aed2..985941090 100644
--- a/lib/dispatchcloud/scheduler/fix_stale_locks.go
+++ b/lib/dispatchcloud/scheduler/fix_stale_locks.go
@@ -9,17 +9,17 @@ import (
 
 	"git.curoverse.com/arvados.git/lib/dispatchcloud/worker"
 	"git.curoverse.com/arvados.git/sdk/go/arvados"
-	"github.com/Sirupsen/logrus"
 )
 
-// FixStaleLocks waits for any already-locked containers (i.e., locked
+// fixStaleLocks waits for any already-locked containers (i.e., locked
 // by a prior dispatcher process) to appear on workers as the worker
 // pool recovers its state. It unlocks any that still remain when all
-// workers are recovered or shutdown, or its timer expires.
-func FixStaleLocks(logger logrus.FieldLogger, queue ContainerQueue, pool WorkerPool, limit time.Duration) {
-	wp := pool.Subscribe()
-	defer pool.Unsubscribe(wp)
-	timeout := time.NewTimer(limit)
+// workers are recovered or shutdown, or its timer
+// (sch.staleLockTimeout) expires.
+func (sch *Scheduler) fixStaleLocks() {
+	wp := sch.pool.Subscribe()
+	defer sch.pool.Unsubscribe(wp)
+	timeout := time.NewTimer(sch.staleLockTimeout)
 waiting:
 	for {
 		unlock := false
@@ -28,15 +28,15 @@ waiting:
 			// If all workers have been contacted, unlock
 			// containers that aren't claimed by any
 			// worker.
-			unlock = pool.Workers()[worker.StateUnknown] == 0
+			unlock = sch.pool.Workers()[worker.StateUnknown] == 0
 		case <-timeout.C:
 			// Give up and unlock the containers, even
 			// though they might be working.
 			unlock = true
 		}
 
-		running := pool.Running()
-		qEntries, _ := queue.Entries()
+		running := sch.pool.Running()
+		qEntries, _ := sch.queue.Entries()
 		for uuid, ent := range qEntries {
 			if ent.Container.State != arvados.ContainerStateLocked {
 				continue
@@ -47,9 +47,9 @@ waiting:
 			if !unlock {
 				continue waiting
 			}
-			err := queue.Unlock(uuid)
+			err := sch.queue.Unlock(uuid)
 			if err != nil {
-				logger.Warnf("Unlock %s: %s", uuid, err)
+				sch.logger.Warnf("Unlock %s: %s", uuid, err)
 			}
 		}
 		return
diff --git a/lib/dispatchcloud/scheduler/interfaces.go b/lib/dispatchcloud/scheduler/interfaces.go
index bdb8678e9..467247ad9 100644
--- a/lib/dispatchcloud/scheduler/interfaces.go
+++ b/lib/dispatchcloud/scheduler/interfaces.go
@@ -21,6 +21,9 @@ type ContainerQueue interface {
 	Cancel(uuid string) error
 	Forget(uuid string)
 	Get(uuid string) (arvados.Container, bool)
+	Subscribe() <-chan struct{}
+	Unsubscribe(<-chan struct{})
+	Update() error
 }
 
 // A WorkerPool asynchronously starts and stops worker VMs, and starts
diff --git a/lib/dispatchcloud/scheduler/map.go b/lib/dispatchcloud/scheduler/run_queue.go
similarity index 65%
rename from lib/dispatchcloud/scheduler/map.go
rename to lib/dispatchcloud/scheduler/run_queue.go
index aa92d0433..9fc1a1658 100644
--- a/lib/dispatchcloud/scheduler/map.go
+++ b/lib/dispatchcloud/scheduler/run_queue.go
@@ -2,11 +2,6 @@
 //
 // SPDX-License-Identifier: AGPL-3.0
 
-// Package scheduler uses a resizable worker pool to execute
-// containers in priority order.
-//
-// Scheduler functions must not be called concurrently using the same
-// queue or pool.
 package scheduler
 
 import (
@@ -18,28 +13,8 @@ import (
 	"github.com/Sirupsen/logrus"
 )
 
-// Map maps queued containers onto unallocated workers in priority
-// order, creating new workers if needed. It locks containers that can
-// be mapped onto existing/pending workers, and starts them if
-// possible.
-//
-// Map unlocks any containers that are locked but can't be
-// mapped. (For example, this happens when the cloud provider reaches
-// quota/capacity and a previously mappable container's priority is
-// surpassed by a newer container.)
-//
-// If it encounters errors while creating new workers, Map shuts down
-// idle workers, in case they are consuming quota.
-//
-// Map should not be called without first calling FixStaleLocks.
-//
-//	FixStaleLocks()
-//	for {
-//		Map()
-//		Sync()
-//	}
-func Map(logger logrus.FieldLogger, queue ContainerQueue, pool WorkerPool) {
-	unsorted, _ := queue.Entries()
+func (sch *Scheduler) runQueue() {
+	unsorted, _ := sch.queue.Entries()
 	sorted := make([]container.QueueEnt, 0, len(unsorted))
 	for _, ent := range unsorted {
 		sorted = append(sorted, ent)
@@ -48,20 +23,20 @@ func Map(logger logrus.FieldLogger, queue ContainerQueue, pool WorkerPool) {
 		return sorted[i].Container.Priority > sorted[j].Container.Priority
 	})
 
-	running := pool.Running()
-	unalloc := pool.Unallocated()
+	running := sch.pool.Running()
+	unalloc := sch.pool.Unallocated()
 
-	logger.WithFields(logrus.Fields{
+	sch.logger.WithFields(logrus.Fields{
 		"Containers": len(sorted),
 		"Processes":  len(running),
-	}).Debug("mapping")
+	}).Debug("runQueue")
 
 	dontstart := map[arvados.InstanceType]bool{}
 	var overquota []container.QueueEnt // entries that are unmappable because of worker pool quota
 
 	for i, ctr := range sorted {
 		ctr, it := ctr.Container, ctr.InstanceType
-		logger := logger.WithFields(logrus.Fields{
+		logger := sch.logger.WithFields(logrus.Fields{
 			"ContainerUUID": ctr.UUID,
 			"InstanceType":  it.Name,
 		})
@@ -69,19 +44,20 @@ func Map(logger logrus.FieldLogger, queue ContainerQueue, pool WorkerPool) {
 			continue
 		}
 		if ctr.State == arvados.ContainerStateQueued {
-			logger.Debugf("locking")
-			if unalloc[it] < 1 && pool.AtQuota() {
+			if unalloc[it] < 1 && sch.pool.AtQuota() {
+				logger.Debugf("not locking: AtQuota and no unalloc workers")
 				overquota = sorted[i:]
 				break
 			}
-			err := queue.Lock(ctr.UUID)
+			logger.Debugf("locking")
+			err := sch.queue.Lock(ctr.UUID)
 			if err != nil {
 				logger.WithError(err).Warnf("lock error")
 				unalloc[it]++
 				continue
 			}
 			var ok bool
-			ctr, ok = queue.Get(ctr.UUID)
+			ctr, ok = sch.queue.Get(ctr.UUID)
 			if !ok {
 				logger.Error("(BUG?) container disappeared from queue after Lock succeeded")
 				continue
@@ -95,12 +71,12 @@ func Map(logger logrus.FieldLogger, queue ContainerQueue, pool WorkerPool) {
 		}
 		if unalloc[it] < 1 {
 			logger.Info("creating new instance")
-			err := pool.Create(it)
+			err := sch.pool.Create(it)
 			if err != nil {
 				if _, ok := err.(cloud.QuotaError); !ok {
 					logger.WithError(err).Warn("error creating worker")
 				}
-				queue.Unlock(ctr.UUID)
+				sch.queue.Unlock(ctr.UUID)
 				// Don't let lower-priority containers
 				// starve this one by using keeping
 				// idle workers alive on different
@@ -117,7 +93,7 @@ func Map(logger logrus.FieldLogger, queue ContainerQueue, pool WorkerPool) {
 			// higher-priority container on the same
 			// instance type. Don't let this one sneak in
 			// ahead of it.
-		} else if pool.StartContainer(it, ctr) {
+		} else if sch.pool.StartContainer(it, ctr) {
 			unalloc[it]--
 		} else {
 			dontstart[it] = true
@@ -130,9 +106,9 @@ func Map(logger logrus.FieldLogger, queue ContainerQueue, pool WorkerPool) {
 		for _, ctr := range overquota {
 			ctr := ctr.Container
 			if ctr.State == arvados.ContainerStateLocked {
-				logger := logger.WithField("ContainerUUID", ctr.UUID)
+				logger := sch.logger.WithField("ContainerUUID", ctr.UUID)
 				logger.Debug("unlock because pool capacity is used by higher priority containers")
-				err := queue.Unlock(ctr.UUID)
+				err := sch.queue.Unlock(ctr.UUID)
 				if err != nil {
 					logger.WithError(err).Warn("error unlocking")
 				}
@@ -144,7 +120,7 @@ func Map(logger logrus.FieldLogger, queue ContainerQueue, pool WorkerPool) {
 			if n < 1 {
 				continue
 			}
-			pool.Shutdown(it)
+			sch.pool.Shutdown(it)
 		}
 	}
 }
diff --git a/lib/dispatchcloud/scheduler/map_test.go b/lib/dispatchcloud/scheduler/run_queue_test.go
similarity index 53%
rename from lib/dispatchcloud/scheduler/map_test.go
rename to lib/dispatchcloud/scheduler/run_queue_test.go
index f30520053..e5e8c7ecf 100644
--- a/lib/dispatchcloud/scheduler/map_test.go
+++ b/lib/dispatchcloud/scheduler/run_queue_test.go
@@ -6,10 +6,8 @@ package scheduler
 
 import (
 	"errors"
-	"fmt"
 	"time"
 
-	"git.curoverse.com/arvados.git/lib/dispatchcloud/container"
 	"git.curoverse.com/arvados.git/lib/dispatchcloud/test"
 	"git.curoverse.com/arvados.git/lib/dispatchcloud/worker"
 	"git.curoverse.com/arvados.git/sdk/go/arvados"
@@ -20,14 +18,6 @@ import (
 var (
 	logger = logrus.StandardLogger()
 
-	// arbitrary example instance types
-	types = func() (r []arvados.InstanceType) {
-		for i := 0; i < 16; i++ {
-			r = append(r, test.InstanceType(i))
-		}
-		return
-	}()
-
 	// arbitrary example container UUIDs
 	uuids = func() (r []string) {
 		for i := 0; i < 16; i++ {
@@ -37,38 +27,6 @@ var (
 	}()
 )
 
-type stubQueue struct {
-	ents map[string]container.QueueEnt
-}
-
-func (q *stubQueue) Entries() (map[string]container.QueueEnt, time.Time) {
-	return q.ents, time.Now()
-}
-func (q *stubQueue) Lock(uuid string) error {
-	return q.setState(uuid, arvados.ContainerStateLocked)
-}
-func (q *stubQueue) Unlock(uuid string) error {
-	return q.setState(uuid, arvados.ContainerStateQueued)
-}
-func (q *stubQueue) Cancel(uuid string) error {
-	return q.setState(uuid, arvados.ContainerStateCancelled)
-}
-func (q *stubQueue) Forget(uuid string) {
-}
-func (q *stubQueue) Get(uuid string) (arvados.Container, bool) {
-	ent, ok := q.ents[uuid]
-	return ent.Container, ok
-}
-func (q *stubQueue) setState(uuid string, state arvados.ContainerState) error {
-	ent, ok := q.ents[uuid]
-	if !ok {
-		return fmt.Errorf("no such ent: %q", uuid)
-	}
-	ent.Container.State = state
-	q.ents[uuid] = ent
-	return nil
-}
-
 type stubQuotaError struct {
 	error
 }
@@ -135,47 +93,71 @@ var _ = check.Suite(&SchedulerSuite{})
 
 type SchedulerSuite struct{}
 
-// Map priority=4 container to idle node. Create a new instance for
+// Assign priority=4 container to idle node. Create a new instance for
 // the priority=3 container. Don't try to start any priority<3
 // containers because priority=3 container didn't start
 // immediately. Don't try to create any other nodes after the failed
 // create.
-func (*SchedulerSuite) TestMapIdle(c *check.C) {
-	queue := stubQueue{
-		ents: map[string]container.QueueEnt{
-			uuids[1]: {
-				Container:    arvados.Container{UUID: uuids[1], Priority: 1, State: arvados.ContainerStateQueued},
-				InstanceType: types[1],
+func (*SchedulerSuite) TestUseIdleWorkers(c *check.C) {
+	queue := test.Queue{
+		ChooseType: func(ctr *arvados.Container) (arvados.InstanceType, error) {
+			return test.InstanceType(ctr.RuntimeConstraints.VCPUs), nil
+		},
+		Containers: []arvados.Container{
+			{
+				UUID:     test.ContainerUUID(1),
+				Priority: 1,
+				State:    arvados.ContainerStateQueued,
+				RuntimeConstraints: arvados.RuntimeConstraints{
+					VCPUs: 1,
+					RAM:   1 << 30,
+				},
 			},
-			uuids[2]: {
-				Container:    arvados.Container{UUID: uuids[2], Priority: 2, State: arvados.ContainerStateQueued},
-				InstanceType: types[1],
+			{
+				UUID:     test.ContainerUUID(2),
+				Priority: 2,
+				State:    arvados.ContainerStateQueued,
+				RuntimeConstraints: arvados.RuntimeConstraints{
+					VCPUs: 1,
+					RAM:   1 << 30,
+				},
 			},
-			uuids[3]: {
-				Container:    arvados.Container{UUID: uuids[3], Priority: 3, State: arvados.ContainerStateQueued},
-				InstanceType: types[1],
+			{
+				UUID:     test.ContainerUUID(3),
+				Priority: 3,
+				State:    arvados.ContainerStateQueued,
+				RuntimeConstraints: arvados.RuntimeConstraints{
+					VCPUs: 1,
+					RAM:   1 << 30,
+				},
 			},
-			uuids[4]: {
-				Container:    arvados.Container{UUID: uuids[4], Priority: 4, State: arvados.ContainerStateQueued},
-				InstanceType: types[1],
+			{
+				UUID:     test.ContainerUUID(4),
+				Priority: 4,
+				State:    arvados.ContainerStateQueued,
+				RuntimeConstraints: arvados.RuntimeConstraints{
+					VCPUs: 1,
+					RAM:   1 << 30,
+				},
 			},
 		},
 	}
+	queue.Update()
 	pool := stubPool{
 		unalloc: map[arvados.InstanceType]int{
-			types[1]: 1,
-			types[2]: 2,
+			test.InstanceType(1): 1,
+			test.InstanceType(2): 2,
 		},
 		idle: map[arvados.InstanceType]int{
-			types[1]: 1,
-			types[2]: 2,
+			test.InstanceType(1): 1,
+			test.InstanceType(2): 2,
 		},
 		running:   map[string]time.Time{},
 		canCreate: 1,
 	}
-	Map(logger, &queue, &pool)
-	c.Check(pool.creates, check.DeepEquals, []arvados.InstanceType{types[1]})
-	c.Check(pool.starts, check.DeepEquals, []string{uuids[4], uuids[3]})
+	New(logger, &queue, &pool, time.Millisecond, time.Millisecond).runQueue()
+	c.Check(pool.creates, check.DeepEquals, []arvados.InstanceType{test.InstanceType(1)})
+	c.Check(pool.starts, check.DeepEquals, []string{test.ContainerUUID(4), test.ContainerUUID(3)})
 	c.Check(pool.running, check.HasLen, 1)
 	for uuid := range pool.running {
 		c.Check(uuid, check.Equals, uuids[4])
@@ -184,31 +166,43 @@ func (*SchedulerSuite) TestMapIdle(c *check.C) {
 
 // Shutdown some nodes if Create() fails -- and without even calling
 // Create(), if AtQuota() is true.
-func (*SchedulerSuite) TestMapShutdownAtQuota(c *check.C) {
+func (*SchedulerSuite) TestShutdownAtQuota(c *check.C) {
 	for quota := 0; quota < 2; quota++ {
-		shouldCreate := types[1 : 1+quota]
-		queue := stubQueue{
-			ents: map[string]container.QueueEnt{
-				uuids[1]: {
-					Container:    arvados.Container{UUID: uuids[1], Priority: 1, State: arvados.ContainerStateQueued},
-					InstanceType: types[1],
+		shouldCreate := []arvados.InstanceType{}
+		for i := 1; i < 1+quota; i++ {
+			shouldCreate = append(shouldCreate, test.InstanceType(i))
+		}
+		queue := test.Queue{
+			ChooseType: func(ctr *arvados.Container) (arvados.InstanceType, error) {
+				return test.InstanceType(ctr.RuntimeConstraints.VCPUs), nil
+			},
+			Containers: []arvados.Container{
+				{
+					UUID:     test.ContainerUUID(1),
+					Priority: 1,
+					State:    arvados.ContainerStateQueued,
+					RuntimeConstraints: arvados.RuntimeConstraints{
+						VCPUs: 1,
+						RAM:   1 << 30,
+					},
 				},
 			},
 		}
+		queue.Update()
 		pool := stubPool{
 			atQuota: quota == 0,
 			unalloc: map[arvados.InstanceType]int{
-				types[2]: 2,
+				test.InstanceType(2): 2,
 			},
 			idle: map[arvados.InstanceType]int{
-				types[2]: 2,
+				test.InstanceType(2): 2,
 			},
 			running:   map[string]time.Time{},
 			creates:   []arvados.InstanceType{},
 			starts:    []string{},
 			canCreate: 0,
 		}
-		Map(logger, &queue, &pool)
+		New(logger, &queue, &pool, time.Millisecond, time.Millisecond).runQueue()
 		c.Check(pool.creates, check.DeepEquals, shouldCreate)
 		c.Check(pool.starts, check.DeepEquals, []string{})
 		c.Check(pool.shutdowns, check.Not(check.Equals), 0)
@@ -217,55 +211,89 @@ func (*SchedulerSuite) TestMapShutdownAtQuota(c *check.C) {
 
 // Start lower-priority containers while waiting for new/existing
 // workers to come up for higher-priority containers.
-func (*SchedulerSuite) TestMapStartWhileCreating(c *check.C) {
+func (*SchedulerSuite) TestStartWhileCreating(c *check.C) {
 	pool := stubPool{
 		unalloc: map[arvados.InstanceType]int{
-			types[1]: 1,
-			types[2]: 1,
+			test.InstanceType(1): 1,
+			test.InstanceType(2): 1,
 		},
 		idle: map[arvados.InstanceType]int{
-			types[1]: 1,
-			types[2]: 1,
+			test.InstanceType(1): 1,
+			test.InstanceType(2): 1,
 		},
 		running:   map[string]time.Time{},
 		canCreate: 2,
 	}
-	queue := stubQueue{
-		ents: map[string]container.QueueEnt{
-			uuids[1]: {
+	queue := test.Queue{
+		ChooseType: func(ctr *arvados.Container) (arvados.InstanceType, error) {
+			return test.InstanceType(ctr.RuntimeConstraints.VCPUs), nil
+		},
+		Containers: []arvados.Container{
+			{
 				// create a new worker
-				Container:    arvados.Container{UUID: uuids[1], Priority: 1, State: arvados.ContainerStateQueued},
-				InstanceType: types[1],
+				UUID:     test.ContainerUUID(1),
+				Priority: 1,
+				State:    arvados.ContainerStateQueued,
+				RuntimeConstraints: arvados.RuntimeConstraints{
+					VCPUs: 1,
+					RAM:   1 << 30,
+				},
 			},
-			uuids[2]: {
+			{
 				// tentatively map to unalloc worker
-				Container:    arvados.Container{UUID: uuids[2], Priority: 2, State: arvados.ContainerStateQueued},
-				InstanceType: types[1],
+				UUID:     test.ContainerUUID(2),
+				Priority: 2,
+				State:    arvados.ContainerStateQueued,
+				RuntimeConstraints: arvados.RuntimeConstraints{
+					VCPUs: 1,
+					RAM:   1 << 30,
+				},
 			},
-			uuids[3]: {
+			{
 				// start now on idle worker
-				Container:    arvados.Container{UUID: uuids[3], Priority: 3, State: arvados.ContainerStateQueued},
-				InstanceType: types[1],
+				UUID:     test.ContainerUUID(3),
+				Priority: 3,
+				State:    arvados.ContainerStateQueued,
+				RuntimeConstraints: arvados.RuntimeConstraints{
+					VCPUs: 1,
+					RAM:   1 << 30,
+				},
 			},
-			uuids[4]: {
+			{
 				// create a new worker
-				Container:    arvados.Container{UUID: uuids[4], Priority: 4, State: arvados.ContainerStateQueued},
-				InstanceType: types[2],
+				UUID:     test.ContainerUUID(4),
+				Priority: 4,
+				State:    arvados.ContainerStateQueued,
+				RuntimeConstraints: arvados.RuntimeConstraints{
+					VCPUs: 2,
+					RAM:   2 << 30,
+				},
 			},
-			uuids[5]: {
+			{
 				// tentatively map to unalloc worker
-				Container:    arvados.Container{UUID: uuids[5], Priority: 5, State: arvados.ContainerStateQueued},
-				InstanceType: types[2],
+				UUID:     test.ContainerUUID(5),
+				Priority: 5,
+				State:    arvados.ContainerStateQueued,
+				RuntimeConstraints: arvados.RuntimeConstraints{
+					VCPUs: 2,
+					RAM:   2 << 30,
+				},
 			},
-			uuids[6]: {
+			{
 				// start now on idle worker
-				Container:    arvados.Container{UUID: uuids[6], Priority: 6, State: arvados.ContainerStateQueued},
-				InstanceType: types[2],
+				UUID:     test.ContainerUUID(6),
+				Priority: 6,
+				State:    arvados.ContainerStateQueued,
+				RuntimeConstraints: arvados.RuntimeConstraints{
+					VCPUs: 2,
+					RAM:   2 << 30,
+				},
 			},
 		},
 	}
-	Map(logger, &queue, &pool)
-	c.Check(pool.creates, check.DeepEquals, []arvados.InstanceType{types[2], types[1]})
+	queue.Update()
+	New(logger, &queue, &pool, time.Millisecond, time.Millisecond).runQueue()
+	c.Check(pool.creates, check.DeepEquals, []arvados.InstanceType{test.InstanceType(2), test.InstanceType(1)})
 	c.Check(pool.starts, check.DeepEquals, []string{uuids[6], uuids[5], uuids[3], uuids[2]})
 	running := map[string]bool{}
 	for uuid, t := range pool.running {
diff --git a/lib/dispatchcloud/scheduler/scheduler.go b/lib/dispatchcloud/scheduler/scheduler.go
new file mode 100644
index 000000000..0be8edb7b
--- /dev/null
+++ b/lib/dispatchcloud/scheduler/scheduler.go
@@ -0,0 +1,107 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+// Package scheduler uses a resizable worker pool to execute
+// containers in priority order.
+package scheduler
+
+import (
+	"sync"
+	"time"
+
+	"github.com/Sirupsen/logrus"
+)
+
+// A Scheduler maps queued containers onto unallocated workers in
+// priority order, creating new workers if needed. It locks containers
+// that can be mapped onto existing/pending workers, and starts them
+// if possible.
+//
+// A Scheduler unlocks any containers that are locked but can't be
+// mapped. (For example, this happens when the cloud provider reaches
+// quota/capacity and a previously mappable container's priority is
+// surpassed by a newer container.)
+//
+// If it encounters errors while creating new workers, a Scheduler
+// shuts down idle workers, in case they are consuming quota.
+type Scheduler struct {
+	logger              logrus.FieldLogger
+	queue               ContainerQueue
+	pool                WorkerPool
+	staleLockTimeout    time.Duration
+	queueUpdateInterval time.Duration
+
+	runOnce sync.Once
+	stop    chan struct{}
+}
+
+// New returns a new unstarted Scheduler.
+//
+// Any given queue and pool should not be used by more than one
+// scheduler at a time.
+func New(logger logrus.FieldLogger, queue ContainerQueue, pool WorkerPool, staleLockTimeout, queueUpdateInterval time.Duration) *Scheduler {
+	return &Scheduler{
+		logger:              logger,
+		queue:               queue,
+		pool:                pool,
+		staleLockTimeout:    staleLockTimeout,
+		queueUpdateInterval: queueUpdateInterval,
+		stop:                make(chan struct{}),
+	}
+}
+
+// Start starts the scheduler.
+func (sch *Scheduler) Start() {
+	go sch.runOnce.Do(sch.run)
+}
+
+// Stop stops the scheduler. No other method should be called after
+// Stop.
+func (sch *Scheduler) Stop() {
+	close(sch.stop)
+}
+
+func (sch *Scheduler) run() {
+	// Ensure the queue is fetched once before attempting anything.
+	for err := sch.queue.Update(); err != nil; err = sch.queue.Update() {
+		sch.logger.Errorf("error updating queue: %s", err)
+		d := sch.queueUpdateInterval / 60
+		sch.logger.Infof("waiting %s before retry", d)
+		time.Sleep(d)
+	}
+
+	// Keep the queue up to date.
+	poll := time.NewTicker(sch.queueUpdateInterval)
+	defer poll.Stop()
+	go func() {
+		for range poll.C {
+			err := sch.queue.Update()
+			if err != nil {
+				sch.logger.Errorf("error updating queue: %s", err)
+			}
+		}
+	}()
+
+	t0 := time.Now()
+	sch.logger.Infof("FixStaleLocks starting.")
+	sch.fixStaleLocks()
+	sch.logger.Infof("FixStaleLocks finished (%s), starting scheduling.", time.Since(t0))
+
+	poolNotify := sch.pool.Subscribe()
+	defer sch.pool.Unsubscribe(poolNotify)
+
+	queueNotify := sch.queue.Subscribe()
+	defer sch.queue.Unsubscribe(queueNotify)
+
+	for {
+		sch.runQueue()
+		sch.sync()
+		select {
+		case <-sch.stop:
+			return
+		case <-queueNotify:
+		case <-poolNotify:
+		}
+	}
+}
diff --git a/lib/dispatchcloud/scheduler/sync.go b/lib/dispatchcloud/scheduler/sync.go
index bd0e9b309..a85162deb 100644
--- a/lib/dispatchcloud/scheduler/sync.go
+++ b/lib/dispatchcloud/scheduler/sync.go
@@ -12,7 +12,7 @@ import (
 	"github.com/Sirupsen/logrus"
 )
 
-// Sync resolves discrepancies between the queue and the pool:
+// sync resolves discrepancies between the queue and the pool:
 //
 // Lingering crunch-run processes for finalized and unlocked/requeued
 // containers are killed.
@@ -22,27 +22,24 @@ import (
 //
 // Running containers whose crunch-run processes have exited are
 // cancelled.
-//
-// Sync must not be called concurrently with other calls to Map or
-// Sync using the same queue or pool.
-func Sync(logger logrus.FieldLogger, queue ContainerQueue, pool WorkerPool) {
-	running := pool.Running()
+func (sch *Scheduler) sync() {
+	running := sch.pool.Running()
 	cancel := func(ent container.QueueEnt, reason string) {
 		uuid := ent.Container.UUID
-		logger := logger.WithField("ContainerUUID", uuid)
+		logger := sch.logger.WithField("ContainerUUID", uuid)
 		logger.Infof("cancelling container because %s", reason)
-		err := queue.Cancel(uuid)
+		err := sch.queue.Cancel(uuid)
 		if err != nil {
 			logger.WithError(err).Print("error cancelling container")
 		}
 	}
 	kill := func(ent container.QueueEnt) {
 		uuid := ent.Container.UUID
-		logger := logger.WithField("ContainerUUID", uuid)
+		logger := sch.logger.WithField("ContainerUUID", uuid)
 		logger.Debugf("killing crunch-run process because state=%q", ent.Container.State)
-		pool.KillContainer(uuid)
+		sch.pool.KillContainer(uuid)
 	}
-	qEntries, qUpdated := queue.Entries()
+	qEntries, qUpdated := sch.queue.Entries()
 	for uuid, ent := range qEntries {
 		exited, running := running[uuid]
 		switch ent.Container.State {
@@ -64,11 +61,11 @@ func Sync(logger logrus.FieldLogger, queue ContainerQueue, pool WorkerPool) {
 				// container.
 				go kill(ent)
 			} else {
-				logger.WithFields(logrus.Fields{
+				sch.logger.WithFields(logrus.Fields{
 					"ContainerUUID": uuid,
 					"State":         ent.Container.State,
 				}).Info("container finished")
-				queue.Forget(uuid)
+				sch.queue.Forget(uuid)
 			}
 		case arvados.ContainerStateQueued:
 			if running {
@@ -80,18 +77,18 @@ func Sync(logger logrus.FieldLogger, queue ContainerQueue, pool WorkerPool) {
 			}
 		case arvados.ContainerStateLocked:
 			if running && !exited.IsZero() && qUpdated.After(exited) {
-				logger = logger.WithFields(logrus.Fields{
+				logger := sch.logger.WithFields(logrus.Fields{
 					"ContainerUUID": uuid,
 					"Exited":        time.Since(exited).Seconds(),
 				})
 				logger.Infof("requeueing container because state=%q after crunch-run exited", ent.Container.State)
-				err := queue.Unlock(uuid)
+				err := sch.queue.Unlock(uuid)
 				if err != nil {
 					logger.WithError(err).Info("error requeueing container")
 				}
 			}
 		default:
-			logger.WithField("ContainerUUID", uuid).Errorf("BUG: unexpected state %q", ent.Container.State)
+			sch.logger.WithField("ContainerUUID", uuid).Errorf("BUG: unexpected state %q", ent.Container.State)
 		}
 	}
 }
diff --git a/lib/dispatchcloud/test/queue.go b/lib/dispatchcloud/test/queue.go
index 909f56114..0bad5a391 100644
--- a/lib/dispatchcloud/test/queue.go
+++ b/lib/dispatchcloud/test/queue.go
@@ -6,6 +6,7 @@ package test
 
 import (
 	"fmt"
+	"sync"
 	"time"
 
 	"git.curoverse.com/arvados.git/lib/dispatchcloud/container"
@@ -22,13 +23,18 @@ type Queue struct {
 	// must not be nil.
 	ChooseType func(*arvados.Container) (arvados.InstanceType, error)
 
-	entries map[string]container.QueueEnt
-	updTime time.Time
+	entries     map[string]container.QueueEnt
+	updTime     time.Time
+	subscribers map[<-chan struct{}]chan struct{}
+
+	mtx sync.Mutex
 }
 
 // Entries returns the containers that were queued when Update was
 // last called.
 func (q *Queue) Entries() (map[string]container.QueueEnt, time.Time) {
+	q.mtx.Lock()
+	defer q.mtx.Unlock()
 	updTime := q.updTime
 	r := map[string]container.QueueEnt{}
 	for uuid, ent := range q.entries {
@@ -42,11 +48,15 @@ func (q *Queue) Entries() (map[string]container.QueueEnt, time.Time) {
 // the state has been changed (via Lock, Unlock, or Cancel) since the
 // last Update, the updated state is returned.
 func (q *Queue) Get(uuid string) (arvados.Container, bool) {
+	q.mtx.Lock()
+	defer q.mtx.Unlock()
 	ent, ok := q.entries[uuid]
 	return ent.Container, ok
 }
 
 func (q *Queue) Forget(uuid string) {
+	q.mtx.Lock()
+	defer q.mtx.Unlock()
 	delete(q.entries, uuid)
 }
 
@@ -62,7 +72,35 @@ func (q *Queue) Cancel(uuid string) error {
 	return q.changeState(uuid, q.entries[uuid].Container.State, arvados.ContainerStateCancelled)
 }
 
+func (q *Queue) Subscribe() <-chan struct{} {
+	q.mtx.Lock()
+	defer q.mtx.Unlock()
+	if q.subscribers == nil {
+		q.subscribers = map[<-chan struct{}]chan struct{}{}
+	}
+	ch := make(chan struct{}, 1)
+	q.subscribers[ch] = ch
+	return ch
+}
+
+func (q *Queue) Unsubscribe(ch <-chan struct{}) {
+	q.mtx.Lock()
+	defer q.mtx.Unlock()
+	delete(q.subscribers, ch)
+}
+
+func (q *Queue) notify() {
+	for _, ch := range q.subscribers {
+		select {
+		case ch <- struct{}{}:
+		default:
+		}
+	}
+}
+
 func (q *Queue) changeState(uuid string, from, to arvados.ContainerState) error {
+	q.mtx.Lock()
+	defer q.mtx.Unlock()
 	ent := q.entries[uuid]
 	if ent.Container.State != from {
 		return fmt.Errorf("lock failed: state=%q", ent.Container.State)
@@ -75,11 +113,14 @@ func (q *Queue) changeState(uuid string, from, to arvados.ContainerState) error
 			break
 		}
 	}
+	q.notify()
 	return nil
 }
 
 // Update rebuilds the current entries from the Containers slice.
 func (q *Queue) Update() error {
+	q.mtx.Lock()
+	defer q.mtx.Unlock()
 	updTime := time.Now()
 	upd := map[string]container.QueueEnt{}
 	for _, ctr := range q.Containers {
@@ -95,6 +136,7 @@ func (q *Queue) Update() error {
 	}
 	q.entries = upd
 	q.updTime = updTime
+	q.notify()
 	return nil
 }
 
@@ -106,6 +148,8 @@ func (q *Queue) Update() error {
 // The resulting changes are not exposed through Get() or Entries()
 // until the next call to Update().
 func (q *Queue) Notify(upd arvados.Container) {
+	q.mtx.Lock()
+	defer q.mtx.Unlock()
 	for i, ctr := range q.Containers {
 		if ctr.UUID == upd.UUID {
 			q.Containers[i] = upd

commit 67464732a18e0aac11d89a75987151543b704cf6
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Mon Oct 29 13:13:37 2018 -0400

    14360: Cancel containers asynchronously in Sync cleanup.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/lib/dispatchcloud/scheduler/sync.go b/lib/dispatchcloud/scheduler/sync.go
index c9273b4be..bd0e9b309 100644
--- a/lib/dispatchcloud/scheduler/sync.go
+++ b/lib/dispatchcloud/scheduler/sync.go
@@ -48,9 +48,9 @@ func Sync(logger logrus.FieldLogger, queue ContainerQueue, pool WorkerPool) {
 		switch ent.Container.State {
 		case arvados.ContainerStateRunning:
 			if !running {
-				cancel(ent, "not running on any worker")
+				go cancel(ent, "not running on any worker")
 			} else if !exited.IsZero() && qUpdated.After(exited) {
-				cancel(ent, "state=\"Running\" after crunch-run exited")
+				go cancel(ent, "state=\"Running\" after crunch-run exited")
 			}
 		case arvados.ContainerStateComplete, arvados.ContainerStateCancelled:
 			if running {
@@ -62,7 +62,7 @@ func Sync(logger logrus.FieldLogger, queue ContainerQueue, pool WorkerPool) {
 				// of kill() will be to make the
 				// worker available for the next
 				// container.
-				kill(ent)
+				go kill(ent)
 			} else {
 				logger.WithFields(logrus.Fields{
 					"ContainerUUID": uuid,
@@ -76,7 +76,7 @@ func Sync(logger logrus.FieldLogger, queue ContainerQueue, pool WorkerPool) {
 				// a network outage and is still
 				// preparing to run a container that
 				// has already been unlocked/requeued.
-				kill(ent)
+				go kill(ent)
 			}
 		case arvados.ContainerStateLocked:
 			if running && !exited.IsZero() && qUpdated.After(exited) {

commit a5ef124795af5b28a97c98e69a89c4c6e99cf0b7
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Mon Oct 29 13:12:48 2018 -0400

    14360: Log level debug->warning for unexpected condition.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/lib/dispatchcloud/scheduler/map.go b/lib/dispatchcloud/scheduler/map.go
index 75d8eeefb..aa92d0433 100644
--- a/lib/dispatchcloud/scheduler/map.go
+++ b/lib/dispatchcloud/scheduler/map.go
@@ -87,7 +87,7 @@ func Map(logger logrus.FieldLogger, queue ContainerQueue, pool WorkerPool) {
 				continue
 			}
 			if ctr.State != arvados.ContainerStateLocked {
-				logger.Debugf("(race?) container has state=%q after Lock succeeded", ctr.State)
+				logger.Warnf("(race?) container has state=%q after Lock succeeded", ctr.State)
 			}
 		}
 		if ctr.State != arvados.ContainerStateLocked {

commit a5fe34a8cd05c4e55deaec599347f65e6d662d22
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Mon Oct 29 11:28:42 2018 -0400

    14360: Improve comments.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/lib/dispatchcloud/scheduler/map.go b/lib/dispatchcloud/scheduler/map.go
index 5742c1f47..75d8eeefb 100644
--- a/lib/dispatchcloud/scheduler/map.go
+++ b/lib/dispatchcloud/scheduler/map.go
@@ -32,6 +32,12 @@ import (
 // idle workers, in case they are consuming quota.
 //
 // Map should not be called without first calling FixStaleLocks.
+//
+//	FixStaleLocks()
+//	for {
+//		Map()
+//		Sync()
+//	}
 func Map(logger logrus.FieldLogger, queue ContainerQueue, pool WorkerPool) {
 	unsorted, _ := queue.Entries()
 	sorted := make([]container.QueueEnt, 0, len(unsorted))
diff --git a/lib/dispatchcloud/scheduler/sync.go b/lib/dispatchcloud/scheduler/sync.go
index 00e2a30f7..c9273b4be 100644
--- a/lib/dispatchcloud/scheduler/sync.go
+++ b/lib/dispatchcloud/scheduler/sync.go
@@ -54,6 +54,14 @@ func Sync(logger logrus.FieldLogger, queue ContainerQueue, pool WorkerPool) {
 			}
 		case arvados.ContainerStateComplete, arvados.ContainerStateCancelled:
 			if running {
+				// Kill crunch-run in case it's stuck;
+				// nothing it does now will matter
+				// anyway. If crunch-run has already
+				// exited and we just haven't found
+				// out about it yet, the only effect
+				// of kill() will be to make the
+				// worker available for the next
+				// container.
 				kill(ent)
 			} else {
 				logger.WithFields(logrus.Fields{
@@ -64,6 +72,10 @@ func Sync(logger logrus.FieldLogger, queue ContainerQueue, pool WorkerPool) {
 			}
 		case arvados.ContainerStateQueued:
 			if running {
+				// Can happen if a worker returns from
+				// a network outage and is still
+				// preparing to run a container that
+				// has already been unlocked/requeued.
 				kill(ent)
 			}
 		case arvados.ContainerStateLocked:
diff --git a/lib/dispatchcloud/worker/pool.go b/lib/dispatchcloud/worker/pool.go
index d1e23eb6b..1e759e38f 100644
--- a/lib/dispatchcloud/worker/pool.go
+++ b/lib/dispatchcloud/worker/pool.go
@@ -409,6 +409,9 @@ func (wp *Pool) StartContainer(it arvados.InstanceType, ctr arvados.Container) b
 
 // KillContainer kills the crunch-run process for the given container
 // UUID, if it's running on any worker.
+//
+// KillContainer returns immediately; the act of killing the container
+// takes some time, and runs in the background.
 func (wp *Pool) KillContainer(uuid string) {
 	wp.mtx.Lock()
 	defer wp.mtx.Unlock()

commit 8ef9e211fa1eb91deb1b6aa7245cd7e5b8dd4c33
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Mon Oct 29 11:17:05 2018 -0400

    14360: Add missing mutex lock.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/lib/dispatchcloud/worker/pool.go b/lib/dispatchcloud/worker/pool.go
index 28feb3ded..d1e23eb6b 100644
--- a/lib/dispatchcloud/worker/pool.go
+++ b/lib/dispatchcloud/worker/pool.go
@@ -233,6 +233,8 @@ func (wp *Pool) Create(it arvados.InstanceType) error {
 // AtQuota returns true if Create is not expected to work at the
 // moment.
 func (wp *Pool) AtQuota() bool {
+	wp.mtx.Lock()
+	defer wp.mtx.Unlock()
 	return time.Now().Before(wp.atQuotaUntil)
 }
 

commit 468a33e9931c3dbadc83332a3b85d7677c9e4dc3
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Mon Oct 29 11:08:45 2018 -0400

    14360: Rename View to InstanceView.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/lib/dispatchcloud/dispatcher.go b/lib/dispatchcloud/dispatcher.go
index 707c31f24..c441dc6fa 100644
--- a/lib/dispatchcloud/dispatcher.go
+++ b/lib/dispatchcloud/dispatcher.go
@@ -38,7 +38,7 @@ type containerQueue interface {
 
 type pool interface {
 	scheduler.WorkerPool
-	View() []worker.View
+	Instances() []worker.InstanceView
 }
 
 type dispatcher struct {
@@ -198,8 +198,8 @@ func (disp *dispatcher) apiInstances(w http.ResponseWriter, r *http.Request) {
 		return
 	}
 	var resp struct {
-		Items []worker.View
+		Items []worker.InstanceView
 	}
-	resp.Items = disp.pool.View()
+	resp.Items = disp.pool.Instances()
 	json.NewEncoder(w).Encode(resp)
 }
diff --git a/lib/dispatchcloud/worker/pool.go b/lib/dispatchcloud/worker/pool.go
index a9531987c..28feb3ded 100644
--- a/lib/dispatchcloud/worker/pool.go
+++ b/lib/dispatchcloud/worker/pool.go
@@ -23,8 +23,8 @@ const (
 	tagKeyHold         = "Hold"
 )
 
-// A View shows a worker's current state and recent activity.
-type View struct {
+// An InstanceView shows a worker's current state and recent activity.
+type InstanceView struct {
 	Instance             string
 	Price                float64
 	ArvadosInstanceType  string
@@ -643,13 +643,14 @@ func (wp *Pool) Stop() {
 	close(wp.stop)
 }
 
-// View reports status information for every worker in the pool.
-func (wp *Pool) View() []View {
-	var r []View
+// Instances returns an InstanceView for each worker in the pool,
+// summarizing its current state and recent activity.
+func (wp *Pool) Instances() []InstanceView {
+	var r []InstanceView
 	wp.setupOnce.Do(wp.setup)
 	wp.mtx.Lock()
 	for _, w := range wp.workers {
-		r = append(r, View{
+		r = append(r, InstanceView{
 			Instance:             w.instance.String(),
 			Price:                w.instType.Price,
 			ArvadosInstanceType:  w.instType.Name,

commit 0f159e5eb304e08a0704156046762709bd6e1eba
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Sat Oct 27 01:58:29 2018 -0400

    14360: Rename keeplocal -> dontupdate, add explanation.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/lib/dispatchcloud/container/queue.go b/lib/dispatchcloud/container/queue.go
index 17a38259d..e0e1cd0cd 100644
--- a/lib/dispatchcloud/container/queue.go
+++ b/lib/dispatchcloud/container/queue.go
@@ -56,11 +56,19 @@ type Queue struct {
 	chooseType typeChooser
 	client     APIClient
 
-	auth      *arvados.APIClientAuthorization
-	current   map[string]QueueEnt
-	updated   time.Time
-	mtx       sync.Mutex
-	keeplocal map[string]struct{}
+	auth    *arvados.APIClientAuthorization
+	current map[string]QueueEnt
+	updated time.Time
+	mtx     sync.Mutex
+
+	// Methods that modify the Queue (like Lock) add the affected
+	// container UUIDs to dontupdate. When applying a batch of
+	// updates received from the network, anything appearing in
+	// dontupdate is skipped, in case the received update has
+	// already been superseded by the locally initiated change.
+	// When no network update is in progress, this protection is
+	// not needed, and dontupdate is nil.
+	dontupdate map[string]struct{}
 }
 
 // NewQueue returns a new Queue. When a new container appears in the
@@ -126,7 +134,7 @@ func (cq *Queue) Entries() (entries map[string]QueueEnt, threshold time.Time) {
 // containers.
 func (cq *Queue) Update() error {
 	cq.mtx.Lock()
-	cq.keeplocal = map[string]struct{}{}
+	cq.dontupdate = map[string]struct{}{}
 	updateStarted := time.Now()
 	cq.mtx.Unlock()
 
@@ -138,7 +146,7 @@ func (cq *Queue) Update() error {
 	cq.mtx.Lock()
 	defer cq.mtx.Unlock()
 	for uuid, ctr := range next {
-		if _, keep := cq.keeplocal[uuid]; keep {
+		if _, keep := cq.dontupdate[uuid]; keep {
 			continue
 		}
 		if cur, ok := cq.current[uuid]; !ok {
@@ -149,7 +157,7 @@ func (cq *Queue) Update() error {
 		}
 	}
 	for uuid := range cq.current {
-		if _, keep := cq.keeplocal[uuid]; keep {
+		if _, keep := cq.dontupdate[uuid]; keep {
 			continue
 		} else if _, keep = next[uuid]; keep {
 			continue
@@ -157,7 +165,7 @@ func (cq *Queue) Update() error {
 			delete(cq.current, uuid)
 		}
 	}
-	cq.keeplocal = nil
+	cq.dontupdate = nil
 	cq.updated = updateStarted
 	return nil
 }
@@ -198,8 +206,8 @@ func (cq *Queue) apiUpdate(uuid, action string) error {
 
 	cq.mtx.Lock()
 	defer cq.mtx.Unlock()
-	if cq.keeplocal != nil {
-		cq.keeplocal[uuid] = struct{}{}
+	if cq.dontupdate != nil {
+		cq.dontupdate[uuid] = struct{}{}
 	}
 	if ent, ok := cq.current[uuid]; !ok {
 		cq.addEnt(uuid, resp)

commit 3ebb991101ef62387268d594e3e4307a0f5c4e9e
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Sat Oct 27 01:41:11 2018 -0400

    14360: Update busy/lastUUID/running together. Clarify race comment.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/lib/dispatchcloud/worker/pool.go b/lib/dispatchcloud/worker/pool.go
index 4ddd3745e..a9531987c 100644
--- a/lib/dispatchcloud/worker/pool.go
+++ b/lib/dispatchcloud/worker/pool.go
@@ -382,9 +382,12 @@ func (wp *Pool) StartContainer(it arvados.InstanceType, ctr arvados.Container) b
 		stdout, stderr, err := wkr.executor.Execute("crunch-run --detach '"+ctr.UUID+"'", nil)
 		wp.mtx.Lock()
 		defer wp.mtx.Unlock()
-		wkr.updated = time.Now()
+		now := time.Now()
+		wkr.updated = now
+		wkr.busy = now
 		delete(wkr.starting, ctr.UUID)
 		wkr.running[ctr.UUID] = struct{}{}
+		wkr.lastUUID = ctr.UUID
 		if err != nil {
 			logger.WithField("stdout", string(stdout)).
 				WithField("stderr", string(stderr)).
@@ -789,10 +792,6 @@ func (wp *Pool) probeAndUpdate(wkr *worker) {
 
 	updateTime := time.Now()
 	wkr.probed = updateTime
-	if len(ctrUUIDs) > 0 {
-		wkr.busy = updateTime
-		wkr.lastUUID = ctrUUIDs[0]
-	}
 	if wkr.state == StateShutdown || wkr.state == StateHold {
 	} else if booted {
 		if wkr.state != StateRunning {
@@ -804,13 +803,18 @@ func (wp *Pool) probeAndUpdate(wkr *worker) {
 	}
 
 	if updated != wkr.updated {
-		// Worker was updated (e.g., by starting a new
-		// container) after the probe began. Avoid clobbering
-		// those changes with the probe results.
+		// Worker was updated after the probe began, so
+		// wkr.running might have a container UUID that was
+		// not yet running when ctrUUIDs was generated. Leave
+		// wkr.running alone and wait for the next probe to
+		// catch up on any changes.
 		return
 	}
 
-	if len(ctrUUIDs) == 0 && len(wkr.running) > 0 {
+	if len(ctrUUIDs) > 0 {
+		wkr.busy = updateTime
+		wkr.lastUUID = ctrUUIDs[0]
+	} else if len(wkr.running) > 0 {
 		wkr.unallocated = updateTime
 	}
 	running := map[string]struct{}{}

commit 968519edf4c9150ac65991aea5f977356551b4e4
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Sat Oct 27 01:19:05 2018 -0400

    14360: Move monitoring API test from lame driver to stub driver.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/lib/dispatchcloud/dispatcher_test.go b/lib/dispatchcloud/dispatcher_test.go
index e6f536d24..362fb4942 100644
--- a/lib/dispatchcloud/dispatcher_test.go
+++ b/lib/dispatchcloud/dispatcher_test.go
@@ -318,10 +318,8 @@ func (s *DispatcherSuite) TestDispatchToStubDriver(c *check.C) {
 }
 
 func (s *DispatcherSuite) TestInstancesAPI(c *check.C) {
-	var lameSet test.LameInstanceSet
-	drivers["test"] = cloud.DriverFunc(func(params map[string]interface{}, id cloud.InstanceSetID) (cloud.InstanceSet, error) {
-		return &lameSet, nil
-	})
+	s.cluster.CloudVMs.TimeoutBooting = arvados.Duration(time.Second)
+	drivers["test"] = s.stubDriver
 
 	type instance struct {
 		Instance             string
@@ -349,22 +347,16 @@ func (s *DispatcherSuite) TestInstancesAPI(c *check.C) {
 
 	ch := s.disp.pool.Subscribe()
 	defer s.disp.pool.Unsubscribe(ch)
-	err := s.disp.pool.Create(arvados.InstanceType{
-		Name:         "a1.small-1",
-		ProviderType: "a1.small",
-		VCPUs:        1,
-		RAM:          1 << 30,
-		Price:        0.12,
-	})
+	err := s.disp.pool.Create(test.InstanceType(1))
 	c.Check(err, check.IsNil)
 	<-ch
 
 	sr = getInstances()
 	c.Assert(len(sr.Items), check.Equals, 1)
-	c.Check(sr.Items[0].Instance, check.Matches, "lame-.*")
+	c.Check(sr.Items[0].Instance, check.Matches, "stub.*")
 	c.Check(sr.Items[0].WorkerState, check.Equals, "booting")
-	c.Check(sr.Items[0].Price, check.Equals, 0.12)
+	c.Check(sr.Items[0].Price, check.Equals, 0.123)
 	c.Check(sr.Items[0].LastContainerUUID, check.Equals, "")
-	c.Check(sr.Items[0].ProviderInstanceType, check.Equals, "a1.small")
-	c.Check(sr.Items[0].ArvadosInstanceType, check.Equals, "a1.small-1")
+	c.Check(sr.Items[0].ProviderInstanceType, check.Equals, test.InstanceType(1).ProviderType)
+	c.Check(sr.Items[0].ArvadosInstanceType, check.Equals, test.InstanceType(1).Name)
 }
diff --git a/lib/dispatchcloud/test/fixtures.go b/lib/dispatchcloud/test/fixtures.go
index 7d65ca057..68bdb3db4 100644
--- a/lib/dispatchcloud/test/fixtures.go
+++ b/lib/dispatchcloud/test/fixtures.go
@@ -23,5 +23,6 @@ func InstanceType(i int) arvados.InstanceType {
 		ProviderType: fmt.Sprintf("providertype%d", i),
 		VCPUs:        i,
 		RAM:          arvados.ByteSize(i) << 30,
+		Price:        float64(i) * 0.123,
 	}
 }

commit 71460cf96c9b43c8ab0a38118c3745a4c0e6d7e9
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Sat Oct 27 01:18:36 2018 -0400

    14360: Move shutdown-if-broken check to its own func.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/lib/dispatchcloud/worker/pool.go b/lib/dispatchcloud/worker/pool.go
index a7b5132a5..4ddd3745e 100644
--- a/lib/dispatchcloud/worker/pool.go
+++ b/lib/dispatchcloud/worker/pool.go
@@ -311,7 +311,7 @@ func (wp *Pool) shutdown(wkr *worker, logger logrus.FieldLogger) {
 	go func() {
 		err := wkr.instance.Destroy()
 		if err != nil {
-			logger.WithError(err).Warn("shutdown failed")
+			logger.WithError(err).WithField("Instance", wkr.instance).Warn("shutdown failed")
 			return
 		}
 		wp.mtx.Lock()
@@ -542,7 +542,7 @@ func (wp *Pool) runProbes() {
 		workers = workers[:0]
 		wp.mtx.Lock()
 		for id, wkr := range wp.workers {
-			if wkr.state == StateShutdown || wp.autoShutdown(wkr) {
+			if wkr.state == StateShutdown || wp.shutdownIfIdle(wkr) {
 				continue
 			}
 			workers = append(workers, id)
@@ -596,7 +596,28 @@ func (wp *Pool) runSync() {
 }
 
 // caller must have lock.
-func (wp *Pool) autoShutdown(wkr *worker) bool {
+func (wp *Pool) shutdownIfBroken(wkr *worker, dur time.Duration) {
+	if wkr.state == StateHold {
+		return
+	}
+	label, threshold := "", wp.timeoutProbe
+	if wkr.state == StateBooting {
+		label, threshold = "new ", wp.timeoutBooting
+	}
+	if dur < threshold {
+		return
+	}
+	wp.logger.WithFields(logrus.Fields{
+		"Instance": wkr.instance,
+		"Duration": dur,
+		"Since":    wkr.probed,
+		"State":    wkr.state,
+	}).Warnf("%sinstance unresponsive, shutting down", label)
+	wp.shutdown(wkr, wp.logger)
+}
+
+// caller must have lock.
+func (wp *Pool) shutdownIfIdle(wkr *worker) bool {
 	if len(wkr.running)+len(wkr.starting) > 0 || wkr.state != StateRunning {
 		return false
 	}
@@ -762,19 +783,7 @@ func (wp *Pool) probeAndUpdate(wkr *worker) {
 		} else {
 			logger.Info("instance not responding")
 		}
-
-		if wkr.state == StateHold {
-			return
-		}
-
-		label, threshold := "", wp.timeoutProbe
-		if wkr.state == StateBooting {
-			label, threshold = "new ", wp.timeoutBooting
-		}
-		if dur > threshold {
-			logger.WithField("Since", wkr.probed).Warnf("%sinstance unresponsive, shutting down", label)
-			wp.shutdown(wkr, logger)
-		}
+		wp.shutdownIfBroken(wkr, dur)
 		return
 	}
 

commit 438ce6f86261376880b186775d7ba5fbade560a6
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Sat Oct 27 01:17:39 2018 -0400

    14360: Just notify once per sync.
    
    Sync holds the mutex so none of the notifications are delivered until
    sync is done anyway.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/lib/dispatchcloud/worker/pool.go b/lib/dispatchcloud/worker/pool.go
index 74d3d5172..a7b5132a5 100644
--- a/lib/dispatchcloud/worker/pool.go
+++ b/lib/dispatchcloud/worker/pool.go
@@ -213,6 +213,7 @@ func (wp *Pool) Create(it arvados.InstanceType) error {
 	tags := cloud.InstanceTags{tagKeyInstanceType: it.Name}
 	wp.creating[it]++
 	go func() {
+		defer wp.notify()
 		inst, err := wp.instanceSet.Create(it, wp.imageID, tags, nil)
 		wp.mtx.Lock()
 		defer wp.mtx.Unlock()
@@ -222,7 +223,6 @@ func (wp *Pool) Create(it arvados.InstanceType) error {
 		}
 		if err != nil {
 			logger.WithError(err).Error("create failed")
-			go wp.notify()
 			return
 		}
 		wp.updateWorker(inst, it, StateBooting)
@@ -238,7 +238,9 @@ func (wp *Pool) AtQuota() bool {
 
 // Add or update worker attached to the given instance. Use
 // initialState if a new worker is created. Caller must have lock.
-func (wp *Pool) updateWorker(inst cloud.Instance, it arvados.InstanceType, initialState State) {
+//
+// Returns true when a new worker is created.
+func (wp *Pool) updateWorker(inst cloud.Instance, it arvados.InstanceType, initialState State) bool {
 	id := inst.ID()
 	if wp.workers[id] != nil {
 		wp.workers[id].executor.SetTarget(inst)
@@ -247,7 +249,7 @@ func (wp *Pool) updateWorker(inst cloud.Instance, it arvados.InstanceType, initi
 		if initialState == StateBooting && wp.workers[id].state == StateUnknown {
 			wp.workers[id].state = StateBooting
 		}
-		return
+		return false
 	}
 	if initialState == StateUnknown && inst.Tags()[tagKeyHold] != "" {
 		initialState = StateHold
@@ -271,7 +273,7 @@ func (wp *Pool) updateWorker(inst cloud.Instance, it arvados.InstanceType, initi
 		starting:    make(map[string]struct{}),
 		probing:     make(chan struct{}, 1),
 	}
-	go wp.notify()
+	return true
 }
 
 // Shutdown shuts down a worker with the given type, or returns false
@@ -678,6 +680,7 @@ func (wp *Pool) sync(threshold time.Time, instances []cloud.Instance) {
 	wp.mtx.Lock()
 	defer wp.mtx.Unlock()
 	wp.logger.WithField("Instances", len(instances)).Debug("sync instances")
+	notify := false
 
 	for _, inst := range instances {
 		itTag := inst.Tags()[tagKeyInstanceType]
@@ -686,7 +689,9 @@ func (wp *Pool) sync(threshold time.Time, instances []cloud.Instance) {
 			wp.logger.WithField("Instance", inst).Errorf("unknown InstanceType tag %q --- ignoring", itTag)
 			continue
 		}
-		wp.updateWorker(inst, it, StateUnknown)
+		if wp.updateWorker(inst, it, StateUnknown) {
+			notify = true
+		}
 	}
 
 	for id, wkr := range wp.workers {
@@ -700,13 +705,17 @@ func (wp *Pool) sync(threshold time.Time, instances []cloud.Instance) {
 		logger.Info("instance disappeared in cloud")
 		delete(wp.workers, id)
 		go wkr.executor.Close()
-		go wp.notify()
+		notify = true
 	}
 
 	if !wp.loaded {
 		wp.loaded = true
 		wp.logger.WithField("N", len(wp.workers)).Info("loaded initial instance list")
 	}
+
+	if notify {
+		go wp.notify()
+	}
 }
 
 // should be called in a new goroutine

commit c11e1e9b7e04b819a8a1794acc3ffb0a816715b7
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Sat Oct 27 00:31:43 2018 -0400

    14360: Cut excess time.Now() calls.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/lib/dispatchcloud/worker/pool.go b/lib/dispatchcloud/worker/pool.go
index fc87ea421..74d3d5172 100644
--- a/lib/dispatchcloud/worker/pool.go
+++ b/lib/dispatchcloud/worker/pool.go
@@ -257,15 +257,16 @@ func (wp *Pool) updateWorker(inst cloud.Instance, it arvados.InstanceType, initi
 		"Instance":     inst,
 		"State":        initialState,
 	}).Infof("instance appeared in cloud")
+	now := time.Now()
 	wp.workers[id] = &worker{
 		executor:    wp.newExecutor(inst),
 		state:       initialState,
 		instance:    inst,
 		instType:    it,
-		probed:      time.Now(),
-		busy:        time.Now(),
-		updated:     time.Now(),
-		unallocated: time.Now(),
+		probed:      now,
+		busy:        now,
+		updated:     now,
+		unallocated: now,
 		running:     make(map[string]struct{}),
 		starting:    make(map[string]struct{}),
 		probing:     make(chan struct{}, 1),

commit b10284a496cd2d953b865144a5f28265fc277938
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Sat Oct 27 00:30:16 2018 -0400

    14360: Split up long run func.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/lib/dispatchcloud/worker/pool.go b/lib/dispatchcloud/worker/pool.go
index f2a53acba..fc87ea421 100644
--- a/lib/dispatchcloud/worker/pool.go
+++ b/lib/dispatchcloud/worker/pool.go
@@ -91,7 +91,12 @@ func NewPool(logger logrus.FieldLogger, reg *prometheus.Registry, instanceSet cl
 		timeoutProbe:       duration(cluster.CloudVMs.TimeoutProbe, defaultTimeoutProbe),
 	}
 	wp.registerMetrics(reg)
-	go wp.run()
+	go func() {
+		wp.setupOnce.Do(wp.setup)
+		go wp.runMetrics()
+		go wp.runProbes()
+		go wp.runSync()
+	}()
 	return wp
 }
 
@@ -487,6 +492,14 @@ func (wp *Pool) registerMetrics(reg *prometheus.Registry) {
 	reg.MustRegister(wp.mMemoryInuse)
 }
 
+func (wp *Pool) runMetrics() {
+	ch := wp.Subscribe()
+	defer wp.Unsubscribe(ch)
+	for range ch {
+		wp.updateMetrics()
+	}
+}
+
 func (wp *Pool) updateMetrics() {
 	wp.mtx.RLock()
 	defer wp.mtx.RUnlock()
@@ -510,67 +523,57 @@ func (wp *Pool) updateMetrics() {
 	wp.mMemoryInuse.Set(float64(memInuse))
 }
 
-func (wp *Pool) run() {
-	wp.setupOnce.Do(wp.setup)
+func (wp *Pool) runProbes() {
+	maxPPS := wp.maxProbesPerSecond
+	if maxPPS < 1 {
+		maxPPS = defaultMaxProbesPerSecond
+	}
+	limitticker := time.NewTicker(time.Second / time.Duration(maxPPS))
+	defer limitticker.Stop()
 
-	go func() {
-		ch := wp.Subscribe()
-		defer wp.Unsubscribe(ch)
-		for range ch {
-			wp.updateMetrics()
-		}
-	}()
+	probeticker := time.NewTicker(wp.probeInterval)
+	defer probeticker.Stop()
 
-	go func() {
-		maxPPS := wp.maxProbesPerSecond
-		if maxPPS < 1 {
-			maxPPS = defaultMaxProbesPerSecond
+	workers := []cloud.InstanceID{}
+	for range probeticker.C {
+		workers = workers[:0]
+		wp.mtx.Lock()
+		for id, wkr := range wp.workers {
+			if wkr.state == StateShutdown || wp.autoShutdown(wkr) {
+				continue
+			}
+			workers = append(workers, id)
 		}
-		limitticker := time.NewTicker(time.Second / time.Duration(maxPPS))
-		defer limitticker.Stop()
-
-		probeticker := time.NewTicker(wp.probeInterval)
-		defer probeticker.Stop()
+		wp.mtx.Unlock()
 
-		workers := []cloud.InstanceID{}
-		for range probeticker.C {
-			workers = workers[:0]
+		for _, id := range workers {
 			wp.mtx.Lock()
-			for id, wkr := range wp.workers {
-				if wkr.state == StateShutdown || wp.autoShutdown(wkr) {
-					continue
-				}
-				workers = append(workers, id)
-			}
+			wkr, ok := wp.workers[id]
 			wp.mtx.Unlock()
-
-			for _, id := range workers {
-				wp.mtx.Lock()
-				wkr, ok := wp.workers[id]
-				wp.mtx.Unlock()
-				if !ok || wkr.state == StateShutdown {
-					// Deleted/shutdown while we
-					// were probing others
-					continue
-				}
-				select {
-				case wkr.probing <- struct{}{}:
-					go func() {
-						wp.probeAndUpdate(wkr)
-						<-wkr.probing
-					}()
-				default:
-					wp.logger.WithField("Instance", wkr.instance).Debug("still waiting for last probe to finish")
-				}
-				select {
-				case <-wp.stop:
-					return
-				case <-limitticker.C:
-				}
+			if !ok || wkr.state == StateShutdown {
+				// Deleted/shutdown while we
+				// were probing others
+				continue
+			}
+			select {
+			case wkr.probing <- struct{}{}:
+				go func() {
+					wp.probeAndUpdate(wkr)
+					<-wkr.probing
+				}()
+			default:
+				wp.logger.WithField("Instance", wkr.instance).Debug("still waiting for last probe to finish")
+			}
+			select {
+			case <-wp.stop:
+				return
+			case <-limitticker.C:
 			}
 		}
-	}()
+	}
+}
 
+func (wp *Pool) runSync() {
 	// sync once immediately, then wait syncInterval, sync again,
 	// etc.
 	timer := time.NewTimer(1)

commit 8dff5fe14f23f3db9efaf5d71a21668738a9f164
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Sat Oct 27 00:29:26 2018 -0400

    14360: Use consts for tag keys.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/lib/dispatchcloud/worker/pool.go b/lib/dispatchcloud/worker/pool.go
index 364670544..f2a53acba 100644
--- a/lib/dispatchcloud/worker/pool.go
+++ b/lib/dispatchcloud/worker/pool.go
@@ -18,6 +18,11 @@ import (
 	"github.com/prometheus/client_golang/prometheus"
 )
 
+const (
+	tagKeyInstanceType = "InstanceType"
+	tagKeyHold         = "Hold"
+)
+
 // A View shows a worker's current state and recent activity.
 type View struct {
 	Instance             string
@@ -200,7 +205,7 @@ func (wp *Pool) Create(it arvados.InstanceType) error {
 	wp.setupOnce.Do(wp.setup)
 	wp.mtx.Lock()
 	defer wp.mtx.Unlock()
-	tags := cloud.InstanceTags{"InstanceType": it.Name}
+	tags := cloud.InstanceTags{tagKeyInstanceType: it.Name}
 	wp.creating[it]++
 	go func() {
 		inst, err := wp.instanceSet.Create(it, wp.imageID, tags, nil)
@@ -239,7 +244,7 @@ func (wp *Pool) updateWorker(inst cloud.Instance, it arvados.InstanceType, initi
 		}
 		return
 	}
-	if initialState == StateUnknown && inst.Tags()["hold"] != "" {
+	if initialState == StateUnknown && inst.Tags()[tagKeyHold] != "" {
 		initialState = StateHold
 	}
 	wp.logger.WithFields(logrus.Fields{
@@ -671,7 +676,7 @@ func (wp *Pool) sync(threshold time.Time, instances []cloud.Instance) {
 	wp.logger.WithField("Instances", len(instances)).Debug("sync instances")
 
 	for _, inst := range instances {
-		itTag := inst.Tags()["InstanceType"]
+		itTag := inst.Tags()[tagKeyInstanceType]
 		it, ok := wp.instanceTypes[itTag]
 		if !ok {
 			wp.logger.WithField("Instance", inst).Errorf("unknown InstanceType tag %q --- ignoring", itTag)

commit c81a653c1e800c40a0c6e1a5d94cddd6620b5e52
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Sat Oct 27 00:28:17 2018 -0400

    14360: Close SSH connections when no longer needed.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/lib/dispatchcloud/ssh_executor/executor.go b/lib/dispatchcloud/ssh_executor/executor.go
index 804ae6f15..b5dba9870 100644
--- a/lib/dispatchcloud/ssh_executor/executor.go
+++ b/lib/dispatchcloud/ssh_executor/executor.go
@@ -90,6 +90,19 @@ func (exr *Executor) Execute(cmd string, stdin io.Reader) ([]byte, []byte, error
 	return stdout.Bytes(), stderr.Bytes(), err
 }
 
+// Close shuts down any active connections.
+func (exr *Executor) Close() {
+	// Ensure exr is initialized
+	exr.sshClient(false)
+
+	exr.clientSetup <- true
+	if exr.client != nil {
+		defer exr.client.Close()
+	}
+	exr.client, exr.clientErr = nil, errors.New("closed")
+	<-exr.clientSetup
+}
+
 // Create a new SSH session. If session setup fails or the SSH client
 // hasn't been setup yet, setup a new SSH client and try again.
 func (exr *Executor) newSession() (*ssh.Session, error) {
@@ -121,6 +134,11 @@ func (exr *Executor) sshClient(create bool) (*ssh.Client, error) {
 		if create {
 			client, err := exr.setupSSHClient()
 			if err == nil || exr.client == nil {
+				if exr.client != nil {
+					// Hang up the previous
+					// (non-working) client
+					go exr.client.Close()
+				}
 				exr.client, exr.clientErr = client, err
 			}
 			if err != nil {
diff --git a/lib/dispatchcloud/worker/pool.go b/lib/dispatchcloud/worker/pool.go
index 85e080142..364670544 100644
--- a/lib/dispatchcloud/worker/pool.go
+++ b/lib/dispatchcloud/worker/pool.go
@@ -45,6 +45,8 @@ type Executor interface {
 	//
 	// SetTarget must not block on concurrent Execute calls.
 	SetTarget(cloud.ExecutorTarget)
+
+	Close()
 }
 
 const (
@@ -688,6 +690,7 @@ func (wp *Pool) sync(threshold time.Time, instances []cloud.Instance) {
 		})
 		logger.Info("instance disappeared in cloud")
 		delete(wp.workers, id)
+		go wkr.executor.Close()
 		go wp.notify()
 	}
 
diff --git a/lib/dispatchcloud/worker/pool_test.go b/lib/dispatchcloud/worker/pool_test.go
index 7ab4e1b6c..790ee851e 100644
--- a/lib/dispatchcloud/worker/pool_test.go
+++ b/lib/dispatchcloud/worker/pool_test.go
@@ -123,3 +123,5 @@ func (*stubExecutor) SetTarget(cloud.ExecutorTarget) {}
 func (*stubExecutor) Execute(cmd string, stdin io.Reader) ([]byte, []byte, error) {
 	return nil, nil, nil
 }
+
+func (*stubExecutor) Close() {}

commit 8162f74b2611b686cc8bcec225c6724c71b8926d
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Fri Oct 26 20:27:20 2018 -0400

    14360: Clarify sync loop timing.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/lib/dispatchcloud/worker/pool.go b/lib/dispatchcloud/worker/pool.go
index cf8fac380..85e080142 100644
--- a/lib/dispatchcloud/worker/pool.go
+++ b/lib/dispatchcloud/worker/pool.go
@@ -564,24 +564,17 @@ func (wp *Pool) run() {
 		}
 	}()
 
-	timer := time.NewTimer(time.Nanosecond)
+	// sync once immediately, then wait syncInterval, sync again,
+	// etc.
+	timer := time.NewTimer(1)
 	for {
-		err := wp.getInstancesAndSync()
-		if err != nil {
-			wp.logger.WithError(err).Warn("sync failed")
-		}
-
-		// Reset timer to desired interval, and ignore the
-		// tick that might have already arrived.
-		timer.Stop()
-		select {
-		case <-timer.C:
-		default:
-		}
-		timer.Reset(wp.syncInterval)
-
 		select {
 		case <-timer.C:
+			err := wp.getInstancesAndSync()
+			if err != nil {
+				wp.logger.WithError(err).Warn("sync failed")
+			}
+			timer.Reset(wp.syncInterval)
 		case <-wp.stop:
 			wp.logger.Debug("worker.Pool stopped")
 			return

commit 1b47c040b802c09d8652b952704e2ad09f31aa27
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Fri Oct 26 20:18:27 2018 -0400

    14360: Update readme.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/lib/dispatchcloud/readme.go b/lib/dispatchcloud/readme.go
index a4b005eb8..c8491fb1d 100644
--- a/lib/dispatchcloud/readme.go
+++ b/lib/dispatchcloud/readme.go
@@ -5,33 +5,35 @@
 package dispatchcloud
 
 // A dispatcher comprises a container queue, a scheduler, a worker
-// pool, a cloud provider, a stale-lock fixer, and a syncer.
+// pool, a remote command executor, and a cloud driver.
 // 1. Choose a provider.
 // 2. Start a worker pool.
 // 3. Start a container queue.
-// 4. Run a stale-lock fixer.
-// 5. Start a scheduler.
-// 6. Start a syncer.
+// 4. Run the scheduler's stale-lock fixer.
+// 5. Run the scheduler's mapper.
+// 6. Run the scheduler's syncer.
+// 7. Wait for updates to the container queue or worker pool.
+// 8. Repeat from 5.
 //
 //
-// A provider (cloud driver) creates new cloud VM instances and gets
-// the latest list of instances. The returned instances implement
-// proxies to the provider's metadata and control interfaces (get IP
-// address, update tags, shutdown).
+// A cloud driver creates new cloud VM instances and gets the latest
+// list of instances. The returned instances are caches/proxies for
+// the provider's metadata and control interfaces (get IP address,
+// update tags, shutdown).
 //
 //
-// A workerPool tracks workers' instance types and readiness states
+// A worker pool tracks workers' instance types and readiness states
 // (available to do work now, booting, suffering a temporary network
 // outage, shutting down). It loads internal state from the cloud
 // provider's list of instances at startup, and syncs periodically
 // after that.
 //
 //
-// A worker maintains a multiplexed SSH connection to a cloud
-// instance, retrying/reconnecting as needed, so the workerPool can
-// execute commands. It asks the provider's instance to verify its SSH
-// public key once when first connecting, and again later if the key
-// changes.
+// An executor maintains a multiplexed SSH connection to a cloud
+// instance, retrying/reconnecting as needed, so the worker pool can
+// execute commands. It asks the cloud driver's instance to verify its
+// SSH public key once when first connecting, and again later if the
+// key changes.
 //
 //
 // A container queue tracks the known state (according to
@@ -44,36 +46,25 @@ package dispatchcloud
 // lock/unlock/cancel operation.)
 //
 //
-// A stale-lock fixer waits for any already-locked containers (i.e.,
-// locked by a prior server process) to appear on workers as the
-// worker pool recovers its state. It unlocks/requeues any that still
-// remain when all workers are recovered or shutdown, or its timer
-// expires.
+// The scheduler's stale-lock fixer waits for any already-locked
+// containers (i.e., locked by a prior dispatcher process) to appear
+// on workers as the worker pool recovers its state. It
+// unlocks/requeues any that still remain when all workers are
+// recovered or shutdown, or its timer expires.
 //
 //
-// A scheduler chooses which containers to assign to which idle
-// workers, and decides what to do when there are not enough idle
+// The scheduler's mapper chooses which containers to assign to which
+// idle workers, and decides what to do when there are not enough idle
 // workers (including shutting down some idle nodes).
 //
 //
-// A syncer updates state to Cancelled when a running container
-// process dies without finalizing its entry in the controller
-// database. It also calls the worker pool to kill containers that
-// have priority=0 while locked or running.
+// The scheduler's syncer updates state to Cancelled when a running
+// container process dies without finalizing its entry in the
+// controller database. It also calls the worker pool to kill
+// containers that have priority=0 while locked or running.
 //
 //
-// A provider proxy wraps a provider with rate-limiting logic. After
-// the wrapped provider receives a cloud.RateLimitError, the proxy
-// starts returning errors to callers immediately without calling
-// through to the wrapped provider.
-//
-//
-// TBD: Bootstrapping script via SSH, too? Future version.
-//
-// TBD: drain instance, keep instance alive
-// TBD: metrics, diagnostics
-// TBD: why dispatch token currently passed to worker?
-//
-// Metrics: queue size, time job has been in queued, #idle/busy/booting nodes
-// Timing in each step, and end-to-end
-// Metrics: boot/idle/alloc time and cost
+// An instance set proxy wraps a driver's instance set with
+// rate-limiting logic. After the wrapped instance set receives a
+// cloud.RateLimitError, the proxy starts returning errors to callers
+// immediately without calling through to the wrapped instance set.

commit 8c44ed01463d5e0f0d167b37474b0e78fe11bd8d
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Fri Oct 26 20:09:09 2018 -0400

    14360: Add explicit Start method to dispatcher.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/lib/dispatchcloud/cmd.go b/lib/dispatchcloud/cmd.go
index a5a11d2fa..92948fb30 100644
--- a/lib/dispatchcloud/cmd.go
+++ b/lib/dispatchcloud/cmd.go
@@ -13,5 +13,7 @@ import (
 var Command cmd.Handler = service.Command(arvados.ServiceNameDispatchCloud, newHandler)
 
 func newHandler(cluster *arvados.Cluster, _ *arvados.NodeProfile) service.Handler {
-	return &dispatcher{Cluster: cluster}
+	d := &dispatcher{Cluster: cluster}
+	go d.Start()
+	return d
 }
diff --git a/lib/dispatchcloud/dispatcher.go b/lib/dispatchcloud/dispatcher.go
index e422b3963..707c31f24 100644
--- a/lib/dispatchcloud/dispatcher.go
+++ b/lib/dispatchcloud/dispatcher.go
@@ -58,22 +58,28 @@ type dispatcher struct {
 	stop      chan struct{}
 }
 
+// Start starts the dispatcher. Start can be called multiple times
+// with no ill effect.
+func (disp *dispatcher) Start() {
+	disp.setupOnce.Do(disp.setup)
+}
+
 // ServeHTTP implements service.Handler.
 func (disp *dispatcher) ServeHTTP(w http.ResponseWriter, r *http.Request) {
-	disp.setupOnce.Do(disp.setup)
+	disp.Start()
 	disp.httpHandler.ServeHTTP(w, r)
 }
 
 // CheckHealth implements service.Handler.
 func (disp *dispatcher) CheckHealth() error {
-	disp.setupOnce.Do(disp.setup)
+	disp.Start()
 	return nil
 }
 
 // Stop dispatching containers and release resources. Typically used
 // in tests.
 func (disp *dispatcher) Close() {
-	disp.setupOnce.Do(disp.setup)
+	disp.Start()
 	select {
 	case disp.stop <- struct{}{}:
 	default:

-----------------------------------------------------------------------


hooks/post-receive
--