[ARVADOS] created: 1.2.0-248-g2202b7952
Git user
git at public.curoverse.com
Fri Oct 26 00:05:17 EDT 2018
at 2202b7952c7246d65c6e9a5ce99a1a81e87dbc75 (commit)
commit 2202b7952c7246d65c6e9a5ce99a1a81e87dbc75
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date: Thu Oct 25 01:34:35 2018 -0400
14360: Initial version of dispatch-cloud.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>
diff --git a/.licenseignore b/.licenseignore
index 51a1e7cbd..a0127cfa3 100644
--- a/.licenseignore
+++ b/.licenseignore
@@ -69,3 +69,4 @@ sdk/R/NAMESPACE
sdk/R/.Rbuildignore
sdk/R/ArvadosR.Rproj
*.Rd
+lib/dispatchcloud/test/sshkey_*
diff --git a/build/run-build-packages.sh b/build/run-build-packages.sh
index fe01e4b2e..1b486b40b 100755
--- a/build/run-build-packages.sh
+++ b/build/run-build-packages.sh
@@ -294,6 +294,9 @@ package_go_binary cmd/arvados-server arvados-server \
"Arvados server daemons"
package_go_binary cmd/arvados-server arvados-controller \
"Arvados cluster controller daemon"
+# No package until #14325
+#package_go_binary cmd/arvados-server crunch-dispatch-cloud \
+# "Arvados cluster cloud dispatch"
package_go_binary sdk/go/crunchrunner crunchrunner \
"Crunchrunner executes a command inside a container and uploads the output"
package_go_binary services/arv-git-httpd arvados-git-httpd \
diff --git a/build/run-tests.sh b/build/run-tests.sh
index 26a907fc2..9edc70cbc 100755
--- a/build/run-tests.sh
+++ b/build/run-tests.sh
@@ -77,6 +77,10 @@ lib/cmd
lib/controller
lib/crunchstat
lib/dispatchcloud
+lib/dispatchcloud/container
+lib/dispatchcloud/scheduler
+lib/dispatchcloud/ssh_executor
+lib/dispatchcloud/worker
services/api
services/arv-git-httpd
services/crunchstat
@@ -924,6 +928,10 @@ gostuff=(
lib/controller
lib/crunchstat
lib/dispatchcloud
+ lib/dispatchcloud/container
+ lib/dispatchcloud/scheduler
+ lib/dispatchcloud/ssh_executor
+ lib/dispatchcloud/worker
sdk/go/arvados
sdk/go/arvadosclient
sdk/go/auth
diff --git a/cmd/arvados-server/cmd.go b/cmd/arvados-server/cmd.go
index 1af3745df..cd15d25dd 100644
--- a/cmd/arvados-server/cmd.go
+++ b/cmd/arvados-server/cmd.go
@@ -9,6 +9,7 @@ import (
"git.curoverse.com/arvados.git/lib/cmd"
"git.curoverse.com/arvados.git/lib/controller"
+ "git.curoverse.com/arvados.git/lib/dispatchcloud"
)
var (
@@ -18,7 +19,8 @@ var (
"-version": cmd.Version(version),
"--version": cmd.Version(version),
- "controller": controller.Command,
+ "controller": controller.Command,
+ "dispatch-cloud": dispatchcloud.Command,
})
)
diff --git a/cmd/arvados-server/crunch-dispatch-cloud.service b/cmd/arvados-server/crunch-dispatch-cloud.service
new file mode 100644
index 000000000..f8d71c975
--- /dev/null
+++ b/cmd/arvados-server/crunch-dispatch-cloud.service
@@ -0,0 +1,28 @@
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: AGPL-3.0
+
+[Unit]
+Description=Arvados cloud dispatch
+Documentation=https://doc.arvados.org/
+After=network.target
+AssertPathExists=/etc/arvados/config.yml
+
+# systemd==229 (ubuntu:xenial) obeys StartLimitInterval in the [Unit] section
+StartLimitInterval=0
+
+# systemd>=230 (debian:9) obeys StartLimitIntervalSec in the [Unit] section
+StartLimitIntervalSec=0
+
+[Service]
+Type=notify
+EnvironmentFile=-/etc/arvados/environment
+ExecStart=/usr/bin/crunch-dispatch-cloud
+Restart=always
+RestartSec=1
+
+# systemd<=219 (centos:7, debian:8, ubuntu:trusty) obeys StartLimitInterval in the [Service] section
+StartLimitInterval=0
+
+[Install]
+WantedBy=multi-user.target
diff --git a/lib/cloud/interfaces.go b/lib/cloud/interfaces.go
new file mode 100644
index 000000000..e3a072582
--- /dev/null
+++ b/lib/cloud/interfaces.go
@@ -0,0 +1,179 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package cloud
+
+import (
+ "io"
+ "time"
+
+ "git.curoverse.com/arvados.git/sdk/go/arvados"
+ "golang.org/x/crypto/ssh"
+)
+
+// A RateLimitError should be returned by an InstanceSet when the
+// cloud service indicates it is rejecting all API calls for some time
+// interval.
+type RateLimitError interface {
+ // Time before which the caller should expect requests to
+ // fail.
+ EarliestRetry() time.Time
+ error
+}
+
+// A QuotaError should be returned by an InstanceSet when the cloud
+// service indicates the account cannot create more VMs than already
+// exist.
+type QuotaError interface {
+ // If true, don't create more instances until some existing
+ // instances are destroyed. If false, don't handle the error
+ // as a quota error.
+ IsQuotaError() bool
+ error
+}
+
+type InstanceSetID string
+type InstanceTags map[string]string
+type InstanceID string
+type ImageID string
+
+// An Executor executes commands on an ExecutorTarget.
+type Executor interface {
+ // Update the set of private keys used to authenticate to
+ // targets.
+ SetSigners(...ssh.Signer)
+
+ // Set the target used for subsequent command executions.
+ SetTarget(ExecutorTarget)
+
+ // Return the current target.
+ Target() ExecutorTarget
+
+ // Execute a shell command and return the resulting stdout and
+ // stderr. stdin can be nil.
+ Execute(cmd string, stdin io.Reader) (stdout, stderr []byte, err error)
+}
+
+// An ExecutorTarget is a remote command execution service.
+type ExecutorTarget interface {
+ // SSH server hostname or IP address, or empty string if
+ // unknown while instance is booting.
+ Address() string
+
+ // Return nil if the given public key matches the instance's
+ // SSH server key. If the provided Dialer is not nil,
+ // VerifyHostKey can use it to make outgoing network
+ // connections from the instance -- e.g., to use the cloud's
+ // "this instance's metadata" API.
+ VerifyHostKey(ssh.PublicKey, *ssh.Client) error
+}
+
+// Instance is implemented by the provider-specific instance types.
+type Instance interface {
+ ExecutorTarget
+
+ // ID returns the provider's instance ID. It must be stable
+ // for the life of the instance.
+ ID() InstanceID
+
+ // String typically returns the cloud-provided instance ID.
+ String() string
+
+ // Cloud provider's "instance type" ID. Matches a ProviderType
+ // in the cluster's InstanceTypes configuration.
+ ProviderType() string
+
+ // Get current tags
+ Tags() InstanceTags
+
+ // Replace tags with the given tags
+ SetTags(InstanceTags) error
+
+ // Shut down the node
+ Destroy() error
+}
+
+// An InstanceSet manages a set of VM instances created by an elastic
+// cloud provider like AWS, GCE, or Azure.
+//
+// All public methods of an InstanceSet, and all public methods of the
+// instances it returns, are goroutine safe.
+type InstanceSet interface {
+ // Create a new instance. If supported by the driver, add the
+ // provided public key to /root/.ssh/authorized_keys.
+ //
+ // The returned error should implement RateLimitError and
+ // QuotaError where applicable.
+ Create(arvados.InstanceType, ImageID, InstanceTags, ssh.PublicKey) (Instance, error)
+
+ // Return all instances, including ones that are booting or
+ // shutting down. Optionally, filter out nodes that don't have
+ // all of the given InstanceTags (the caller will ignore these
+ // anyway).
+ //
+ // An instance returned by successive calls to Instances() may
+ // -- but does not need to -- be represented by the same
+ // Instance object each time. Thus, the caller is responsible
+ // for de-duplicating the returned instances by comparing the
+ // InstanceIDs returned by the instances' ID() methods.
+ Instances(InstanceTags) ([]Instance, error)
+
+ // Stop any background tasks and release other resources.
+ Stop()
+}
+
+// A Driver returns an InstanceSet that uses the given InstanceSetID
+// and driver-dependent configuration parameters.
+//
+// The supplied id will be of the form "zzzzz-zzzzz-zzzzzzzzzzzzzzz"
+// where each z can be any alphanum. The returned InstanceSet must use
+// this id to tag long-lived cloud resources that it creates, and must
+// assume control of any existing resources that are tagged with the
+// same id. Tagging can be accomplished by including the ID in
+// resource names, using the cloud provider's tagging feature, or any
+// other mechanism. The tags must be visible to another instance of
+// the same driver running on a different host.
+//
+// The returned InstanceSet must ignore existing resources that are
+// visible but not tagged with the given id, except that it should log
+// a summary of such resources -- only once -- when it starts
+// up. Thus, two identically configured InstanceSets running on
+// different hosts with different ids should log about the existence
+// of each other's resources at startup, but will not interfere with
+// each other.
+//
+// Example:
+//
+// type exampleInstanceSet struct {
+// ownID string
+// AccessKey string
+// }
+//
+// type exampleDriver struct {}
+//
+// func (*exampleDriver) InstanceSet(config map[string]interface{}, id InstanceSetID) (InstanceSet, error) {
+// var is exampleInstanceSet
+// if err := mapstructure.Decode(config, &is); err != nil {
+// return nil, err
+// }
+// is.ownID = id
+// return &is, nil
+// }
+//
+// var _ = registerCloudDriver("example", &exampleDriver{})
+type Driver interface {
+ InstanceSet(config map[string]interface{}, id InstanceSetID) (InstanceSet, error)
+}
+
+// DriverFunc makes a Driver using the provided function as its
+// InstanceSet method. This is similar to http.HandlerFunc.
+func DriverFunc(fn func(config map[string]interface{}, id InstanceSetID) (InstanceSet, error)) Driver {
+ return driverFunc(fn)
+}
+
+type driverFunc func(config map[string]interface{}, id InstanceSetID) (InstanceSet, error)
+
+func (df driverFunc) InstanceSet(config map[string]interface{}, id InstanceSetID) (InstanceSet, error) {
+ return df(config, id)
+}
diff --git a/lib/cmd/cmd.go b/lib/cmd/cmd.go
index 8c65cf7ac..9292ef7e5 100644
--- a/lib/cmd/cmd.go
+++ b/lib/cmd/cmd.go
@@ -36,8 +36,9 @@ func (v Version) RunCommand(prog string, args []string, stdin io.Reader, stdout,
return 0
}
-// Multi is a Handler that looks up its first argument in a map, and
-// invokes the resulting Handler with the remaining args.
+// Multi is a Handler that looks up its first argument in a map (after
+// stripping any "arvados-" or "crunch-" prefix), and invokes the
+// resulting Handler with the remaining args.
//
// Example:
//
diff --git a/lib/dispatchcloud/cmd.go b/lib/dispatchcloud/cmd.go
new file mode 100644
index 000000000..a5a11d2fa
--- /dev/null
+++ b/lib/dispatchcloud/cmd.go
@@ -0,0 +1,17 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package dispatchcloud
+
+import (
+ "git.curoverse.com/arvados.git/lib/cmd"
+ "git.curoverse.com/arvados.git/lib/service"
+ "git.curoverse.com/arvados.git/sdk/go/arvados"
+)
+
+var Command cmd.Handler = service.Command(arvados.ServiceNameDispatchCloud, newHandler)
+
+func newHandler(cluster *arvados.Cluster, _ *arvados.NodeProfile) service.Handler {
+ return &dispatcher{Cluster: cluster}
+}
diff --git a/lib/dispatchcloud/container/queue.go b/lib/dispatchcloud/container/queue.go
new file mode 100644
index 000000000..17a38259d
--- /dev/null
+++ b/lib/dispatchcloud/container/queue.go
@@ -0,0 +1,323 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package container
+
+import (
+ "io"
+ "sync"
+ "time"
+
+ "git.curoverse.com/arvados.git/sdk/go/arvados"
+ "github.com/Sirupsen/logrus"
+ "github.com/prometheus/client_golang/prometheus"
+)
+
+type typeChooser func(*arvados.Container) (arvados.InstanceType, error)
+
+// An APIClient performs Arvados API requests. It is typically an
+// *arvados.Client.
+type APIClient interface {
+ RequestAndDecode(dst interface{}, method, path string, body io.Reader, params interface{}) error
+}
+
+// A QueueEnt is an entry in the queue, consisting of a container
+// record and the instance type that should be used to run it.
+type QueueEnt struct {
+ // The container to run. Only the UUID, State, Priority, and
+ // RuntimeConstraints fields are populated.
+ Container arvados.Container
+ InstanceType arvados.InstanceType
+}
+
+// String implements fmt.Stringer by returning the queued container's
+// UUID.
+func (c *QueueEnt) String() string {
+ return c.Container.UUID
+}
+
+// A Queue is an interface to an Arvados cluster's container
+// database. It presents only the containers that are eligible to be
+// run by, are already being run by, or have recently been run by the
+// present dispatcher.
+//
+// The Entries, Get, and Forget methods do not block: they return
+// immediately, using cached data.
+//
+// The updating methods (Cancel, Lock, Unlock, Update) do block: they
+// return only after the operation has completed.
+//
+// A Queue's Update method should be called periodically to keep the
+// cache up to date.
+type Queue struct {
+ logger logrus.FieldLogger
+ reg *prometheus.Registry
+ chooseType typeChooser
+ client APIClient
+
+ auth *arvados.APIClientAuthorization
+ current map[string]QueueEnt
+ updated time.Time
+ mtx sync.Mutex
+ keeplocal map[string]struct{}
+}
+
+// NewQueue returns a new Queue. When a new container appears in the
+// Arvados cluster's queue during Update, chooseType will be called to
+// assign an appropriate arvados.InstanceType for the queue entry.
+func NewQueue(logger logrus.FieldLogger, reg *prometheus.Registry, chooseType typeChooser, client APIClient) *Queue {
+ return &Queue{
+ logger: logger,
+ reg: reg,
+ chooseType: chooseType,
+ client: client,
+ current: map[string]QueueEnt{},
+ }
+}
+
+// Forget drops the specified container from the cache. It should be
+// called on finalized containers to avoid leaking memory over
+// time. It is a no-op if the indicated container is not in a
+// finalized state.
+func (cq *Queue) Forget(uuid string) {
+ cq.mtx.Lock()
+ defer cq.mtx.Unlock()
+ ctr := cq.current[uuid].Container
+ if ctr.State == arvados.ContainerStateComplete || ctr.State == arvados.ContainerStateCancelled {
+ delete(cq.current, uuid)
+ }
+}
+
+// Get returns the (partial) Container record for the specified
+// container. Like a map lookup, its second return value is false if
+// the specified container is not in the Queue.
+func (cq *Queue) Get(uuid string) (arvados.Container, bool) {
+ cq.mtx.Lock()
+ defer cq.mtx.Unlock()
+ if ctr, ok := cq.current[uuid]; !ok {
+ return arvados.Container{}, false
+ } else {
+ return ctr.Container, true
+ }
+}
+
+// Entries returns all cache entries, keyed by container UUID.
+//
+// The returned threshold indicates the maximum age of any cached data
+// returned in the map. This makes it possible for a scheduler to
+// determine correctly the outcome of a remote process that updates
+// container state. It must first wait for the remote process to exit,
+// then wait for the Queue to start and finish its next Update --
+// i.e., it must wait until threshold > timeProcessExited.
+func (cq *Queue) Entries() (entries map[string]QueueEnt, threshold time.Time) {
+ cq.mtx.Lock()
+ defer cq.mtx.Unlock()
+ entries = make(map[string]QueueEnt, len(cq.current))
+ for uuid, ctr := range cq.current {
+ entries[uuid] = ctr
+ }
+ threshold = cq.updated
+ return
+}
+
+// Update refreshes the cache from the Arvados API. It adds newly
+// queued containers, and updates the state of previously queued
+// containers.
+func (cq *Queue) Update() error {
+ cq.mtx.Lock()
+ cq.keeplocal = map[string]struct{}{}
+ updateStarted := time.Now()
+ cq.mtx.Unlock()
+
+ next, err := cq.poll()
+ if err != nil {
+ return err
+ }
+
+ cq.mtx.Lock()
+ defer cq.mtx.Unlock()
+ for uuid, ctr := range next {
+ if _, keep := cq.keeplocal[uuid]; keep {
+ continue
+ }
+ if cur, ok := cq.current[uuid]; !ok {
+ cq.addEnt(uuid, *ctr)
+ } else {
+ cur.Container = *ctr
+ cq.current[uuid] = cur
+ }
+ }
+ for uuid := range cq.current {
+ if _, keep := cq.keeplocal[uuid]; keep {
+ continue
+ } else if _, keep = next[uuid]; keep {
+ continue
+ } else {
+ delete(cq.current, uuid)
+ }
+ }
+ cq.keeplocal = nil
+ cq.updated = updateStarted
+ return nil
+}
+
+func (cq *Queue) addEnt(uuid string, ctr arvados.Container) {
+ it, err := cq.chooseType(&ctr)
+ if err != nil {
+ // FIXME: throttle warnings, cancel after timeout
+ cq.logger.Warnf("cannot run %s", &ctr)
+ return
+ }
+ cq.current[uuid] = QueueEnt{Container: ctr, InstanceType: it}
+}
+
+// Lock acquires the dispatch lock for the given container.
+func (cq *Queue) Lock(uuid string) error {
+ return cq.apiUpdate(uuid, "lock")
+}
+
+// Unlock releases the dispatch lock for the given container.
+func (cq *Queue) Unlock(uuid string) error {
+ return cq.apiUpdate(uuid, "unlock")
+}
+
+// Cancel cancels the given container.
+func (cq *Queue) Cancel(uuid string) error {
+ return cq.client.RequestAndDecode(nil, "PUT", "arvados/v1/containers/"+uuid, nil, map[string]map[string]interface{}{
+ "container": {"state": arvados.ContainerStateCancelled},
+ })
+}
+
+func (cq *Queue) apiUpdate(uuid, action string) error {
+ var resp arvados.Container
+ err := cq.client.RequestAndDecode(&resp, "POST", "arvados/v1/containers/"+uuid+"/"+action, nil, nil)
+ if err != nil {
+ return err
+ }
+
+ cq.mtx.Lock()
+ defer cq.mtx.Unlock()
+ if cq.keeplocal != nil {
+ cq.keeplocal[uuid] = struct{}{}
+ }
+ if ent, ok := cq.current[uuid]; !ok {
+ cq.addEnt(uuid, resp)
+ } else {
+ ent.Container.State, ent.Container.Priority, ent.Container.LockedByUUID = resp.State, resp.Priority, resp.LockedByUUID
+ cq.current[uuid] = ent
+ }
+ return nil
+}
+
+func (cq *Queue) poll() (map[string]*arvados.Container, error) {
+ cq.mtx.Lock()
+ size := len(cq.current)
+ auth := cq.auth
+ cq.mtx.Unlock()
+
+ if auth == nil {
+ auth = &arvados.APIClientAuthorization{}
+ err := cq.client.RequestAndDecode(auth, "GET", "arvados/v1/api_client_authorizations/current", nil, nil)
+ if err != nil {
+ return nil, err
+ }
+ cq.mtx.Lock()
+ cq.auth = auth
+ cq.mtx.Unlock()
+ }
+
+ next := make(map[string]*arvados.Container, size)
+ apply := func(updates []arvados.Container) {
+ for _, upd := range updates {
+ if next[upd.UUID] == nil {
+ next[upd.UUID] = &arvados.Container{}
+ }
+ *next[upd.UUID] = upd
+ }
+ }
+ selectParam := []string{"uuid", "state", "priority", "runtime_constraints"}
+ limitParam := 1000
+
+ mine, err := cq.fetchAll(arvados.ResourceListParams{
+ Select: selectParam,
+ Order: "uuid",
+ Limit: &limitParam,
+ Count: "none",
+ Filters: []arvados.Filter{{"locked_by_uuid", "=", auth.UUID}},
+ })
+ if err != nil {
+ return nil, err
+ }
+ apply(mine)
+
+ avail, err := cq.fetchAll(arvados.ResourceListParams{
+ Select: selectParam,
+ Order: "uuid",
+ Limit: &limitParam,
+ Count: "none",
+ Filters: []arvados.Filter{{"state", "=", arvados.ContainerStateQueued}, {"priority", ">", "0"}},
+ })
+ if err != nil {
+ return nil, err
+ }
+ apply(avail)
+
+ var missing []string
+ cq.mtx.Lock()
+ for uuid, ent := range cq.current {
+ if next[uuid] == nil &&
+ ent.Container.State != arvados.ContainerStateCancelled &&
+ ent.Container.State != arvados.ContainerStateComplete {
+ missing = append(missing, uuid)
+ }
+ }
+ cq.mtx.Unlock()
+
+ for i, page := 0, 20; i < len(missing); i += page {
+ batch := missing[i:]
+ if len(batch) > page {
+ batch = batch[:page]
+ }
+ ended, err := cq.fetchAll(arvados.ResourceListParams{
+ Select: selectParam,
+ Order: "uuid",
+ Count: "none",
+ Filters: []arvados.Filter{{"uuid", "in", batch}},
+ })
+ if err != nil {
+ return nil, err
+ }
+ apply(ended)
+ }
+ return next, nil
+}
+
+func (cq *Queue) fetchAll(initialParams arvados.ResourceListParams) ([]arvados.Container, error) {
+ var results []arvados.Container
+ params := initialParams
+ params.Offset = 0
+ for {
+ // This list variable must be a new one declared
+ // inside the loop: otherwise, items in the API
+ // response would get deep-merged into the items
+ // loaded in previous iterations.
+ var list arvados.ContainerList
+
+ err := cq.client.RequestAndDecode(&list, "GET", "arvados/v1/containers", nil, params)
+ if err != nil {
+ return nil, err
+ }
+ if len(list.Items) == 0 {
+ break
+ }
+
+ results = append(results, list.Items...)
+ if len(params.Order) == 1 && params.Order == "uuid" {
+ params.Filters = append(initialParams.Filters, arvados.Filter{"uuid", ">", list.Items[len(list.Items)-1].UUID})
+ } else {
+ params.Offset += len(list.Items)
+ }
+ }
+ return results, nil
+}
diff --git a/lib/dispatchcloud/dispatcher.go b/lib/dispatchcloud/dispatcher.go
new file mode 100644
index 000000000..e422b3963
--- /dev/null
+++ b/lib/dispatchcloud/dispatcher.go
@@ -0,0 +1,199 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package dispatchcloud
+
+import (
+ "crypto/md5"
+ "encoding/json"
+ "fmt"
+ "net/http"
+ "strings"
+ "sync"
+ "time"
+
+ "git.curoverse.com/arvados.git/lib/cloud"
+ "git.curoverse.com/arvados.git/lib/dispatchcloud/container"
+ "git.curoverse.com/arvados.git/lib/dispatchcloud/scheduler"
+ "git.curoverse.com/arvados.git/lib/dispatchcloud/ssh_executor"
+ "git.curoverse.com/arvados.git/lib/dispatchcloud/worker"
+ "git.curoverse.com/arvados.git/sdk/go/arvados"
+ "git.curoverse.com/arvados.git/sdk/go/auth"
+ "git.curoverse.com/arvados.git/sdk/go/httpserver"
+ "github.com/Sirupsen/logrus"
+ "github.com/prometheus/client_golang/prometheus"
+ "github.com/prometheus/client_golang/prometheus/promhttp"
+ "golang.org/x/crypto/ssh"
+)
+
+const (
+ defaultPollInterval = time.Second
+)
+
+type containerQueue interface {
+ scheduler.ContainerQueue
+ Update() error
+}
+
+type pool interface {
+ scheduler.WorkerPool
+ View() []worker.View
+}
+
+type dispatcher struct {
+ Cluster *arvados.Cluster
+ InstanceSetID cloud.InstanceSetID
+
+ logger logrus.FieldLogger
+ reg *prometheus.Registry
+ instanceSet cloud.InstanceSet
+ pool pool
+ queue containerQueue
+ httpHandler http.Handler
+ pollInterval time.Duration
+ sshKey ssh.Signer
+
+ setupOnce sync.Once
+ stop chan struct{}
+}
+
+// ServeHTTP implements service.Handler.
+func (disp *dispatcher) ServeHTTP(w http.ResponseWriter, r *http.Request) {
+ disp.setupOnce.Do(disp.setup)
+ disp.httpHandler.ServeHTTP(w, r)
+}
+
+// CheckHealth implements service.Handler.
+func (disp *dispatcher) CheckHealth() error {
+ disp.setupOnce.Do(disp.setup)
+ return nil
+}
+
+// Stop dispatching containers and release resources. Typically used
+// in tests.
+func (disp *dispatcher) Close() {
+ disp.setupOnce.Do(disp.setup)
+ select {
+ case disp.stop <- struct{}{}:
+ default:
+ }
+}
+
+// Make a worker.Executor for the given instance.
+func (disp *dispatcher) newExecutor(inst cloud.Instance) worker.Executor {
+ exr := ssh_executor.New(inst)
+ exr.SetSigners(disp.sshKey)
+ return exr
+}
+
+func (disp *dispatcher) typeChooser(ctr *arvados.Container) (arvados.InstanceType, error) {
+ return ChooseInstanceType(disp.Cluster, ctr)
+}
+
+func (disp *dispatcher) setup() {
+ disp.initialize()
+ go disp.run()
+}
+
+func (disp *dispatcher) initialize() {
+ arvClient := arvados.NewClientFromEnv()
+ if disp.InstanceSetID == "" {
+ if strings.HasPrefix(arvClient.AuthToken, "v2/") {
+ disp.InstanceSetID = cloud.InstanceSetID(strings.Split(arvClient.AuthToken, "/")[1])
+ } else {
+ // Use some other string unique to this token
+ // that doesn't reveal the token itself.
+ disp.InstanceSetID = cloud.InstanceSetID(fmt.Sprintf("%x", md5.Sum([]byte(arvClient.AuthToken))))
+ }
+ }
+ disp.stop = make(chan struct{}, 1)
+ disp.logger = logrus.StandardLogger()
+
+ if key, err := ssh.ParsePrivateKey(disp.Cluster.Dispatch.PrivateKey); err != nil {
+ disp.logger.Fatalf("error parsing configured Dispatch.PrivateKey: %s", err)
+ } else {
+ disp.sshKey = key
+ }
+
+ instanceSet, err := newInstanceSet(disp.Cluster, disp.InstanceSetID)
+ if err != nil {
+ disp.logger.Fatalf("error initializing driver: %s", err)
+ }
+ disp.instanceSet = &instanceSetProxy{instanceSet}
+ disp.reg = prometheus.NewRegistry()
+ disp.pool = worker.NewPool(disp.logger, disp.reg, disp.instanceSet, disp.newExecutor, disp.Cluster)
+ disp.queue = container.NewQueue(disp.logger, disp.reg, disp.typeChooser, arvClient)
+
+ mux := http.NewServeMux()
+ mux.HandleFunc("/arvados/v1/dispatch/containers", disp.apiContainers)
+ mux.HandleFunc("/arvados/v1/dispatch/instances", disp.apiInstances)
+ metricsH := promhttp.HandlerFor(disp.reg, promhttp.HandlerOpts{
+ ErrorLog: disp.logger,
+ })
+ mux.Handle("/metrics", metricsH)
+ mux.Handle("/metrics.json", metricsH)
+ disp.httpHandler = auth.RequireLiteralToken(disp.Cluster.ManagementToken, mux)
+
+ if d := disp.Cluster.Dispatch.PollInterval; d > 0 {
+ disp.pollInterval = time.Duration(d)
+ } else {
+ disp.pollInterval = defaultPollInterval
+ }
+}
+
+func (disp *dispatcher) run() {
+ defer disp.instanceSet.Stop()
+
+ t0 := time.Now()
+ disp.logger.Infof("FixStaleLocks starting.")
+ scheduler.FixStaleLocks(disp.logger, disp.queue, disp.pool, time.Duration(disp.Cluster.Dispatch.StaleLockTimeout))
+ disp.logger.Infof("FixStaleLocks finished (%s), starting scheduling.", time.Since(t0))
+
+ wp := disp.pool.Subscribe()
+ defer disp.pool.Unsubscribe(wp)
+ poll := time.NewTicker(disp.pollInterval)
+ for {
+ scheduler.Map(disp.logger, disp.queue, disp.pool)
+ scheduler.Sync(disp.logger, disp.queue, disp.pool)
+ select {
+ case <-disp.stop:
+ return
+ case <-wp:
+ case <-poll.C:
+ err := disp.queue.Update()
+ if err != nil {
+ disp.logger.Errorf("error updating queue: %s", err)
+ }
+ }
+ }
+}
+
+// Management API: all active and queued containers.
+func (disp *dispatcher) apiContainers(w http.ResponseWriter, r *http.Request) {
+ if r.Method != "GET" {
+ httpserver.Error(w, "method not allowed", http.StatusMethodNotAllowed)
+ return
+ }
+ var resp struct {
+ Items []container.QueueEnt
+ }
+ qEntries, _ := disp.queue.Entries()
+ for _, ent := range qEntries {
+ resp.Items = append(resp.Items, ent)
+ }
+ json.NewEncoder(w).Encode(resp)
+}
+
+// Management API: all active instances (cloud VMs).
+func (disp *dispatcher) apiInstances(w http.ResponseWriter, r *http.Request) {
+ if r.Method != "GET" {
+ httpserver.Error(w, "method not allowed", http.StatusMethodNotAllowed)
+ return
+ }
+ var resp struct {
+ Items []worker.View
+ }
+ resp.Items = disp.pool.View()
+ json.NewEncoder(w).Encode(resp)
+}
diff --git a/lib/dispatchcloud/dispatcher_test.go b/lib/dispatchcloud/dispatcher_test.go
new file mode 100644
index 000000000..e6f536d24
--- /dev/null
+++ b/lib/dispatchcloud/dispatcher_test.go
@@ -0,0 +1,370 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package dispatchcloud
+
+import (
+ "encoding/json"
+ "fmt"
+ "io"
+ "io/ioutil"
+ "math/rand"
+ "net/http/httptest"
+ "os"
+ "regexp"
+ "strings"
+ "sync"
+ "time"
+
+ "git.curoverse.com/arvados.git/lib/cloud"
+ "git.curoverse.com/arvados.git/lib/dispatchcloud/test"
+ "git.curoverse.com/arvados.git/sdk/go/arvados"
+ "github.com/Sirupsen/logrus"
+ "golang.org/x/crypto/ssh"
+ check "gopkg.in/check.v1"
+)
+
+var _ = check.Suite(&DispatcherSuite{})
+
+// fakeCloud provides an exec method that can be used as a
+// test.StubExecFunc. It calls the provided makeVM func when called
+// with a previously unseen instance ID. Calls to exec are passed on
+// to the *fakeVM for the appropriate instance ID.
+type fakeCloud struct {
+ queue *test.Queue
+ makeVM func(cloud.Instance) *fakeVM
+ onComplete func(string)
+ onCancel func(string)
+ vms map[cloud.InstanceID]*fakeVM
+ sync.Mutex
+}
+
+func (fc *fakeCloud) exec(inst cloud.Instance, command string, stdin io.Reader, stdout, stderr io.Writer) uint32 {
+ fc.Lock()
+ fvm, ok := fc.vms[inst.ID()]
+ if !ok {
+ if fc.vms == nil {
+ fc.vms = make(map[cloud.InstanceID]*fakeVM)
+ }
+ fvm = fc.makeVM(inst)
+ fc.vms[inst.ID()] = fvm
+ }
+ fc.Unlock()
+ return fvm.exec(fc.queue, fc.onComplete, fc.onCancel, command, stdin, stdout, stderr)
+}
+
+// fakeVM is a fake VM with configurable delays and failure modes.
+type fakeVM struct {
+ boot time.Time
+ broken time.Time
+ crunchRunMissing bool
+ crunchRunCrashRate float64
+ crunchRunDetachDelay time.Duration
+ ctrExit int
+ running map[string]bool
+ completed []string
+ sync.Mutex
+}
+
+func (fvm *fakeVM) exec(queue *test.Queue, onComplete, onCancel func(uuid string), command string, stdin io.Reader, stdout, stderr io.Writer) uint32 {
+ uuid := regexp.MustCompile(`.{5}-dz642-.{15}`).FindString(command)
+ if eta := fvm.boot.Sub(time.Now()); eta > 0 {
+ fmt.Fprintf(stderr, "stub is booting, ETA %s\n", eta)
+ return 1
+ }
+ if !fvm.broken.IsZero() && fvm.broken.Before(time.Now()) {
+ fmt.Fprintf(stderr, "cannot fork\n")
+ return 2
+ }
+ if fvm.crunchRunMissing && strings.Contains(command, "crunch-run") {
+ fmt.Fprint(stderr, "crunch-run: command not found\n")
+ return 1
+ }
+ if strings.HasPrefix(command, "crunch-run --detach ") {
+ fvm.Lock()
+ if fvm.running == nil {
+ fvm.running = map[string]bool{}
+ }
+ fvm.running[uuid] = true
+ fvm.Unlock()
+ time.Sleep(fvm.crunchRunDetachDelay)
+ fmt.Fprintf(stderr, "starting %s\n", uuid)
+ logger := logrus.WithField("ContainerUUID", uuid)
+ logger.Printf("[test] starting crunch-run stub")
+ go func() {
+ crashluck := rand.Float64()
+ ctr, ok := queue.Get(uuid)
+ if !ok {
+ logger.Print("[test] container not in queue")
+ return
+ }
+ if crashluck > fvm.crunchRunCrashRate/2 {
+ time.Sleep(time.Duration(rand.Float64()*20) * time.Millisecond)
+ ctr.State = arvados.ContainerStateRunning
+ queue.Notify(ctr)
+ }
+
+ time.Sleep(time.Duration(rand.Float64()*20) * time.Millisecond)
+ fvm.Lock()
+ _, running := fvm.running[uuid]
+ fvm.Unlock()
+ if !running {
+ logger.Print("[test] container was killed")
+ return
+ }
+ if crashluck < fvm.crunchRunCrashRate {
+ logger.Print("[test] crashing crunch-run stub")
+ if onCancel != nil && ctr.State == arvados.ContainerStateRunning {
+ onCancel(uuid)
+ }
+ } else {
+ ctr.State = arvados.ContainerStateComplete
+ ctr.ExitCode = fvm.ctrExit
+ queue.Notify(ctr)
+ if onComplete != nil {
+ onComplete(uuid)
+ }
+ }
+ logger.Print("[test] exiting crunch-run stub")
+ fvm.Lock()
+ defer fvm.Unlock()
+ delete(fvm.running, uuid)
+ }()
+ return 0
+ }
+ if command == "crunch-run --list" {
+ fvm.Lock()
+ defer fvm.Unlock()
+ for uuid := range fvm.running {
+ fmt.Fprintf(stdout, "%s\n", uuid)
+ }
+ return 0
+ }
+ if strings.HasPrefix(command, "crunch-run --kill ") {
+ fvm.Lock()
+ defer fvm.Unlock()
+ if fvm.running[uuid] {
+ delete(fvm.running, uuid)
+ } else {
+ fmt.Fprintf(stderr, "%s: container is not running\n", uuid)
+ }
+ return 0
+ }
+ if command == "true" {
+ return 0
+ }
+ fmt.Fprintf(stderr, "%q: command not found", command)
+ return 1
+}
+
+type DispatcherSuite struct {
+ cluster *arvados.Cluster
+ instanceSet *test.LameInstanceSet
+ stubDriver *test.StubDriver
+ disp *dispatcher
+}
+
+func (s *DispatcherSuite) SetUpSuite(c *check.C) {
+ if os.Getenv("ARVADOS_DEBUG") != "" {
+ logrus.StandardLogger().SetLevel(logrus.DebugLevel)
+ }
+}
+
+func (s *DispatcherSuite) SetUpTest(c *check.C) {
+ dispatchpub, _ := test.LoadTestKey(c, "test/sshkey_dispatch")
+ dispatchprivraw, err := ioutil.ReadFile("test/sshkey_dispatch")
+ c.Assert(err, check.IsNil)
+
+ _, hostpriv := test.LoadTestKey(c, "test/sshkey_vm")
+ s.stubDriver = &test.StubDriver{
+ Exec: func(inst cloud.Instance, command string, _ io.Reader, _, _ io.Writer) uint32 {
+ c.Logf("stubDriver SSHExecFunc(%s, %q, ...)", inst, command)
+ return 1
+ },
+ HostKey: hostpriv,
+ AuthorizedKeys: []ssh.PublicKey{dispatchpub},
+ }
+
+ s.cluster = &arvados.Cluster{
+ CloudVMs: arvados.CloudVMs{
+ Driver: "test",
+ SyncInterval: arvados.Duration(10 * time.Millisecond),
+ TimeoutIdle: arvados.Duration(30 * time.Millisecond),
+ TimeoutBooting: arvados.Duration(30 * time.Millisecond),
+ TimeoutProbe: arvados.Duration(15 * time.Millisecond),
+ TimeoutShutdown: arvados.Duration(5 * time.Millisecond),
+ },
+ Dispatch: arvados.Dispatch{
+ PrivateKey: dispatchprivraw,
+ PollInterval: arvados.Duration(5 * time.Millisecond),
+ ProbeInterval: arvados.Duration(5 * time.Millisecond),
+ MaxProbesPerSecond: 1000,
+ },
+ InstanceTypes: arvados.InstanceTypeMap{
+ test.InstanceType(1).Name: test.InstanceType(1),
+ test.InstanceType(2).Name: test.InstanceType(2),
+ test.InstanceType(3).Name: test.InstanceType(3),
+ test.InstanceType(4).Name: test.InstanceType(4),
+ test.InstanceType(6).Name: test.InstanceType(6),
+ test.InstanceType(8).Name: test.InstanceType(8),
+ test.InstanceType(16).Name: test.InstanceType(16),
+ },
+ NodeProfiles: map[string]arvados.NodeProfile{
+ "*": {
+ Controller: arvados.SystemServiceInstance{Listen: os.Getenv("ARVADOS_API_HOST")},
+ DispatchCloud: arvados.SystemServiceInstance{Listen: ":"},
+ },
+ },
+ }
+ s.disp = &dispatcher{Cluster: s.cluster}
+ // Test cases can modify s.cluster before calling
+ // initialize(), and then modify private state before calling
+ // go run().
+}
+
+func (s *DispatcherSuite) TearDownTest(c *check.C) {
+ s.disp.Close()
+}
+
+func (s *DispatcherSuite) TestDispatchToStubDriver(c *check.C) {
+ drivers["test"] = s.stubDriver
+ s.disp.setupOnce.Do(s.disp.initialize)
+ queue := &test.Queue{
+ ChooseType: func(ctr *arvados.Container) (arvados.InstanceType, error) {
+ return ChooseInstanceType(s.cluster, ctr)
+ },
+ }
+ for i := 0; i < 200; i++ {
+ queue.Containers = append(queue.Containers, arvados.Container{
+ UUID: test.ContainerUUID(i + 1),
+ State: arvados.ContainerStateQueued,
+ Priority: int64(i%20 + 1),
+ RuntimeConstraints: arvados.RuntimeConstraints{
+ RAM: int64(i%3+1) << 30,
+ VCPUs: i%8 + 1,
+ },
+ })
+ }
+ s.disp.queue = queue
+
+ var mtx sync.Mutex
+ done := make(chan struct{})
+ waiting := map[string]struct{}{}
+ for _, ctr := range queue.Containers {
+ waiting[ctr.UUID] = struct{}{}
+ }
+ onComplete := func(uuid string) {
+ mtx.Lock()
+ defer mtx.Unlock()
+ if _, ok := waiting[uuid]; !ok {
+ c.Errorf("container completed twice: %s", uuid)
+ }
+ delete(waiting, uuid)
+ if len(waiting) == 0 {
+ close(done)
+ }
+ }
+ n := 0
+ fc := &fakeCloud{
+ queue: queue,
+ makeVM: func(inst cloud.Instance) *fakeVM {
+ n++
+ fvm := &fakeVM{
+ boot: time.Now().Add(time.Duration(rand.Int63n(int64(5 * time.Millisecond)))),
+ crunchRunDetachDelay: time.Duration(rand.Int63n(int64(10 * time.Millisecond))),
+ ctrExit: int(rand.Uint32() & 0x3),
+ }
+ switch n % 7 {
+ case 0:
+ fvm.broken = time.Now().Add(time.Duration(rand.Int63n(90)) * time.Millisecond)
+ case 1:
+ fvm.crunchRunMissing = true
+ default:
+ fvm.crunchRunCrashRate = 0.1
+ }
+ return fvm
+ },
+ onComplete: onComplete,
+ onCancel: onComplete,
+ }
+ s.stubDriver.Exec = fc.exec
+
+ start := time.Now()
+ go s.disp.run()
+ err := s.disp.CheckHealth()
+ c.Check(err, check.IsNil)
+
+ select {
+ case <-done:
+ c.Logf("containers finished (%s), waiting for instances to shutdown and queue to clear", time.Since(start))
+ case <-time.After(10 * time.Second):
+ c.Fatalf("timed out; still waiting for %d containers: %q", len(waiting), waiting)
+ }
+
+ deadline := time.Now().Add(time.Second)
+ for range time.NewTicker(10 * time.Millisecond).C {
+ insts, err := s.stubDriver.InstanceSets()[0].Instances(nil)
+ c.Check(err, check.IsNil)
+ queue.Update()
+ ents, _ := queue.Entries()
+ if len(ents) == 0 && len(insts) == 0 {
+ break
+ }
+ if time.Now().After(deadline) {
+ c.Fatalf("timed out with %d containers (%v), %d instances (%+v)", len(ents), ents, len(insts), insts)
+ }
+ }
+}
+
+func (s *DispatcherSuite) TestInstancesAPI(c *check.C) {
+ var lameSet test.LameInstanceSet
+ drivers["test"] = cloud.DriverFunc(func(params map[string]interface{}, id cloud.InstanceSetID) (cloud.InstanceSet, error) {
+ return &lameSet, nil
+ })
+
+ type instance struct {
+ Instance string
+ WorkerState string
+ Price float64
+ LastContainerUUID string
+ ArvadosInstanceType string
+ ProviderInstanceType string
+ }
+ type instancesResponse struct {
+ Items []instance
+ }
+ getInstances := func() instancesResponse {
+ req := httptest.NewRequest("GET", "/arvados/v1/dispatch/instances", nil)
+ resp := httptest.NewRecorder()
+ s.disp.ServeHTTP(resp, req)
+ var sr instancesResponse
+ err := json.Unmarshal(resp.Body.Bytes(), &sr)
+ c.Check(err, check.IsNil)
+ return sr
+ }
+
+ sr := getInstances()
+ c.Check(len(sr.Items), check.Equals, 0)
+
+ ch := s.disp.pool.Subscribe()
+ defer s.disp.pool.Unsubscribe(ch)
+ err := s.disp.pool.Create(arvados.InstanceType{
+ Name: "a1.small-1",
+ ProviderType: "a1.small",
+ VCPUs: 1,
+ RAM: 1 << 30,
+ Price: 0.12,
+ })
+ c.Check(err, check.IsNil)
+ <-ch
+
+ sr = getInstances()
+ c.Assert(len(sr.Items), check.Equals, 1)
+ c.Check(sr.Items[0].Instance, check.Matches, "lame-.*")
+ c.Check(sr.Items[0].WorkerState, check.Equals, "booting")
+ c.Check(sr.Items[0].Price, check.Equals, 0.12)
+ c.Check(sr.Items[0].LastContainerUUID, check.Equals, "")
+ c.Check(sr.Items[0].ProviderInstanceType, check.Equals, "a1.small")
+ c.Check(sr.Items[0].ArvadosInstanceType, check.Equals, "a1.small-1")
+}
diff --git a/lib/dispatchcloud/driver.go b/lib/dispatchcloud/driver.go
new file mode 100644
index 000000000..295fd6105
--- /dev/null
+++ b/lib/dispatchcloud/driver.go
@@ -0,0 +1,22 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package dispatchcloud
+
+import (
+ "fmt"
+
+ "git.curoverse.com/arvados.git/lib/cloud"
+ "git.curoverse.com/arvados.git/sdk/go/arvados"
+)
+
+var drivers = map[string]cloud.Driver{}
+
+func newInstanceSet(cluster *arvados.Cluster, setID cloud.InstanceSetID) (cloud.InstanceSet, error) {
+ driver, ok := drivers[cluster.CloudVMs.Driver]
+ if !ok {
+ return nil, fmt.Errorf("unsupported cloud driver %q", cluster.CloudVMs.Driver)
+ }
+ return driver.InstanceSet(cluster.CloudVMs.DriverParameters, setID)
+}
diff --git a/lib/dispatchcloud/instance_set_proxy.go b/lib/dispatchcloud/instance_set_proxy.go
new file mode 100644
index 000000000..e728b67cd
--- /dev/null
+++ b/lib/dispatchcloud/instance_set_proxy.go
@@ -0,0 +1,25 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package dispatchcloud
+
+import (
+ "git.curoverse.com/arvados.git/lib/cloud"
+ "git.curoverse.com/arvados.git/sdk/go/arvados"
+ "golang.org/x/crypto/ssh"
+)
+
+type instanceSetProxy struct {
+ cloud.InstanceSet
+}
+
+func (is *instanceSetProxy) Create(it arvados.InstanceType, id cloud.ImageID, tags cloud.InstanceTags, pk ssh.PublicKey) (cloud.Instance, error) {
+ // TODO: return if Create failed recently with a RateLimitError or QuotaError
+ return is.InstanceSet.Create(it, id, tags, pk)
+}
+
+func (is *instanceSetProxy) Instances(tags cloud.InstanceTags) ([]cloud.Instance, error) {
+ // TODO: return if Instances failed recently with a RateLimitError
+ return is.InstanceSet.Instances(tags)
+}
diff --git a/lib/dispatchcloud/logger.go b/lib/dispatchcloud/logger.go
new file mode 100644
index 000000000..90bb6ca68
--- /dev/null
+++ b/lib/dispatchcloud/logger.go
@@ -0,0 +1,29 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package dispatchcloud
+
+import (
+ "sync"
+ "time"
+)
+
+type logger interface {
+ Printf(string, ...interface{})
+ Warnf(string, ...interface{})
+ Debugf(string, ...interface{})
+}
+
+var nextSpam = map[string]time.Time{}
+var nextSpamMtx sync.Mutex
+
+func unspam(msg string) bool {
+ nextSpamMtx.Lock()
+ defer nextSpamMtx.Unlock()
+ if nextSpam[msg].Before(time.Now()) {
+ nextSpam[msg] = time.Now().Add(time.Minute)
+ return true
+ }
+ return false
+}
diff --git a/lib/dispatchcloud/node_size.go b/lib/dispatchcloud/node_size.go
index 1c36d6cf5..3bada3baf 100644
--- a/lib/dispatchcloud/node_size.go
+++ b/lib/dispatchcloud/node_size.go
@@ -15,10 +15,9 @@ import (
"git.curoverse.com/arvados.git/sdk/go/arvados"
)
-var (
- ErrInstanceTypesNotConfigured = errors.New("site configuration does not list any instance types")
- discountConfiguredRAMPercent = 5
-)
+var ErrInstanceTypesNotConfigured = errors.New("site configuration does not list any instance types")
+
+var discountConfiguredRAMPercent = 5
// ConstraintsNotSatisfiableError includes a list of available instance types
// to be reported back to the user.
diff --git a/lib/dispatchcloud/readme.go b/lib/dispatchcloud/readme.go
new file mode 100644
index 000000000..a4b005eb8
--- /dev/null
+++ b/lib/dispatchcloud/readme.go
@@ -0,0 +1,79 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package dispatchcloud
+
+// A dispatcher comprises a container queue, a scheduler, a worker
+// pool, a cloud provider, a stale-lock fixer, and a syncer.
+// 1. Choose a provider.
+// 2. Start a worker pool.
+// 3. Start a container queue.
+// 4. Run a stale-lock fixer.
+// 5. Start a scheduler.
+// 6. Start a syncer.
+//
+//
+// A provider (cloud driver) creates new cloud VM instances and gets
+// the latest list of instances. The returned instances implement
+// proxies to the provider's metadata and control interfaces (get IP
+// address, update tags, shutdown).
+//
+//
+// A workerPool tracks workers' instance types and readiness states
+// (available to do work now, booting, suffering a temporary network
+// outage, shutting down). It loads internal state from the cloud
+// provider's list of instances at startup, and syncs periodically
+// after that.
+//
+//
+// A worker maintains a multiplexed SSH connection to a cloud
+// instance, retrying/reconnecting as needed, so the workerPool can
+// execute commands. It asks the provider's instance to verify its SSH
+// public key once when first connecting, and again later if the key
+// changes.
+//
+//
+// A container queue tracks the known state (according to
+// arvados-controller) of each container of interest -- i.e., queued,
+// or locked/running using our own dispatch token. It also proxies the
+// dispatcher's lock/unlock/cancel requests to the controller. It
+// handles concurrent refresh and update operations without exposing
+// out-of-order updates to its callers. (It drops any new information
+// that might have originated before its own most recent
+// lock/unlock/cancel operation.)
+//
+//
+// A stale-lock fixer waits for any already-locked containers (i.e.,
+// locked by a prior server process) to appear on workers as the
+// worker pool recovers its state. It unlocks/requeues any that still
+// remain when all workers are recovered or shutdown, or its timer
+// expires.
+//
+//
+// A scheduler chooses which containers to assign to which idle
+// workers, and decides what to do when there are not enough idle
+// workers (including shutting down some idle nodes).
+//
+//
+// A syncer updates state to Cancelled when a running container
+// process dies without finalizing its entry in the controller
+// database. It also calls the worker pool to kill containers that
+// have priority=0 while locked or running.
+//
+//
+// A provider proxy wraps a provider with rate-limiting logic. After
+// the wrapped provider receives a cloud.RateLimitError, the proxy
+// starts returning errors to callers immediately without calling
+// through to the wrapped provider.
+//
+//
+// TBD: Bootstrapping script via SSH, too? Future version.
+//
+// TBD: drain instance, keep instance alive
+// TBD: metrics, diagnostics
+// TBD: why dispatch token currently passed to worker?
+//
+// Metrics: queue size, time job has been in queued, #idle/busy/booting nodes
+// Timing in each step, and end-to-end
+// Metrics: boot/idle/alloc time and cost
diff --git a/lib/dispatchcloud/scheduler/fix_stale_locks.go b/lib/dispatchcloud/scheduler/fix_stale_locks.go
new file mode 100644
index 000000000..e9644aed2
--- /dev/null
+++ b/lib/dispatchcloud/scheduler/fix_stale_locks.go
@@ -0,0 +1,57 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package scheduler
+
+import (
+ "time"
+
+ "git.curoverse.com/arvados.git/lib/dispatchcloud/worker"
+ "git.curoverse.com/arvados.git/sdk/go/arvados"
+ "github.com/Sirupsen/logrus"
+)
+
+// FixStaleLocks waits for any already-locked containers (i.e., locked
+// by a prior dispatcher process) to appear on workers as the worker
+// pool recovers its state. It unlocks any that still remain when all
+// workers are recovered or shutdown, or its timer expires.
+func FixStaleLocks(logger logrus.FieldLogger, queue ContainerQueue, pool WorkerPool, limit time.Duration) {
+ wp := pool.Subscribe()
+ defer pool.Unsubscribe(wp)
+ timeout := time.NewTimer(limit)
+waiting:
+ for {
+ unlock := false
+ select {
+ case <-wp:
+ // If all workers have been contacted, unlock
+ // containers that aren't claimed by any
+ // worker.
+ unlock = pool.Workers()[worker.StateUnknown] == 0
+ case <-timeout.C:
+ // Give up and unlock the containers, even
+ // though they might be working.
+ unlock = true
+ }
+
+ running := pool.Running()
+ qEntries, _ := queue.Entries()
+ for uuid, ent := range qEntries {
+ if ent.Container.State != arvados.ContainerStateLocked {
+ continue
+ }
+ if _, running := running[uuid]; running {
+ continue
+ }
+ if !unlock {
+ continue waiting
+ }
+ err := queue.Unlock(uuid)
+ if err != nil {
+ logger.Warnf("Unlock %s: %s", uuid, err)
+ }
+ }
+ return
+ }
+}
diff --git a/lib/dispatchcloud/scheduler/gocheck_test.go b/lib/dispatchcloud/scheduler/gocheck_test.go
new file mode 100644
index 000000000..558c60f73
--- /dev/null
+++ b/lib/dispatchcloud/scheduler/gocheck_test.go
@@ -0,0 +1,16 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package scheduler
+
+import (
+ "testing"
+
+ check "gopkg.in/check.v1"
+)
+
+// Gocheck boilerplate
+func Test(t *testing.T) {
+ check.TestingT(t)
+}
diff --git a/lib/dispatchcloud/scheduler/interfaces.go b/lib/dispatchcloud/scheduler/interfaces.go
new file mode 100644
index 000000000..bdb8678e9
--- /dev/null
+++ b/lib/dispatchcloud/scheduler/interfaces.go
@@ -0,0 +1,40 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package scheduler
+
+import (
+ "time"
+
+ "git.curoverse.com/arvados.git/lib/dispatchcloud/container"
+ "git.curoverse.com/arvados.git/lib/dispatchcloud/worker"
+ "git.curoverse.com/arvados.git/sdk/go/arvados"
+)
+
+// A ContainerQueue is a set of containers that need to be started or
+// stopped. Implemented by container.Queue and test stubs.
+type ContainerQueue interface {
+ Entries() (entries map[string]container.QueueEnt, updated time.Time)
+ Lock(uuid string) error
+ Unlock(uuid string) error
+ Cancel(uuid string) error
+ Forget(uuid string)
+ Get(uuid string) (arvados.Container, bool)
+}
+
+// A WorkerPool asynchronously starts and stops worker VMs, and starts
+// and stops containers on them. Implemented by worker.Pool and test
+// stubs.
+type WorkerPool interface {
+ Running() map[string]time.Time
+ Unallocated() map[arvados.InstanceType]int
+ Workers() map[worker.State]int
+ AtQuota() bool
+ Create(arvados.InstanceType) error
+ Shutdown(arvados.InstanceType) bool
+ StartContainer(arvados.InstanceType, arvados.Container) bool
+ KillContainer(uuid string)
+ Subscribe() <-chan struct{}
+ Unsubscribe(<-chan struct{})
+}
diff --git a/lib/dispatchcloud/scheduler/map.go b/lib/dispatchcloud/scheduler/map.go
new file mode 100644
index 000000000..5742c1f47
--- /dev/null
+++ b/lib/dispatchcloud/scheduler/map.go
@@ -0,0 +1,144 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+// Package scheduler uses a resizable worker pool to execute
+// containers in priority order.
+//
+// Scheduler functions must not be called concurrently using the same
+// queue or pool.
+package scheduler
+
+import (
+ "sort"
+
+ "git.curoverse.com/arvados.git/lib/cloud"
+ "git.curoverse.com/arvados.git/lib/dispatchcloud/container"
+ "git.curoverse.com/arvados.git/sdk/go/arvados"
+ "github.com/Sirupsen/logrus"
+)
+
+// Map maps queued containers onto unallocated workers in priority
+// order, creating new workers if needed. It locks containers that can
+// be mapped onto existing/pending workers, and starts them if
+// possible.
+//
+// Map unlocks any containers that are locked but can't be
+// mapped. (For example, this happens when the cloud provider reaches
+// quota/capacity and a previously mappable container's priority is
+// surpassed by a newer container.)
+//
+// If it encounters errors while creating new workers, Map shuts down
+// idle workers, in case they are consuming quota.
+//
+// Map should not be called without first calling FixStaleLocks.
+func Map(logger logrus.FieldLogger, queue ContainerQueue, pool WorkerPool) {
+ unsorted, _ := queue.Entries()
+ sorted := make([]container.QueueEnt, 0, len(unsorted))
+ for _, ent := range unsorted {
+ sorted = append(sorted, ent)
+ }
+ sort.Slice(sorted, func(i, j int) bool {
+ return sorted[i].Container.Priority > sorted[j].Container.Priority
+ })
+
+ running := pool.Running()
+ unalloc := pool.Unallocated()
+
+ logger.WithFields(logrus.Fields{
+ "Containers": len(sorted),
+ "Processes": len(running),
+ }).Debug("mapping")
+
+ dontstart := map[arvados.InstanceType]bool{}
+ var overquota []container.QueueEnt // entries that are unmappable because of worker pool quota
+
+ for i, ctr := range sorted {
+ ctr, it := ctr.Container, ctr.InstanceType
+ logger := logger.WithFields(logrus.Fields{
+ "ContainerUUID": ctr.UUID,
+ "InstanceType": it.Name,
+ })
+ if _, running := running[ctr.UUID]; running || ctr.Priority < 1 {
+ continue
+ }
+ if ctr.State == arvados.ContainerStateQueued {
+ logger.Debugf("locking")
+ if unalloc[it] < 1 && pool.AtQuota() {
+ overquota = sorted[i:]
+ break
+ }
+ err := queue.Lock(ctr.UUID)
+ if err != nil {
+ logger.WithError(err).Warnf("lock error")
+ unalloc[it]++
+ continue
+ }
+ var ok bool
+ ctr, ok = queue.Get(ctr.UUID)
+ if !ok {
+ logger.Error("(BUG?) container disappeared from queue after Lock succeeded")
+ continue
+ }
+ if ctr.State != arvados.ContainerStateLocked {
+ logger.Debugf("(race?) container has state=%q after Lock succeeded", ctr.State)
+ }
+ }
+ if ctr.State != arvados.ContainerStateLocked {
+ continue
+ }
+ if unalloc[it] < 1 {
+ logger.Info("creating new instance")
+ err := pool.Create(it)
+ if err != nil {
+ if _, ok := err.(cloud.QuotaError); !ok {
+ logger.WithError(err).Warn("error creating worker")
+ }
+ queue.Unlock(ctr.UUID)
+ // Don't let lower-priority containers
+ // starve this one by using keeping
+ // idle workers alive on different
+ // instance types. TODO: avoid
+ // getting starved here if instances
+ // of a specific type always fail.
+ overquota = sorted[i:]
+ break
+ }
+ unalloc[it]++
+ }
+ if dontstart[it] {
+ // We already tried & failed to start a
+ // higher-priority container on the same
+ // instance type. Don't let this one sneak in
+ // ahead of it.
+ } else if pool.StartContainer(it, ctr) {
+ unalloc[it]--
+ } else {
+ dontstart[it] = true
+ }
+ }
+
+ if len(overquota) > 0 {
+ // Unlock any containers that are unmappable while
+ // we're at quota.
+ for _, ctr := range overquota {
+ ctr := ctr.Container
+ if ctr.State == arvados.ContainerStateLocked {
+ logger := logger.WithField("ContainerUUID", ctr.UUID)
+ logger.Debug("unlock because pool capacity is used by higher priority containers")
+ err := queue.Unlock(ctr.UUID)
+ if err != nil {
+ logger.WithError(err).Warn("error unlocking")
+ }
+ }
+ }
+ // Shut down idle workers that didn't get any
+ // containers mapped onto them before we hit quota.
+ for it, n := range unalloc {
+ if n < 1 {
+ continue
+ }
+ pool.Shutdown(it)
+ }
+ }
+}
diff --git a/lib/dispatchcloud/scheduler/map_test.go b/lib/dispatchcloud/scheduler/map_test.go
new file mode 100644
index 000000000..c40b3041b
--- /dev/null
+++ b/lib/dispatchcloud/scheduler/map_test.go
@@ -0,0 +1,259 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package scheduler
+
+import (
+ "errors"
+ "fmt"
+
+ "git.curoverse.com/arvados.git/lib/dispatchcloud/container"
+ "git.curoverse.com/arvados.git/lib/dispatchcloud/test"
+ "git.curoverse.com/arvados.git/lib/dispatchcloud/worker"
+ "git.curoverse.com/arvados.git/sdk/go/arvados"
+ "github.com/Sirupsen/logrus"
+ check "gopkg.in/check.v1"
+)
+
+var (
+ logger = logrus.StandardLogger()
+
+ // arbitrary example instance types
+ types = func() (r []arvados.InstanceType) {
+ for i := 0; i < 16; i++ {
+ r = append(r, test.InstanceType(i))
+ }
+ return
+ }()
+
+ // arbitrary example container UUIDs
+ uuids = func() (r []string) {
+ for i := 0; i < 16; i++ {
+ r = append(r, test.ContainerUUID(i))
+ }
+ return
+ }()
+)
+
+type stubQueue struct {
+ ents map[string]container.QueueEnt
+}
+
+func (q *stubQueue) Entries() map[string]container.QueueEnt {
+ return q.ents
+}
+func (q *stubQueue) Lock(uuid string) error {
+ return q.setState(uuid, arvados.ContainerStateLocked)
+}
+func (q *stubQueue) Unlock(uuid string) error {
+ return q.setState(uuid, arvados.ContainerStateQueued)
+}
+func (q *stubQueue) Get(uuid string) (arvados.Container, bool) {
+ ent, ok := q.ents[uuid]
+ return ent.Container, ok
+}
+func (q *stubQueue) setState(uuid string, state arvados.ContainerState) error {
+ ent, ok := q.ents[uuid]
+ if !ok {
+ return fmt.Errorf("no such ent: %q", uuid)
+ }
+ ent.Container.State = state
+ q.ents[uuid] = ent
+ return nil
+}
+
+type stubQuotaError struct {
+ error
+}
+
+func (stubQuotaError) IsQuotaError() bool { return true }
+
+type stubPool struct {
+ notify <-chan struct{}
+ unalloc map[arvados.InstanceType]int // idle+booting+unknown
+ idle map[arvados.InstanceType]int
+ running map[string]bool
+ atQuota bool
+ canCreate int
+ creates []arvados.InstanceType
+ starts []string
+ shutdowns int
+}
+
+func (p *stubPool) AtQuota() bool { return p.atQuota }
+func (p *stubPool) Subscribe() <-chan struct{} { return p.notify }
+func (p *stubPool) Unsubscribe(<-chan struct{}) {}
+func (p *stubPool) Running() map[string]bool { return p.running }
+func (p *stubPool) Unallocated() map[arvados.InstanceType]int {
+ r := map[arvados.InstanceType]int{}
+ for it, n := range p.unalloc {
+ r[it] = n
+ }
+ return r
+}
+func (p *stubPool) Create(it arvados.InstanceType) error {
+ p.creates = append(p.creates, it)
+ if p.canCreate < 1 {
+ return stubQuotaError{errors.New("quota")}
+ }
+ p.canCreate--
+ p.unalloc[it]++
+ return nil
+}
+func (p *stubPool) Shutdown(arvados.InstanceType) bool {
+ p.shutdowns++
+ return false
+}
+func (p *stubPool) Workers() map[worker.State]int {
+ return map[worker.State]int{
+ worker.StateBooting: len(p.unalloc) - len(p.idle),
+ worker.StateRunning: len(p.idle) - len(p.running),
+ }
+}
+func (p *stubPool) StartContainer(it arvados.InstanceType, ctr arvados.Container) bool {
+ p.starts = append(p.starts, ctr.UUID)
+ if p.idle[it] == 0 {
+ return false
+ }
+ p.idle[it]--
+ p.unalloc[it]--
+ p.running[ctr.UUID] = true
+ return true
+}
+
+var _ = check.Suite(&SchedulerSuite{})
+
+type SchedulerSuite struct{}
+
+// Map priority=4 container to idle node. Create a new instance for
+// the priority=3 container. Don't try to start any priority<3
+// containers because priority=3 container didn't start
+// immediately. Don't try to create any other nodes after the failed
+// create.
+func (*SchedulerSuite) TestMapIdle(c *check.C) {
+ queue := stubQueue{
+ ents: map[string]container.QueueEnt{
+ uuids[1]: {
+ Container: arvados.Container{UUID: uuids[1], Priority: 1, State: arvados.ContainerStateQueued},
+ InstanceType: types[1],
+ },
+ uuids[2]: {
+ Container: arvados.Container{UUID: uuids[2], Priority: 2, State: arvados.ContainerStateQueued},
+ InstanceType: types[1],
+ },
+ uuids[3]: {
+ Container: arvados.Container{UUID: uuids[3], Priority: 3, State: arvados.ContainerStateQueued},
+ InstanceType: types[1],
+ },
+ uuids[4]: {
+ Container: arvados.Container{UUID: uuids[4], Priority: 4, State: arvados.ContainerStateQueued},
+ InstanceType: types[1],
+ },
+ },
+ }
+ pool := stubPool{
+ unalloc: map[arvados.InstanceType]int{
+ types[1]: 1,
+ types[2]: 2,
+ },
+ idle: map[arvados.InstanceType]int{
+ types[1]: 1,
+ types[2]: 2,
+ },
+ running: map[string]bool{},
+ canCreate: 1,
+ }
+ Map(logger, &queue, &pool)
+ c.Check(pool.creates, check.DeepEquals, []arvados.InstanceType{types[1]})
+ c.Check(pool.starts, check.DeepEquals, []string{uuids[4], uuids[3]})
+ c.Check(pool.running, check.DeepEquals, map[string]bool{uuids[4]: true})
+}
+
+// Shutdown some nodes if Create() fails -- and without even calling
+// Create(), if AtQuota() is true.
+func (*SchedulerSuite) TestMapShutdownAtQuota(c *check.C) {
+ for quota := 0; quota < 2; quota++ {
+ shouldCreate := types[1 : 1+quota]
+ queue := stubQueue{
+ ents: map[string]container.QueueEnt{
+ uuids[1]: {
+ Container: arvados.Container{UUID: uuids[1], Priority: 1, State: arvados.ContainerStateQueued},
+ InstanceType: types[1],
+ },
+ },
+ }
+ pool := stubPool{
+ atQuota: quota == 0,
+ unalloc: map[arvados.InstanceType]int{
+ types[2]: 2,
+ },
+ idle: map[arvados.InstanceType]int{
+ types[2]: 2,
+ },
+ running: map[string]bool{},
+ creates: []arvados.InstanceType{},
+ starts: []string{},
+ canCreate: 0,
+ }
+ Map(logger, &queue, &pool)
+ c.Check(pool.creates, check.DeepEquals, shouldCreate)
+ c.Check(pool.starts, check.DeepEquals, []string{})
+ c.Check(pool.shutdowns, check.Not(check.Equals), 0)
+ }
+}
+
+// Start lower-priority containers while waiting for new/existing
+// workers to come up for higher-priority containers.
+func (*SchedulerSuite) TestMapStartWhileCreating(c *check.C) {
+ pool := stubPool{
+ unalloc: map[arvados.InstanceType]int{
+ types[1]: 1,
+ types[2]: 1,
+ },
+ idle: map[arvados.InstanceType]int{
+ types[1]: 1,
+ types[2]: 1,
+ },
+ running: map[string]bool{},
+ canCreate: 2,
+ }
+ queue := stubQueue{
+ ents: map[string]container.QueueEnt{
+ uuids[1]: {
+ // create a new worker
+ Container: arvados.Container{UUID: uuids[1], Priority: 1, State: arvados.ContainerStateQueued},
+ InstanceType: types[1],
+ },
+ uuids[2]: {
+ // tentatively map to unalloc worker
+ Container: arvados.Container{UUID: uuids[2], Priority: 2, State: arvados.ContainerStateQueued},
+ InstanceType: types[1],
+ },
+ uuids[3]: {
+ // start now on idle worker
+ Container: arvados.Container{UUID: uuids[3], Priority: 3, State: arvados.ContainerStateQueued},
+ InstanceType: types[1],
+ },
+ uuids[4]: {
+ // create a new worker
+ Container: arvados.Container{UUID: uuids[4], Priority: 4, State: arvados.ContainerStateQueued},
+ InstanceType: types[2],
+ },
+ uuids[5]: {
+ // tentatively map to unalloc worker
+ Container: arvados.Container{UUID: uuids[5], Priority: 5, State: arvados.ContainerStateQueued},
+ InstanceType: types[2],
+ },
+ uuids[6]: {
+ // start now on idle worker
+ Container: arvados.Container{UUID: uuids[6], Priority: 6, State: arvados.ContainerStateQueued},
+ InstanceType: types[2],
+ },
+ },
+ }
+ Map(logger, &queue, &pool)
+ c.Check(pool.creates, check.DeepEquals, []arvados.InstanceType{types[2], types[1]})
+ c.Check(pool.starts, check.DeepEquals, []string{uuids[6], uuids[5], uuids[3], uuids[2]})
+ c.Check(pool.running, check.DeepEquals, map[string]bool{uuids[3]: true, uuids[6]: true})
+}
diff --git a/lib/dispatchcloud/scheduler/sync.go b/lib/dispatchcloud/scheduler/sync.go
new file mode 100644
index 000000000..00e2a30f7
--- /dev/null
+++ b/lib/dispatchcloud/scheduler/sync.go
@@ -0,0 +1,85 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package scheduler
+
+import (
+ "time"
+
+ "git.curoverse.com/arvados.git/lib/dispatchcloud/container"
+ "git.curoverse.com/arvados.git/sdk/go/arvados"
+ "github.com/Sirupsen/logrus"
+)
+
+// Sync resolves discrepancies between the queue and the pool:
+//
+// Lingering crunch-run processes for finalized and unlocked/requeued
+// containers are killed.
+//
+// Locked containers whose crunch-run processes have exited are
+// requeued.
+//
+// Running containers whose crunch-run processes have exited are
+// cancelled.
+//
+// Sync must not be called concurrently with other calls to Map or
+// Sync using the same queue or pool.
+func Sync(logger logrus.FieldLogger, queue ContainerQueue, pool WorkerPool) {
+ running := pool.Running()
+ cancel := func(ent container.QueueEnt, reason string) {
+ uuid := ent.Container.UUID
+ logger := logger.WithField("ContainerUUID", uuid)
+ logger.Infof("cancelling container because %s", reason)
+ err := queue.Cancel(uuid)
+ if err != nil {
+ logger.WithError(err).Print("error cancelling container")
+ }
+ }
+ kill := func(ent container.QueueEnt) {
+ uuid := ent.Container.UUID
+ logger := logger.WithField("ContainerUUID", uuid)
+ logger.Debugf("killing crunch-run process because state=%q", ent.Container.State)
+ pool.KillContainer(uuid)
+ }
+ qEntries, qUpdated := queue.Entries()
+ for uuid, ent := range qEntries {
+ exited, running := running[uuid]
+ switch ent.Container.State {
+ case arvados.ContainerStateRunning:
+ if !running {
+ cancel(ent, "not running on any worker")
+ } else if !exited.IsZero() && qUpdated.After(exited) {
+ cancel(ent, "state=\"Running\" after crunch-run exited")
+ }
+ case arvados.ContainerStateComplete, arvados.ContainerStateCancelled:
+ if running {
+ kill(ent)
+ } else {
+ logger.WithFields(logrus.Fields{
+ "ContainerUUID": uuid,
+ "State": ent.Container.State,
+ }).Info("container finished")
+ queue.Forget(uuid)
+ }
+ case arvados.ContainerStateQueued:
+ if running {
+ kill(ent)
+ }
+ case arvados.ContainerStateLocked:
+ if running && !exited.IsZero() && qUpdated.After(exited) {
+ logger = logger.WithFields(logrus.Fields{
+ "ContainerUUID": uuid,
+ "Exited": time.Since(exited).Seconds(),
+ })
+ logger.Infof("requeueing container because state=%q after crunch-run exited", ent.Container.State)
+ err := queue.Unlock(uuid)
+ if err != nil {
+ logger.WithError(err).Info("error requeueing container")
+ }
+ }
+ default:
+ logger.WithField("ContainerUUID", uuid).Errorf("BUG: unexpected state %q", ent.Container.State)
+ }
+ }
+}
diff --git a/lib/dispatchcloud/ssh_executor/executor.go b/lib/dispatchcloud/ssh_executor/executor.go
new file mode 100644
index 000000000..804ae6f15
--- /dev/null
+++ b/lib/dispatchcloud/ssh_executor/executor.go
@@ -0,0 +1,172 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+// Package ssh_executor provides an implementation of pool.Executor
+// using a long-lived multiplexed SSH session.
+package ssh_executor
+
+import (
+ "bytes"
+ "errors"
+ "io"
+ "net"
+ "sync"
+ "time"
+
+ "git.curoverse.com/arvados.git/lib/cloud"
+ "golang.org/x/crypto/ssh"
+)
+
+// New returns a new Executor, using the given target.
+func New(t cloud.ExecutorTarget) *Executor {
+ return &Executor{target: t}
+}
+
+// An Executor uses a multiplexed SSH connection to execute shell
+// commands on a remote target. It reconnects automatically after
+// errors.
+//
+// When setting up a connection, the Executor accepts whatever host
+// key is provided by the remote server, then passes the received key
+// and the SSH connection to the target's VerifyHostKey method before
+// executing commands on the connection.
+//
+// A zero Executor must not be used before calling SetTarget.
+//
+// An Executor must not be copied.
+type Executor struct {
+ target cloud.ExecutorTarget
+ signers []ssh.Signer
+ mtx sync.RWMutex // controls access to instance after creation
+
+ client *ssh.Client
+ clientErr error
+ clientOnce sync.Once // initialized private state
+ clientSetup chan bool // len>0 while client setup is in progress
+ hostKey ssh.PublicKey // most recent host key that passed verification, if any
+}
+
+// SetSigners updates the set of private keys that will be offered to
+// the target next time the Executor sets up a new connection.
+func (exr *Executor) SetSigners(signers ...ssh.Signer) {
+ exr.mtx.Lock()
+ defer exr.mtx.Unlock()
+ exr.signers = signers
+}
+
+// SetTarget sets the current target. The new target will be used next
+// time a new connection is set up; until then, the Executor will
+// continue to use the existing target.
+//
+// The new target is assumed to represent the same host as the
+// previous target, although its address and host key might differ.
+func (exr *Executor) SetTarget(t cloud.ExecutorTarget) {
+ exr.mtx.Lock()
+ defer exr.mtx.Unlock()
+ exr.target = t
+}
+
+// Target returns the current target.
+func (exr *Executor) Target() cloud.ExecutorTarget {
+ exr.mtx.RLock()
+ defer exr.mtx.RUnlock()
+ return exr.target
+}
+
+// Execute runs cmd on the target. If an existing connection is not
+// usable, it sets up a new connection to the current target.
+func (exr *Executor) Execute(cmd string, stdin io.Reader) ([]byte, []byte, error) {
+ session, err := exr.newSession()
+ if err != nil {
+ return nil, nil, err
+ }
+ defer session.Close()
+ var stdout, stderr bytes.Buffer
+ session.Stdin = stdin
+ session.Stdout = &stdout
+ session.Stderr = &stderr
+ err = session.Run(cmd)
+ return stdout.Bytes(), stderr.Bytes(), err
+}
+
+// Create a new SSH session. If session setup fails or the SSH client
+// hasn't been setup yet, setup a new SSH client and try again.
+func (exr *Executor) newSession() (*ssh.Session, error) {
+ try := func(create bool) (*ssh.Session, error) {
+ client, err := exr.sshClient(create)
+ if err != nil {
+ return nil, err
+ }
+ return client.NewSession()
+ }
+ session, err := try(false)
+ if err != nil {
+ session, err = try(true)
+ }
+ return session, err
+}
+
+// Get the latest SSH client. If another goroutine is in the process
+// of setting one up, wait for it to finish and return its result (or
+// the last successfully setup client, if it fails).
+func (exr *Executor) sshClient(create bool) (*ssh.Client, error) {
+ exr.clientOnce.Do(func() {
+ exr.clientSetup = make(chan bool, 1)
+ exr.clientErr = errors.New("client not yet created")
+ })
+ defer func() { <-exr.clientSetup }()
+ select {
+ case exr.clientSetup <- true:
+ if create {
+ client, err := exr.setupSSHClient()
+ if err == nil || exr.client == nil {
+ exr.client, exr.clientErr = client, err
+ }
+ if err != nil {
+ return nil, err
+ }
+ }
+ default:
+ // Another goroutine is doing the above case. Wait
+ // for it to finish and return whatever it leaves in
+ // wkr.client.
+ exr.clientSetup <- true
+ }
+ return exr.client, exr.clientErr
+}
+
+// Create a new SSH client.
+func (exr *Executor) setupSSHClient() (*ssh.Client, error) {
+ target := exr.Target()
+ addr := target.Address()
+ if addr == "" {
+ return nil, errors.New("instance has no address")
+ }
+ var receivedKey ssh.PublicKey
+ client, err := ssh.Dial("tcp", addr, &ssh.ClientConfig{
+ User: "root",
+ Auth: []ssh.AuthMethod{
+ ssh.PublicKeys(exr.signers...),
+ },
+ HostKeyCallback: func(hostname string, remote net.Addr, key ssh.PublicKey) error {
+ receivedKey = key
+ return nil
+ },
+ Timeout: time.Minute,
+ })
+ if err != nil {
+ return nil, err
+ } else if receivedKey == nil {
+ return nil, errors.New("BUG: key was never provided to HostKeyCallback")
+ }
+
+ if exr.hostKey == nil || !bytes.Equal(exr.hostKey.Marshal(), receivedKey.Marshal()) {
+ err = target.VerifyHostKey(receivedKey, client)
+ if err != nil {
+ return nil, err
+ }
+ exr.hostKey = receivedKey
+ }
+ return client, nil
+}
diff --git a/lib/dispatchcloud/ssh_executor/executor_test.go b/lib/dispatchcloud/ssh_executor/executor_test.go
new file mode 100644
index 000000000..8dabfecad
--- /dev/null
+++ b/lib/dispatchcloud/ssh_executor/executor_test.go
@@ -0,0 +1,102 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package ssh_executor
+
+import (
+ "bytes"
+ "io"
+ "io/ioutil"
+ "sync"
+ "testing"
+ "time"
+
+ "git.curoverse.com/arvados.git/lib/dispatchcloud/test"
+ "golang.org/x/crypto/ssh"
+ check "gopkg.in/check.v1"
+)
+
+// Gocheck boilerplate
+func Test(t *testing.T) {
+ check.TestingT(t)
+}
+
+var _ = check.Suite(&ExecutorSuite{})
+
+type testTarget struct {
+ test.SSHService
+}
+
+func (*testTarget) VerifyHostKey(ssh.PublicKey, *ssh.Client) error {
+ return nil
+}
+
+type ExecutorSuite struct{}
+
+func (s *ExecutorSuite) TestExecute(c *check.C) {
+ command := `foo 'bar' "baz"`
+ stdinData := "foobar\nbaz\n"
+ _, hostpriv := test.LoadTestKey(c, "../test/sshkey_vm")
+ clientpub, clientpriv := test.LoadTestKey(c, "../test/sshkey_dispatch")
+ for _, exitcode := range []int{0, 1, 2} {
+ srv := &testTarget{
+ SSHService: test.SSHService{
+ Exec: func(cmd string, stdin io.Reader, stdout, stderr io.Writer) uint32 {
+ c.Check(cmd, check.Equals, command)
+ var wg sync.WaitGroup
+ wg.Add(2)
+ go func() {
+ io.WriteString(stdout, "stdout\n")
+ wg.Done()
+ }()
+ go func() {
+ io.WriteString(stderr, "stderr\n")
+ wg.Done()
+ }()
+ buf, err := ioutil.ReadAll(stdin)
+ wg.Wait()
+ c.Check(err, check.IsNil)
+ if err != nil {
+ return 99
+ }
+ _, err = stdout.Write(buf)
+ c.Check(err, check.IsNil)
+ return uint32(exitcode)
+ },
+ HostKey: hostpriv,
+ AuthorizedKeys: []ssh.PublicKey{clientpub},
+ },
+ }
+ err := srv.Start()
+ c.Check(err, check.IsNil)
+ c.Logf("srv address %q", srv.Address())
+ defer srv.Close()
+
+ exr := New(srv)
+ exr.SetSigners(clientpriv)
+
+ done := make(chan bool)
+ go func() {
+ stdout, stderr, err := exr.Execute(command, bytes.NewBufferString(stdinData))
+ if exitcode == 0 {
+ c.Check(err, check.IsNil)
+ } else {
+ c.Check(err, check.NotNil)
+ err, ok := err.(*ssh.ExitError)
+ c.Assert(ok, check.Equals, true)
+ c.Check(err.ExitStatus(), check.Equals, exitcode)
+ }
+ c.Check(stdout, check.DeepEquals, []byte("stdout\n"+stdinData))
+ c.Check(stderr, check.DeepEquals, []byte("stderr\n"))
+ close(done)
+ }()
+
+ timeout := time.NewTimer(time.Second)
+ select {
+ case <-done:
+ case <-timeout.C:
+ c.Fatal("timed out")
+ }
+ }
+}
diff --git a/lib/dispatchcloud/test/doc.go b/lib/dispatchcloud/test/doc.go
new file mode 100644
index 000000000..12f3b16b2
--- /dev/null
+++ b/lib/dispatchcloud/test/doc.go
@@ -0,0 +1,7 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+// Package test provides fakes and other tools for testing cloud
+// drivers and other dispatcher modules.
+package test
diff --git a/lib/dispatchcloud/test/fixtures.go b/lib/dispatchcloud/test/fixtures.go
new file mode 100644
index 000000000..7d65ca057
--- /dev/null
+++ b/lib/dispatchcloud/test/fixtures.go
@@ -0,0 +1,27 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package test
+
+import (
+ "fmt"
+
+ "git.curoverse.com/arvados.git/sdk/go/arvados"
+)
+
+// ContainerUUID returns a fake container UUID.
+func ContainerUUID(i int) string {
+ return fmt.Sprintf("zzzzz-dz642-%015d", i)
+}
+
+// InstanceType returns a fake arvados.InstanceType called "type{i}"
+// with i CPUs and i GiB of memory.
+func InstanceType(i int) arvados.InstanceType {
+ return arvados.InstanceType{
+ Name: fmt.Sprintf("type%d", i),
+ ProviderType: fmt.Sprintf("providertype%d", i),
+ VCPUs: i,
+ RAM: arvados.ByteSize(i) << 30,
+ }
+}
diff --git a/lib/dispatchcloud/test/lame_instance_set.go b/lib/dispatchcloud/test/lame_instance_set.go
new file mode 100644
index 000000000..baab407a7
--- /dev/null
+++ b/lib/dispatchcloud/test/lame_instance_set.go
@@ -0,0 +1,118 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package test
+
+import (
+ "fmt"
+ "math/rand"
+ "sync"
+
+ "git.curoverse.com/arvados.git/lib/cloud"
+ "git.curoverse.com/arvados.git/sdk/go/arvados"
+ "golang.org/x/crypto/ssh"
+)
+
+// LameInstanceSet creates instances that boot but can't run
+// containers.
+type LameInstanceSet struct {
+ Hold chan bool // set to make(chan bool) to hold operations until Release is called
+
+ mtx sync.Mutex
+ instances map[*lameInstance]bool
+}
+
+// Create returns a new instance.
+func (p *LameInstanceSet) Create(instType arvados.InstanceType, imageID cloud.ImageID, tags cloud.InstanceTags, pubkey ssh.PublicKey) (cloud.Instance, error) {
+ inst := &lameInstance{
+ p: p,
+ id: cloud.InstanceID(fmt.Sprintf("lame-%x", rand.Uint64())),
+ providerType: instType.ProviderType,
+ }
+ inst.SetTags(tags)
+ if p.Hold != nil {
+ p.Hold <- true
+ }
+ p.mtx.Lock()
+ defer p.mtx.Unlock()
+ if p.instances == nil {
+ p.instances = map[*lameInstance]bool{}
+ }
+ p.instances[inst] = true
+ return inst, nil
+}
+
+// Instances returns the instances that haven't been destroyed.
+func (p *LameInstanceSet) Instances(cloud.InstanceTags) ([]cloud.Instance, error) {
+ p.mtx.Lock()
+ defer p.mtx.Unlock()
+ var instances []cloud.Instance
+ for i := range p.instances {
+ instances = append(instances, i)
+ }
+ return instances, nil
+}
+
+// Stop is a no-op, but exists to satisfy cloud.InstanceSet.
+func (p *LameInstanceSet) Stop() {
+}
+
+// Release n held calls. Blocks if n calls aren't already
+// waiting. Blocks forever if Hold is nil.
+func (p *LameInstanceSet) Release(n int) {
+ for i := 0; i < n; i++ {
+ <-p.Hold
+ }
+}
+
+type lameInstance struct {
+ p *LameInstanceSet
+ id cloud.InstanceID
+ providerType string
+ tags cloud.InstanceTags
+}
+
+func (inst *lameInstance) ID() cloud.InstanceID {
+ return inst.id
+}
+
+func (inst *lameInstance) String() string {
+ return fmt.Sprint(inst.id)
+}
+
+func (inst *lameInstance) ProviderType() string {
+ return inst.providerType
+}
+
+func (inst *lameInstance) Address() string {
+ return "0.0.0.0:1234"
+}
+
+func (inst *lameInstance) SetTags(tags cloud.InstanceTags) error {
+ inst.p.mtx.Lock()
+ defer inst.p.mtx.Unlock()
+ inst.tags = cloud.InstanceTags{}
+ for k, v := range tags {
+ inst.tags[k] = v
+ }
+ return nil
+}
+
+func (inst *lameInstance) Destroy() error {
+ if inst.p.Hold != nil {
+ inst.p.Hold <- true
+ }
+ inst.p.mtx.Lock()
+ defer inst.p.mtx.Unlock()
+ delete(inst.p.instances, inst)
+ return nil
+}
+
+func (inst *lameInstance) Tags() cloud.InstanceTags {
+ return inst.tags
+}
+
+func (inst *lameInstance) VerifyHostKey(ssh.PublicKey, *ssh.Client) error {
+ return nil
+}
diff --git a/lib/dispatchcloud/test/queue.go b/lib/dispatchcloud/test/queue.go
new file mode 100644
index 000000000..909f56114
--- /dev/null
+++ b/lib/dispatchcloud/test/queue.go
@@ -0,0 +1,116 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package test
+
+import (
+ "fmt"
+ "time"
+
+ "git.curoverse.com/arvados.git/lib/dispatchcloud/container"
+ "git.curoverse.com/arvados.git/sdk/go/arvados"
+)
+
+// Queue is a test stub for container.Queue. The caller specifies the
+// initial queue state.
+type Queue struct {
+ // Containers represent the API server database contents.
+ Containers []arvados.Container
+
+ // ChooseType will be called for each entry in Containers. It
+ // must not be nil.
+ ChooseType func(*arvados.Container) (arvados.InstanceType, error)
+
+ entries map[string]container.QueueEnt
+ updTime time.Time
+}
+
+// Entries returns the containers that were queued when Update was
+// last called.
+func (q *Queue) Entries() (map[string]container.QueueEnt, time.Time) {
+ updTime := q.updTime
+ r := map[string]container.QueueEnt{}
+ for uuid, ent := range q.entries {
+ r[uuid] = ent
+ }
+ return r, updTime
+}
+
+// Get returns the container from the cached queue, i.e., as it was
+// when Update was last called -- just like a container.Queue does. If
+// the state has been changed (via Lock, Unlock, or Cancel) since the
+// last Update, the updated state is returned.
+func (q *Queue) Get(uuid string) (arvados.Container, bool) {
+ ent, ok := q.entries[uuid]
+ return ent.Container, ok
+}
+
+func (q *Queue) Forget(uuid string) {
+ delete(q.entries, uuid)
+}
+
+func (q *Queue) Lock(uuid string) error {
+ return q.changeState(uuid, arvados.ContainerStateQueued, arvados.ContainerStateLocked)
+}
+
+func (q *Queue) Unlock(uuid string) error {
+ return q.changeState(uuid, arvados.ContainerStateLocked, arvados.ContainerStateQueued)
+}
+
+func (q *Queue) Cancel(uuid string) error {
+ return q.changeState(uuid, q.entries[uuid].Container.State, arvados.ContainerStateCancelled)
+}
+
+func (q *Queue) changeState(uuid string, from, to arvados.ContainerState) error {
+ ent := q.entries[uuid]
+ if ent.Container.State != from {
+ return fmt.Errorf("lock failed: state=%q", ent.Container.State)
+ }
+ ent.Container.State = to
+ q.entries[uuid] = ent
+ for i, ctr := range q.Containers {
+ if ctr.UUID == uuid {
+ q.Containers[i].State = to
+ break
+ }
+ }
+ return nil
+}
+
+// Update rebuilds the current entries from the Containers slice.
+func (q *Queue) Update() error {
+ updTime := time.Now()
+ upd := map[string]container.QueueEnt{}
+ for _, ctr := range q.Containers {
+ _, exists := q.entries[ctr.UUID]
+ if !exists && (ctr.State == arvados.ContainerStateComplete || ctr.State == arvados.ContainerStateCancelled) {
+ continue
+ }
+ it, _ := q.ChooseType(&ctr)
+ upd[ctr.UUID] = container.QueueEnt{
+ Container: ctr,
+ InstanceType: it,
+ }
+ }
+ q.entries = upd
+ q.updTime = updTime
+ return nil
+}
+
+// Notify adds/updates an entry in the Containers slice. This
+// simulates the effect of an API update from someone other than the
+// dispatcher -- e.g., crunch-run updating state to "Complete" when a
+// container exits.
+//
+// The resulting changes are not exposed through Get() or Entries()
+// until the next call to Update().
+func (q *Queue) Notify(upd arvados.Container) {
+ for i, ctr := range q.Containers {
+ if ctr.UUID == upd.UUID {
+ q.Containers[i] = upd
+ return
+ }
+ }
+ q.Containers = append(q.Containers, upd)
+}
diff --git a/lib/dispatchcloud/test/ssh_service.go b/lib/dispatchcloud/test/ssh_service.go
new file mode 100644
index 000000000..b1e4e03b1
--- /dev/null
+++ b/lib/dispatchcloud/test/ssh_service.go
@@ -0,0 +1,169 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package test
+
+import (
+ "bytes"
+ "fmt"
+ "io"
+ "io/ioutil"
+ "log"
+ "net"
+ "strings"
+ "sync"
+
+ "golang.org/x/crypto/ssh"
+ check "gopkg.in/check.v1"
+)
+
+func LoadTestKey(c *check.C, fnm string) (ssh.PublicKey, ssh.Signer) {
+ rawpubkey, err := ioutil.ReadFile(fnm + ".pub")
+ c.Assert(err, check.IsNil)
+ pubkey, _, _, _, err := ssh.ParseAuthorizedKey(rawpubkey)
+ c.Assert(err, check.IsNil)
+ rawprivkey, err := ioutil.ReadFile(fnm)
+ c.Assert(err, check.IsNil)
+ privkey, err := ssh.ParsePrivateKey(rawprivkey)
+ c.Assert(err, check.IsNil)
+ return pubkey, privkey
+}
+
+// An SSHExecFunc handles an "exec" session on a multiplexed SSH
+// connection.
+type SSHExecFunc func(command string, stdin io.Reader, stdout, stderr io.Writer) uint32
+
+// An SSHService accepts SSH connections on an available TCP port and
+// passes clients' "exec" sessions to the provided SSHExecFunc.
+type SSHService struct {
+ Exec SSHExecFunc
+ HostKey ssh.Signer
+ AuthorizedKeys []ssh.PublicKey
+
+ listener net.Listener
+ conn *ssh.ServerConn
+ setup sync.Once
+ mtx sync.Mutex
+ started chan bool
+ closed bool
+ err error
+}
+
+// Address returns the host:port where the SSH server is listening. It
+// returns "" if called before the server is ready to accept
+// connections.
+func (ss *SSHService) Address() string {
+ ss.setup.Do(ss.start)
+ ss.mtx.Lock()
+ ln := ss.listener
+ ss.mtx.Unlock()
+ if ln == nil {
+ return ""
+ }
+ return ln.Addr().String()
+}
+
+// Close shuts down the server and releases resources. Established
+// connections are unaffected.
+func (ss *SSHService) Close() {
+ ss.Start()
+ ss.mtx.Lock()
+ ln := ss.listener
+ ss.closed = true
+ ss.mtx.Unlock()
+ if ln != nil {
+ ln.Close()
+ }
+}
+
+// Start returns when the server is ready to accept connections.
+func (ss *SSHService) Start() error {
+ ss.setup.Do(ss.start)
+ <-ss.started
+ return ss.err
+}
+
+func (ss *SSHService) start() {
+ ss.started = make(chan bool)
+ go ss.run()
+}
+
+func (ss *SSHService) run() {
+ defer close(ss.started)
+ config := &ssh.ServerConfig{
+ PublicKeyCallback: func(c ssh.ConnMetadata, pubKey ssh.PublicKey) (*ssh.Permissions, error) {
+ for _, ak := range ss.AuthorizedKeys {
+ if bytes.Equal(ak.Marshal(), pubKey.Marshal()) {
+ return &ssh.Permissions{}, nil
+ }
+ }
+ return nil, fmt.Errorf("unknown public key for %q", c.User())
+ },
+ }
+ config.AddHostKey(ss.HostKey)
+
+ listener, err := net.Listen("tcp", ":")
+ if err != nil {
+ ss.err = err
+ return
+ }
+
+ ss.mtx.Lock()
+ ss.listener = listener
+ ss.mtx.Unlock()
+
+ go func() {
+ for {
+ nConn, err := listener.Accept()
+ if err != nil && strings.Contains(err.Error(), "use of closed network connection") && ss.closed {
+ return
+ } else if err != nil {
+ log.Printf("accept: %s", err)
+ return
+ }
+ go ss.serveConn(nConn, config)
+ }
+ }()
+}
+
+func (ss *SSHService) serveConn(nConn net.Conn, config *ssh.ServerConfig) {
+ defer nConn.Close()
+ conn, newchans, reqs, err := ssh.NewServerConn(nConn, config)
+ if err != nil {
+ log.Printf("ssh.NewServerConn: %s", err)
+ return
+ }
+ defer conn.Close()
+ go ssh.DiscardRequests(reqs)
+ for newch := range newchans {
+ if newch.ChannelType() != "session" {
+ newch.Reject(ssh.UnknownChannelType, "unknown channel type")
+ continue
+ }
+ ch, reqs, err := newch.Accept()
+ if err != nil {
+ log.Printf("accept channel: %s", err)
+ return
+ }
+ var execReq struct {
+ Command string
+ }
+ go func() {
+ for req := range reqs {
+ if req.Type == "exec" && execReq.Command == "" {
+ req.Reply(true, nil)
+ ssh.Unmarshal(req.Payload, &execReq)
+ go func() {
+ var resp struct {
+ Status uint32
+ }
+ resp.Status = ss.Exec(execReq.Command, ch, ch, ch.Stderr())
+ ch.SendRequest("exit-status", false, ssh.Marshal(&resp))
+ ch.Close()
+ }()
+ }
+ }
+ }()
+ }
+}
diff --git a/lib/dispatchcloud/test/sshkey_dispatch b/lib/dispatchcloud/test/sshkey_dispatch
new file mode 100644
index 000000000..5584519c7
--- /dev/null
+++ b/lib/dispatchcloud/test/sshkey_dispatch
@@ -0,0 +1,27 @@
+-----BEGIN RSA PRIVATE KEY-----
+MIIEowIBAAKCAQEAqYm4XsQHm8sBSZFwUX5VeW1OkGsfoNzcGPG2nzzYRhNhClYZ
+0ABHhUk82HkaC/8l6d/jpYTf42HrK42nNQ0r0Yzs7qw8yZMQioK4Yk+kFyVLF78E
+GRG4pGAWXFs6pUchs/lm8fo9zcda4R3XeqgI+NO+nEERXmdRJa1FhI+Za3/S/+CV
+mg+6O00wZz2+vKmDPptGN4MCKmQOCKsMJts7wSZGyVcTtdNv7jjfr6yPAIOIL8X7
+LtarBCFaK/pD7uWll/Uj7h7D8K48nIZUrvBJJjXL8Sm4LxCNoz3Z83k8J5ZzuDRD
+gRiQe/C085mhO6VL+2fypDLwcKt1tOL8fI81MwIDAQABAoIBACR3tEnmHsDbNOav
+Oxq8cwRQh9K2yDHg8BMJgz/TZa4FIx2HEbxVIw0/iLADtJ+Z/XzGJQCIiWQuvtg6
+exoFQESt7JUWRWkSkj9JCQJUoTY9Vl7APtBpqG7rIEQzd3TvzQcagZNRQZQO6rR7
+p8sBdBSZ72lK8cJ9tM3G7Kor/VNK7KgRZFNhEWnmvEa3qMd4hzDcQ4faOn7C9NZK
+dwJAuJVVfwOLlOORYcyEkvksLaDOK2DsB/p0AaCpfSmThRbBKN5fPXYaKgUdfp3w
+70Hpp27WWymb1cgjyqSH3DY+V/kvid+5QxgxCBRq865jPLn3FFT9bWEVS/0wvJRj
+iMIRrjECgYEA4Ffv9rBJXqVXonNQbbstd2PaprJDXMUy9/UmfHL6pkq1xdBeuM7v
+yf2ocXheA8AahHtIOhtgKqwv/aRhVK0ErYtiSvIk+tXG+dAtj/1ZAKbKiFyxjkZV
+X72BH7cTlR6As5SRRfWM/HaBGEgED391gKsI5PyMdqWWdczT5KfxAksCgYEAwXYE
+ewPmV1GaR5fbh2RupoPnUJPMj36gJCnwls7sGaXDQIpdlq56zfKgrLocGXGgj+8f
+QH7FHTJQO15YCYebtsXWwB3++iG43gVlJlecPAydsap2CCshqNWC5JU5pan0QzsP
+exzNzWqfUPSbTkR2SRaN+MenZo2Y/WqScOAth7kCgYBgVoLujW9EXH5QfXJpXLq+
+jTvE38I7oVcs0bJwOLPYGzcJtlwmwn6IYAwohgbhV2pLv+EZSs42JPEK278MLKxY
+lgVkp60npgunFTWroqDIvdc1TZDVxvA8h9VeODEJlSqxczgbMcIUXBM9yRctTI+5
+7DiKlMUA4kTFW2sWwuOlFwKBgGXvrYS0FVbFJKm8lmvMu5D5x5RpjEu/yNnFT4Pn
+G/iXoz4Kqi2PWh3STl804UF24cd1k94D7hDoReZCW9kJnz67F+C67XMW+bXi2d1O
+JIBvlVfcHb1IHMA9YG7ZQjrMRmx2Xj3ce4RVPgUGHh8ra7gvLjd72/Tpf0doNClN
+ti/hAoGBAMW5D3LhU05LXWmOqpeT4VDgqk4MrTBcstVe7KdVjwzHrVHCAmI927vI
+pjpphWzpC9m3x4OsTNf8m+g6H7f3IiQS0aiFNtduXYlcuT5FHS2fSATTzg5PBon9
+1E6BudOve+WyFyBs7hFWAqWFBdWujAl4Qk5Ek09U2ilFEPE7RTgJ
+-----END RSA PRIVATE KEY-----
diff --git a/lib/dispatchcloud/test/sshkey_dispatch.pub b/lib/dispatchcloud/test/sshkey_dispatch.pub
new file mode 100644
index 000000000..1d5c1ea1b
--- /dev/null
+++ b/lib/dispatchcloud/test/sshkey_dispatch.pub
@@ -0,0 +1 @@
+ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQCpibhexAebywFJkXBRflV5bU6Qax+g3NwY8bafPNhGE2EKVhnQAEeFSTzYeRoL/yXp3+OlhN/jYesrjac1DSvRjOzurDzJkxCKgrhiT6QXJUsXvwQZEbikYBZcWzqlRyGz+Wbx+j3Nx1rhHdd6qAj4076cQRFeZ1ElrUWEj5lrf9L/4JWaD7o7TTBnPb68qYM+m0Y3gwIqZA4Iqwwm2zvBJkbJVxO102/uON+vrI8Ag4gvxfsu1qsEIVor+kPu5aWX9SPuHsPwrjychlSu8EkmNcvxKbgvEI2jPdnzeTwnlnO4NEOBGJB78LTzmaE7pUv7Z/KkMvBwq3W04vx8jzUz tom at curve
diff --git a/lib/dispatchcloud/test/sshkey_vm b/lib/dispatchcloud/test/sshkey_vm
new file mode 100644
index 000000000..10b7ed1bc
--- /dev/null
+++ b/lib/dispatchcloud/test/sshkey_vm
@@ -0,0 +1,27 @@
+-----BEGIN RSA PRIVATE KEY-----
+MIIEpQIBAAKCAQEApIfWk2StZGDtmunumIeXLJ46AQrbHHvuxrSAkQf6+zUwjB2I
+rse7ezBRHWcge9U5EsigixmhUM4ozFLnUQNwC862jbmsjbyA97arG/REECNlUrEB
+HQPYHhai5yyJ89AfjWVxKyINfW0K2HX1R8nl4kdVraAgpohPLh0dGjfwzm/BcXDG
++TxW9zRz0KCs9ZRI6s2MNdv08ahKQ0azk8gRTqMADJmYNWIo3zPQ+fhlwyr6EZJ/
+HFbRtjpajEPMJPwoVPO+Wj6wztfHDYKkPIrIWbhMl6w+tEKdsmygd3Iq94ktLS3X
+AbRCfn4njS2QSlkKFEepkUJWCSSWZgFn6DLm2wIDAQABAoIBAQCb137LxcTnG1h0
+L7isCWKMBKN0cU/xvwIAfOB6f1CfuVXuodrhkpZmrPFoJFKEeQbCX/6RQwmlfGDw
+iGZKOjNbO8V2oLRs3GxcNk4FAG2ny58hoD8puIZwmYhb57gTlMMOL1PuQyb78tkf
+Bzv5b6ermV3yQ4Ypt1solrMGLo6NOZD0oDX9p0Zt9kueIhjzgP0v5//T1F4PGHZK
++sLSsMiu9u6F+PB+Oc6uv0Zee9Lnts/QiWH5f18oEculjwKWFx+JwJWiLffGg2Bl
+vbpmvHFRoRWkHTpgSiLwSUqs0ZUWU9R5h11ROg5L39MLsxQoBvHsPEnP5ssN8jGt
+aH86EZjBAoGBAM+A5B/UjhIn9m05EhDTDRzI92hGhM8f7uAwobbnjvIQyZbWlBwj
+2TmgbJdpTGVbD+iTBIwKQdcFBbWobTCZsNMpghqA/ir4YIAnZ5OX9VQ1Bc+bWE7V
+dPmMVpCgyg+ERAe+79FrYWcI3vhnBpHCsY/9p9pGQIKDzlGTWNF1HJGjAoGBAMr7
+2CTVnFImTgD3E+rH4AAAfkz+cyqfK6BUhli/NifFYZhWCs16r9QCGSORnp4gPhMY
+3mf7VBs9rk123zOMo89eJt3adTgbZ+QIxXeXilGXpbT3w1+CJMaZRrIy80E1tB5/
+KvDZcrZ78o8XWMNUa+9k55ukvgyC24ICAmOIWNlpAoGBALEFvphBF2r52MtZUsYz
+pw4VjKvS7V5eWcW891k4tsRf+frK2NQg6SK2b63EUT5ur2W0dr6ZyY2MZVCSfYRm
+uWmMEchWn389IeZyt3Q8wTize1+foXivtflm9jqwUXFnXzpUc/du6kuiT8YO7pXP
+SPgUZ+xY3pP5qjwBvlYC2PqNAoGAZ1CKMi1bdGC0wT8BLzXuqHGX136HhcEgRmnf
+O5qPaOzJAO2CcBWrGuC6hOUgc+F7VuMIiKpeo8LgTeNcNfO2iNymMbN4iEdCuMlS
+IM3MBD2IhTS6h4lJSKBJYHgYYi+AbylQ5Of4wDMUQYqjjkAQ8/dK/2h5pwqPyXtW
+VezXNEkCgYEAq4S0++y9tjlLn+w9BIkmx3bAVRDQZIzIEwxTh+jpqaUp1J0iyseJ
+71pwqQojGNF6x8GglVXa6bMrETae21WhEeHnWmzlpCWIODsYPUQ+erjDuAWi9eGk
+HLklqSEoLB8pzC6zDqjxDw+CnGERIDSaoaeoWiNKZ95IH1WiEwYjuxU=
+-----END RSA PRIVATE KEY-----
diff --git a/lib/dispatchcloud/test/sshkey_vm.pub b/lib/dispatchcloud/test/sshkey_vm.pub
new file mode 100644
index 000000000..b9d44c946
--- /dev/null
+++ b/lib/dispatchcloud/test/sshkey_vm.pub
@@ -0,0 +1 @@
+ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQCkh9aTZK1kYO2a6e6Yh5csnjoBCtsce+7GtICRB/r7NTCMHYiux7t7MFEdZyB71TkSyKCLGaFQzijMUudRA3ALzraNuayNvID3tqsb9EQQI2VSsQEdA9geFqLnLInz0B+NZXErIg19bQrYdfVHyeXiR1WtoCCmiE8uHR0aN/DOb8FxcMb5PFb3NHPQoKz1lEjqzYw12/TxqEpDRrOTyBFOowAMmZg1YijfM9D5+GXDKvoRkn8cVtG2OlqMQ8wk/ChU875aPrDO18cNgqQ8ishZuEyXrD60Qp2ybKB3cir3iS0tLdcBtEJ+fieNLZBKWQoUR6mRQlYJJJZmAWfoMubb tom at curve
diff --git a/lib/dispatchcloud/test/stub_driver.go b/lib/dispatchcloud/test/stub_driver.go
new file mode 100644
index 000000000..53e312ea2
--- /dev/null
+++ b/lib/dispatchcloud/test/stub_driver.go
@@ -0,0 +1,195 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package test
+
+import (
+ "crypto/rand"
+ "errors"
+ "fmt"
+ "io"
+ math_rand "math/rand"
+ "sync"
+
+ "git.curoverse.com/arvados.git/lib/cloud"
+ "git.curoverse.com/arvados.git/sdk/go/arvados"
+ "github.com/mitchellh/mapstructure"
+ "golang.org/x/crypto/ssh"
+)
+
+type StubExecFunc func(instance cloud.Instance, command string, stdin io.Reader, stdout, stderr io.Writer) uint32
+
+// A StubDriver implements cloud.Driver by setting up local SSH
+// servers that pass their command execution requests to the provided
+// SSHExecFunc.
+type StubDriver struct {
+ Exec StubExecFunc
+ HostKey ssh.Signer
+ AuthorizedKeys []ssh.PublicKey
+ instanceSets []*StubInstanceSet
+}
+
+// InstanceSet returns a new *StubInstanceSet.
+func (sd *StubDriver) InstanceSet(params map[string]interface{}, id cloud.InstanceSetID) (cloud.InstanceSet, error) {
+ sis := StubInstanceSet{
+ driver: sd,
+ servers: map[cloud.InstanceID]*stubServer{},
+ }
+ sd.instanceSets = append(sd.instanceSets, &sis)
+ return &sis, mapstructure.Decode(params, &sis)
+}
+
+// InstanceSets returns all instances that have been created by the
+// driver. This can be used to test a component that uses the driver
+// but doesn't expose the InstanceSets it has created.
+func (sd *StubDriver) InstanceSets() []*StubInstanceSet {
+ return sd.instanceSets
+}
+
+type StubInstanceSet struct {
+ driver *StubDriver
+ servers map[cloud.InstanceID]*stubServer
+ mtx sync.RWMutex
+ stopped bool
+}
+
+func (sis *StubInstanceSet) Create(it arvados.InstanceType, image cloud.ImageID, tags cloud.InstanceTags, authKey ssh.PublicKey) (cloud.Instance, error) {
+ sis.mtx.Lock()
+ defer sis.mtx.Unlock()
+ if sis.stopped {
+ return nil, errors.New("StubInstanceSet: Create called after Stop")
+ }
+ ak := sis.driver.AuthorizedKeys
+ if authKey != nil {
+ ak = append([]ssh.PublicKey{authKey}, ak...)
+ }
+ var ss *stubServer
+ ss = &stubServer{
+ sis: sis,
+ id: cloud.InstanceID(fmt.Sprintf("stub-%s-%x", it.ProviderType, math_rand.Int63())),
+ tags: copyTags(tags),
+ providerType: it.ProviderType,
+ SSHService: SSHService{
+ HostKey: sis.driver.HostKey,
+ AuthorizedKeys: ak,
+ Exec: func(command string, stdin io.Reader, stdout, stderr io.Writer) uint32 {
+ return sis.driver.Exec(ss.Instance(), command, stdin, stdout, stderr)
+ },
+ },
+ }
+
+ sis.servers[ss.id] = ss
+ return ss.Instance(), nil
+}
+
+func (sis *StubInstanceSet) Instances(cloud.InstanceTags) ([]cloud.Instance, error) {
+ sis.mtx.RLock()
+ defer sis.mtx.RUnlock()
+ var r []cloud.Instance
+ for _, ss := range sis.servers {
+ r = append(r, ss.Instance())
+ }
+ return r, nil
+}
+
+func (sis *StubInstanceSet) Stop() {
+ sis.mtx.Lock()
+ defer sis.mtx.Unlock()
+ if sis.stopped {
+ panic("Stop called twice")
+ }
+ sis.stopped = true
+}
+
+type stubServer struct {
+ sis *StubInstanceSet
+ id cloud.InstanceID
+ tags cloud.InstanceTags
+ providerType string
+ SSHService SSHService
+ sync.Mutex
+}
+
+func (ss *stubServer) Instance() stubInstance {
+ ss.Lock()
+ defer ss.Unlock()
+ return stubInstance{
+ ss: ss,
+ addr: ss.SSHService.Address(),
+ // We deliberately return a cached/stale copy of the
+ // real tags here, so that (Instance)Tags() sometimes
+ // returns old data after a call to
+ // (Instance)SetTags(). This is permitted by the
+ // driver interface, and this might help remind
+ // callers that they need to tolerate it.
+ tags: copyTags(ss.tags),
+ }
+}
+
+type stubInstance struct {
+ ss *stubServer
+ addr string
+ tags cloud.InstanceTags
+}
+
+func (si stubInstance) ID() cloud.InstanceID {
+ return si.ss.id
+}
+
+func (si stubInstance) Address() string {
+ return si.addr
+}
+
+func (si stubInstance) Destroy() error {
+ si.ss.SSHService.Close()
+ sis := si.ss.sis
+ sis.mtx.Lock()
+ defer sis.mtx.Unlock()
+ delete(sis.servers, si.ss.id)
+ return nil
+}
+
+func (si stubInstance) ProviderType() string {
+ return si.ss.providerType
+}
+
+func (si stubInstance) SetTags(tags cloud.InstanceTags) error {
+ tags = copyTags(tags)
+ ss := si.ss
+ go func() {
+ ss.Lock()
+ defer ss.Unlock()
+ ss.tags = tags
+ }()
+ return nil
+}
+
+func (si stubInstance) Tags() cloud.InstanceTags {
+ return si.tags
+}
+
+func (si stubInstance) String() string {
+ return string(si.ss.id)
+}
+
+func (si stubInstance) VerifyHostKey(key ssh.PublicKey, client *ssh.Client) error {
+ buf := make([]byte, 512)
+ _, err := io.ReadFull(rand.Reader, buf)
+ if err != nil {
+ return err
+ }
+ sig, err := si.ss.sis.driver.HostKey.Sign(rand.Reader, buf)
+ if err != nil {
+ return err
+ }
+ return key.Verify(buf, sig)
+}
+
+func copyTags(src cloud.InstanceTags) cloud.InstanceTags {
+ dst := cloud.InstanceTags{}
+ for k, v := range src {
+ dst[k] = v
+ }
+ return dst
+}
diff --git a/lib/dispatchcloud/worker/gocheck_test.go b/lib/dispatchcloud/worker/gocheck_test.go
new file mode 100644
index 000000000..b4ca66c97
--- /dev/null
+++ b/lib/dispatchcloud/worker/gocheck_test.go
@@ -0,0 +1,16 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package worker
+
+import (
+ "testing"
+
+ check "gopkg.in/check.v1"
+)
+
+// Gocheck boilerplate
+func Test(t *testing.T) {
+ check.TestingT(t)
+}
diff --git a/lib/dispatchcloud/worker/pool.go b/lib/dispatchcloud/worker/pool.go
new file mode 100644
index 000000000..cf8fac380
--- /dev/null
+++ b/lib/dispatchcloud/worker/pool.go
@@ -0,0 +1,852 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package worker
+
+import (
+ "bytes"
+ "io"
+ "sort"
+ "strings"
+ "sync"
+ "time"
+
+ "git.curoverse.com/arvados.git/lib/cloud"
+ "git.curoverse.com/arvados.git/sdk/go/arvados"
+ "github.com/Sirupsen/logrus"
+ "github.com/prometheus/client_golang/prometheus"
+)
+
+// A View shows a worker's current state and recent activity.
+type View struct {
+ Instance string
+ Price float64
+ ArvadosInstanceType string
+ ProviderInstanceType string
+ LastContainerUUID string
+ Unallocated time.Time
+ WorkerState string
+}
+
+// An Executor executes shell commands on a remote host.
+type Executor interface {
+ // Run cmd on the current target.
+ Execute(cmd string, stdin io.Reader) (stdout, stderr []byte, err error)
+
+ // Use the given target for subsequent operations. The new
+ // target is the same host as the previous target, but it
+ // might return a different address and verify a different
+ // host key.
+ //
+ // SetTarget is called frequently, and in most cases the new
+ // target will behave exactly the same as the old one. An
+ // implementation should optimize accordingly.
+ //
+ // SetTarget must not block on concurrent Execute calls.
+ SetTarget(cloud.ExecutorTarget)
+}
+
+const (
+ defaultSyncInterval = time.Minute
+ defaultProbeInterval = time.Second * 10
+ defaultMaxProbesPerSecond = 10
+ defaultTimeoutIdle = time.Minute
+ defaultTimeoutBooting = time.Minute * 10
+ defaultTimeoutProbe = time.Minute * 10
+)
+
+func duration(conf arvados.Duration, def time.Duration) time.Duration {
+ if conf > 0 {
+ return time.Duration(conf)
+ } else {
+ return def
+ }
+}
+
+// NewPool creates a Pool of workers backed by instanceSet.
+//
+// New instances are configured and set up according to the given
+// cluster configuration.
+func NewPool(logger logrus.FieldLogger, reg *prometheus.Registry, instanceSet cloud.InstanceSet, newExecutor func(cloud.Instance) Executor, cluster *arvados.Cluster) *Pool {
+ wp := &Pool{
+ logger: logger,
+ instanceSet: instanceSet,
+ newExecutor: newExecutor,
+ bootProbeCommand: cluster.CloudVMs.BootProbeCommand,
+ imageID: cloud.ImageID(cluster.CloudVMs.ImageID),
+ instanceTypes: cluster.InstanceTypes,
+ maxProbesPerSecond: cluster.Dispatch.MaxProbesPerSecond,
+ probeInterval: duration(cluster.Dispatch.ProbeInterval, defaultProbeInterval),
+ syncInterval: duration(cluster.CloudVMs.SyncInterval, defaultSyncInterval),
+ timeoutIdle: duration(cluster.CloudVMs.TimeoutIdle, defaultTimeoutIdle),
+ timeoutBooting: duration(cluster.CloudVMs.TimeoutBooting, defaultTimeoutBooting),
+ timeoutProbe: duration(cluster.CloudVMs.TimeoutProbe, defaultTimeoutProbe),
+ }
+ wp.registerMetrics(reg)
+ go wp.run()
+ return wp
+}
+
+// Pool is a resizable worker pool backed by a cloud.InstanceSet. A
+// zero Pool should not be used. Call NewPool to create a new Pool.
+type Pool struct {
+ // configuration
+ logger logrus.FieldLogger
+ instanceSet cloud.InstanceSet
+ newExecutor func(cloud.Instance) Executor
+ bootProbeCommand string
+ imageID cloud.ImageID
+ instanceTypes map[string]arvados.InstanceType
+ syncInterval time.Duration
+ probeInterval time.Duration
+ maxProbesPerSecond int
+ timeoutIdle time.Duration
+ timeoutBooting time.Duration
+ timeoutProbe time.Duration
+
+ // private state
+ subscribers map[<-chan struct{}]chan<- struct{}
+ creating map[arvados.InstanceType]int // goroutines waiting for (InstanceSet)Create to return
+ workers map[cloud.InstanceID]*worker
+ loaded bool // loaded list of instances from InstanceSet at least once
+ exited map[string]time.Time // containers whose crunch-run proc has exited, but KillContainer has not been called
+ atQuotaUntil time.Time
+ stop chan bool
+ mtx sync.RWMutex
+ setupOnce sync.Once
+
+ mInstances prometheus.Gauge
+ mContainersRunning prometheus.Gauge
+ mVCPUs prometheus.Gauge
+ mVCPUsInuse prometheus.Gauge
+ mMemory prometheus.Gauge
+ mMemoryInuse prometheus.Gauge
+}
+
+type worker struct {
+ state State
+ instance cloud.Instance
+ executor Executor
+ instType arvados.InstanceType
+ vcpus int64
+ memory int64
+ booted bool
+ probed time.Time
+ updated time.Time
+ busy time.Time
+ unallocated time.Time
+ lastUUID string
+ running map[string]struct{}
+ starting map[string]struct{}
+ probing chan struct{}
+}
+
+// Subscribe returns a channel that becomes ready whenever a worker's
+// state changes.
+//
+// Example:
+//
+// ch := wp.Subscribe()
+// defer wp.Unsubscribe(ch)
+// for range ch {
+// // ...try scheduling some work...
+// if done {
+// break
+// }
+// }
+func (wp *Pool) Subscribe() <-chan struct{} {
+ wp.setupOnce.Do(wp.setup)
+ wp.mtx.Lock()
+ defer wp.mtx.Unlock()
+ ch := make(chan struct{}, 1)
+ wp.subscribers[ch] = ch
+ return ch
+}
+
+// Unsubscribe stops sending updates to the given channel.
+func (wp *Pool) Unsubscribe(ch <-chan struct{}) {
+ wp.setupOnce.Do(wp.setup)
+ wp.mtx.Lock()
+ defer wp.mtx.Unlock()
+ delete(wp.subscribers, ch)
+}
+
+// Unallocated returns the number of unallocated (creating + booting +
+// idle + unknown) workers for each instance type.
+func (wp *Pool) Unallocated() map[arvados.InstanceType]int {
+ wp.setupOnce.Do(wp.setup)
+ wp.mtx.RLock()
+ defer wp.mtx.RUnlock()
+ u := map[arvados.InstanceType]int{}
+ for it, c := range wp.creating {
+ u[it] = c
+ }
+ for _, wkr := range wp.workers {
+ if len(wkr.running)+len(wkr.starting) == 0 && (wkr.state == StateRunning || wkr.state == StateBooting || wkr.state == StateUnknown) {
+ u[wkr.instType]++
+ }
+ }
+ return u
+}
+
+// Create a new instance with the given type, and add it to the worker
+// pool. The worker is added immediately; instance creation runs in
+// the background.
+func (wp *Pool) Create(it arvados.InstanceType) error {
+ logger := wp.logger.WithField("InstanceType", it.Name)
+ wp.setupOnce.Do(wp.setup)
+ wp.mtx.Lock()
+ defer wp.mtx.Unlock()
+ tags := cloud.InstanceTags{"InstanceType": it.Name}
+ wp.creating[it]++
+ go func() {
+ inst, err := wp.instanceSet.Create(it, wp.imageID, tags, nil)
+ wp.mtx.Lock()
+ defer wp.mtx.Unlock()
+ wp.creating[it]--
+ if err, ok := err.(cloud.QuotaError); ok && err.IsQuotaError() {
+ wp.atQuotaUntil = time.Now().Add(time.Minute)
+ }
+ if err != nil {
+ logger.WithError(err).Error("create failed")
+ go wp.notify()
+ return
+ }
+ wp.updateWorker(inst, it, StateBooting)
+ }()
+ return nil
+}
+
+// AtQuota returns true if Create is not expected to work at the
+// moment.
+func (wp *Pool) AtQuota() bool {
+ return time.Now().Before(wp.atQuotaUntil)
+}
+
+// Add or update worker attached to the given instance. Use
+// initialState if a new worker is created. Caller must have lock.
+func (wp *Pool) updateWorker(inst cloud.Instance, it arvados.InstanceType, initialState State) {
+ id := inst.ID()
+ if wp.workers[id] != nil {
+ wp.workers[id].executor.SetTarget(inst)
+ wp.workers[id].instance = inst
+ wp.workers[id].updated = time.Now()
+ if initialState == StateBooting && wp.workers[id].state == StateUnknown {
+ wp.workers[id].state = StateBooting
+ }
+ return
+ }
+ if initialState == StateUnknown && inst.Tags()["hold"] != "" {
+ initialState = StateHold
+ }
+ wp.logger.WithFields(logrus.Fields{
+ "InstanceType": it.Name,
+ "Instance": inst,
+ "State": initialState,
+ }).Infof("instance appeared in cloud")
+ wp.workers[id] = &worker{
+ executor: wp.newExecutor(inst),
+ state: initialState,
+ instance: inst,
+ instType: it,
+ probed: time.Now(),
+ busy: time.Now(),
+ updated: time.Now(),
+ unallocated: time.Now(),
+ running: make(map[string]struct{}),
+ starting: make(map[string]struct{}),
+ probing: make(chan struct{}, 1),
+ }
+ go wp.notify()
+}
+
+// Shutdown shuts down a worker with the given type, or returns false
+// if all workers with the given type are busy.
+func (wp *Pool) Shutdown(it arvados.InstanceType) bool {
+ wp.setupOnce.Do(wp.setup)
+ wp.mtx.Lock()
+ defer wp.mtx.Unlock()
+ logger := wp.logger.WithField("InstanceType", it.Name)
+ logger.Info("shutdown requested")
+ for _, tryState := range []State{StateBooting, StateRunning} {
+ // TODO: shutdown the worker with the longest idle
+ // time (Running) or the earliest create time
+ // (Booting)
+ for _, wkr := range wp.workers {
+ if wkr.state != tryState || len(wkr.running)+len(wkr.starting) > 0 {
+ continue
+ }
+ if wkr.instType != it {
+ continue
+ }
+ logger = logger.WithField("Instance", wkr.instance)
+ logger.Info("shutting down")
+ wp.shutdown(wkr, logger)
+ return true
+ }
+ }
+ return false
+}
+
+// caller must have lock
+func (wp *Pool) shutdown(wkr *worker, logger logrus.FieldLogger) {
+ wkr.updated = time.Now()
+ wkr.state = StateShutdown
+ go func() {
+ err := wkr.instance.Destroy()
+ if err != nil {
+ logger.WithError(err).Warn("shutdown failed")
+ return
+ }
+ wp.mtx.Lock()
+ wp.atQuotaUntil = time.Now()
+ wp.mtx.Unlock()
+ wp.notify()
+ }()
+}
+
+// Workers returns the current number of workers in each state.
+func (wp *Pool) Workers() map[State]int {
+ wp.setupOnce.Do(wp.setup)
+ wp.mtx.Lock()
+ defer wp.mtx.Unlock()
+ r := map[State]int{}
+ for _, w := range wp.workers {
+ r[w.state]++
+ }
+ return r
+}
+
+// Running returns the container UUIDs being prepared/run on workers.
+func (wp *Pool) Running() map[string]time.Time {
+ wp.setupOnce.Do(wp.setup)
+ wp.mtx.Lock()
+ defer wp.mtx.Unlock()
+ r := map[string]time.Time{}
+ for _, wkr := range wp.workers {
+ for uuid := range wkr.running {
+ r[uuid] = time.Time{}
+ }
+ for uuid := range wkr.starting {
+ r[uuid] = time.Time{}
+ }
+ }
+ for uuid, exited := range wp.exited {
+ r[uuid] = exited
+ }
+ return r
+}
+
+// StartContainer starts a container on an idle worker immediately if
+// possible, otherwise returns false.
+func (wp *Pool) StartContainer(it arvados.InstanceType, ctr arvados.Container) bool {
+ logger := wp.logger.WithFields(logrus.Fields{
+ "InstanceType": it.Name,
+ "ContainerUUID": ctr.UUID,
+ "Priority": ctr.Priority,
+ })
+ wp.setupOnce.Do(wp.setup)
+ wp.mtx.Lock()
+ defer wp.mtx.Unlock()
+ var wkr *worker
+ for _, w := range wp.workers {
+ if w.instType == it && w.state == StateRunning && len(w.running)+len(w.starting) == 0 {
+ if wkr == nil || w.busy.After(wkr.busy) {
+ wkr = w
+ }
+ }
+ }
+ if wkr == nil {
+ return false
+ }
+ logger = logger.WithField("Instance", wkr.instance)
+ logger.Debug("starting container")
+ wkr.starting[ctr.UUID] = struct{}{}
+ go func() {
+ stdout, stderr, err := wkr.executor.Execute("crunch-run --detach '"+ctr.UUID+"'", nil)
+ wp.mtx.Lock()
+ defer wp.mtx.Unlock()
+ wkr.updated = time.Now()
+ delete(wkr.starting, ctr.UUID)
+ wkr.running[ctr.UUID] = struct{}{}
+ if err != nil {
+ logger.WithField("stdout", string(stdout)).
+ WithField("stderr", string(stderr)).
+ WithError(err).
+ Error("error starting crunch-run process")
+ // Leave uuid in wkr.running, though: it's
+ // possible the error was just a communication
+ // failure and the process was in fact
+ // started. Wait for next probe to find out.
+ return
+ }
+ logger.Info("crunch-run process started")
+ wkr.lastUUID = ctr.UUID
+ }()
+ return true
+}
+
+// KillContainer kills the crunch-run process for the given container
+// UUID, if it's running on any worker.
+func (wp *Pool) KillContainer(uuid string) {
+ wp.mtx.Lock()
+ defer wp.mtx.Unlock()
+ if _, ok := wp.exited[uuid]; ok {
+ wp.logger.WithField("ContainerUUID", uuid).Debug("clearing placeholder for exited crunch-run process")
+ delete(wp.exited, uuid)
+ return
+ }
+ for _, wkr := range wp.workers {
+ if _, ok := wkr.running[uuid]; ok {
+ go wp.kill(wkr, uuid)
+ return
+ }
+ }
+ wp.logger.WithField("ContainerUUID", uuid).Debug("cannot kill: already disappeared")
+}
+
+func (wp *Pool) kill(wkr *worker, uuid string) {
+ logger := wp.logger.WithFields(logrus.Fields{
+ "ContainerUUID": uuid,
+ "Instance": wkr.instance,
+ })
+ logger.Debug("killing process")
+ stdout, stderr, err := wkr.executor.Execute("crunch-run --kill "+uuid, nil)
+ if err != nil {
+ logger.WithFields(logrus.Fields{
+ "stderr": string(stderr),
+ "stdout": string(stdout),
+ "error": err,
+ }).Warn("kill failed")
+ return
+ }
+ logger.Debug("killing process succeeded")
+ wp.mtx.Lock()
+ defer wp.mtx.Unlock()
+ if _, ok := wkr.running[uuid]; ok {
+ delete(wkr.running, uuid)
+ wkr.updated = time.Now()
+ go wp.notify()
+ }
+}
+
+func (wp *Pool) registerMetrics(reg *prometheus.Registry) {
+ if reg == nil {
+ reg = prometheus.NewRegistry()
+ }
+ wp.mInstances = prometheus.NewGauge(prometheus.GaugeOpts{
+ Namespace: "arvados",
+ Subsystem: "dispatchcloud",
+ Name: "instances_total",
+ Help: "Number of cloud VMs including pending, booting, running, held, and shutting down.",
+ })
+ reg.MustRegister(wp.mInstances)
+ wp.mContainersRunning = prometheus.NewGauge(prometheus.GaugeOpts{
+ Namespace: "arvados",
+ Subsystem: "dispatchcloud",
+ Name: "containers_running",
+ Help: "Number of containers reported running by cloud VMs.",
+ })
+ reg.MustRegister(wp.mContainersRunning)
+
+ wp.mVCPUs = prometheus.NewGauge(prometheus.GaugeOpts{
+ Namespace: "arvados",
+ Subsystem: "dispatchcloud",
+ Name: "vcpus_total",
+ Help: "Total VCPUs on all cloud VMs.",
+ })
+ reg.MustRegister(wp.mVCPUs)
+ wp.mVCPUsInuse = prometheus.NewGauge(prometheus.GaugeOpts{
+ Namespace: "arvados",
+ Subsystem: "dispatchcloud",
+ Name: "vcpus_inuse",
+ Help: "VCPUs on cloud VMs that are running containers.",
+ })
+ reg.MustRegister(wp.mVCPUsInuse)
+ wp.mMemory = prometheus.NewGauge(prometheus.GaugeOpts{
+ Namespace: "arvados",
+ Subsystem: "dispatchcloud",
+ Name: "memory_bytes_total",
+ Help: "Total memory on all cloud VMs.",
+ })
+ reg.MustRegister(wp.mMemory)
+ wp.mMemoryInuse = prometheus.NewGauge(prometheus.GaugeOpts{
+ Namespace: "arvados",
+ Subsystem: "dispatchcloud",
+ Name: "memory_bytes_inuse",
+ Help: "Memory on cloud VMs that are running containers.",
+ })
+ reg.MustRegister(wp.mMemoryInuse)
+}
+
+func (wp *Pool) updateMetrics() {
+ wp.mtx.RLock()
+ defer wp.mtx.RUnlock()
+
+ var alloc, cpu, cpuInuse, mem, memInuse int64
+ for _, wkr := range wp.workers {
+ cpu += int64(wkr.instType.VCPUs)
+ mem += int64(wkr.instType.RAM)
+ if len(wkr.running)+len(wkr.starting) == 0 {
+ continue
+ }
+ alloc += int64(len(wkr.running) + len(wkr.starting))
+ cpuInuse += int64(wkr.instType.VCPUs)
+ memInuse += int64(wkr.instType.RAM)
+ }
+ wp.mInstances.Set(float64(len(wp.workers)))
+ wp.mContainersRunning.Set(float64(alloc))
+ wp.mVCPUs.Set(float64(cpu))
+ wp.mMemory.Set(float64(mem))
+ wp.mVCPUsInuse.Set(float64(cpuInuse))
+ wp.mMemoryInuse.Set(float64(memInuse))
+}
+
+func (wp *Pool) run() {
+ wp.setupOnce.Do(wp.setup)
+
+ go func() {
+ ch := wp.Subscribe()
+ defer wp.Unsubscribe(ch)
+ for range ch {
+ wp.updateMetrics()
+ }
+ }()
+
+ go func() {
+ maxPPS := wp.maxProbesPerSecond
+ if maxPPS < 1 {
+ maxPPS = defaultMaxProbesPerSecond
+ }
+ limitticker := time.NewTicker(time.Second / time.Duration(maxPPS))
+ defer limitticker.Stop()
+
+ probeticker := time.NewTicker(wp.probeInterval)
+ defer probeticker.Stop()
+
+ workers := []cloud.InstanceID{}
+ for range probeticker.C {
+ workers = workers[:0]
+ wp.mtx.Lock()
+ for id, wkr := range wp.workers {
+ if wkr.state == StateShutdown || wp.autoShutdown(wkr) {
+ continue
+ }
+ workers = append(workers, id)
+ }
+ wp.mtx.Unlock()
+
+ for _, id := range workers {
+ wp.mtx.Lock()
+ wkr, ok := wp.workers[id]
+ wp.mtx.Unlock()
+ if !ok || wkr.state == StateShutdown {
+ // Deleted/shutdown while we
+ // were probing others
+ continue
+ }
+ select {
+ case wkr.probing <- struct{}{}:
+ go func() {
+ wp.probeAndUpdate(wkr)
+ <-wkr.probing
+ }()
+ default:
+ wp.logger.WithField("Instance", wkr.instance).Debug("still waiting for last probe to finish")
+ }
+ select {
+ case <-wp.stop:
+ return
+ case <-limitticker.C:
+ }
+ }
+ }
+ }()
+
+ timer := time.NewTimer(time.Nanosecond)
+ for {
+ err := wp.getInstancesAndSync()
+ if err != nil {
+ wp.logger.WithError(err).Warn("sync failed")
+ }
+
+ // Reset timer to desired interval, and ignore the
+ // tick that might have already arrived.
+ timer.Stop()
+ select {
+ case <-timer.C:
+ default:
+ }
+ timer.Reset(wp.syncInterval)
+
+ select {
+ case <-timer.C:
+ case <-wp.stop:
+ wp.logger.Debug("worker.Pool stopped")
+ return
+ }
+ }
+}
+
+// caller must have lock.
+func (wp *Pool) autoShutdown(wkr *worker) bool {
+ if len(wkr.running)+len(wkr.starting) > 0 || wkr.state != StateRunning {
+ return false
+ }
+ age := time.Since(wkr.unallocated)
+ if age < wp.timeoutIdle {
+ return false
+ }
+ logger := wp.logger.WithFields(logrus.Fields{
+ "Age": age,
+ "Instance": wkr.instance,
+ })
+ logger.Info("shutdown idle worker")
+ wp.shutdown(wkr, logger)
+ return true
+}
+
+// Stop synchronizing with the InstanceSet.
+func (wp *Pool) Stop() {
+ wp.setupOnce.Do(wp.setup)
+ close(wp.stop)
+}
+
+// View reports status information for every worker in the pool.
+func (wp *Pool) View() []View {
+ var r []View
+ wp.setupOnce.Do(wp.setup)
+ wp.mtx.Lock()
+ for _, w := range wp.workers {
+ r = append(r, View{
+ Instance: w.instance.String(),
+ Price: w.instType.Price,
+ ArvadosInstanceType: w.instType.Name,
+ ProviderInstanceType: w.instType.ProviderType,
+ LastContainerUUID: w.lastUUID,
+ Unallocated: w.unallocated,
+ WorkerState: w.state.String(),
+ })
+ }
+ wp.mtx.Unlock()
+ sort.Slice(r, func(i, j int) bool {
+ return strings.Compare(r[i].Instance, r[j].Instance) < 0
+ })
+ return r
+}
+
+func (wp *Pool) setup() {
+ wp.creating = map[arvados.InstanceType]int{}
+ wp.exited = map[string]time.Time{}
+ wp.workers = map[cloud.InstanceID]*worker{}
+ wp.subscribers = map[<-chan struct{}]chan<- struct{}{}
+}
+
+func (wp *Pool) notify() {
+ wp.mtx.RLock()
+ defer wp.mtx.RUnlock()
+ for _, send := range wp.subscribers {
+ select {
+ case send <- struct{}{}:
+ default:
+ }
+ }
+}
+
+func (wp *Pool) getInstancesAndSync() error {
+ wp.setupOnce.Do(wp.setup)
+ wp.logger.Debug("getting instance list")
+ threshold := time.Now()
+ instances, err := wp.instanceSet.Instances(cloud.InstanceTags{})
+ if err != nil {
+ return err
+ }
+ wp.sync(threshold, instances)
+ wp.logger.Debug("sync done")
+ return nil
+}
+
+// Add/remove/update workers based on instances, which was obtained
+// from the instanceSet. However, don't clobber any other updates that
+// already happened after threshold.
+func (wp *Pool) sync(threshold time.Time, instances []cloud.Instance) {
+ wp.mtx.Lock()
+ defer wp.mtx.Unlock()
+ wp.logger.WithField("Instances", len(instances)).Debug("sync instances")
+
+ for _, inst := range instances {
+ itTag := inst.Tags()["InstanceType"]
+ it, ok := wp.instanceTypes[itTag]
+ if !ok {
+ wp.logger.WithField("Instance", inst).Errorf("unknown InstanceType tag %q --- ignoring", itTag)
+ continue
+ }
+ wp.updateWorker(inst, it, StateUnknown)
+ }
+
+ for id, wkr := range wp.workers {
+ if wkr.updated.After(threshold) {
+ continue
+ }
+ logger := wp.logger.WithFields(logrus.Fields{
+ "Instance": wkr.instance,
+ "WorkerState": wkr.state,
+ })
+ logger.Info("instance disappeared in cloud")
+ delete(wp.workers, id)
+ go wp.notify()
+ }
+
+ if !wp.loaded {
+ wp.loaded = true
+ wp.logger.WithField("N", len(wp.workers)).Info("loaded initial instance list")
+ }
+}
+
+// should be called in a new goroutine
+func (wp *Pool) probeAndUpdate(wkr *worker) {
+ logger := wp.logger.WithField("Instance", wkr.instance)
+ wp.mtx.Lock()
+ updated := wkr.updated
+ booted := wkr.booted
+ wp.mtx.Unlock()
+
+ var (
+ ctrUUIDs []string
+ ok bool
+ stderr []byte
+ )
+ if !booted {
+ booted, stderr = wp.probeBooted(wkr)
+ wp.mtx.Lock()
+ if booted && !wkr.booted {
+ wkr.booted = booted
+ logger.Info("instance booted")
+ } else {
+ booted = wkr.booted
+ }
+ wp.mtx.Unlock()
+ }
+ if booted {
+ ctrUUIDs, ok, stderr = wp.probeRunning(wkr)
+ }
+ logger = logger.WithField("stderr", string(stderr))
+ wp.mtx.Lock()
+ defer wp.mtx.Unlock()
+ if !ok {
+ if wkr.state == StateShutdown {
+ return
+ }
+ dur := time.Since(wkr.probed)
+ logger := logger.WithFields(logrus.Fields{
+ "Duration": dur,
+ "State": wkr.state,
+ })
+ if wkr.state == StateBooting {
+ logger.Debug("new instance not responding")
+ } else {
+ logger.Info("instance not responding")
+ }
+
+ if wkr.state == StateHold {
+ return
+ }
+
+ label, threshold := "", wp.timeoutProbe
+ if wkr.state == StateBooting {
+ label, threshold = "new ", wp.timeoutBooting
+ }
+ if dur > threshold {
+ logger.WithField("Since", wkr.probed).Warnf("%sinstance unresponsive, shutting down", label)
+ wp.shutdown(wkr, logger)
+ }
+ return
+ }
+
+ updateTime := time.Now()
+ wkr.probed = updateTime
+ if len(ctrUUIDs) > 0 {
+ wkr.busy = updateTime
+ wkr.lastUUID = ctrUUIDs[0]
+ }
+ if wkr.state == StateShutdown || wkr.state == StateHold {
+ } else if booted {
+ if wkr.state != StateRunning {
+ wkr.state = StateRunning
+ go wp.notify()
+ }
+ } else {
+ wkr.state = StateBooting
+ }
+
+ if updated != wkr.updated {
+ // Worker was updated (e.g., by starting a new
+ // container) after the probe began. Avoid clobbering
+ // those changes with the probe results.
+ return
+ }
+
+ if len(ctrUUIDs) == 0 && len(wkr.running) > 0 {
+ wkr.unallocated = updateTime
+ }
+ running := map[string]struct{}{}
+ changed := false
+ for _, uuid := range ctrUUIDs {
+ running[uuid] = struct{}{}
+ if _, ok := wkr.running[uuid]; !ok {
+ changed = true
+ }
+ }
+ for uuid := range wkr.running {
+ if _, ok := running[uuid]; !ok {
+ logger.WithField("ContainerUUID", uuid).Info("crunch-run process ended")
+ wp.exited[uuid] = updateTime
+ changed = true
+ }
+ }
+ if changed {
+ wkr.running = running
+ wkr.updated = updateTime
+ go wp.notify()
+ }
+}
+
+func (wp *Pool) probeRunning(wkr *worker) (running []string, ok bool, stderr []byte) {
+ cmd := "crunch-run --list"
+ stdout, stderr, err := wkr.executor.Execute(cmd, nil)
+ if err != nil {
+ wp.logger.WithFields(logrus.Fields{
+ "Instance": wkr.instance,
+ "Command": cmd,
+ "stdout": string(stdout),
+ "stderr": string(stderr),
+ }).WithError(err).Warn("probe failed")
+ return nil, false, stderr
+ }
+ stdout = bytes.TrimRight(stdout, "\n")
+ if len(stdout) == 0 {
+ return nil, true, stderr
+ }
+ return strings.Split(string(stdout), "\n"), true, stderr
+}
+
+func (wp *Pool) probeBooted(wkr *worker) (ok bool, stderr []byte) {
+ cmd := wp.bootProbeCommand
+ if cmd == "" {
+ cmd = "true"
+ }
+ stdout, stderr, err := wkr.executor.Execute(cmd, nil)
+ logger := wp.logger.WithFields(logrus.Fields{
+ "Instance": wkr.instance,
+ "Command": cmd,
+ "stdout": string(stdout),
+ "stderr": string(stderr),
+ })
+ if err != nil {
+ logger.WithError(err).Debug("boot probe failed")
+ return false, stderr
+ }
+ logger.Info("boot probe succeeded")
+ return true, stderr
+}
diff --git a/lib/dispatchcloud/worker/pool_test.go b/lib/dispatchcloud/worker/pool_test.go
new file mode 100644
index 000000000..cf4bff12d
--- /dev/null
+++ b/lib/dispatchcloud/worker/pool_test.go
@@ -0,0 +1,124 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package worker
+
+import (
+ "time"
+
+ "git.curoverse.com/arvados.git/lib/cloud"
+ "git.curoverse.com/arvados.git/lib/dispatchcloud/test"
+ "git.curoverse.com/arvados.git/sdk/go/arvados"
+ "github.com/Sirupsen/logrus"
+ check "gopkg.in/check.v1"
+)
+
+const GiB arvados.ByteSize = 1 << 30
+
+var _ = check.Suite(&PoolSuite{})
+
+type PoolSuite struct{}
+
+func (suite *PoolSuite) SetUpSuite(c *check.C) {
+ logrus.StandardLogger().SetLevel(logrus.DebugLevel)
+}
+
+func (suite *PoolSuite) TestStartContainer(c *check.C) {
+ // TODO: use an instanceSet stub with an SSH server
+ c.Fail()
+}
+
+func (suite *PoolSuite) TestVerifyHostKey(c *check.C) {
+ // TODO: use an instanceSet stub with an SSH server
+ c.Fail()
+}
+
+func (suite *PoolSuite) TestCreateUnallocShutdown(c *check.C) {
+ lameInstanceSet := &test.LameInstanceSet{Hold: make(chan bool)}
+ type1 := arvados.InstanceType{Name: "a1s", ProviderType: "a1.small", VCPUs: 1, RAM: 1 * GiB, Price: .01}
+ type2 := arvados.InstanceType{Name: "a2m", ProviderType: "a2.medium", VCPUs: 2, RAM: 2 * GiB, Price: .02}
+ pool := &Pool{
+ logger: logrus.StandardLogger(),
+ newExecutor: func(cloud.Instance) Executor { return &stubExecutor{} },
+ instanceSet: lameInstanceSet,
+ instanceTypes: arvados.InstanceTypeMap{
+ type1.Name: type1,
+ type2.Name: type2,
+ },
+ }
+ notify := pool.Subscribe()
+ defer pool.Unsubscribe(notify)
+ notify2 := pool.Subscribe()
+ defer pool.Unsubscribe(notify2)
+
+ c.Check(pool.Unallocated()[type1], check.Equals, 0)
+ c.Check(pool.Unallocated()[type2], check.Equals, 0)
+ pool.Create(type2)
+ pool.Create(type1)
+ pool.Create(type2)
+ c.Check(pool.Unallocated()[type1], check.Equals, 1)
+ c.Check(pool.Unallocated()[type2], check.Equals, 2)
+ // Unblock the pending Create calls and (before calling Sync!)
+ // wait for the pool to process the returned instances.
+ go lameInstanceSet.Release(3)
+ suite.wait(c, pool, notify, func() bool {
+ list, err := lameInstanceSet.Instances(nil)
+ return err == nil && len(list) == 3
+ })
+
+ c.Check(pool.Unallocated()[type1], check.Equals, 1)
+ c.Check(pool.Unallocated()[type2], check.Equals, 2)
+ pool.getInstancesAndSync()
+ c.Check(pool.Unallocated()[type1], check.Equals, 1)
+ c.Check(pool.Unallocated()[type2], check.Equals, 2)
+
+ c.Check(pool.Shutdown(type2), check.Equals, true)
+ suite.wait(c, pool, notify, func() bool {
+ return pool.Unallocated()[type1] == 1 && pool.Unallocated()[type2] == 1
+ })
+ c.Check(pool.Shutdown(type2), check.Equals, true)
+ suite.wait(c, pool, notify, func() bool {
+ return pool.Unallocated()[type1] == 1 && pool.Unallocated()[type2] == 0
+ })
+ c.Check(pool.Shutdown(type2), check.Equals, false)
+ for {
+ // Consume any waiting notifications to ensure the
+ // next one we get is from Shutdown.
+ select {
+ case <-notify:
+ continue
+ default:
+ }
+ break
+ }
+ c.Check(pool.Shutdown(type1), check.Equals, true)
+ suite.wait(c, pool, notify, func() bool {
+ return pool.Unallocated()[type1] == 0 && pool.Unallocated()[type2] == 0
+ })
+ select {
+ case <-notify2:
+ case <-time.After(time.Second):
+ c.Error("notify did not receive")
+ }
+ go lameInstanceSet.Release(3) // unblock Destroy calls
+}
+
+func (suite *PoolSuite) wait(c *check.C, pool Pool, notify <-chan struct{}, ready func() bool) {
+ timeout := time.NewTimer(time.Second).C
+ for !ready() {
+ select {
+ case <-notify:
+ continue
+ case <-timeout:
+ }
+ break
+ }
+ c.Check(ready(), check.Equals, true)
+}
+
+type stubExecutor struct{}
+
+func (*stubExecutor) SetInstance(cloud.Instance) {}
+
+func (*stubExecutor) Execute(cmd string, stdin []byte) ([]byte, []byte, error) { return nil, nil, nil }
diff --git a/lib/dispatchcloud/worker/worker.go b/lib/dispatchcloud/worker/worker.go
new file mode 100644
index 000000000..7828d4f69
--- /dev/null
+++ b/lib/dispatchcloud/worker/worker.go
@@ -0,0 +1,45 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package worker
+
+import (
+ "time"
+)
+
+// State indicates whether a worker is available to do work, and (if
+// not) whether/when it is expected to become ready.
+type State int
+
+const (
+ StateUnknown State = iota // might be running a container already
+ StateBooting // instance is booting
+ StateRunning // instance is running
+ StateShutdown // worker has stopped monitoring the instance
+ StateHold // running, but not available to run new containers
+)
+
+const (
+ // TODO: configurable
+ maxPingFailTime = 10 * time.Minute
+)
+
+var stateString = map[State]string{
+ StateUnknown: "unknown",
+ StateBooting: "booting",
+ StateRunning: "running",
+ StateShutdown: "shutdown",
+ StateHold: "hold",
+}
+
+// String implements fmt.Stringer.
+func (s State) String() string {
+ return stateString[s]
+}
+
+// MarshalText implements encoding.TextMarshaler so a JSON encoding of
+// map[State]anything uses the state's string representation.
+func (s State) MarshalText() ([]byte, error) {
+ return []byte(stateString[s]), nil
+}
diff --git a/sdk/go/arvados/config.go b/sdk/go/arvados/config.go
index e2e9907d5..bfa86abf6 100644
--- a/sdk/go/arvados/config.go
+++ b/sdk/go/arvados/config.go
@@ -60,6 +60,8 @@ type Cluster struct {
ManagementToken string
NodeProfiles map[string]NodeProfile
InstanceTypes InstanceTypeMap
+ CloudVMs CloudVMs
+ Dispatch Dispatch
HTTPRequestTimeout Duration
RemoteClusters map[string]RemoteCluster
PostgreSQL PostgreSQL
@@ -95,6 +97,50 @@ type InstanceType struct {
Preemptible bool
}
+type Dispatch struct {
+ // PEM encoded SSH key (RSA, DSA, or ECDSA) able to log in to
+ // cloud VMs.
+ PrivateKey []byte
+
+ // Max time for workers to come up before abandoning stale
+ // locks from previous run
+ StaleLockTimeout Duration
+
+ // Interval between queue polls
+ PollInterval Duration
+
+ // Interval between probes to each worker
+ ProbeInterval Duration
+
+ // Maximum total worker probes per second
+ MaxProbesPerSecond int
+}
+
+type CloudVMs struct {
+ // Shell command that exits zero IFF the VM is fully booted
+ // and ready to run containers, e.g., "mount | grep
+ // /encrypted-tmp"
+ BootProbeCommand string
+ SyncInterval Duration
+
+ // Maximum idle time before automatic shutdown
+ TimeoutIdle Duration
+
+ // Maximum booting time before automatic shutdown
+ TimeoutBooting Duration
+
+ // Maximum time with no successful probes before automatic shutdown
+ TimeoutProbe Duration
+
+ // Time after shutdown to retry shutdown
+ TimeoutShutdown Duration
+
+ ImageID string
+
+ Driver string
+ DriverParameters map[string]interface{}
+}
+
type InstanceTypeMap map[string]InstanceType
var errDuplicateInstanceTypeName = errors.New("duplicate instance type name")
@@ -159,45 +205,48 @@ func (cc *Cluster) GetNodeProfile(node string) (*NodeProfile, error) {
}
type NodeProfile struct {
- Controller SystemServiceInstance `json:"arvados-controller"`
- Health SystemServiceInstance `json:"arvados-health"`
- Keepbalance SystemServiceInstance `json:"keep-balance"`
- Keepproxy SystemServiceInstance `json:"keepproxy"`
- Keepstore SystemServiceInstance `json:"keepstore"`
- Keepweb SystemServiceInstance `json:"keep-web"`
- Nodemanager SystemServiceInstance `json:"arvados-node-manager"`
- RailsAPI SystemServiceInstance `json:"arvados-api-server"`
- Websocket SystemServiceInstance `json:"arvados-ws"`
- Workbench SystemServiceInstance `json:"arvados-workbench"`
+ Controller SystemServiceInstance `json:"arvados-controller"`
+ Health SystemServiceInstance `json:"arvados-health"`
+ Keepbalance SystemServiceInstance `json:"keep-balance"`
+ Keepproxy SystemServiceInstance `json:"keepproxy"`
+ Keepstore SystemServiceInstance `json:"keepstore"`
+ Keepweb SystemServiceInstance `json:"keep-web"`
+ Nodemanager SystemServiceInstance `json:"arvados-node-manager"`
+ DispatchCloud SystemServiceInstance `json:"arvados-dispatch-cloud"`
+ RailsAPI SystemServiceInstance `json:"arvados-api-server"`
+ Websocket SystemServiceInstance `json:"arvados-ws"`
+ Workbench SystemServiceInstance `json:"arvados-workbench"`
}
type ServiceName string
const (
- ServiceNameRailsAPI ServiceName = "arvados-api-server"
- ServiceNameController ServiceName = "arvados-controller"
- ServiceNameNodemanager ServiceName = "arvados-node-manager"
- ServiceNameWorkbench ServiceName = "arvados-workbench"
- ServiceNameWebsocket ServiceName = "arvados-ws"
- ServiceNameKeepbalance ServiceName = "keep-balance"
- ServiceNameKeepweb ServiceName = "keep-web"
- ServiceNameKeepproxy ServiceName = "keepproxy"
- ServiceNameKeepstore ServiceName = "keepstore"
+ ServiceNameRailsAPI ServiceName = "arvados-api-server"
+ ServiceNameController ServiceName = "arvados-controller"
+ ServiceNameDispatchCloud ServiceName = "arvados-dispatch-cloud"
+ ServiceNameNodemanager ServiceName = "arvados-node-manager"
+ ServiceNameWorkbench ServiceName = "arvados-workbench"
+ ServiceNameWebsocket ServiceName = "arvados-ws"
+ ServiceNameKeepbalance ServiceName = "keep-balance"
+ ServiceNameKeepweb ServiceName = "keep-web"
+ ServiceNameKeepproxy ServiceName = "keepproxy"
+ ServiceNameKeepstore ServiceName = "keepstore"
)
// ServicePorts returns the configured listening address (or "" if
// disabled) for each service on the node.
func (np *NodeProfile) ServicePorts() map[ServiceName]string {
return map[ServiceName]string{
- ServiceNameRailsAPI: np.RailsAPI.Listen,
- ServiceNameController: np.Controller.Listen,
- ServiceNameNodemanager: np.Nodemanager.Listen,
- ServiceNameWorkbench: np.Workbench.Listen,
- ServiceNameWebsocket: np.Websocket.Listen,
- ServiceNameKeepbalance: np.Keepbalance.Listen,
- ServiceNameKeepweb: np.Keepweb.Listen,
- ServiceNameKeepproxy: np.Keepproxy.Listen,
- ServiceNameKeepstore: np.Keepstore.Listen,
+ ServiceNameRailsAPI: np.RailsAPI.Listen,
+ ServiceNameController: np.Controller.Listen,
+ ServiceNameDispatchCloud: np.DispatchCloud.Listen,
+ ServiceNameNodemanager: np.Nodemanager.Listen,
+ ServiceNameWorkbench: np.Workbench.Listen,
+ ServiceNameWebsocket: np.Websocket.Listen,
+ ServiceNameKeepbalance: np.Keepbalance.Listen,
+ ServiceNameKeepweb: np.Keepweb.Listen,
+ ServiceNameKeepproxy: np.Keepproxy.Listen,
+ ServiceNameKeepstore: np.Keepstore.Listen,
}
}
diff --git a/sdk/go/arvados/container.go b/sdk/go/arvados/container.go
index 2622c1370..def4e33cb 100644
--- a/sdk/go/arvados/container.go
+++ b/sdk/go/arvados/container.go
@@ -18,10 +18,11 @@ type Container struct {
Mounts map[string]Mount `json:"mounts"`
Output string `json:"output"`
OutputPath string `json:"output_path"`
- Priority int `json:"priority"`
+ Priority int64 `json:"priority"`
RuntimeConstraints RuntimeConstraints `json:"runtime_constraints"`
State ContainerState `json:"state"`
SchedulingParameters SchedulingParameters `json:"scheduling_parameters"`
+ ExitCode int `json:"exit_code"`
}
// Container is an arvados#container resource.
diff --git a/services/crunch-run/background.go b/services/crunch-run/background.go
new file mode 100644
index 000000000..3dbfcfcde
--- /dev/null
+++ b/services/crunch-run/background.go
@@ -0,0 +1,192 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package main
+
+import (
+ "encoding/json"
+ "fmt"
+ "io"
+ "io/ioutil"
+ "os"
+ "os/exec"
+ "path/filepath"
+ "strings"
+ "syscall"
+ "time"
+)
+
+var (
+ lockdir = "/var/run"
+ lockprefix = "crunch-run-"
+ locksuffix = ".lock"
+)
+
+// procinfo is saved in each process's lockfile.
+type procinfo struct {
+ UUID string
+ PID int
+ Stdout string
+ Stderr string
+}
+
+// Detach acquires a lock for the given uuid, and starts the current
+// program as a child process (with -nodetach prepended to the given
+// arguments so the child knows not to detach again). The lock is
+// passed along to the child process.
+func Detach(uuid string, args []string, stdout, stderr io.Writer) int {
+ return exitcode(stderr, detach(uuid, args, stdout, stderr))
+}
+func detach(uuid string, args []string, stdout, stderr io.Writer) error {
+ lockfile, err := os.OpenFile(filepath.Join(lockdir, lockprefix+uuid+locksuffix), os.O_CREATE|os.O_RDWR, 0700)
+ if err != nil {
+ return err
+ }
+ defer lockfile.Close()
+ err = syscall.Flock(int(lockfile.Fd()), syscall.LOCK_EX|syscall.LOCK_NB)
+ if err != nil {
+ return err
+ }
+ lockfile.Truncate(0)
+
+ outfile, err := ioutil.TempFile("", "crunch-run-"+uuid+"-stdout-")
+ if err != nil {
+ return err
+ }
+ defer outfile.Close()
+ errfile, err := ioutil.TempFile("", "crunch-run-"+uuid+"-stderr-")
+ if err != nil {
+ os.Remove(outfile.Name())
+ return err
+ }
+ defer errfile.Close()
+
+ cmd := exec.Command(args[0], append([]string{"-nodetach"}, args[1:]...)...)
+ cmd.Stdout = outfile
+ cmd.Stderr = errfile
+ cmd.ExtraFiles = []*os.File{lockfile}
+ err = cmd.Start()
+ if err != nil {
+ os.Remove(outfile.Name())
+ os.Remove(errfile.Name())
+ return err
+ }
+
+ w := io.MultiWriter(stdout, lockfile)
+ err = json.NewEncoder(w).Encode(procinfo{
+ PID: cmd.Process.Pid,
+ Stdout: outfile.Name(),
+ Stderr: errfile.Name(),
+ })
+ if err != nil {
+ os.Remove(outfile.Name())
+ os.Remove(errfile.Name())
+ return err
+ }
+ return nil
+}
+
+// Kill finds the crunch-run process corresponding to the given uuid,
+// and sends the given signal to it. It then waits up to 1 second for
+// the process to die. It returns 0 if the process is successfully
+// killed or didn't exist in the first place.
+func Kill(uuid string, signal syscall.Signal, stdout, stderr io.Writer) int {
+ return exitcode(stderr, kill(uuid, signal, stdout, stderr))
+}
+
+func kill(uuid string, signal syscall.Signal, stdout, stderr io.Writer) error {
+ path := filepath.Join(lockdir, lockprefix+uuid+locksuffix)
+ f, err := os.Open(path)
+ if os.IsNotExist(err) {
+ return nil
+ } else if err != nil {
+ return err
+ }
+ defer f.Close()
+
+ var pi procinfo
+ err = json.NewDecoder(f).Decode(&pi)
+ if err != nil {
+ return fmt.Errorf("%s: %s\n", path, err)
+ }
+
+ if pi.UUID != uuid || pi.PID == 0 {
+ return fmt.Errorf("%s: bogus procinfo: %+v", path, pi)
+ }
+
+ proc, err := os.FindProcess(pi.PID)
+ if err != nil {
+ return err
+ }
+
+ err = proc.Signal(signal)
+ for deadline := time.Now().Add(time.Second); err == nil && time.Now().Before(deadline); time.Sleep(time.Second / 100) {
+ err = proc.Signal(syscall.Signal(0))
+ }
+ if err == nil {
+ return fmt.Errorf("pid %d: sent signal %d (%s) but process is still alive", pi.PID, signal, signal)
+ }
+ fmt.Fprintln(stderr, "pid %d: %s", pi.PID, err)
+ return nil
+}
+
+// List UUIDs of active crunch-run processes.
+func List(stdout, stderr io.Writer) int {
+ return exitcode(stderr, filepath.Walk(lockdir, func(path string, info os.FileInfo, err error) error {
+ if info.IsDir() {
+ return filepath.SkipDir
+ }
+ if name := info.Name(); !strings.HasPrefix(name, lockprefix) || !strings.HasSuffix(name, locksuffix) {
+ return nil
+ }
+ if info.Size() == 0 {
+ // race: process has opened/locked but hasn't yet written pid/uuid
+ return nil
+ }
+
+ f, err := os.Open(path)
+ if err != nil {
+ return nil
+ }
+ defer f.Close()
+
+ // TODO: Do this check without risk of disrupting lock
+ // acquisition during races, e.g., by connecting to a
+ // unix socket or checking /proc/$pid/fd/$n ->
+ // lockfile.
+ err = syscall.Flock(int(f.Fd()), syscall.LOCK_SH)
+ if err == nil {
+ // lockfile is stale
+ err := os.Remove(path)
+ if err != nil {
+ fmt.Fprintln(stderr, err)
+ }
+ return nil
+ }
+
+ var pi procinfo
+ err = json.NewDecoder(f).Decode(&pi)
+ if err != nil {
+ fmt.Fprintf(stderr, "%s: %s\n", path, err)
+ return nil
+ }
+ if pi.UUID == "" || pi.PID == 0 {
+ fmt.Fprintf(stderr, "%s: bogus procinfo: %+v", path, pi)
+ return nil
+ }
+
+ fmt.Fprintln(stdout, pi.UUID)
+ return nil
+ }))
+}
+
+// If err is nil, return 0 ("success"); otherwise, print err to stderr
+// and return 1.
+func exitcode(stderr io.Writer, err error) int {
+ if err != nil {
+ fmt.Fprintln(stderr, err)
+ return 1
+ }
+ return 0
+}
diff --git a/services/crunch-run/crunchrun.go b/services/crunch-run/crunchrun.go
index 44560b80a..74ab77ab0 100644
--- a/services/crunch-run/crunchrun.go
+++ b/services/crunch-run/crunchrun.go
@@ -1716,6 +1716,10 @@ func main() {
cgroupParent := flag.String("cgroup-parent", "docker", "name of container's parent cgroup (ignored if -cgroup-parent-subsystem is used)")
cgroupParentSubsystem := flag.String("cgroup-parent-subsystem", "", "use current cgroup for given subsystem as parent cgroup for container")
caCertsPath := flag.String("ca-certs", "", "Path to TLS root certificates")
+ detach := flag.Bool("detach", false, "Detach from parent process and run in the background")
+ sleep := flag.Duration("sleep", 0, "Delay before starting (testing use only)")
+ kill := flag.Int("kill", -1, "Send signal to an existing crunch-run process for given UUID")
+ list := flag.Bool("list", false, "List UUIDs of existing crunch-run processes")
enableNetwork := flag.String("container-enable-networking", "default",
`Specify if networking should be enabled for container. One of 'default', 'always':
default: only enable networking if container requests it.
@@ -1727,8 +1731,29 @@ func main() {
memprofile := flag.String("memprofile", "", "write memory profile to `file` after running container")
getVersion := flag.Bool("version", false, "Print version information and exit.")
checkContainerd := flag.Duration("check-containerd", 60*time.Second, "Periodic check if (docker-)containerd is running (use 0s to disable).")
+
+ detached := false
+ if len(os.Args) > 1 && os.Args[1] == "-detached" {
+ // This process was invoked by a parent process, which
+ // has passed along its own arguments, including
+ // -detach, after the leading -detached flag. Strip
+ // the leading -detached flag (it's not recognized by
+ // flag.Parse()) ... and remember not to detach all
+ // over again in this process.
+ os.Args = append([]string{os.Args[0]}, os.Args[2:]...)
+ detached = true
+ }
flag.Parse()
+ switch {
+ case *detach && !detached:
+ os.Exit(Detach(flag.Arg(0), os.Args, os.Stdout, os.Stderr))
+ case *kill >= 0:
+ os.Exit(Kill(flag.Arg(0), syscall.Signal(*kill), os.Stdout, os.Stderr))
+ case *list:
+ os.Exit(List(os.Stdout, os.Stderr))
+ }
+
// Print version information if requested
if *getVersion {
fmt.Printf("crunch-run %s\n", version)
@@ -1736,6 +1761,7 @@ func main() {
}
log.Printf("crunch-run %s started", version)
+ time.Sleep(*sleep)
containerId := flag.Arg(0)
diff --git a/vendor/vendor.json b/vendor/vendor.json
index aa6b2d773..aee25beab 100644
--- a/vendor/vendor.json
+++ b/vendor/vendor.json
@@ -349,6 +349,12 @@
"revisionTime": "2016-12-03T19:45:07Z"
},
{
+ "checksumSHA1": "ewGq4nGalpCQOHcmBTdAEQx1wW0=",
+ "path": "github.com/mitchellh/mapstructure",
+ "revision": "bb74f1db0675b241733089d5a1faa5dd8b0ef57b",
+ "revisionTime": "2018-05-11T14:21:26Z"
+ },
+ {
"checksumSHA1": "OFNit1Qx2DdWhotfREKodDNUwCM=",
"path": "github.com/opencontainers/go-digest",
"revision": "279bed98673dd5bef374d3b6e4b09e2af76183bf",
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list