[ARVADOS] updated: 1.2.0-106-gc6e22488e

Git user git at public.curoverse.com
Sat Sep 29 02:41:25 EDT 2018


Summary of changes:
 .licenseignore                                     |    1 +
 apps/workbench/app/helpers/application_helper.rb   |   12 +-
 apps/workbench/app/models/container_work_unit.rb   |   25 +-
 apps/workbench/app/models/proxy_work_unit.rb       |    6 +-
 apps/workbench/app/models/work_unit.rb             |    4 +
 .../application/_show_text_with_locators.html.erb  |   49 +-
 .../app/views/projects/_show_dashboard.html.erb    |   15 +-
 .../app/views/work_units/_show_component.html.erb  |   48 +
 .../container_requests_controller_test.rb          |    2 +
 apps/workbench/test/integration/work_units_test.rb |    2 +-
 build/build.list                                   |   40 +-
 build/libcloud-pin.sh                              |    2 +-
 build/package-build-dockerfiles/Makefile           |   16 +-
 build/package-build-dockerfiles/centos7/Dockerfile |    5 +-
 build/package-build-dockerfiles/debian8/Dockerfile |    5 +-
 .../package-build-dockerfiles/debian9/D39DC0E3.asc |  692 ------------
 build/package-build-dockerfiles/debian9/Dockerfile |    6 +-
 .../ubuntu1404/Dockerfile                          |    5 +-
 .../ubuntu1604/Dockerfile                          |    5 +-
 .../{ubuntu1404 => ubuntu1804}/Dockerfile          |   11 +-
 build/package-test-dockerfiles/centos7/Dockerfile  |    2 +-
 build/package-test-dockerfiles/debian8/Dockerfile  |    2 +-
 .../package-test-dockerfiles/ubuntu1404/Dockerfile |    2 +-
 .../package-test-dockerfiles/ubuntu1604/Dockerfile |    2 +-
 .../{ubuntu1604 => ubuntu1804}/Dockerfile          |   10 +-
 .../etc-apt-preferences.d-arvados                  |    0
 build/package-testing/deb-common-test-packages.sh  |    2 +-
 ...ages-debian8.sh => test-packages-ubuntu1804.sh} |    0
 build/run-build-packages-sso.sh                    |   15 +-
 build/run-build-packages.sh                        |   32 +-
 build/run-library.sh                               |   18 +-
 build/run-tests.sh                                 |   24 +-
 ...cloud.service => crunch-dispatch-cloud.service} |    2 +-
 doc/_config.yml                                    |    1 +
 doc/_includes/_install_compute_docker.liquid       |    3 +-
 doc/admin/upgrading.html.textile.liquid            |    4 -
 doc/api/methods/containers.html.textile.liquid     |   18 +
 doc/api/methods/groups.html.textile.liquid         |   21 +
 doc/architecture/Arvados_federation.odg            |  Bin 0 -> 16162 bytes
 doc/architecture/federation.html.textile.liquid    |  111 ++
 doc/images/arvados_federation.svg                  | 1133 ++++++++++++++++++++
 doc/install/install-api-server.html.textile.liquid |   39 +-
 .../install-arv-git-httpd.html.textile.liquid      |    8 +-
 doc/install/install-keepstore.html.textile.liquid  |    4 +-
 lib/cloud/interfaces.go                            |   70 +-
 lib/cmd/cmd.go                                     |    5 +-
 lib/controller/federation.go                       |  441 +++++++-
 lib/controller/federation_test.go                  |  314 ++++++
 lib/controller/handler.go                          |    4 +-
 lib/controller/proxy.go                            |   46 +-
 lib/dispatchcloud/cmd.go                           |    4 +-
 lib/dispatchcloud/container/queue.go               |  270 +++++
 lib/dispatchcloud/container_queue.go               |  233 ----
 lib/dispatchcloud/dispatcher.go                    |  147 ++-
 lib/dispatchcloud/dispatcher_test.go               |  217 ++++
 lib/dispatchcloud/driver.go                        |   22 +
 lib/dispatchcloud/instance_set_proxy.go            |   25 +
 lib/dispatchcloud/scheduler.go                     |  113 --
 lib/dispatchcloud/scheduler/fix_stale_locks.go     |   53 +
 .../{worker => scheduler}/gocheck_test.go          |    2 +-
 lib/dispatchcloud/scheduler/interfaces.go          |   38 +
 lib/dispatchcloud/scheduler/map.go                 |  127 +++
 lib/dispatchcloud/scheduler/map_test.go            |  259 +++++
 lib/dispatchcloud/scheduler/sync.go                |   51 +
 lib/dispatchcloud/ssh_executor/executor.go         |  148 +++
 lib/dispatchcloud/ssh_executor/executor_test.go    |  102 ++
 lib/dispatchcloud/test/fixtures.go                 |   24 +
 .../{lame_provider.go => lame_instance_set.go}     |    0
 lib/dispatchcloud/test/queue.go                    |   72 ++
 lib/dispatchcloud/test/ssh_service.go              |  169 +++
 lib/dispatchcloud/test/sshkey_dispatch             |   27 +
 lib/dispatchcloud/test/sshkey_dispatch.pub         |    1 +
 lib/dispatchcloud/test/sshkey_vm                   |   27 +
 lib/dispatchcloud/test/sshkey_vm.pub               |    1 +
 lib/dispatchcloud/test/stub_driver.go              |  190 ++++
 lib/dispatchcloud/worker/pool.go                   |  475 ++++++--
 lib/dispatchcloud/worker/pool_test.go              |   76 +-
 lib/dispatchcloud/worker/worker.go                 |  150 +--
 sdk/cli/arvados-cli.gemspec                        |    6 +-
 sdk/cli/bin/arv                                    |   20 +-
 sdk/cli/bin/arv-run-pipeline-instance              |    8 +-
 sdk/cli/bin/arv-tag                                |   10 +-
 sdk/cwl/arvados_cwl/__init__.py                    |  134 ++-
 sdk/cwl/arvados_cwl/arvcontainer.py                |   20 +-
 sdk/cwl/arvados_cwl/context.py                     |    1 +
 sdk/cwl/arvados_cwl/done.py                        |    6 +-
 sdk/cwl/setup.py                                   |    4 +-
 sdk/cwl/tests/test_container.py                    |  103 +-
 sdk/cwl/tests/test_job.py                          |   18 +-
 sdk/cwl/tests/test_submit.py                       |   20 +-
 sdk/go/arvados/config.go                           |   76 +-
 sdk/go/arvados/container.go                        |   43 +
 sdk/go/arvados/integration_test_cluster.go         |   25 +
 sdk/go/arvadosclient/arvadosclient.go              |    7 +-
 sdk/go/arvadostest/fixtures.go                     |    4 +-
 sdk/go/dispatch/dispatch.go                        |   55 +-
 sdk/go/httpserver/error.go                         |    7 +
 sdk/go/keepclient/discover.go                      |    8 +-
 sdk/go/keepclient/perms.go                         |   10 +-
 sdk/go/keepclient/support.go                       |    2 +-
 sdk/python/arvados/api.py                          |    8 +-
 sdk/python/arvados/commands/run.py                 |   70 +-
 sdk/python/setup.py                                |    6 +-
 sdk/python/tests/run_test_server.py                |    1 +
 sdk/ruby/arvados.gemspec                           |    2 +-
 .../api/app/controllers/application_controller.rb  |   24 +-
 .../arvados/v1/containers_controller.rb            |    1 +
 .../controllers/arvados/v1/groups_controller.rb    |   56 +
 services/api/app/middlewares/arvados_api_token.rb  |    3 +
 .../api/app/models/api_client_authorization.rb     |   12 +
 services/api/app/models/arvados_model.rb           |   61 +-
 services/api/app/models/collection.rb              |    9 +-
 services/api/app/models/container.rb               |   45 +-
 services/api/app/models/container_request.rb       |   42 +-
 services/api/config/application.default.yml        |   15 +-
 .../api/config/initializers/legacy_jobs_api.rb     |   16 +
 services/api/config/routes.rb                      |    1 +
 .../migrate/20180806133039_index_all_filenames.rb  |    6 -
 ...30357_add_pdh_and_trash_index_to_collections.rb |    9 +
 .../20180820132617_add_lock_index_to_containers.rb |   13 +
 ...80820135808_drop_pdh_index_from_collections.rb} |    4 +-
 .../20180824152014_add_md5_index_to_containers.rb  |   12 +
 ...20180824155207_add_queue_index_to_containers.rb |   12 +
 ...80904110712_add_runtime_status_to_containers.rb |   10 +
 .../20180917205609_recompute_file_names_index.rb   |   55 +
 services/api/db/structure.sql                      |   56 +-
 services/api/lib/enable_jobs_api.rb                |   39 +
 .../test/fixtures/api_client_authorizations.yml    |    7 +
 services/api/test/fixtures/container_requests.yml  |    4 +-
 services/api/test/fixtures/containers.yml          |    2 +-
 services/api/test/fixtures/groups.yml              |    8 +
 .../arvados/v1/collections_controller_test.rb      |   39 +-
 .../v1/container_requests_controller_test.rb       |    6 +-
 .../arvados/v1/groups_controller_test.rb           |   95 +-
 services/api/test/test_helper.rb                   |    3 +
 .../api/test/unit/collection_performance_test.rb   |    9 +-
 services/api/test/unit/container_request_test.rb   |   32 +-
 services/api/test/unit/container_test.rb           |  121 ++-
 services/api/test/unit/job_test.rb                 |   28 +
 .../crunch-dispatch-local/crunch-dispatch-local.go |   39 +-
 .../crunch-dispatch-local_test.go                  |   20 +-
 .../crunch-dispatch-slurm/crunch-dispatch-slurm.go |   39 +-
 .../crunch-dispatch-slurm_test.go                  |   14 +-
 services/crunch-dispatch-slurm/slurm.go            |   12 +-
 services/crunch-dispatch-slurm/squeue.go           |   12 +-
 services/crunch-dispatch-slurm/squeue_test.go      |    5 +
 services/crunch-dispatch-slurm/usage.go            |    1 +
 services/crunch-run/copier.go                      |   10 +-
 services/crunch-run/crunchrun.go                   |   14 +-
 services/fuse/arvados_fuse/fusedir.py              |   74 +-
 services/keep-web/handler.go                       |    7 +-
 services/keep-web/handler_test.go                  |    2 +-
 services/keepstore/handler_test.go                 |   10 +-
 services/keepstore/handlers.go                     |   41 +-
 services/keepstore/keepstore.go                    |   19 +-
 services/keepstore/mounts_test.go                  |    2 +-
 services/keepstore/proxy_remote.go                 |  113 ++
 services/keepstore/proxy_remote_test.go            |  149 +++
 services/keepstore/s3_volume.go                    |    4 +
 services/login-sync/arvados-login-sync.gemspec     |    2 +-
 vendor/vendor.json                                 |    6 +
 161 files changed, 6532 insertions(+), 1899 deletions(-)
 delete mode 100644 build/package-build-dockerfiles/debian9/D39DC0E3.asc
 copy build/package-build-dockerfiles/{ubuntu1404 => ubuntu1804}/Dockerfile (77%)
 copy build/package-test-dockerfiles/{ubuntu1604 => ubuntu1804}/Dockerfile (73%)
 copy build/package-test-dockerfiles/{ubuntu1604 => ubuntu1804}/etc-apt-preferences.d-arvados (100%)
 copy build/package-testing/{test-packages-debian8.sh => test-packages-ubuntu1804.sh} (100%)
 rename cmd/arvados-server/{arvados-dispatch-cloud.service => crunch-dispatch-cloud.service} (94%)
 create mode 100644 doc/architecture/Arvados_federation.odg
 create mode 100644 doc/architecture/federation.html.textile.liquid
 create mode 100644 doc/images/arvados_federation.svg
 create mode 100644 lib/dispatchcloud/container/queue.go
 delete mode 100644 lib/dispatchcloud/container_queue.go
 create mode 100644 lib/dispatchcloud/dispatcher_test.go
 create mode 100644 lib/dispatchcloud/driver.go
 create mode 100644 lib/dispatchcloud/instance_set_proxy.go
 delete mode 100644 lib/dispatchcloud/scheduler.go
 create mode 100644 lib/dispatchcloud/scheduler/fix_stale_locks.go
 copy lib/dispatchcloud/{worker => scheduler}/gocheck_test.go (92%)
 create mode 100644 lib/dispatchcloud/scheduler/interfaces.go
 create mode 100644 lib/dispatchcloud/scheduler/map.go
 create mode 100644 lib/dispatchcloud/scheduler/map_test.go
 create mode 100644 lib/dispatchcloud/scheduler/sync.go
 create mode 100644 lib/dispatchcloud/ssh_executor/executor.go
 create mode 100644 lib/dispatchcloud/ssh_executor/executor_test.go
 create mode 100644 lib/dispatchcloud/test/fixtures.go
 rename lib/dispatchcloud/test/{lame_provider.go => lame_instance_set.go} (100%)
 create mode 100644 lib/dispatchcloud/test/queue.go
 create mode 100644 lib/dispatchcloud/test/ssh_service.go
 create mode 100644 lib/dispatchcloud/test/sshkey_dispatch
 create mode 100644 lib/dispatchcloud/test/sshkey_dispatch.pub
 create mode 100644 lib/dispatchcloud/test/sshkey_vm
 create mode 100644 lib/dispatchcloud/test/sshkey_vm.pub
 create mode 100644 lib/dispatchcloud/test/stub_driver.go
 create mode 100644 sdk/go/arvados/integration_test_cluster.go
 create mode 100644 services/api/config/initializers/legacy_jobs_api.rb
 create mode 100644 services/api/db/migrate/20180820130357_add_pdh_and_trash_index_to_collections.rb
 create mode 100644 services/api/db/migrate/20180820132617_add_lock_index_to_containers.rb
 copy services/api/db/migrate/{20130107181109_add_uuid_to_collections.rb => 20180820135808_drop_pdh_index_from_collections.rb} (50%)
 create mode 100644 services/api/db/migrate/20180824152014_add_md5_index_to_containers.rb
 create mode 100644 services/api/db/migrate/20180824155207_add_queue_index_to_containers.rb
 create mode 100644 services/api/db/migrate/20180904110712_add_runtime_status_to_containers.rb
 create mode 100644 services/api/db/migrate/20180917205609_recompute_file_names_index.rb
 create mode 100644 services/api/lib/enable_jobs_api.rb
 create mode 100644 services/keepstore/proxy_remote.go
 create mode 100644 services/keepstore/proxy_remote_test.go

  discards  88fd77aa7d9ce38a291c09eb459228f60edfb2fb (commit)
       via  c6e22488e438656c515c00b11a418a9e14009d21 (commit)
       via  9c710442cc3d18afcc7d4a37f3c3c71b96a89adc (commit)
       via  4058b87cfdc07fe25da31f59577f835656e97816 (commit)
       via  959fc42ba11fbe4f803511ffcb8945bae537ff21 (commit)
       via  4edb33f16f8f111fa478dda908c37ea2a5723dd9 (commit)
       via  1dd4997ebfe9e1bce4073aed9abb37f89ae90845 (commit)
       via  b23972dec86c459c2e89fbb0c021153db54e7efa (commit)
       via  a2d7b1241424ea22e5ee81818d6562f92ee2731d (commit)
       via  a629657158ed8690e9c4d44acea47bf0d8ba3794 (commit)
       via  ab92b051d6f09f1b3e670ecd39e8c96ed34570db (commit)
       via  a0789339c1fa67ab8b3f71b4fa6a0087165c078d (commit)
       via  49e18f8c16bda6b6bf56c301b440c9d56d9bcb72 (commit)
       via  a78dce7bd57c1a6822e1949be4a5f13cf2447909 (commit)
       via  ba54c26be4078cb32d78947837f630a4404a93ae (commit)
       via  494cc8312cff496bab3653d5b2ab1f00ac43a40c (commit)
       via  720e76bb1d82d5a5448ce395df634310ceee473e (commit)
       via  b5f589dce8110139073388f11f1ed2259e272a96 (commit)
       via  985bfe119f34bbf39a7007bee1fc1c03db9ab8bf (commit)
       via  9b1e23489c659655134a7e208a012d5d8d05bd07 (commit)
       via  898cd195f8b5916ed9c3a83dec212e858bfa610f (commit)
       via  5d8eb6ead9e2ca32f424a8385979485a902cf09f (commit)
       via  05729d26f0d71620bb7151b443f628a92d3ec78d (commit)
       via  ce4285dd9a6310a799b861237918273329390316 (commit)
       via  8c30c649c667f7ccd205cdfcf1864563ac503585 (commit)
       via  18b51f2620505d4efbc9de322e24d5218a5ca19b (commit)
       via  af1125bd1bc10f6ac2f9129261176c4510aadd54 (commit)
       via  591fd5d18644037426b58abc0d21bb2ccbcae888 (commit)
       via  d9224c40587a8d3617e7be01f3bb7f801c4b52e4 (commit)
       via  ddc180fa5200d9d8fac59cc5041d7d452b68e6a2 (commit)
       via  e0ecd284c0ca1b860a8cccd3414d765dbe6164e2 (commit)
       via  364dafeaac5e6d02583d4329738050842d2550b4 (commit)
       via  909ce946a5bdb7fc6f6003d8b088705fb7b7e2ff (commit)
       via  7d8906675774ce1c176b3f17c47f64ab43eea5cb (commit)
       via  8e7c30852f1cf244ae3c58e93acea705739e8625 (commit)
       via  945dd1588b84a3d19248aff4b9bd32c2ca8766eb (commit)
       via  6ab0abd1ba40262a6bca12dfdd58e3701b274028 (commit)
       via  cf1f57e51578de2ad9121e300f5b816b74938684 (commit)
       via  1010c9e519409f0eb36971df6c382fc7cc0e2732 (commit)
       via  30d37841e979eacd15f11cbaf608d507af379a86 (commit)
       via  6451f2e4aa5464fa23fd8fe801177621702b8ac8 (commit)
       via  21fe33427ce1e726e527c37716b30a519e1ebb94 (commit)
       via  58f3f7ab5c8c09f179a7106ae1ca127f128cdd2a (commit)
       via  3c08b21fd2d0b07094554f6761eac799cfc31ea0 (commit)
       via  55f6178b9a9a0d165e952eeec9a04d0234299397 (commit)
       via  d6993a413a1290f110c52ed7339b09caf8b87b15 (commit)
       via  b66b029bef5b4b0f54d204318a8928b7a6977219 (commit)
       via  8674cfd50ad24171de7d157813819f3383c3353b (commit)
       via  21810ed18611adbb902bc7dc35407fc4e4adc828 (commit)
       via  2b0b06579199967eca3d44d955ad64195d2db3c3 (commit)
       via  3539e6e39515a5e08bb2f10dc71be5841827b294 (commit)
       via  1a11ab632c333474fa8c46105245bace4028733c (commit)
       via  ce5e69f4b9ef71c9e7ee688827861eeeabaf200f (commit)
       via  473029212de9517dc2c486122cee7c9ca4c47ad5 (commit)
       via  365260291d7d6aa5434c8f2b600aee079e69b611 (commit)
       via  9d2e1158e961801db714236c6942bd3596b867e7 (commit)
       via  c25c456973ebd634256e8c5eff56397d3381fb0e (commit)
       via  c5f1bd5d798400858a0314a30230fa8d860d4e01 (commit)
       via  fe235611ed2a624783067e65b02b319fa687f372 (commit)
       via  8858ba320e28c7c7cee294257f9e2232dfd41230 (commit)
       via  d69693c92dbd82cfe5e797f2dbffc6f32e0e2ff0 (commit)
       via  d23b72550d31b105dfed14b8dc1dacaa1a71dd14 (commit)
       via  01693774dffd5ad5dd5313f24a5933f44b0e069d (commit)
       via  9c1f240500cca97fe986b529fcfc9c7f9fe1a283 (commit)
       via  77001923c90a319a1923f56e624c1048ede41542 (commit)
       via  0091c45c908f1056904865f37e136812fb769351 (commit)
       via  c976171138862577a90b34a5d9d21daac52aefec (commit)
       via  4a466ccc5f447e7284c2d479ea3ee427876896e0 (commit)
       via  0e6fc197eff8a85c0a56d2f14124c155bfb92da8 (commit)
       via  59330d17dc80a1d7dc738569b04d0436329e34f6 (commit)
       via  1416b0952adc0bfee85e15d9c86a51c32fcfd003 (commit)
       via  490f5378e5e902fcf9735fc5520a515c989f350a (commit)
       via  64e905e02fae8d63112be8a83b8f9f5f158fb2a2 (commit)
       via  406c193dfd61ceb39cfb695ccb72f20ba21f267a (commit)
       via  fffdb47c07df61fe3b1b5a698d2a745e49f0fa68 (commit)
       via  cb2a05a424c0fc4bb05d7ad171cd220a0507150e (commit)
       via  10c1e7359286edd6562c52304e9706449a9ee53f (commit)
       via  f159fab8f9d6bc4254192ce43432defd5bd400aa (commit)
       via  e780e337d170036a69879debeb347a9c26a81518 (commit)
       via  39c28fac9320fbee1cc8fc4e92c5e7cca03cccd6 (commit)
       via  25bea50304d8de6c340ea6f0bda9305948cc36cd (commit)
       via  ec2d38f6129879183ce4e940e44afa7d3e6dca27 (commit)
       via  77f0b1bfa00972e13d10233c8c5f08d7ac7e0c1b (commit)
       via  e0b527579fe18fa5ed01739e2cc5528e0777584e (commit)
       via  6eb2a147f40a3ea01b45266b1281244098d024e8 (commit)
       via  3e5ac41e4e38e6c8fa72883b803d85fcde53e25b (commit)
       via  50e758a596802a957b889814904eda9b7e7a5267 (commit)
       via  9d413ff4e3868761422f79b4f81177cec05cbae5 (commit)
       via  05709dc092dccede4206b7cfe04f1574bc2299c4 (commit)
       via  43bb88d0e15c7dc257cc8b98d5862a1fa3681549 (commit)
       via  40447d7fe7a73c6babd90fbb6d10f72f8de3a3e4 (commit)
       via  70eb5479023517624f1966791ccfaeb6d273017d (commit)
       via  fda792680a8fd21b1c80ea2a79b267381521935a (commit)
       via  5ea613b9dfb402666adb88b3d5e531bce86f5401 (commit)
       via  5e0a13ebe8f0a25bf09de76024687481c81a19fe (commit)
       via  df6941fb0efb83b0c030fb284b9397dd2bd09167 (commit)
       via  3c85ce36c1188474b6b94ad534daf51cb1497274 (commit)
       via  ef664b6f94418aa8d11848a4c938602f227bbe99 (commit)
       via  b84947c1681c49ef1c1e9c29d1d19825c8868500 (commit)
       via  bc311951f626e014fea21fce010bbefbea545b75 (commit)
       via  fad3f1b73088d66cae689dd2712f52af020ba357 (commit)
       via  5fc7f61811373868b4740de6ce002b6bbdfccd7e (commit)
       via  0a9f43066d71e9e7ee04ce121083336ee7333a9d (commit)
       via  de65e841d1de01f0b1b356d9d47acd7fd3b49a2b (commit)
       via  d3693bea252723f0cfd6dc6ea5c06347cfcb6cb3 (commit)
       via  0504474989f03a0f1f18fd678abf936abafb296c (commit)
       via  10dc1ca759592b7281265ac1378bda126c979208 (commit)
       via  5b8dfb5d5f4b34fa730e69c45c517f0cfb3fdc9f (commit)
       via  95e5ccacf6c1193b313fa90a6d39baafa2ba67d8 (commit)
       via  10176dd693120301ccc9695310c6ecbd7714cf79 (commit)
       via  c71619f7d3ec01de2c5a9a517701ecf88381830e (commit)
       via  c076dcc368300f5e767b6cb946551fd3c9c84a3b (commit)
       via  794cecd80b3fc3c9812687fa9e4e269fb1e79c60 (commit)
       via  516d126f71ecdb05f3ebac781d2518634d16209f (commit)
       via  b43cf1d3398b7004ada50e053ae235b814c9aa70 (commit)
       via  572601ef7712342e599e2b2abf22268f8f378b07 (commit)
       via  6525b509825dbbf1cbe8b30b34080aafc4e5bde3 (commit)
       via  1d36bef0f6e0a64d4d7660f5db8e9625d99302c6 (commit)
       via  e73008e45f0c67d26bd4e05c932de4c9d4431575 (commit)
       via  27f2739825c3d543301a82871ab6f07d69785998 (commit)
       via  43227b3a265d1529efe3e5b12964fe3a622f139b (commit)
       via  9af9cecdc25dc15243e6ca54895620ab472780df (commit)
       via  499208bc547b8151fcf3a4230fab28720f94b13c (commit)
       via  c11f9cac7535f27aff9575002bc31ba0149ac111 (commit)
       via  ac2cc876733c6137d525d12780275f2c02d84383 (commit)
       via  bc97d806fa1fb15e49cefe819f00089ebc33d376 (commit)
       via  bf742fa9565561e8862bbc86a38a411accbcb8c3 (commit)
       via  301e4dc1069fd7f97d977d9cbeb682e697660e49 (commit)
       via  e4fca76d5db9bd844530454894d07ddc729b4a9a (commit)
       via  3df02aa618ee1f9c125d9f2253d95a474d3d8e57 (commit)
       via  3ae5b174a710fca21895a1ad7273b6f61bf9ac1e (commit)
       via  35bc168c5a98dbe3b97c4b3179b4834b78a5b72c (commit)
       via  067a7263a97e1b4d96ee6572e79884bdcd937bbf (commit)
       via  0d93140ca76f38f05f1b689bdf31168efbc6984f (commit)
       via  5c1d46ab958097eb0da1ca692af23345e1faf66c (commit)
       via  22488d2c44b03bd255a9d223b8f8498d56534bdf (commit)
       via  b7d199af0a1844a4b6db38d315c26365617bfe41 (commit)
       via  2df6f028ca8b1ec3abb6f15eaf949174ccbb8586 (commit)
       via  a0446e23b6795c8c90c5e66438a3a1bb82f9368a (commit)
       via  662a70957f0652acd6579efc635eed8f708ab48c (commit)
       via  1a373b5f2c37cead0fe41482805fdb93ca871e37 (commit)
       via  6d6a0b0f7d5780c92c865ba2e6979195510c27b2 (commit)
       via  c18142134ed78216dbddcc40c9954a2701d6086e (commit)
       via  489aed58cb0d8bd816e07128cfcb9f5a06224083 (commit)
       via  24972a99eb2cb4d914b687ccd4050f2da0333214 (commit)
       via  f739d736bbb60a8463f04f5d56c18d09157d820e (commit)
       via  d7b63b6901f31b8b9de89762cdf76aa9ae5b00b7 (commit)
       via  58044098495d066effa7fd4742b6635d9a10fdfb (commit)
       via  99263be28ab2ec28db721ced52357c05d11947d7 (commit)
       via  70e5c7a3c6a5860d702d5e5c219dc0f3a3696d35 (commit)
       via  a96cca47cc60c482316d3b1a25cfe1cb4b838f41 (commit)
       via  fd4d26e448b1f9f45f84c0ff9ec10db54471d880 (commit)
       via  a41c0f6aa41b658c8f2947c46cb90778894f5cf3 (commit)
       via  63bae17d784c2c1522a087d71a0fcb2a9b6eddcd (commit)
       via  5d9908601b8c16c556d0153640f67aa3b26c2f57 (commit)
       via  89698c15b13dcb151ebb673a2d73b1df7339c607 (commit)
       via  21e13deb6b38f6bea48923306755f648acd2d794 (commit)
       via  eb1c9afa9a92c1506e5d4d1161b6e74d919e8f00 (commit)
       via  34d989929d8ecf3620640b5b7a5e89b89da70a89 (commit)
       via  37e385b7aaf52ac391c6964557e30f662cb1de36 (commit)
       via  f0d67ab6f71cef3e7ce0fa850f7309a94e21671b (commit)
       via  9c93e1f79c385269435299d90d00e1b36c09cf3c (commit)
       via  0ba8a53ba145475153b01dc498f85dbd2f03228b (commit)
       via  bb31e272ab27d14a5d7dce7e4270c10b99035f45 (commit)
       via  61ad64acbca1b8cc990ecf0e9ca2c54ad9eb8fe6 (commit)
       via  4d00ed0a34cae5a5da3992ca9adac46d3682fee1 (commit)
       via  bc49a7967325c523a7c25d48df30ae718a66aa63 (commit)
       via  420a8e7fb7b159452da834062cc3e040dd1b411b (commit)
       via  0befd8da0101dbc7845c8216637c8b621519bc68 (commit)
       via  f8f3944e6ba1f470eedf48aa609a50c780fe4b6d (commit)
       via  ce3f2d624d8600a27030994990b8893fcb7df24f (commit)
       via  71df6eb49c3c3ef4a9ef463f9193764149cfd5d2 (commit)
       via  809a07ce8704dcb579fc653664bd365f1a6e9ef7 (commit)
       via  19f787c5b5833623eee4d6cb88ddb909a5add7fc (commit)
       via  5de415291109d623382b364177f707e4b33e0ee1 (commit)
       via  b78c39a9e242e44357fe7c5f0f70eb534ec2a9e3 (commit)
       via  7828c8a729955de05135c2a53172b29c515a03ec (commit)
       via  c9af3e8511747cc3f530cdfe62f59955defe2121 (commit)
       via  0e435239a486e09a889d4f09a30f42053b9f687f (commit)
       via  71e0f70a3095f669e4b136c9d7624c26634db0d3 (commit)

This update added new revisions after undoing existing revisions.  That is
to say, the old revision is not a strict subset of the new revision.  This
situation occurs when you --force push a change and generate a repository
containing something like this:

 * -- * -- B -- O -- O -- O (88fd77aa7d9ce38a291c09eb459228f60edfb2fb)
            \
             N -- N -- N (c6e22488e438656c515c00b11a418a9e14009d21)

When this happens we assume that you've already had alert emails for all
of the O revisions, and so we here report only the revisions in the N
branch from the common base, B.

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.


commit c6e22488e438656c515c00b11a418a9e14009d21
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Sat Sep 29 02:41:21 2018 -0400

    13964: Initial version of dispatch-cloud.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/.licenseignore b/.licenseignore
index 51a1e7cbd..a0127cfa3 100644
--- a/.licenseignore
+++ b/.licenseignore
@@ -69,3 +69,4 @@ sdk/R/NAMESPACE
 sdk/R/.Rbuildignore
 sdk/R/ArvadosR.Rproj
 *.Rd
+lib/dispatchcloud/test/sshkey_*
diff --git a/build/run-build-packages.sh b/build/run-build-packages.sh
index 88466bd37..a93a2d553 100755
--- a/build/run-build-packages.sh
+++ b/build/run-build-packages.sh
@@ -294,6 +294,8 @@ package_go_binary cmd/arvados-server arvados-server \
     "Arvados server daemons"
 package_go_binary cmd/arvados-server arvados-controller \
     "Arvados cluster controller daemon"
+package_go_binary cmd/arvados-server crunch-dispatch-cloud \
+    "Arvados cluster cloud dispatch"
 package_go_binary sdk/go/crunchrunner crunchrunner \
     "Crunchrunner executes a command inside a container and uploads the output"
 package_go_binary services/arv-git-httpd arvados-git-httpd \
diff --git a/build/run-tests.sh b/build/run-tests.sh
index 4ddbf89c1..31c232b92 100755
--- a/build/run-tests.sh
+++ b/build/run-tests.sh
@@ -77,6 +77,10 @@ lib/cmd
 lib/controller
 lib/crunchstat
 lib/dispatchcloud
+lib/dispatchcloud/container
+lib/dispatchcloud/scheduler
+lib/dispatchcloud/ssh_executor
+lib/dispatchcloud/worker
 services/api
 services/arv-git-httpd
 services/crunchstat
@@ -923,6 +927,10 @@ gostuff=(
     lib/controller
     lib/crunchstat
     lib/dispatchcloud
+    lib/dispatchcloud/container
+    lib/dispatchcloud/scheduler
+    lib/dispatchcloud/ssh_executor
+    lib/dispatchcloud/worker
     sdk/go/arvados
     sdk/go/arvadosclient
     sdk/go/blockdigest
diff --git a/cmd/arvados-server/cmd.go b/cmd/arvados-server/cmd.go
index 1af3745df..cd15d25dd 100644
--- a/cmd/arvados-server/cmd.go
+++ b/cmd/arvados-server/cmd.go
@@ -9,6 +9,7 @@ import (
 
 	"git.curoverse.com/arvados.git/lib/cmd"
 	"git.curoverse.com/arvados.git/lib/controller"
+	"git.curoverse.com/arvados.git/lib/dispatchcloud"
 )
 
 var (
@@ -18,7 +19,8 @@ var (
 		"-version":  cmd.Version(version),
 		"--version": cmd.Version(version),
 
-		"controller": controller.Command,
+		"controller":     controller.Command,
+		"dispatch-cloud": dispatchcloud.Command,
 	})
 )
 
diff --git a/cmd/arvados-server/crunch-dispatch-cloud.service b/cmd/arvados-server/crunch-dispatch-cloud.service
new file mode 100644
index 000000000..f8d71c975
--- /dev/null
+++ b/cmd/arvados-server/crunch-dispatch-cloud.service
@@ -0,0 +1,28 @@
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: AGPL-3.0
+
+[Unit]
+Description=Arvados cloud dispatch
+Documentation=https://doc.arvados.org/
+After=network.target
+AssertPathExists=/etc/arvados/config.yml
+
+# systemd==229 (ubuntu:xenial) obeys StartLimitInterval in the [Unit] section
+StartLimitInterval=0
+
+# systemd>=230 (debian:9) obeys StartLimitIntervalSec in the [Unit] section
+StartLimitIntervalSec=0
+
+[Service]
+Type=notify
+EnvironmentFile=-/etc/arvados/environment
+ExecStart=/usr/bin/crunch-dispatch-cloud
+Restart=always
+RestartSec=1
+
+# systemd<=219 (centos:7, debian:8, ubuntu:trusty) obeys StartLimitInterval in the [Service] section
+StartLimitInterval=0
+
+[Install]
+WantedBy=multi-user.target
diff --git a/lib/cloud/interfaces.go b/lib/cloud/interfaces.go
new file mode 100644
index 000000000..ff7577ac8
--- /dev/null
+++ b/lib/cloud/interfaces.go
@@ -0,0 +1,160 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package cloud
+
+import (
+	"time"
+
+	"git.curoverse.com/arvados.git/sdk/go/arvados"
+	"golang.org/x/crypto/ssh"
+)
+
+// A RateLimitError should be returned by an InstanceSet when the
+// cloud service indicates it is rejecting all API calls for some time
+// interval.
+type RateLimitError interface {
+	// Time before which the caller should expect requests to
+	// fail.
+	EarliestRetry() time.Time
+	error
+}
+
+// A QuotaError should be returned by an InstanceSet when the cloud
+// service indicates the account cannot create more VMs than already
+// exist.
+type QuotaError interface {
+	// If true, don't create more instances until some existing
+	// instances are destroyed. If false, don't handle the error
+	// as a quota error.
+	IsQuotaError() bool
+	error
+}
+
+type InstanceSetID string
+type InstanceTags map[string]string
+type InstanceID string
+type ImageID string
+
+type ExecutorTarget interface {
+	// SSH server hostname or IP address, or empty string if
+	// unknown while instance is booting.
+	Address() string
+
+	// Return nil if the given public key matches the instance's
+	// SSH server key. If the provided Dialer is not nil,
+	// VerifyPublicKey can use it to make outgoing network
+	// connections from the instance -- e.g., to use the cloud's
+	// "this instance's metadata" API.
+	VerifyPublicKey(ssh.PublicKey, *ssh.Client) error
+}
+
+// Instance is implemented by the provider-specific instance types.
+type Instance interface {
+	ExecutorTarget
+
+	// ID returns the provider's instance ID. It must be stable
+	// for the life of the instance.
+	ID() InstanceID
+
+	// String typically returns the cloud-provided instance ID.
+	String() string
+
+	// Cloud provider's "instance type" ID. Matches a ProviderType
+	// in the cluster's InstanceTypes configuration.
+	ProviderType() string
+
+	// Get current tags
+	Tags() InstanceTags
+
+	// Replace tags with the given tags
+	SetTags(InstanceTags) error
+
+	// Shut down the node
+	Destroy() error
+}
+
+// An InstanceSet manages a set of VM instances created by an elastic
+// cloud provider like AWS, GCE, or Azure.
+//
+// All public methods of an InstanceSet, and all public methods of the
+// instances it returns, are goroutine safe.
+type InstanceSet interface {
+	// Create a new instance. If supported by the driver, add the
+	// provided public key to /root/.ssh/authorized_keys.
+	//
+	// The returned error should implement RateLimitError and
+	// QuotaError where applicable.
+	Create(arvados.InstanceType, ImageID, InstanceTags, ssh.PublicKey) (Instance, error)
+
+	// Return all instances, including ones that are booting or
+	// shutting down. Optionally, filter out nodes that don't have
+	// all of the given InstanceTags (the caller will ignore these
+	// anyway).
+	//
+	// An instance returned by successive calls to Instances() may
+	// -- but does not need to -- be represented by the same
+	// Instance object each time. Thus, the caller is responsible
+	// for de-duplicating the returned instances by comparing the
+	// InstanceIDs returned by the instances' ID() methods.
+	Instances(InstanceTags) ([]Instance, error)
+
+	// Stop any background tasks and release other resources.
+	Stop()
+}
+
+// A Driver returns an InstanceSet that uses the given InstanceSetID
+// and driver-dependent configuration parameters.
+//
+// The supplied id will be of the form "zzzzz-zzzzz-zzzzzzzzzzzzzzz"
+// where each z can be any alphanum. The returned InstanceSet must use
+// this id to tag long-lived cloud resources that it creates, and must
+// assume control of any existing resources that are tagged with the
+// same id. Tagging can be accomplished by including the ID in
+// resource names, using the cloud provider's tagging feature, or any
+// other mechanism. The tags must be visible to another instance of
+// the same driver running on a different host.
+//
+// The returned InstanceSet must ignore existing resources that are
+// visible but not tagged with the given id, except that it should log
+// a summary of such resources -- only once -- when it starts
+// up. Thus, two identically configured InstanceSets running on
+// different hosts with different ids should log about the existence
+// of each other's resources at startup, but will not interfere with
+// each other.
+//
+// Example:
+//
+//	type exampleInstanceSet struct {
+//		ownID     string
+//		AccessKey string
+//	}
+//
+//	type exampleDriver struct {}
+//
+//	func (*exampleDriver) InstanceSet(config map[string]interface{}, id InstanceSetID) (InstanceSet, error) {
+//		var is exampleInstanceSet
+//		if err := mapstructure.Decode(config, &is); err != nil {
+//			return nil, err
+//		}
+//		is.ownID = id
+//		return &is, nil
+//	}
+//
+//	var _ = registerCloudDriver("example", &exampleDriver{})
+type Driver interface {
+	InstanceSet(config map[string]interface{}, id InstanceSetID) (InstanceSet, error)
+}
+
+// DriverFunc makes a Driver using the provided function as its
+// InstanceSet method. This is similar to http.HandlerFunc.
+func DriverFunc(fn func(config map[string]interface{}, id InstanceSetID) (InstanceSet, error)) Driver {
+	return driverFunc(fn)
+}
+
+type driverFunc func(config map[string]interface{}, id InstanceSetID) (InstanceSet, error)
+
+func (df driverFunc) InstanceSet(config map[string]interface{}, id InstanceSetID) (InstanceSet, error) {
+	return df(config, id)
+}
diff --git a/lib/cmd/cmd.go b/lib/cmd/cmd.go
index 8c65cf7ac..9292ef7e5 100644
--- a/lib/cmd/cmd.go
+++ b/lib/cmd/cmd.go
@@ -36,8 +36,9 @@ func (v Version) RunCommand(prog string, args []string, stdin io.Reader, stdout,
 	return 0
 }
 
-// Multi is a Handler that looks up its first argument in a map, and
-// invokes the resulting Handler with the remaining args.
+// Multi is a Handler that looks up its first argument in a map (after
+// stripping any "arvados-" or "crunch-" prefix), and invokes the
+// resulting Handler with the remaining args.
 //
 // Example:
 //
diff --git a/lib/dispatchcloud/cmd.go b/lib/dispatchcloud/cmd.go
new file mode 100644
index 000000000..a5a11d2fa
--- /dev/null
+++ b/lib/dispatchcloud/cmd.go
@@ -0,0 +1,17 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package dispatchcloud
+
+import (
+	"git.curoverse.com/arvados.git/lib/cmd"
+	"git.curoverse.com/arvados.git/lib/service"
+	"git.curoverse.com/arvados.git/sdk/go/arvados"
+)
+
+var Command cmd.Handler = service.Command(arvados.ServiceNameDispatchCloud, newHandler)
+
+func newHandler(cluster *arvados.Cluster, _ *arvados.NodeProfile) service.Handler {
+	return &dispatcher{Cluster: cluster}
+}
diff --git a/lib/dispatchcloud/container/queue.go b/lib/dispatchcloud/container/queue.go
new file mode 100644
index 000000000..7859849b1
--- /dev/null
+++ b/lib/dispatchcloud/container/queue.go
@@ -0,0 +1,270 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package container
+
+import (
+	"io"
+	"sync"
+
+	"git.curoverse.com/arvados.git/sdk/go/arvados"
+	"github.com/Sirupsen/logrus"
+)
+
+type typeChooser func(*arvados.Container) (arvados.InstanceType, error)
+
+type APIClient interface {
+	RequestAndDecode(dst interface{}, method, path string, body io.Reader, params interface{}) error
+}
+
+type QueueEnt struct {
+	Container    arvados.Container
+	InstanceType arvados.InstanceType
+}
+
+func (c *QueueEnt) String() string {
+	return c.Container.UUID
+}
+
+type Queue struct {
+	logger     logrus.FieldLogger
+	chooseType typeChooser
+	client     APIClient
+
+	auth      *arvados.APIClientAuthorization
+	current   map[string]QueueEnt
+	mtx       sync.Mutex
+	keeplocal map[string]struct{}
+}
+
+func NewQueue(logger logrus.FieldLogger, chooseType typeChooser, client APIClient) *Queue {
+	return &Queue{
+		logger:     logger,
+		chooseType: chooseType,
+		client:     client,
+		current:    map[string]QueueEnt{},
+	}
+}
+
+func (cq *Queue) Forget(uuid string) {
+	cq.mtx.Lock()
+	defer cq.mtx.Unlock()
+	ctr := cq.current[uuid].Container
+	if ctr.State == arvados.ContainerStateComplete || ctr.State == arvados.ContainerStateCancelled {
+		delete(cq.current, uuid)
+	}
+}
+
+func (cq *Queue) Get(uuid string) (arvados.Container, bool) {
+	cq.mtx.Lock()
+	defer cq.mtx.Unlock()
+	if ctr, ok := cq.current[uuid]; !ok {
+		return arvados.Container{}, false
+	} else {
+		return ctr.Container, true
+	}
+}
+
+func (cq *Queue) Entries() map[string]QueueEnt {
+	cq.mtx.Lock()
+	defer cq.mtx.Unlock()
+	ret := make(map[string]QueueEnt, len(cq.current))
+	for uuid, ctr := range cq.current {
+		ret[uuid] = ctr
+	}
+	return ret
+}
+
+func (cq *Queue) Update() error {
+	cq.mtx.Lock()
+	cq.keeplocal = map[string]struct{}{}
+	cq.mtx.Unlock()
+
+	next, err := cq.poll()
+	if err != nil {
+		return err
+	}
+
+	cq.mtx.Lock()
+	defer cq.mtx.Unlock()
+	for uuid, ctr := range next {
+		if _, keep := cq.keeplocal[uuid]; keep {
+			continue
+		}
+		if cur, ok := cq.current[uuid]; !ok {
+			cq.addEnt(uuid, *ctr)
+		} else {
+			cur.Container = *ctr
+			cq.current[uuid] = cur
+		}
+	}
+	for uuid := range cq.current {
+		if _, keep := cq.keeplocal[uuid]; keep {
+			continue
+		} else if _, keep = next[uuid]; keep {
+			continue
+		} else {
+			delete(cq.current, uuid)
+		}
+	}
+	cq.keeplocal = nil
+	return nil
+}
+
+func (cq *Queue) addEnt(uuid string, ctr arvados.Container) {
+	it, err := cq.chooseType(&ctr)
+	if err != nil {
+		// FIXME: throttle warnings, cancel after timeout
+		cq.logger.Warnf("cannot run %s", &ctr)
+		return
+	}
+	cq.current[uuid] = QueueEnt{Container: ctr, InstanceType: it}
+}
+
+func (cq *Queue) Lock(uuid string) error {
+	return cq.apiUpdate(uuid, "lock")
+}
+
+func (cq *Queue) Unlock(uuid string) error {
+	return cq.apiUpdate(uuid, "unlock")
+}
+
+func (cq *Queue) Cancel(uuid string) error {
+	return cq.client.RequestAndDecode(nil, "PUT", "arvados/v1/containers/"+uuid, nil, map[string]map[string]interface{}{
+		"container": {"state": arvados.ContainerStateCancelled},
+	})
+}
+
+func (cq *Queue) apiUpdate(uuid, action string) error {
+	var resp arvados.Container
+	err := cq.client.RequestAndDecode(&resp, "POST", "arvados/v1/containers/"+uuid+"/"+action, nil, nil)
+	if err != nil {
+		return err
+	}
+
+	cq.mtx.Lock()
+	defer cq.mtx.Unlock()
+	if cq.keeplocal != nil {
+		cq.keeplocal[uuid] = struct{}{}
+	}
+	if ent, ok := cq.current[uuid]; !ok {
+		cq.addEnt(uuid, resp)
+	} else {
+		ent.Container.State, ent.Container.Priority, ent.Container.LockedByUUID = resp.State, resp.Priority, resp.LockedByUUID
+		cq.current[uuid] = ent
+	}
+	return nil
+}
+
+func (cq *Queue) poll() (map[string]*arvados.Container, error) {
+	cq.mtx.Lock()
+	size := len(cq.current)
+	auth := cq.auth
+	cq.mtx.Unlock()
+
+	if auth == nil {
+		auth := &arvados.APIClientAuthorization{}
+		err := cq.client.RequestAndDecode(auth, "GET", "arvados/v1/api_client_authorizations/current", nil, nil)
+		if err != nil {
+			return nil, err
+		}
+		cq.mtx.Lock()
+		cq.auth = auth
+		cq.mtx.Unlock()
+	}
+
+	next := make(map[string]*arvados.Container, size)
+	apply := func(updates []arvados.Container) {
+		for _, upd := range updates {
+			if next[upd.UUID] == nil {
+				next[upd.UUID] = &arvados.Container{}
+			}
+			*next[upd.UUID] = upd
+		}
+	}
+	selectParam := []string{"uuid", "state", "priority"}
+	limitParam := 1000
+
+	mine, err := cq.fetchAll(arvados.ResourceListParams{
+		Select:  selectParam,
+		Order:   "uuid",
+		Limit:   &limitParam,
+		Count:   "none",
+		Filters: []arvados.Filter{{"locked_by_uuid", "=", auth.UUID}},
+	})
+	if err != nil {
+		return nil, err
+	}
+	apply(mine)
+
+	avail, err := cq.fetchAll(arvados.ResourceListParams{
+		Select:  selectParam,
+		Order:   "uuid",
+		Limit:   &limitParam,
+		Count:   "none",
+		Filters: []arvados.Filter{{"state", "=", arvados.ContainerStateQueued}, {"priority", ">", "0"}},
+	})
+	if err != nil {
+		return nil, err
+	}
+	apply(avail)
+
+	var missing []string
+	cq.mtx.Lock()
+	for uuid, ent := range cq.current {
+		if next[uuid] == nil &&
+			ent.Container.State != arvados.ContainerStateCancelled &&
+			ent.Container.State != arvados.ContainerStateComplete {
+			missing = append(missing, uuid)
+		}
+	}
+	cq.mtx.Unlock()
+
+	for i, page := 0, 20; i < len(missing); i += page {
+		batch := missing[i:]
+		if len(batch) > page {
+			batch = batch[:page]
+		}
+		ended, err := cq.fetchAll(arvados.ResourceListParams{
+			Select:  selectParam,
+			Order:   "uuid",
+			Count:   "none",
+			Filters: []arvados.Filter{{"uuid", "in", batch}},
+		})
+		if err != nil {
+			return nil, err
+		}
+		apply(ended)
+	}
+	return next, nil
+}
+
+func (cq *Queue) fetchAll(initialParams arvados.ResourceListParams) ([]arvados.Container, error) {
+	var results []arvados.Container
+	params := initialParams
+	params.Offset = 0
+	for {
+		// This list variable must be a new one declared
+		// inside the loop: otherwise, items in the API
+		// response would get deep-merged into the items
+		// loaded in previous iterations.
+		var list arvados.ContainerList
+
+		err := cq.client.RequestAndDecode(&list, "GET", "arvados/v1/containers", nil, params)
+		if err != nil {
+			return nil, err
+		}
+		if len(list.Items) == 0 {
+			break
+		}
+
+		results = append(results, list.Items...)
+		if len(params.Order) == 1 && params.Order == "uuid" {
+			params.Filters = append(initialParams.Filters, arvados.Filter{"uuid", ">", list.Items[len(list.Items)-1].UUID})
+		} else {
+			params.Offset += len(list.Items)
+		}
+	}
+	return results, nil
+}
diff --git a/lib/dispatchcloud/dispatcher.go b/lib/dispatchcloud/dispatcher.go
new file mode 100644
index 000000000..e47286330
--- /dev/null
+++ b/lib/dispatchcloud/dispatcher.go
@@ -0,0 +1,156 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package dispatchcloud
+
+import (
+	"crypto/md5"
+	"encoding/json"
+	"fmt"
+	"net/http"
+	"strings"
+	"sync"
+	"time"
+
+	"git.curoverse.com/arvados.git/lib/cloud"
+	"git.curoverse.com/arvados.git/lib/dispatchcloud/container"
+	"git.curoverse.com/arvados.git/lib/dispatchcloud/scheduler"
+	"git.curoverse.com/arvados.git/lib/dispatchcloud/ssh_executor"
+	"git.curoverse.com/arvados.git/lib/dispatchcloud/worker"
+	"git.curoverse.com/arvados.git/sdk/go/arvados"
+	"github.com/Sirupsen/logrus"
+	"golang.org/x/crypto/ssh"
+)
+
+const (
+	defaultPollInterval = time.Second
+)
+
+type containerQueue interface {
+	scheduler.ContainerQueue
+	Update() error
+}
+
+type dispatcher struct {
+	Cluster       *arvados.Cluster
+	InstanceSetID cloud.InstanceSetID
+
+	logger       logrus.FieldLogger
+	instanceSet  cloud.InstanceSet
+	pool         worker.Pool
+	queue        containerQueue
+	httpHandler  http.Handler
+	pollInterval time.Duration
+	sshKey       ssh.Signer
+
+	setupOnce sync.Once
+	stop      chan struct{}
+}
+
+// ServeHTTP implements service.Handler.
+func (disp *dispatcher) ServeHTTP(w http.ResponseWriter, r *http.Request) {
+	disp.setupOnce.Do(disp.setup)
+	disp.httpHandler.ServeHTTP(w, r)
+}
+
+// CheckHealth implements service.Handler.
+func (disp *dispatcher) CheckHealth() error {
+	disp.setupOnce.Do(disp.setup)
+	return nil
+}
+
+func (disp *dispatcher) Close() {
+	disp.setupOnce.Do(disp.setup)
+	select {
+	case disp.stop <- struct{}{}:
+	default:
+	}
+}
+
+func (disp *dispatcher) newExecutor(inst cloud.Instance) worker.Executor {
+	exr := ssh_executor.New(inst)
+	exr.SetSigners(disp.sshKey)
+	return exr
+}
+
+func (disp *dispatcher) typeChooser(ctr *arvados.Container) (arvados.InstanceType, error) {
+	return ChooseInstanceType(disp.Cluster, ctr)
+}
+
+func (disp *dispatcher) setup() {
+	disp.initialize()
+	go disp.run()
+}
+
+func (disp *dispatcher) initialize() {
+	arvClient := arvados.NewClientFromEnv()
+	if disp.InstanceSetID == "" {
+		if strings.HasPrefix(arvClient.AuthToken, "v2/") {
+			disp.InstanceSetID = cloud.InstanceSetID(strings.Split(arvClient.AuthToken, "/")[1])
+		} else {
+			// Use some other string unique to this token
+			// that doesn't reveal the token itself.
+			disp.InstanceSetID = cloud.InstanceSetID(fmt.Sprintf("%x", md5.Sum([]byte(arvClient.AuthToken))))
+		}
+	}
+	disp.stop = make(chan struct{}, 1)
+	disp.logger = logrus.StandardLogger()
+
+	if key, err := ssh.ParsePrivateKey(disp.Cluster.Dispatch.PrivateKey); err != nil {
+		disp.logger.Fatalf("error parsing configured Dispatch.PrivateKey: %s", err)
+	} else {
+		disp.sshKey = key
+	}
+
+	instanceSet, err := newInstanceSet(disp.Cluster, disp.InstanceSetID)
+	if err != nil {
+		disp.logger.Fatalf("error initializing driver: %s", err)
+	}
+	disp.instanceSet = &instanceSetProxy{instanceSet}
+	disp.pool = worker.NewPool(disp.logger, disp.instanceSet, disp.newExecutor, disp.Cluster)
+	disp.queue = container.NewQueue(disp.logger, disp.typeChooser, arvClient)
+
+	mux := http.NewServeMux()
+	mux.HandleFunc("/status.json", disp.serveStatusJSON)
+	disp.httpHandler = mux
+
+	if d := disp.Cluster.Dispatch.PollInterval; d > 0 {
+		disp.pollInterval = time.Duration(d)
+	} else {
+		disp.pollInterval = defaultPollInterval
+	}
+}
+
+func (disp *dispatcher) run() {
+	defer disp.instanceSet.Stop()
+
+	t0 := time.Now()
+	disp.logger.Infof("FixStaleLocks starting.")
+	scheduler.FixStaleLocks(disp.logger, disp.queue, disp.pool, time.Duration(disp.Cluster.Dispatch.StaleLockTimeout))
+	disp.logger.Infof("FixStaleLocks finished (%s), starting scheduling.", time.Since(t0))
+
+	wp := disp.pool.Subscribe()
+	defer disp.pool.Unsubscribe(wp)
+	poll := time.NewTicker(disp.pollInterval)
+	for {
+		scheduler.Map(disp.logger, disp.queue, disp.pool)
+		scheduler.Sync(disp.logger, disp.queue, disp.pool)
+		select {
+		case <-disp.stop:
+			return
+		case <-wp:
+		case <-poll.C:
+			err := disp.queue.Update()
+			if err != nil {
+				disp.logger.Errorf("error updating queue: %s", err)
+			}
+		}
+	}
+}
+
+func (disp *dispatcher) serveStatusJSON(w http.ResponseWriter, r *http.Request) {
+	json.NewEncoder(w).Encode(map[string]interface{}{
+		"Instances": append([]worker.WorkerView(nil), disp.pool.View()...),
+	})
+}
diff --git a/lib/dispatchcloud/dispatcher_test.go b/lib/dispatchcloud/dispatcher_test.go
new file mode 100644
index 000000000..68168d97a
--- /dev/null
+++ b/lib/dispatchcloud/dispatcher_test.go
@@ -0,0 +1,217 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package dispatchcloud
+
+import (
+	"encoding/json"
+	"fmt"
+	"io"
+	"io/ioutil"
+	"math/rand"
+	"net/http/httptest"
+	"os"
+	"sync"
+	"time"
+
+	"git.curoverse.com/arvados.git/lib/cloud"
+	"git.curoverse.com/arvados.git/lib/dispatchcloud/test"
+	"git.curoverse.com/arvados.git/sdk/go/arvados"
+	"github.com/Sirupsen/logrus"
+	"golang.org/x/crypto/ssh"
+	check "gopkg.in/check.v1"
+)
+
+var _ = check.Suite(&DispatcherSuite{})
+
+type DispatcherSuite struct {
+	cluster     *arvados.Cluster
+	instanceSet *test.LameInstanceSet
+	stubDriver  *test.StubDriver
+	disp        *dispatcher
+}
+
+func (s *DispatcherSuite) SetUpSuite(c *check.C) {
+	logrus.StandardLogger().SetLevel(logrus.DebugLevel)
+}
+
+func (s *DispatcherSuite) SetUpTest(c *check.C) {
+	dispatchpub, _ := test.LoadTestKey(c, "test/sshkey_dispatch")
+	dispatchprivraw, err := ioutil.ReadFile("test/sshkey_dispatch")
+	c.Assert(err, check.IsNil)
+
+	_, hostpriv := test.LoadTestKey(c, "test/sshkey_vm")
+	s.stubDriver = &test.StubDriver{
+		Exec: func(inst cloud.Instance, command string, _ io.Reader, _, _ io.Writer) uint32 {
+			c.Logf("stubDriver SSHExecFunc(%s, %q, ...)", inst, command)
+			return 1
+		},
+		HostKey:        hostpriv,
+		AuthorizedKeys: []ssh.PublicKey{dispatchpub},
+	}
+
+	s.cluster = &arvados.Cluster{
+		CloudVMs: arvados.CloudVMs{
+			Driver:       "test",
+			SyncInterval: arvados.Duration(10 * time.Millisecond),
+		},
+		Dispatch: arvados.Dispatch{
+			PrivateKey:         dispatchprivraw,
+			PollInterval:       arvados.Duration(10 * time.Millisecond),
+			ProbeInterval:      arvados.Duration(10 * time.Millisecond),
+			MaxProbesPerSecond: 1000,
+		},
+		InstanceTypes: arvados.InstanceTypeMap{
+			test.InstanceType(1).Name: test.InstanceType(1),
+			test.InstanceType(2).Name: test.InstanceType(2),
+			test.InstanceType(3).Name: test.InstanceType(3),
+			test.InstanceType(4).Name: test.InstanceType(4),
+			test.InstanceType(5).Name: test.InstanceType(5),
+		},
+		NodeProfiles: map[string]arvados.NodeProfile{
+			"*": {
+				Controller:    arvados.SystemServiceInstance{Listen: os.Getenv("ARVADOS_API_HOST")},
+				DispatchCloud: arvados.SystemServiceInstance{Listen: ":"},
+			},
+		},
+	}
+	s.disp = &dispatcher{Cluster: s.cluster}
+	// Test cases can modify s.cluster before calling
+	// initialize(), and then modify private state before calling
+	// go run().
+}
+
+func (s *DispatcherSuite) TearDownTest(c *check.C) {
+	s.disp.Close()
+}
+
+func (s *DispatcherSuite) TestDispatchToStubDriver(c *check.C) {
+	drivers["test"] = s.stubDriver
+
+	readyTime := map[cloud.InstanceID]time.Time{}
+	mu := sync.Mutex{}
+
+	gotRunCommand := make(chan bool, 1)
+
+	var stubGotCommands []string
+	s.stubDriver.Exec = func(inst cloud.Instance, command string, stdin io.Reader, stdout, stderr io.Writer) uint32 {
+		stubGotCommands = append(stubGotCommands, command)
+		mu.Lock()
+		t, ok := readyTime[inst.ID()]
+		if !ok {
+			t = time.Now().Add(time.Duration(rand.Uint32()%40) * time.Millisecond)
+			readyTime[inst.ID()] = t
+		}
+		mu.Unlock()
+		if time.Now().Before(t) {
+			fmt.Fprintf(stderr, "stub is booting, ETA %s\n", t.Sub(time.Now()))
+			return 1
+		}
+		if command == "crunch-run --detach '"+test.ContainerUUID(1)+"'" {
+			gotRunCommand <- true
+		}
+		return 0
+	}
+
+	s.disp.setupOnce.Do(s.disp.initialize)
+	s.disp.queue = &test.Queue{
+		ChooseType: func(ctr *arvados.Container) (arvados.InstanceType, error) {
+			return test.InstanceType(ctr.RuntimeConstraints.VCPUs), nil
+		},
+		Containers: []arvados.Container{
+			{
+				UUID:     test.ContainerUUID(1),
+				State:    arvados.ContainerStateQueued,
+				Priority: 1,
+				RuntimeConstraints: arvados.RuntimeConstraints{
+					RAM:   1 << 30,
+					VCPUs: 1,
+				},
+			},
+			{
+				UUID:     test.ContainerUUID(2),
+				State:    arvados.ContainerStateQueued,
+				Priority: 2,
+				RuntimeConstraints: arvados.RuntimeConstraints{
+					RAM:   2 << 30,
+					VCPUs: 2,
+				},
+			},
+			{
+				UUID:     test.ContainerUUID(3),
+				State:    arvados.ContainerStateQueued,
+				Priority: 3,
+				RuntimeConstraints: arvados.RuntimeConstraints{
+					RAM:   3 << 30,
+					VCPUs: 3,
+				},
+			},
+		},
+	}
+	go s.disp.run()
+	err := s.disp.CheckHealth()
+	c.Check(err, check.IsNil)
+
+	select {
+	case <-gotRunCommand:
+	case <-time.After(time.Second):
+		c.Errorf("timed out; got %q", stubGotCommands)
+	}
+
+	insts, err := s.stubDriver.InstanceSets()[0].Instances(nil)
+	c.Check(err, check.IsNil)
+	c.Check(len(insts), check.Equals, 3)
+}
+
+func (s *DispatcherSuite) TestStatus(c *check.C) {
+	var lameSet test.LameInstanceSet
+	drivers["test"] = cloud.DriverFunc(func(params map[string]interface{}, id cloud.InstanceSetID) (cloud.InstanceSet, error) {
+		return &lameSet, nil
+	})
+
+	type statusInstance struct {
+		Instance             string
+		WorkerState          string
+		Price                float64
+		LastContainerUUID    string
+		ArvadosInstanceType  string
+		ProviderInstanceType string
+	}
+	type statusResponse struct {
+		Instances []statusInstance
+	}
+	checkStatus := func() statusResponse {
+		req := httptest.NewRequest("GET", "/status.json", nil)
+		resp := httptest.NewRecorder()
+		s.disp.ServeHTTP(resp, req)
+		var sr statusResponse
+		err := json.Unmarshal(resp.Body.Bytes(), &sr)
+		c.Check(err, check.IsNil)
+		return sr
+	}
+
+	sr := checkStatus()
+	c.Check(len(sr.Instances), check.Equals, 0)
+
+	ch := s.disp.pool.Subscribe()
+	defer s.disp.pool.Unsubscribe(ch)
+	err := s.disp.pool.Create(arvados.InstanceType{
+		Name:         "a1.small-1",
+		ProviderType: "a1.small",
+		VCPUs:        1,
+		RAM:          1 << 30,
+		Price:        0.12,
+	})
+	c.Check(err, check.IsNil)
+	<-ch
+
+	sr = checkStatus()
+	c.Assert(len(sr.Instances), check.Equals, 1)
+	c.Check(sr.Instances[0].Instance, check.Matches, "lame-.*")
+	c.Check(sr.Instances[0].WorkerState, check.Equals, "booting")
+	c.Check(sr.Instances[0].Price, check.Equals, 0.12)
+	c.Check(sr.Instances[0].LastContainerUUID, check.Equals, "")
+	c.Check(sr.Instances[0].ProviderInstanceType, check.Equals, "a1.small")
+	c.Check(sr.Instances[0].ArvadosInstanceType, check.Equals, "a1.small-1")
+}
diff --git a/lib/dispatchcloud/driver.go b/lib/dispatchcloud/driver.go
new file mode 100644
index 000000000..295fd6105
--- /dev/null
+++ b/lib/dispatchcloud/driver.go
@@ -0,0 +1,22 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package dispatchcloud
+
+import (
+	"fmt"
+
+	"git.curoverse.com/arvados.git/lib/cloud"
+	"git.curoverse.com/arvados.git/sdk/go/arvados"
+)
+
+var drivers = map[string]cloud.Driver{}
+
+func newInstanceSet(cluster *arvados.Cluster, setID cloud.InstanceSetID) (cloud.InstanceSet, error) {
+	driver, ok := drivers[cluster.CloudVMs.Driver]
+	if !ok {
+		return nil, fmt.Errorf("unsupported cloud driver %q", cluster.CloudVMs.Driver)
+	}
+	return driver.InstanceSet(cluster.CloudVMs.DriverParameters, setID)
+}
diff --git a/lib/dispatchcloud/instance_set_proxy.go b/lib/dispatchcloud/instance_set_proxy.go
new file mode 100644
index 000000000..e728b67cd
--- /dev/null
+++ b/lib/dispatchcloud/instance_set_proxy.go
@@ -0,0 +1,25 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package dispatchcloud
+
+import (
+	"git.curoverse.com/arvados.git/lib/cloud"
+	"git.curoverse.com/arvados.git/sdk/go/arvados"
+	"golang.org/x/crypto/ssh"
+)
+
+type instanceSetProxy struct {
+	cloud.InstanceSet
+}
+
+func (is *instanceSetProxy) Create(it arvados.InstanceType, id cloud.ImageID, tags cloud.InstanceTags, pk ssh.PublicKey) (cloud.Instance, error) {
+	// TODO: return if Create failed recently with a RateLimitError or QuotaError
+	return is.InstanceSet.Create(it, id, tags, pk)
+}
+
+func (is *instanceSetProxy) Instances(tags cloud.InstanceTags) ([]cloud.Instance, error) {
+	// TODO: return if Instances failed recently with a RateLimitError
+	return is.InstanceSet.Instances(tags)
+}
diff --git a/lib/dispatchcloud/logger.go b/lib/dispatchcloud/logger.go
new file mode 100644
index 000000000..90bb6ca68
--- /dev/null
+++ b/lib/dispatchcloud/logger.go
@@ -0,0 +1,29 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package dispatchcloud
+
+import (
+	"sync"
+	"time"
+)
+
+type logger interface {
+	Printf(string, ...interface{})
+	Warnf(string, ...interface{})
+	Debugf(string, ...interface{})
+}
+
+var nextSpam = map[string]time.Time{}
+var nextSpamMtx sync.Mutex
+
+func unspam(msg string) bool {
+	nextSpamMtx.Lock()
+	defer nextSpamMtx.Unlock()
+	if nextSpam[msg].Before(time.Now()) {
+		nextSpam[msg] = time.Now().Add(time.Minute)
+		return true
+	}
+	return false
+}
diff --git a/lib/dispatchcloud/readme.go b/lib/dispatchcloud/readme.go
new file mode 100644
index 000000000..a4b005eb8
--- /dev/null
+++ b/lib/dispatchcloud/readme.go
@@ -0,0 +1,79 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package dispatchcloud
+
+// A dispatcher comprises a container queue, a scheduler, a worker
+// pool, a cloud provider, a stale-lock fixer, and a syncer.
+// 1. Choose a provider.
+// 2. Start a worker pool.
+// 3. Start a container queue.
+// 4. Run a stale-lock fixer.
+// 5. Start a scheduler.
+// 6. Start a syncer.
+//
+//
+// A provider (cloud driver) creates new cloud VM instances and gets
+// the latest list of instances. The returned instances implement
+// proxies to the provider's metadata and control interfaces (get IP
+// address, update tags, shutdown).
+//
+//
+// A workerPool tracks workers' instance types and readiness states
+// (available to do work now, booting, suffering a temporary network
+// outage, shutting down). It loads internal state from the cloud
+// provider's list of instances at startup, and syncs periodically
+// after that.
+//
+//
+// A worker maintains a multiplexed SSH connection to a cloud
+// instance, retrying/reconnecting as needed, so the workerPool can
+// execute commands. It asks the provider's instance to verify its SSH
+// public key once when first connecting, and again later if the key
+// changes.
+//
+//
+// A container queue tracks the known state (according to
+// arvados-controller) of each container of interest -- i.e., queued,
+// or locked/running using our own dispatch token. It also proxies the
+// dispatcher's lock/unlock/cancel requests to the controller. It
+// handles concurrent refresh and update operations without exposing
+// out-of-order updates to its callers. (It drops any new information
+// that might have originated before its own most recent
+// lock/unlock/cancel operation.)
+//
+//
+// A stale-lock fixer waits for any already-locked containers (i.e.,
+// locked by a prior server process) to appear on workers as the
+// worker pool recovers its state. It unlocks/requeues any that still
+// remain when all workers are recovered or shutdown, or its timer
+// expires.
+//
+//
+// A scheduler chooses which containers to assign to which idle
+// workers, and decides what to do when there are not enough idle
+// workers (including shutting down some idle nodes).
+//
+//
+// A syncer updates state to Cancelled when a running container
+// process dies without finalizing its entry in the controller
+// database. It also calls the worker pool to kill containers that
+// have priority=0 while locked or running.
+//
+//
+// A provider proxy wraps a provider with rate-limiting logic. After
+// the wrapped provider receives a cloud.RateLimitError, the proxy
+// starts returning errors to callers immediately without calling
+// through to the wrapped provider.
+//
+//
+// TBD: Bootstrapping script via SSH, too? Future version.
+//
+// TBD: drain instance, keep instance alive
+// TBD: metrics, diagnostics
+// TBD: why dispatch token currently passed to worker?
+//
+// Metrics: queue size, time job has been in queued, #idle/busy/booting nodes
+// Timing in each step, and end-to-end
+// Metrics: boot/idle/alloc time and cost
diff --git a/lib/dispatchcloud/scheduler/fix_stale_locks.go b/lib/dispatchcloud/scheduler/fix_stale_locks.go
new file mode 100644
index 000000000..525947771
--- /dev/null
+++ b/lib/dispatchcloud/scheduler/fix_stale_locks.go
@@ -0,0 +1,53 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package scheduler
+
+import (
+	"time"
+
+	"git.curoverse.com/arvados.git/lib/dispatchcloud/worker"
+	"git.curoverse.com/arvados.git/sdk/go/arvados"
+	"github.com/Sirupsen/logrus"
+)
+
+// FixStaleLocks waits for any already-locked containers (i.e., locked
+// by a prior server process) to appear on workers as the worker pool
+// recovers its state. It unlocks any that still remain when all
+// workers are recovered or shutdown, or its timer expires.
+func FixStaleLocks(logger logrus.FieldLogger, queue ContainerQueue, pool WorkerPool, limit time.Duration) {
+	wp := pool.Subscribe()
+	defer pool.Unsubscribe(wp)
+	timeout := time.NewTimer(limit)
+waiting:
+	for {
+		unlock := false
+		select {
+		case <-wp:
+			// If all workers have been contacted, unlock
+			// containers that aren't claimed by any
+			// worker.
+			unlock = pool.Workers()[worker.StateUnknown] == 0
+		case <-timeout.C:
+			// Give up and unlock the containers, even
+			// though they might be working.
+			unlock = true
+		}
+
+		running := pool.Running()
+		for uuid, ent := range queue.Entries() {
+			if ent.Container.State == arvados.ContainerStateLocked && !running[uuid] {
+				if unlock {
+					err := queue.Unlock(uuid)
+					if err != nil {
+						logger.Warnf("Unlock %s: %s", uuid, err)
+					}
+				} else {
+					continue waiting
+				}
+			}
+		}
+		return
+	}
+}
diff --git a/lib/dispatchcloud/scheduler/gocheck_test.go b/lib/dispatchcloud/scheduler/gocheck_test.go
new file mode 100644
index 000000000..558c60f73
--- /dev/null
+++ b/lib/dispatchcloud/scheduler/gocheck_test.go
@@ -0,0 +1,16 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package scheduler
+
+import (
+	"testing"
+
+	check "gopkg.in/check.v1"
+)
+
+// Gocheck boilerplate
+func Test(t *testing.T) {
+	check.TestingT(t)
+}
diff --git a/lib/dispatchcloud/scheduler/interfaces.go b/lib/dispatchcloud/scheduler/interfaces.go
new file mode 100644
index 000000000..c57fb1f31
--- /dev/null
+++ b/lib/dispatchcloud/scheduler/interfaces.go
@@ -0,0 +1,38 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package scheduler
+
+import (
+	"git.curoverse.com/arvados.git/lib/dispatchcloud/container"
+	"git.curoverse.com/arvados.git/lib/dispatchcloud/worker"
+	"git.curoverse.com/arvados.git/sdk/go/arvados"
+)
+
+// A ContainerQueue is a set of containers that need to be started or
+// stopped. Implemented by container.Queue and test stubs.
+type ContainerQueue interface {
+	Entries() map[string]container.QueueEnt
+	Lock(uuid string) error
+	Unlock(uuid string) error
+	Cancel(uuid string) error
+	Forget(uuid string)
+	Get(uuid string) (arvados.Container, bool)
+}
+
+// A WorkerPool asynchronously starts and stops worker VMs, and starts
+// and stops containers on them. Implemented by worker.Pool and test
+// stubs.
+type WorkerPool interface {
+	Running() map[string]bool
+	Unallocated() map[arvados.InstanceType]int
+	Workers() map[worker.State]int
+	AtQuota() bool
+	Create(arvados.InstanceType) error
+	Shutdown(arvados.InstanceType) bool
+	StartContainer(arvados.InstanceType, arvados.Container) bool
+	KillContainer(uuid string)
+	Subscribe() <-chan struct{}
+	Unsubscribe(<-chan struct{})
+}
diff --git a/lib/dispatchcloud/scheduler/map.go b/lib/dispatchcloud/scheduler/map.go
new file mode 100644
index 000000000..57372eaab
--- /dev/null
+++ b/lib/dispatchcloud/scheduler/map.go
@@ -0,0 +1,127 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package scheduler
+
+import (
+	"sort"
+
+	"git.curoverse.com/arvados.git/lib/cloud"
+	"git.curoverse.com/arvados.git/lib/dispatchcloud/container"
+	"git.curoverse.com/arvados.git/sdk/go/arvados"
+	"github.com/Sirupsen/logrus"
+)
+
+// Map queued containers onto unallocated workers in priority order,
+// creating new workers if needed. Lock containers that can be mapped
+// onto existing/pending workers, and start them if possible.
+//
+// Unlock any containers that are locked but can't be mapped.
+//
+// If errors are encountered creating new workers, shut down idle
+// workers in case they are consuming quota.
+func Map(logger logrus.FieldLogger, queue ContainerQueue, pool WorkerPool) {
+	unsorted := queue.Entries()
+	sorted := make([]container.QueueEnt, 0, len(unsorted))
+	for _, ent := range unsorted {
+		sorted = append(sorted, ent)
+	}
+	sort.Slice(sorted, func(i, j int) bool {
+		return sorted[i].Container.Priority > sorted[j].Container.Priority
+	})
+
+	running := pool.Running()
+	unalloc := pool.Unallocated()
+
+	logger.Debugf("Map: Q=%d, R=%d", len(sorted), len(running))
+
+	dontstart := map[arvados.InstanceType]bool{}
+	var overquota []container.QueueEnt // entries that are unmappable because of worker pool quota
+
+	for i, ctr := range sorted {
+		ctr, it := ctr.Container, ctr.InstanceType
+		if running[ctr.UUID] || ctr.Priority < 1 {
+			continue
+		}
+		if ctr.State == arvados.ContainerStateQueued {
+			logger.Debugf("Map: lock ctr %+v", ctr)
+			if unalloc[it] < 1 && pool.AtQuota() {
+				overquota = sorted[i:]
+				break
+			}
+			err := queue.Lock(ctr.UUID)
+			if err != nil {
+				logger.Warnf("Map: Lock(%s): %s", ctr.UUID, err)
+				unalloc[it]++
+				continue
+			}
+			var ok bool
+			ctr, ok = queue.Get(ctr.UUID)
+			if !ok {
+				logger.Errorf("Map: (BUG?) container %s disappeared from queue after Lock succeeded", ctr.UUID)
+				continue
+			}
+			if ctr.State != arvados.ContainerStateLocked {
+				logger.Debugf("Map: (race?) container %s has state=%q after Lock succeeded", ctr.UUID, ctr.State)
+			}
+		}
+		if ctr.State != arvados.ContainerStateLocked {
+			continue
+		}
+		logger.Debugf("Map: try ctr %s", ctr.UUID)
+		if unalloc[it] < 1 {
+			logger.Debugf("Map: create worker with type %s", it.Name)
+			err := pool.Create(it)
+			if err != nil {
+				if _, ok := err.(cloud.QuotaError); !ok {
+					logger.Warnf("Map: pool.Create: %s", err)
+				}
+				queue.Unlock(ctr.UUID)
+				// Don't let lower-priority containers
+				// starve this one by using keeping
+				// idle workers alive on different
+				// instance types.  TODO: avoid
+				// getting starved here if instances
+				// of a specific type always fail.
+				overquota = sorted[i:]
+				break
+			}
+			unalloc[it]++
+		}
+		if dontstart[it] {
+			// We already tried & failed to start a
+			// higher-priority container on the same
+			// instance type. Don't let this one sneak in
+			// ahead of it.
+		} else if pool.StartContainer(it, ctr) {
+			logger.Debugf("Map: started %s", ctr.UUID)
+			unalloc[it]--
+		} else {
+			dontstart[it] = true
+		}
+	}
+
+	if len(overquota) > 0 {
+		// Unlock any containers that are unmappable while
+		// we're at quota.
+		for _, ctr := range overquota {
+			ctr := ctr.Container
+			if ctr.State == arvados.ContainerStateLocked {
+				logger.Debugf("Map: Unlock(%s) because pool capacity is used by higher priority containers", ctr.UUID)
+				err := queue.Unlock(ctr.UUID)
+				if err != nil {
+					logger.Warnf("Map: error unlocking %s: %s", ctr.UUID, err)
+				}
+			}
+		}
+		// Shut down idle workers that didn't get any
+		// containers mapped onto them before we hit quota.
+		for it, n := range unalloc {
+			if n < 1 {
+				continue
+			}
+			pool.Shutdown(it)
+		}
+	}
+}
diff --git a/lib/dispatchcloud/scheduler/map_test.go b/lib/dispatchcloud/scheduler/map_test.go
new file mode 100644
index 000000000..c40b3041b
--- /dev/null
+++ b/lib/dispatchcloud/scheduler/map_test.go
@@ -0,0 +1,259 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package scheduler
+
+import (
+	"errors"
+	"fmt"
+
+	"git.curoverse.com/arvados.git/lib/dispatchcloud/container"
+	"git.curoverse.com/arvados.git/lib/dispatchcloud/test"
+	"git.curoverse.com/arvados.git/lib/dispatchcloud/worker"
+	"git.curoverse.com/arvados.git/sdk/go/arvados"
+	"github.com/Sirupsen/logrus"
+	check "gopkg.in/check.v1"
+)
+
+var (
+	logger = logrus.StandardLogger()
+
+	// arbitrary example instance types
+	types = func() (r []arvados.InstanceType) {
+		for i := 0; i < 16; i++ {
+			r = append(r, test.InstanceType(i))
+		}
+		return
+	}()
+
+	// arbitrary example container UUIDs
+	uuids = func() (r []string) {
+		for i := 0; i < 16; i++ {
+			r = append(r, test.ContainerUUID(i))
+		}
+		return
+	}()
+)
+
+type stubQueue struct {
+	ents map[string]container.QueueEnt
+}
+
+func (q *stubQueue) Entries() map[string]container.QueueEnt {
+	return q.ents
+}
+func (q *stubQueue) Lock(uuid string) error {
+	return q.setState(uuid, arvados.ContainerStateLocked)
+}
+func (q *stubQueue) Unlock(uuid string) error {
+	return q.setState(uuid, arvados.ContainerStateQueued)
+}
+func (q *stubQueue) Get(uuid string) (arvados.Container, bool) {
+	ent, ok := q.ents[uuid]
+	return ent.Container, ok
+}
+func (q *stubQueue) setState(uuid string, state arvados.ContainerState) error {
+	ent, ok := q.ents[uuid]
+	if !ok {
+		return fmt.Errorf("no such ent: %q", uuid)
+	}
+	ent.Container.State = state
+	q.ents[uuid] = ent
+	return nil
+}
+
+type stubQuotaError struct {
+	error
+}
+
+func (stubQuotaError) IsQuotaError() bool { return true }
+
+type stubPool struct {
+	notify    <-chan struct{}
+	unalloc   map[arvados.InstanceType]int // idle+booting+unknown
+	idle      map[arvados.InstanceType]int
+	running   map[string]bool
+	atQuota   bool
+	canCreate int
+	creates   []arvados.InstanceType
+	starts    []string
+	shutdowns int
+}
+
+func (p *stubPool) AtQuota() bool               { return p.atQuota }
+func (p *stubPool) Subscribe() <-chan struct{}  { return p.notify }
+func (p *stubPool) Unsubscribe(<-chan struct{}) {}
+func (p *stubPool) Running() map[string]bool    { return p.running }
+func (p *stubPool) Unallocated() map[arvados.InstanceType]int {
+	r := map[arvados.InstanceType]int{}
+	for it, n := range p.unalloc {
+		r[it] = n
+	}
+	return r
+}
+func (p *stubPool) Create(it arvados.InstanceType) error {
+	p.creates = append(p.creates, it)
+	if p.canCreate < 1 {
+		return stubQuotaError{errors.New("quota")}
+	}
+	p.canCreate--
+	p.unalloc[it]++
+	return nil
+}
+func (p *stubPool) Shutdown(arvados.InstanceType) bool {
+	p.shutdowns++
+	return false
+}
+func (p *stubPool) Workers() map[worker.State]int {
+	return map[worker.State]int{
+		worker.StateBooting: len(p.unalloc) - len(p.idle),
+		worker.StateRunning: len(p.idle) - len(p.running),
+	}
+}
+func (p *stubPool) StartContainer(it arvados.InstanceType, ctr arvados.Container) bool {
+	p.starts = append(p.starts, ctr.UUID)
+	if p.idle[it] == 0 {
+		return false
+	}
+	p.idle[it]--
+	p.unalloc[it]--
+	p.running[ctr.UUID] = true
+	return true
+}
+
+var _ = check.Suite(&SchedulerSuite{})
+
+type SchedulerSuite struct{}
+
+// Map priority=4 container to idle node. Create a new instance for
+// the priority=3 container. Don't try to start any priority<3
+// containers because priority=3 container didn't start
+// immediately. Don't try to create any other nodes after the failed
+// create.
+func (*SchedulerSuite) TestMapIdle(c *check.C) {
+	queue := stubQueue{
+		ents: map[string]container.QueueEnt{
+			uuids[1]: {
+				Container:    arvados.Container{UUID: uuids[1], Priority: 1, State: arvados.ContainerStateQueued},
+				InstanceType: types[1],
+			},
+			uuids[2]: {
+				Container:    arvados.Container{UUID: uuids[2], Priority: 2, State: arvados.ContainerStateQueued},
+				InstanceType: types[1],
+			},
+			uuids[3]: {
+				Container:    arvados.Container{UUID: uuids[3], Priority: 3, State: arvados.ContainerStateQueued},
+				InstanceType: types[1],
+			},
+			uuids[4]: {
+				Container:    arvados.Container{UUID: uuids[4], Priority: 4, State: arvados.ContainerStateQueued},
+				InstanceType: types[1],
+			},
+		},
+	}
+	pool := stubPool{
+		unalloc: map[arvados.InstanceType]int{
+			types[1]: 1,
+			types[2]: 2,
+		},
+		idle: map[arvados.InstanceType]int{
+			types[1]: 1,
+			types[2]: 2,
+		},
+		running:   map[string]bool{},
+		canCreate: 1,
+	}
+	Map(logger, &queue, &pool)
+	c.Check(pool.creates, check.DeepEquals, []arvados.InstanceType{types[1]})
+	c.Check(pool.starts, check.DeepEquals, []string{uuids[4], uuids[3]})
+	c.Check(pool.running, check.DeepEquals, map[string]bool{uuids[4]: true})
+}
+
+// Shutdown some nodes if Create() fails -- and without even calling
+// Create(), if AtQuota() is true.
+func (*SchedulerSuite) TestMapShutdownAtQuota(c *check.C) {
+	for quota := 0; quota < 2; quota++ {
+		shouldCreate := types[1 : 1+quota]
+		queue := stubQueue{
+			ents: map[string]container.QueueEnt{
+				uuids[1]: {
+					Container:    arvados.Container{UUID: uuids[1], Priority: 1, State: arvados.ContainerStateQueued},
+					InstanceType: types[1],
+				},
+			},
+		}
+		pool := stubPool{
+			atQuota: quota == 0,
+			unalloc: map[arvados.InstanceType]int{
+				types[2]: 2,
+			},
+			idle: map[arvados.InstanceType]int{
+				types[2]: 2,
+			},
+			running:   map[string]bool{},
+			creates:   []arvados.InstanceType{},
+			starts:    []string{},
+			canCreate: 0,
+		}
+		Map(logger, &queue, &pool)
+		c.Check(pool.creates, check.DeepEquals, shouldCreate)
+		c.Check(pool.starts, check.DeepEquals, []string{})
+		c.Check(pool.shutdowns, check.Not(check.Equals), 0)
+	}
+}
+
+// Start lower-priority containers while waiting for new/existing
+// workers to come up for higher-priority containers.
+func (*SchedulerSuite) TestMapStartWhileCreating(c *check.C) {
+	pool := stubPool{
+		unalloc: map[arvados.InstanceType]int{
+			types[1]: 1,
+			types[2]: 1,
+		},
+		idle: map[arvados.InstanceType]int{
+			types[1]: 1,
+			types[2]: 1,
+		},
+		running:   map[string]bool{},
+		canCreate: 2,
+	}
+	queue := stubQueue{
+		ents: map[string]container.QueueEnt{
+			uuids[1]: {
+				// create a new worker
+				Container:    arvados.Container{UUID: uuids[1], Priority: 1, State: arvados.ContainerStateQueued},
+				InstanceType: types[1],
+			},
+			uuids[2]: {
+				// tentatively map to unalloc worker
+				Container:    arvados.Container{UUID: uuids[2], Priority: 2, State: arvados.ContainerStateQueued},
+				InstanceType: types[1],
+			},
+			uuids[3]: {
+				// start now on idle worker
+				Container:    arvados.Container{UUID: uuids[3], Priority: 3, State: arvados.ContainerStateQueued},
+				InstanceType: types[1],
+			},
+			uuids[4]: {
+				// create a new worker
+				Container:    arvados.Container{UUID: uuids[4], Priority: 4, State: arvados.ContainerStateQueued},
+				InstanceType: types[2],
+			},
+			uuids[5]: {
+				// tentatively map to unalloc worker
+				Container:    arvados.Container{UUID: uuids[5], Priority: 5, State: arvados.ContainerStateQueued},
+				InstanceType: types[2],
+			},
+			uuids[6]: {
+				// start now on idle worker
+				Container:    arvados.Container{UUID: uuids[6], Priority: 6, State: arvados.ContainerStateQueued},
+				InstanceType: types[2],
+			},
+		},
+	}
+	Map(logger, &queue, &pool)
+	c.Check(pool.creates, check.DeepEquals, []arvados.InstanceType{types[2], types[1]})
+	c.Check(pool.starts, check.DeepEquals, []string{uuids[6], uuids[5], uuids[3], uuids[2]})
+	c.Check(pool.running, check.DeepEquals, map[string]bool{uuids[3]: true, uuids[6]: true})
+}
diff --git a/lib/dispatchcloud/scheduler/sync.go b/lib/dispatchcloud/scheduler/sync.go
new file mode 100644
index 000000000..6a175127e
--- /dev/null
+++ b/lib/dispatchcloud/scheduler/sync.go
@@ -0,0 +1,51 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package scheduler
+
+import (
+	"git.curoverse.com/arvados.git/lib/dispatchcloud/container"
+	"git.curoverse.com/arvados.git/sdk/go/arvados"
+	"github.com/Sirupsen/logrus"
+)
+
+// Sync resolves discrepancies between the queue and the pool. It must
+// not be called concurrently with Map.
+func Sync(logger logrus.FieldLogger, queue ContainerQueue, pool WorkerPool) {
+	running := pool.Running()
+	cancel := func(ent container.QueueEnt, reason string) {
+		uuid := ent.Container.UUID
+		logger.Printf("cancelling container %s because %s", uuid, reason)
+		err := queue.Cancel(uuid)
+		if err != nil {
+			logger.Printf("error cancelling container %s: %s", uuid, err)
+		}
+	}
+	kill := func(ent container.QueueEnt) {
+		uuid := ent.Container.UUID
+		logger.Printf("killing running container %s because state is %s", uuid, ent.Container.State)
+		pool.KillContainer(uuid)
+	}
+	for uuid, ent := range queue.Entries() {
+		switch ent.Container.State {
+		case arvados.ContainerStateRunning:
+			if !running[uuid] {
+				cancel(ent, "not running on any worker")
+			}
+		case arvados.ContainerStateComplete, arvados.ContainerStateCancelled:
+			if running[uuid] {
+				kill(ent)
+			} else {
+				queue.Forget(uuid)
+			}
+		case arvados.ContainerStateQueued:
+			if running[uuid] {
+				kill(ent)
+			}
+		case arvados.ContainerStateLocked:
+		default:
+			logger.Errorf("BUG: container %s has unexpected state %q", uuid, ent.Container.State)
+		}
+	}
+}
diff --git a/lib/dispatchcloud/ssh_executor/executor.go b/lib/dispatchcloud/ssh_executor/executor.go
new file mode 100644
index 000000000..6a6ff22ab
--- /dev/null
+++ b/lib/dispatchcloud/ssh_executor/executor.go
@@ -0,0 +1,148 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package ssh_executor
+
+import (
+	"bytes"
+	"errors"
+	"io"
+	"net"
+	"sync"
+	"time"
+
+	"git.curoverse.com/arvados.git/lib/cloud"
+	"golang.org/x/crypto/ssh"
+)
+
+// New returns a pool.Executor that executes commands over a
+// multiplexed SSH connection.
+func New(t cloud.ExecutorTarget) *executor {
+	return &executor{target: t}
+}
+
+type executor struct {
+	target  cloud.ExecutorTarget
+	signers []ssh.Signer
+	mtx     sync.RWMutex // controls access to instance after creation
+
+	client      *ssh.Client
+	clientErr   error
+	clientOnce  sync.Once
+	clientSetup chan bool
+	publicKey   ssh.PublicKey
+}
+
+func (exr *executor) SetSigners(signers ...ssh.Signer) {
+	exr.mtx.Lock()
+	defer exr.mtx.Unlock()
+	exr.signers = signers
+}
+
+func (exr *executor) SetTarget(t cloud.ExecutorTarget) {
+	exr.mtx.Lock()
+	defer exr.mtx.Unlock()
+	exr.target = t
+}
+
+func (exr *executor) Target() cloud.ExecutorTarget {
+	exr.mtx.RLock()
+	defer exr.mtx.RUnlock()
+	return exr.target
+}
+
+func (exr *executor) Execute(cmd string, stdin io.Reader) ([]byte, []byte, error) {
+	session, err := exr.newSession()
+	if err != nil {
+		return nil, nil, err
+	}
+	defer session.Close()
+	var stdout, stderr bytes.Buffer
+	session.Stdin = stdin
+	session.Stdout = &stdout
+	session.Stderr = &stderr
+	err = session.Run(cmd)
+	return stdout.Bytes(), stderr.Bytes(), err
+}
+
+// Create a new SSH session. If session setup fails or the SSH client
+// hasn't been setup yet, setup a new SSH client and try again.
+func (exr *executor) newSession() (*ssh.Session, error) {
+	try := func(create bool) (*ssh.Session, error) {
+		client, err := exr.sshClient(create)
+		if err != nil {
+			return nil, err
+		}
+		return client.NewSession()
+	}
+	session, err := try(false)
+	if err != nil {
+		session, err = try(true)
+	}
+	return session, err
+}
+
+// Get the latest SSH client. If another goroutine is in the process
+// of setting one up, wait for it to finish and return its result (or
+// the last successfully setup client, if it fails).
+func (exr *executor) sshClient(create bool) (*ssh.Client, error) {
+	exr.clientOnce.Do(func() {
+		exr.clientSetup = make(chan bool, 1)
+		exr.clientErr = errors.New("client not yet created")
+	})
+	defer func() { <-exr.clientSetup }()
+	select {
+	case exr.clientSetup <- true:
+		if create {
+			client, err := exr.setupSSHClient()
+			if err == nil || exr.client == nil {
+				exr.client, exr.clientErr = client, err
+			}
+			if err != nil {
+				return nil, err
+			}
+		}
+	default:
+		// Another goroutine is doing the above case.  Wait
+		// for it to finish and return whatever it leaves in
+		// wkr.client.
+		exr.clientSetup <- true
+	}
+	return exr.client, exr.clientErr
+}
+
+// Create a new SSH client.
+func (exr *executor) setupSSHClient() (*ssh.Client, error) {
+	target := exr.Target()
+	addr := target.Address()
+	if addr == "" {
+		return nil, errors.New("instance has no address")
+	}
+	var receivedKey ssh.PublicKey
+	client, err := ssh.Dial("tcp", addr, &ssh.ClientConfig{
+		User: "root",
+		Auth: []ssh.AuthMethod{
+			ssh.PublicKeys(exr.signers...),
+		},
+		HostKeyCallback: func(hostname string, remote net.Addr, key ssh.PublicKey) error {
+			receivedKey = key
+			return nil
+		},
+		Timeout: time.Minute,
+	})
+	if err != nil {
+		return nil, err
+	} else if receivedKey == nil {
+		return nil, errors.New("BUG: key was never provided to HostKeyCallback")
+	}
+
+	if exr.publicKey == nil || !bytes.Equal(exr.publicKey.Marshal(), receivedKey.Marshal()) {
+		err = target.VerifyPublicKey(receivedKey, client)
+		if err != nil {
+			return nil, err
+		}
+		exr.publicKey = receivedKey
+	}
+	return client, nil
+}
diff --git a/lib/dispatchcloud/ssh_executor/executor_test.go b/lib/dispatchcloud/ssh_executor/executor_test.go
new file mode 100644
index 000000000..75f057071
--- /dev/null
+++ b/lib/dispatchcloud/ssh_executor/executor_test.go
@@ -0,0 +1,102 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package ssh_executor
+
+import (
+	"bytes"
+	"io"
+	"io/ioutil"
+	"sync"
+	"testing"
+	"time"
+
+	"git.curoverse.com/arvados.git/lib/dispatchcloud/test"
+	"golang.org/x/crypto/ssh"
+	check "gopkg.in/check.v1"
+)
+
+// Gocheck boilerplate
+func Test(t *testing.T) {
+	check.TestingT(t)
+}
+
+var _ = check.Suite(&ExecutorSuite{})
+
+type testTarget struct {
+	test.SSHService
+}
+
+func (*testTarget) VerifyPublicKey(ssh.PublicKey, *ssh.Client) error {
+	return nil
+}
+
+type ExecutorSuite struct{}
+
+func (s *ExecutorSuite) TestExecute(c *check.C) {
+	command := `foo 'bar' "baz"`
+	stdinData := "foobar\nbaz\n"
+	_, hostpriv := test.LoadTestKey(c, "../test/sshkey_vm")
+	clientpub, clientpriv := test.LoadTestKey(c, "../test/sshkey_dispatch")
+	for _, exitcode := range []int{0, 1, 2} {
+		srv := &testTarget{
+			SSHService: test.SSHService{
+				Exec: func(cmd string, stdin io.Reader, stdout, stderr io.Writer) uint32 {
+					c.Check(cmd, check.Equals, command)
+					var wg sync.WaitGroup
+					wg.Add(2)
+					go func() {
+						io.WriteString(stdout, "stdout\n")
+						wg.Done()
+					}()
+					go func() {
+						io.WriteString(stderr, "stderr\n")
+						wg.Done()
+					}()
+					buf, err := ioutil.ReadAll(stdin)
+					wg.Wait()
+					c.Check(err, check.IsNil)
+					if err != nil {
+						return 99
+					}
+					_, err = stdout.Write(buf)
+					c.Check(err, check.IsNil)
+					return uint32(exitcode)
+				},
+				HostKey:        hostpriv,
+				AuthorizedKeys: []ssh.PublicKey{clientpub},
+			},
+		}
+		err := srv.Start()
+		c.Check(err, check.IsNil)
+		c.Logf("srv address %q", srv.Address())
+		defer srv.Close()
+
+		exr := New(srv)
+		exr.SetSigners(clientpriv)
+
+		done := make(chan bool)
+		go func() {
+			stdout, stderr, err := exr.Execute(command, bytes.NewBufferString(stdinData))
+			if exitcode == 0 {
+				c.Check(err, check.IsNil)
+			} else {
+				c.Check(err, check.NotNil)
+				err, ok := err.(*ssh.ExitError)
+				c.Assert(ok, check.Equals, true)
+				c.Check(err.ExitStatus(), check.Equals, exitcode)
+			}
+			c.Check(stdout, check.DeepEquals, []byte("stdout\n"+stdinData))
+			c.Check(stderr, check.DeepEquals, []byte("stderr\n"))
+			close(done)
+		}()
+
+		timeout := time.NewTimer(time.Second)
+		select {
+		case <-done:
+		case <-timeout.C:
+			c.Fatal("timed out")
+		}
+	}
+}
diff --git a/lib/dispatchcloud/test/fixtures.go b/lib/dispatchcloud/test/fixtures.go
new file mode 100644
index 000000000..5c14e3131
--- /dev/null
+++ b/lib/dispatchcloud/test/fixtures.go
@@ -0,0 +1,24 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package test
+
+import (
+	"fmt"
+
+	"git.curoverse.com/arvados.git/sdk/go/arvados"
+)
+
+func ContainerUUID(i int) string {
+	return fmt.Sprintf("zzzzz-dz642-%015d", i)
+}
+
+func InstanceType(i int) arvados.InstanceType {
+	return arvados.InstanceType{
+		Name:         fmt.Sprintf("type%d", i),
+		ProviderType: fmt.Sprintf("providertype%d", i),
+		VCPUs:        i,
+		RAM:          arvados.ByteSize(i) << 30,
+	}
+}
diff --git a/lib/dispatchcloud/test/lame_instance_set.go b/lib/dispatchcloud/test/lame_instance_set.go
new file mode 100644
index 000000000..996a63821
--- /dev/null
+++ b/lib/dispatchcloud/test/lame_instance_set.go
@@ -0,0 +1,118 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package test
+
+import (
+	"fmt"
+	"math/rand"
+	"sync"
+
+	"git.curoverse.com/arvados.git/lib/cloud"
+	"git.curoverse.com/arvados.git/sdk/go/arvados"
+	"golang.org/x/crypto/ssh"
+)
+
+// LameInstanceSet creates instances that boot but can't run
+// containers.
+type LameInstanceSet struct {
+	Hold chan bool // set to make(chan bool) to hold operations until Release is called
+
+	mtx       sync.Mutex
+	instances map[*lameInstance]bool
+}
+
+// Create returns a new instance.
+func (p *LameInstanceSet) Create(instType arvados.InstanceType, imageID cloud.ImageID, tags cloud.InstanceTags, pubkey ssh.PublicKey) (cloud.Instance, error) {
+	inst := &lameInstance{
+		p:            p,
+		id:           cloud.InstanceID(fmt.Sprintf("lame-%x", rand.Uint64())),
+		providerType: instType.ProviderType,
+	}
+	inst.SetTags(tags)
+	if p.Hold != nil {
+		p.Hold <- true
+	}
+	p.mtx.Lock()
+	defer p.mtx.Unlock()
+	if p.instances == nil {
+		p.instances = map[*lameInstance]bool{}
+	}
+	p.instances[inst] = true
+	return inst, nil
+}
+
+// Instances returns the instances that haven't been destroyed.
+func (p *LameInstanceSet) Instances(cloud.InstanceTags) ([]cloud.Instance, error) {
+	p.mtx.Lock()
+	defer p.mtx.Unlock()
+	var instances []cloud.Instance
+	for i := range p.instances {
+		instances = append(instances, i)
+	}
+	return instances, nil
+}
+
+// Stop is a no-op, but exists to satisfy cloud.InstanceSet.
+func (p *LameInstanceSet) Stop() {
+}
+
+// Release n held calls. Blocks if n calls aren't already
+// waiting. Blocks forever if Hold is nil.
+func (p *LameInstanceSet) Release(n int) {
+	for i := 0; i < n; i++ {
+		<-p.Hold
+	}
+}
+
+type lameInstance struct {
+	p            *LameInstanceSet
+	id           cloud.InstanceID
+	providerType string
+	tags         cloud.InstanceTags
+}
+
+func (inst *lameInstance) ID() cloud.InstanceID {
+	return inst.id
+}
+
+func (inst *lameInstance) String() string {
+	return fmt.Sprint(inst.id)
+}
+
+func (inst *lameInstance) ProviderType() string {
+	return inst.providerType
+}
+
+func (inst *lameInstance) Address() string {
+	return "0.0.0.0:1234"
+}
+
+func (inst *lameInstance) SetTags(tags cloud.InstanceTags) error {
+	inst.p.mtx.Lock()
+	defer inst.p.mtx.Unlock()
+	inst.tags = cloud.InstanceTags{}
+	for k, v := range tags {
+		inst.tags[k] = v
+	}
+	return nil
+}
+
+func (inst *lameInstance) Destroy() error {
+	if inst.p.Hold != nil {
+		inst.p.Hold <- true
+	}
+	inst.p.mtx.Lock()
+	defer inst.p.mtx.Unlock()
+	delete(inst.p.instances, inst)
+	return nil
+}
+
+func (inst *lameInstance) Tags() cloud.InstanceTags {
+	return inst.tags
+}
+
+func (inst *lameInstance) VerifyPublicKey(ssh.PublicKey, *ssh.Client) error {
+	return nil
+}
diff --git a/lib/dispatchcloud/test/queue.go b/lib/dispatchcloud/test/queue.go
new file mode 100644
index 000000000..ff204064c
--- /dev/null
+++ b/lib/dispatchcloud/test/queue.go
@@ -0,0 +1,72 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package test
+
+import (
+	"fmt"
+
+	"git.curoverse.com/arvados.git/lib/dispatchcloud/container"
+	"git.curoverse.com/arvados.git/sdk/go/arvados"
+)
+
+type Queue struct {
+	Containers []arvados.Container
+	ChooseType func(*arvados.Container) (arvados.InstanceType, error)
+
+	entries map[string]container.QueueEnt
+}
+
+func (q *Queue) Entries() map[string]container.QueueEnt {
+	r := map[string]container.QueueEnt{}
+	for uuid, ent := range q.entries {
+		r[uuid] = ent
+	}
+	return r
+}
+
+func (q *Queue) Get(uuid string) (arvados.Container, bool) {
+	ent, ok := q.entries[uuid]
+	return ent.Container, ok
+}
+
+func (q *Queue) Forget(uuid string) {
+	delete(q.entries, uuid)
+}
+
+func (q *Queue) Lock(uuid string) error {
+	ent := q.entries[uuid]
+	if ent.Container.State != arvados.ContainerStateQueued {
+		return fmt.Errorf("lock failed: state=%q", ent.Container.State)
+	}
+	ent.Container.State = arvados.ContainerStateLocked
+	q.entries[uuid] = ent
+	for i, ctr := range q.Containers {
+		if ctr.UUID == uuid {
+			q.Containers[i].State = arvados.ContainerStateLocked
+		}
+	}
+	return nil
+}
+
+func (q *Queue) Unlock(uuid string) error {
+	return nil
+}
+
+func (q *Queue) Cancel(uuid string) error {
+	return nil
+}
+
+func (q *Queue) Update() error {
+	upd := map[string]container.QueueEnt{}
+	for _, ctr := range q.Containers {
+		it, _ := q.ChooseType(&ctr)
+		upd[ctr.UUID] = container.QueueEnt{
+			Container:    ctr,
+			InstanceType: it,
+		}
+	}
+	q.entries = upd
+	return nil
+}
diff --git a/lib/dispatchcloud/test/ssh_service.go b/lib/dispatchcloud/test/ssh_service.go
new file mode 100644
index 000000000..b1e4e03b1
--- /dev/null
+++ b/lib/dispatchcloud/test/ssh_service.go
@@ -0,0 +1,169 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package test
+
+import (
+	"bytes"
+	"fmt"
+	"io"
+	"io/ioutil"
+	"log"
+	"net"
+	"strings"
+	"sync"
+
+	"golang.org/x/crypto/ssh"
+	check "gopkg.in/check.v1"
+)
+
+func LoadTestKey(c *check.C, fnm string) (ssh.PublicKey, ssh.Signer) {
+	rawpubkey, err := ioutil.ReadFile(fnm + ".pub")
+	c.Assert(err, check.IsNil)
+	pubkey, _, _, _, err := ssh.ParseAuthorizedKey(rawpubkey)
+	c.Assert(err, check.IsNil)
+	rawprivkey, err := ioutil.ReadFile(fnm)
+	c.Assert(err, check.IsNil)
+	privkey, err := ssh.ParsePrivateKey(rawprivkey)
+	c.Assert(err, check.IsNil)
+	return pubkey, privkey
+}
+
+// An SSHExecFunc handles an "exec" session on a multiplexed SSH
+// connection.
+type SSHExecFunc func(command string, stdin io.Reader, stdout, stderr io.Writer) uint32
+
+// An SSHService accepts SSH connections on an available TCP port and
+// passes clients' "exec" sessions to the provided SSHExecFunc.
+type SSHService struct {
+	Exec           SSHExecFunc
+	HostKey        ssh.Signer
+	AuthorizedKeys []ssh.PublicKey
+
+	listener net.Listener
+	conn     *ssh.ServerConn
+	setup    sync.Once
+	mtx      sync.Mutex
+	started  chan bool
+	closed   bool
+	err      error
+}
+
+// Address returns the host:port where the SSH server is listening. It
+// returns "" if called before the server is ready to accept
+// connections.
+func (ss *SSHService) Address() string {
+	ss.setup.Do(ss.start)
+	ss.mtx.Lock()
+	ln := ss.listener
+	ss.mtx.Unlock()
+	if ln == nil {
+		return ""
+	}
+	return ln.Addr().String()
+}
+
+// Close shuts down the server and releases resources. Established
+// connections are unaffected.
+func (ss *SSHService) Close() {
+	ss.Start()
+	ss.mtx.Lock()
+	ln := ss.listener
+	ss.closed = true
+	ss.mtx.Unlock()
+	if ln != nil {
+		ln.Close()
+	}
+}
+
+// Start returns when the server is ready to accept connections.
+func (ss *SSHService) Start() error {
+	ss.setup.Do(ss.start)
+	<-ss.started
+	return ss.err
+}
+
+func (ss *SSHService) start() {
+	ss.started = make(chan bool)
+	go ss.run()
+}
+
+func (ss *SSHService) run() {
+	defer close(ss.started)
+	config := &ssh.ServerConfig{
+		PublicKeyCallback: func(c ssh.ConnMetadata, pubKey ssh.PublicKey) (*ssh.Permissions, error) {
+			for _, ak := range ss.AuthorizedKeys {
+				if bytes.Equal(ak.Marshal(), pubKey.Marshal()) {
+					return &ssh.Permissions{}, nil
+				}
+			}
+			return nil, fmt.Errorf("unknown public key for %q", c.User())
+		},
+	}
+	config.AddHostKey(ss.HostKey)
+
+	listener, err := net.Listen("tcp", ":")
+	if err != nil {
+		ss.err = err
+		return
+	}
+
+	ss.mtx.Lock()
+	ss.listener = listener
+	ss.mtx.Unlock()
+
+	go func() {
+		for {
+			nConn, err := listener.Accept()
+			if err != nil && strings.Contains(err.Error(), "use of closed network connection") && ss.closed {
+				return
+			} else if err != nil {
+				log.Printf("accept: %s", err)
+				return
+			}
+			go ss.serveConn(nConn, config)
+		}
+	}()
+}
+
+func (ss *SSHService) serveConn(nConn net.Conn, config *ssh.ServerConfig) {
+	defer nConn.Close()
+	conn, newchans, reqs, err := ssh.NewServerConn(nConn, config)
+	if err != nil {
+		log.Printf("ssh.NewServerConn: %s", err)
+		return
+	}
+	defer conn.Close()
+	go ssh.DiscardRequests(reqs)
+	for newch := range newchans {
+		if newch.ChannelType() != "session" {
+			newch.Reject(ssh.UnknownChannelType, "unknown channel type")
+			continue
+		}
+		ch, reqs, err := newch.Accept()
+		if err != nil {
+			log.Printf("accept channel: %s", err)
+			return
+		}
+		var execReq struct {
+			Command string
+		}
+		go func() {
+			for req := range reqs {
+				if req.Type == "exec" && execReq.Command == "" {
+					req.Reply(true, nil)
+					ssh.Unmarshal(req.Payload, &execReq)
+					go func() {
+						var resp struct {
+							Status uint32
+						}
+						resp.Status = ss.Exec(execReq.Command, ch, ch, ch.Stderr())
+						ch.SendRequest("exit-status", false, ssh.Marshal(&resp))
+						ch.Close()
+					}()
+				}
+			}
+		}()
+	}
+}
diff --git a/lib/dispatchcloud/test/sshkey_dispatch b/lib/dispatchcloud/test/sshkey_dispatch
new file mode 100644
index 000000000..5584519c7
--- /dev/null
+++ b/lib/dispatchcloud/test/sshkey_dispatch
@@ -0,0 +1,27 @@
+-----BEGIN RSA PRIVATE KEY-----
+MIIEowIBAAKCAQEAqYm4XsQHm8sBSZFwUX5VeW1OkGsfoNzcGPG2nzzYRhNhClYZ
+0ABHhUk82HkaC/8l6d/jpYTf42HrK42nNQ0r0Yzs7qw8yZMQioK4Yk+kFyVLF78E
+GRG4pGAWXFs6pUchs/lm8fo9zcda4R3XeqgI+NO+nEERXmdRJa1FhI+Za3/S/+CV
+mg+6O00wZz2+vKmDPptGN4MCKmQOCKsMJts7wSZGyVcTtdNv7jjfr6yPAIOIL8X7
+LtarBCFaK/pD7uWll/Uj7h7D8K48nIZUrvBJJjXL8Sm4LxCNoz3Z83k8J5ZzuDRD
+gRiQe/C085mhO6VL+2fypDLwcKt1tOL8fI81MwIDAQABAoIBACR3tEnmHsDbNOav
+Oxq8cwRQh9K2yDHg8BMJgz/TZa4FIx2HEbxVIw0/iLADtJ+Z/XzGJQCIiWQuvtg6
+exoFQESt7JUWRWkSkj9JCQJUoTY9Vl7APtBpqG7rIEQzd3TvzQcagZNRQZQO6rR7
+p8sBdBSZ72lK8cJ9tM3G7Kor/VNK7KgRZFNhEWnmvEa3qMd4hzDcQ4faOn7C9NZK
+dwJAuJVVfwOLlOORYcyEkvksLaDOK2DsB/p0AaCpfSmThRbBKN5fPXYaKgUdfp3w
+70Hpp27WWymb1cgjyqSH3DY+V/kvid+5QxgxCBRq865jPLn3FFT9bWEVS/0wvJRj
+iMIRrjECgYEA4Ffv9rBJXqVXonNQbbstd2PaprJDXMUy9/UmfHL6pkq1xdBeuM7v
+yf2ocXheA8AahHtIOhtgKqwv/aRhVK0ErYtiSvIk+tXG+dAtj/1ZAKbKiFyxjkZV
+X72BH7cTlR6As5SRRfWM/HaBGEgED391gKsI5PyMdqWWdczT5KfxAksCgYEAwXYE
+ewPmV1GaR5fbh2RupoPnUJPMj36gJCnwls7sGaXDQIpdlq56zfKgrLocGXGgj+8f
+QH7FHTJQO15YCYebtsXWwB3++iG43gVlJlecPAydsap2CCshqNWC5JU5pan0QzsP
+exzNzWqfUPSbTkR2SRaN+MenZo2Y/WqScOAth7kCgYBgVoLujW9EXH5QfXJpXLq+
+jTvE38I7oVcs0bJwOLPYGzcJtlwmwn6IYAwohgbhV2pLv+EZSs42JPEK278MLKxY
+lgVkp60npgunFTWroqDIvdc1TZDVxvA8h9VeODEJlSqxczgbMcIUXBM9yRctTI+5
+7DiKlMUA4kTFW2sWwuOlFwKBgGXvrYS0FVbFJKm8lmvMu5D5x5RpjEu/yNnFT4Pn
+G/iXoz4Kqi2PWh3STl804UF24cd1k94D7hDoReZCW9kJnz67F+C67XMW+bXi2d1O
+JIBvlVfcHb1IHMA9YG7ZQjrMRmx2Xj3ce4RVPgUGHh8ra7gvLjd72/Tpf0doNClN
+ti/hAoGBAMW5D3LhU05LXWmOqpeT4VDgqk4MrTBcstVe7KdVjwzHrVHCAmI927vI
+pjpphWzpC9m3x4OsTNf8m+g6H7f3IiQS0aiFNtduXYlcuT5FHS2fSATTzg5PBon9
+1E6BudOve+WyFyBs7hFWAqWFBdWujAl4Qk5Ek09U2ilFEPE7RTgJ
+-----END RSA PRIVATE KEY-----
diff --git a/lib/dispatchcloud/test/sshkey_dispatch.pub b/lib/dispatchcloud/test/sshkey_dispatch.pub
new file mode 100644
index 000000000..1d5c1ea1b
--- /dev/null
+++ b/lib/dispatchcloud/test/sshkey_dispatch.pub
@@ -0,0 +1 @@
+ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQCpibhexAebywFJkXBRflV5bU6Qax+g3NwY8bafPNhGE2EKVhnQAEeFSTzYeRoL/yXp3+OlhN/jYesrjac1DSvRjOzurDzJkxCKgrhiT6QXJUsXvwQZEbikYBZcWzqlRyGz+Wbx+j3Nx1rhHdd6qAj4076cQRFeZ1ElrUWEj5lrf9L/4JWaD7o7TTBnPb68qYM+m0Y3gwIqZA4Iqwwm2zvBJkbJVxO102/uON+vrI8Ag4gvxfsu1qsEIVor+kPu5aWX9SPuHsPwrjychlSu8EkmNcvxKbgvEI2jPdnzeTwnlnO4NEOBGJB78LTzmaE7pUv7Z/KkMvBwq3W04vx8jzUz tom at curve
diff --git a/lib/dispatchcloud/test/sshkey_vm b/lib/dispatchcloud/test/sshkey_vm
new file mode 100644
index 000000000..10b7ed1bc
--- /dev/null
+++ b/lib/dispatchcloud/test/sshkey_vm
@@ -0,0 +1,27 @@
+-----BEGIN RSA PRIVATE KEY-----
+MIIEpQIBAAKCAQEApIfWk2StZGDtmunumIeXLJ46AQrbHHvuxrSAkQf6+zUwjB2I
+rse7ezBRHWcge9U5EsigixmhUM4ozFLnUQNwC862jbmsjbyA97arG/REECNlUrEB
+HQPYHhai5yyJ89AfjWVxKyINfW0K2HX1R8nl4kdVraAgpohPLh0dGjfwzm/BcXDG
++TxW9zRz0KCs9ZRI6s2MNdv08ahKQ0azk8gRTqMADJmYNWIo3zPQ+fhlwyr6EZJ/
+HFbRtjpajEPMJPwoVPO+Wj6wztfHDYKkPIrIWbhMl6w+tEKdsmygd3Iq94ktLS3X
+AbRCfn4njS2QSlkKFEepkUJWCSSWZgFn6DLm2wIDAQABAoIBAQCb137LxcTnG1h0
+L7isCWKMBKN0cU/xvwIAfOB6f1CfuVXuodrhkpZmrPFoJFKEeQbCX/6RQwmlfGDw
+iGZKOjNbO8V2oLRs3GxcNk4FAG2ny58hoD8puIZwmYhb57gTlMMOL1PuQyb78tkf
+Bzv5b6ermV3yQ4Ypt1solrMGLo6NOZD0oDX9p0Zt9kueIhjzgP0v5//T1F4PGHZK
++sLSsMiu9u6F+PB+Oc6uv0Zee9Lnts/QiWH5f18oEculjwKWFx+JwJWiLffGg2Bl
+vbpmvHFRoRWkHTpgSiLwSUqs0ZUWU9R5h11ROg5L39MLsxQoBvHsPEnP5ssN8jGt
+aH86EZjBAoGBAM+A5B/UjhIn9m05EhDTDRzI92hGhM8f7uAwobbnjvIQyZbWlBwj
+2TmgbJdpTGVbD+iTBIwKQdcFBbWobTCZsNMpghqA/ir4YIAnZ5OX9VQ1Bc+bWE7V
+dPmMVpCgyg+ERAe+79FrYWcI3vhnBpHCsY/9p9pGQIKDzlGTWNF1HJGjAoGBAMr7
+2CTVnFImTgD3E+rH4AAAfkz+cyqfK6BUhli/NifFYZhWCs16r9QCGSORnp4gPhMY
+3mf7VBs9rk123zOMo89eJt3adTgbZ+QIxXeXilGXpbT3w1+CJMaZRrIy80E1tB5/
+KvDZcrZ78o8XWMNUa+9k55ukvgyC24ICAmOIWNlpAoGBALEFvphBF2r52MtZUsYz
+pw4VjKvS7V5eWcW891k4tsRf+frK2NQg6SK2b63EUT5ur2W0dr6ZyY2MZVCSfYRm
+uWmMEchWn389IeZyt3Q8wTize1+foXivtflm9jqwUXFnXzpUc/du6kuiT8YO7pXP
+SPgUZ+xY3pP5qjwBvlYC2PqNAoGAZ1CKMi1bdGC0wT8BLzXuqHGX136HhcEgRmnf
+O5qPaOzJAO2CcBWrGuC6hOUgc+F7VuMIiKpeo8LgTeNcNfO2iNymMbN4iEdCuMlS
+IM3MBD2IhTS6h4lJSKBJYHgYYi+AbylQ5Of4wDMUQYqjjkAQ8/dK/2h5pwqPyXtW
+VezXNEkCgYEAq4S0++y9tjlLn+w9BIkmx3bAVRDQZIzIEwxTh+jpqaUp1J0iyseJ
+71pwqQojGNF6x8GglVXa6bMrETae21WhEeHnWmzlpCWIODsYPUQ+erjDuAWi9eGk
+HLklqSEoLB8pzC6zDqjxDw+CnGERIDSaoaeoWiNKZ95IH1WiEwYjuxU=
+-----END RSA PRIVATE KEY-----
diff --git a/lib/dispatchcloud/test/sshkey_vm.pub b/lib/dispatchcloud/test/sshkey_vm.pub
new file mode 100644
index 000000000..b9d44c946
--- /dev/null
+++ b/lib/dispatchcloud/test/sshkey_vm.pub
@@ -0,0 +1 @@
+ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQCkh9aTZK1kYO2a6e6Yh5csnjoBCtsce+7GtICRB/r7NTCMHYiux7t7MFEdZyB71TkSyKCLGaFQzijMUudRA3ALzraNuayNvID3tqsb9EQQI2VSsQEdA9geFqLnLInz0B+NZXErIg19bQrYdfVHyeXiR1WtoCCmiE8uHR0aN/DOb8FxcMb5PFb3NHPQoKz1lEjqzYw12/TxqEpDRrOTyBFOowAMmZg1YijfM9D5+GXDKvoRkn8cVtG2OlqMQ8wk/ChU875aPrDO18cNgqQ8ishZuEyXrD60Qp2ybKB3cir3iS0tLdcBtEJ+fieNLZBKWQoUR6mRQlYJJJZmAWfoMubb tom at curve
diff --git a/lib/dispatchcloud/test/stub_driver.go b/lib/dispatchcloud/test/stub_driver.go
new file mode 100644
index 000000000..1033ae178
--- /dev/null
+++ b/lib/dispatchcloud/test/stub_driver.go
@@ -0,0 +1,190 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package test
+
+import (
+	"crypto/rand"
+	"errors"
+	"fmt"
+	"io"
+	math_rand "math/rand"
+	"sync"
+
+	"git.curoverse.com/arvados.git/lib/cloud"
+	"git.curoverse.com/arvados.git/sdk/go/arvados"
+	"github.com/mitchellh/mapstructure"
+	"golang.org/x/crypto/ssh"
+)
+
+type StubExecFunc func(instance cloud.Instance, command string, stdin io.Reader, stdout, stderr io.Writer) uint32
+
+// A StubDriver implements cloud.Driver by setting up local SSH
+// servers that pass their command execution requests to the provided
+// SSHExecFunc.
+type StubDriver struct {
+	Exec           StubExecFunc
+	HostKey        ssh.Signer
+	AuthorizedKeys []ssh.PublicKey
+	instanceSets   []*StubInstanceSet
+}
+
+// InstanceSet returns a new *StubInstanceSet.
+func (sd *StubDriver) InstanceSet(params map[string]interface{}, id cloud.InstanceSetID) (cloud.InstanceSet, error) {
+	sis := StubInstanceSet{
+		driver:  sd,
+		servers: map[cloud.InstanceID]*stubServer{},
+	}
+	sd.instanceSets = append(sd.instanceSets, &sis)
+	return &sis, mapstructure.Decode(params, &sis)
+}
+
+// InstanceSets returns all instances that have been created by the
+// driver. This can be used to test a component that uses the driver
+// but doesn't expose the InstanceSets it has created.
+func (sd *StubDriver) InstanceSets() []*StubInstanceSet {
+	return sd.instanceSets
+}
+
+type StubInstanceSet struct {
+	driver  *StubDriver
+	servers map[cloud.InstanceID]*stubServer
+	mtx     sync.RWMutex
+	stopped bool
+}
+
+func (sis *StubInstanceSet) Create(it arvados.InstanceType, image cloud.ImageID, tags cloud.InstanceTags, authKey ssh.PublicKey) (cloud.Instance, error) {
+	sis.mtx.Lock()
+	defer sis.mtx.Unlock()
+	if sis.stopped {
+		return nil, errors.New("StubInstanceSet: Create called after Stop")
+	}
+	ak := sis.driver.AuthorizedKeys
+	if authKey != nil {
+		ak = append([]ssh.PublicKey{authKey}, ak...)
+	}
+	var ss *stubServer
+	ss = &stubServer{
+		sis:          sis,
+		id:           cloud.InstanceID(fmt.Sprintf("stub-%s-%x", it.ProviderType, math_rand.Int63())),
+		tags:         copyTags(tags),
+		providerType: it.ProviderType,
+		SSHService: SSHService{
+			HostKey:        sis.driver.HostKey,
+			AuthorizedKeys: ak,
+			Exec: func(command string, stdin io.Reader, stdout, stderr io.Writer) uint32 {
+				return sis.driver.Exec(ss.Instance(), command, stdin, stdout, stderr)
+			},
+		},
+	}
+
+	sis.servers[ss.id] = ss
+	return ss.Instance(), nil
+}
+
+func (sis *StubInstanceSet) Instances(cloud.InstanceTags) ([]cloud.Instance, error) {
+	sis.mtx.RLock()
+	defer sis.mtx.RUnlock()
+	var r []cloud.Instance
+	for _, ss := range sis.servers {
+		r = append(r, ss.Instance())
+	}
+	return r, nil
+}
+
+func (sis *StubInstanceSet) Stop() {
+	sis.mtx.Lock()
+	defer sis.mtx.Unlock()
+	if sis.stopped {
+		panic("Stop called twice")
+	}
+	sis.stopped = true
+}
+
+type stubServer struct {
+	sis          *StubInstanceSet
+	id           cloud.InstanceID
+	tags         cloud.InstanceTags
+	providerType string
+	SSHService   SSHService
+	sync.Mutex
+}
+
+func (ss *stubServer) Instance() stubInstance {
+	ss.Lock()
+	defer ss.Unlock()
+	return stubInstance{
+		ss:   ss,
+		addr: ss.SSHService.Address(),
+		// We deliberately return a cached/stale copy of the
+		// real tags here, so that (Instance)Tags() sometimes
+		// returns old data after a call to
+		// (Instance)SetTags().  This is permitted by the
+		// driver interface, and this might help remind
+		// callers that they need to tolerate it.
+		tags: copyTags(ss.tags),
+	}
+}
+
+type stubInstance struct {
+	ss   *stubServer
+	addr string
+	tags cloud.InstanceTags
+}
+
+func (si stubInstance) ID() cloud.InstanceID {
+	return si.ss.id
+}
+
+func (si stubInstance) Address() string {
+	return si.addr
+}
+
+func (si stubInstance) Destroy() error {
+	return errors.New("unimplemented")
+}
+
+func (si stubInstance) ProviderType() string {
+	return si.ss.providerType
+}
+
+func (si stubInstance) SetTags(tags cloud.InstanceTags) error {
+	tags = copyTags(tags)
+	ss := si.ss
+	go func() {
+		ss.Lock()
+		defer ss.Unlock()
+		ss.tags = tags
+	}()
+	return nil
+}
+
+func (si stubInstance) Tags() cloud.InstanceTags {
+	return si.tags
+}
+
+func (si stubInstance) String() string {
+	return string(si.ss.id)
+}
+
+func (si stubInstance) VerifyPublicKey(key ssh.PublicKey, client *ssh.Client) error {
+	buf := make([]byte, 512)
+	_, err := io.ReadFull(rand.Reader, buf)
+	if err != nil {
+		return err
+	}
+	sig, err := si.ss.sis.driver.HostKey.Sign(rand.Reader, buf)
+	if err != nil {
+		return err
+	}
+	return key.Verify(buf, sig)
+}
+
+func copyTags(src cloud.InstanceTags) cloud.InstanceTags {
+	dst := cloud.InstanceTags{}
+	for k, v := range src {
+		dst[k] = v
+	}
+	return dst
+}
diff --git a/lib/dispatchcloud/worker/gocheck_test.go b/lib/dispatchcloud/worker/gocheck_test.go
new file mode 100644
index 000000000..b4ca66c97
--- /dev/null
+++ b/lib/dispatchcloud/worker/gocheck_test.go
@@ -0,0 +1,16 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package worker
+
+import (
+	"testing"
+
+	check "gopkg.in/check.v1"
+)
+
+// Gocheck boilerplate
+func Test(t *testing.T) {
+	check.TestingT(t)
+}
diff --git a/lib/dispatchcloud/worker/pool.go b/lib/dispatchcloud/worker/pool.go
new file mode 100644
index 000000000..de5069e1f
--- /dev/null
+++ b/lib/dispatchcloud/worker/pool.go
@@ -0,0 +1,620 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package worker
+
+import (
+	"bytes"
+	"fmt"
+	"io"
+	"sort"
+	"strings"
+	"sync"
+	"time"
+
+	"git.curoverse.com/arvados.git/lib/cloud"
+	"git.curoverse.com/arvados.git/sdk/go/arvados"
+	"github.com/Sirupsen/logrus"
+	"golang.org/x/crypto/ssh"
+)
+
+type WorkerView struct {
+	Instance             string
+	Price                float64
+	ArvadosInstanceType  string
+	ProviderInstanceType string
+	LastContainerUUID    string
+	Unallocated          time.Time
+	WorkerState          string
+}
+
+type Executor interface {
+	Execute(cmd string, stdin io.Reader) (stdout, stderr []byte, err error)
+
+	// Use the given target for subsequent operations. The new
+	// target is the same host as the previous target, but it
+	// might return a different address and verify a different
+	// host key.
+	//
+	// SetTarget is called frequently, and in most cases the new
+	// target will behave exactly the same as the old one. An
+	// implementation should optimize accordingly.
+	//
+	// SetTarget must not block on concurrent Execute calls.
+	SetTarget(cloud.ExecutorTarget)
+}
+
+type Pool interface {
+	Subscribe() <-chan struct{}
+	Unsubscribe(ch <-chan struct{})
+	Unallocated() map[arvados.InstanceType]int
+	Create(it arvados.InstanceType) error
+	Shutdown(it arvados.InstanceType) bool
+	AtQuota() bool
+	Workers() map[State]int
+	Running() map[string]bool
+	StartContainer(it arvados.InstanceType, ctr arvados.Container) bool
+	KillContainer(uuid string)
+	Stop()
+	View() []WorkerView
+}
+
+const (
+	defaultSyncInterval       = time.Minute
+	defaultProbeInterval      = 10 * time.Second
+	defaultMaxProbesPerSecond = 10
+)
+
+// NewPool creates a Pool of workers backed by instanceSet.
+//
+// New instances are configured and set up according to the given
+// cluster configuration.
+func NewPool(logger logrus.FieldLogger, instanceSet cloud.InstanceSet, newExecutor func(cloud.Instance) Executor, cluster *arvados.Cluster) Pool {
+	wp := &workerPool{
+		logger:             logger,
+		instanceSet:        instanceSet,
+		newExecutor:        newExecutor,
+		bootProbeCommand:   cluster.CloudVMs.BootProbeCommand,
+		imageID:            cloud.ImageID(cluster.CloudVMs.ImageID),
+		instanceTypes:      cluster.InstanceTypes,
+		maxProbesPerSecond: cluster.Dispatch.MaxProbesPerSecond,
+		probeInterval:      time.Duration(cluster.Dispatch.ProbeInterval),
+		syncInterval:       time.Duration(cluster.CloudVMs.SyncInterval),
+	}
+	go wp.run()
+	return wp
+}
+
+type workerPool struct {
+	// configuration
+	logger             logrus.FieldLogger
+	instanceSet        cloud.InstanceSet
+	newExecutor        func(cloud.Instance) Executor
+	bootProbeCommand   string
+	imageID            cloud.ImageID
+	instanceTypes      map[string]arvados.InstanceType
+	syncInterval       time.Duration
+	probeInterval      time.Duration
+	maxProbesPerSecond int
+
+	// private state
+	subscribers  map[<-chan struct{}]chan<- struct{}
+	creating     map[arvados.InstanceType]int // goroutines waiting for (InstanceSet)Create to return
+	workers      map[cloud.InstanceID]*worker
+	loaded       bool // loaded list of instances from InstanceSet at least once
+	atQuotaUntil time.Time
+	stop         chan bool
+	mtx          sync.RWMutex
+	setupOnce    sync.Once
+}
+
+type worker struct {
+	state       State
+	instance    cloud.Instance
+	executor    Executor
+	instType    arvados.InstanceType
+	booted      bool
+	probed      time.Time
+	updated     time.Time
+	busy        time.Time
+	unallocated time.Time
+	lastUUID    string
+	running     []string
+}
+
+// Subscribe returns a channel that becomes ready whenever a worker's
+// state changes.
+//
+// Example:
+//
+//	ch := wp.Subscribe()
+//	for range ch {
+//		// some worker has become available; try scheduling some work
+//		if wantStop {
+//			wp.Unsubscribe(ch)
+//			break
+//		}
+//	}
+func (wp *workerPool) Subscribe() <-chan struct{} {
+	wp.setupOnce.Do(wp.setup)
+	wp.mtx.Lock()
+	defer wp.mtx.Unlock()
+	ch := make(chan struct{}, 1)
+	wp.subscribers[ch] = ch
+	return ch
+}
+
+// Unsubscribe stops sending updates to the given channel.
+func (wp *workerPool) Unsubscribe(ch <-chan struct{}) {
+	wp.setupOnce.Do(wp.setup)
+	wp.mtx.Lock()
+	defer wp.mtx.Unlock()
+	delete(wp.subscribers, ch)
+}
+
+// Unallocated returns the number of unallocated (booting + idle +
+// unknown) workers for each instance type.
+func (wp *workerPool) Unallocated() map[arvados.InstanceType]int {
+	wp.setupOnce.Do(wp.setup)
+	wp.mtx.RLock()
+	defer wp.mtx.RUnlock()
+	u := map[arvados.InstanceType]int{}
+	for it, c := range wp.creating {
+		u[it] = c
+	}
+	for _, wkr := range wp.workers {
+		if len(wkr.running) == 0 && (wkr.state == StateRunning || wkr.state == StateBooting || wkr.state == StateUnknown) {
+			u[wkr.instType]++
+		}
+	}
+	return u
+}
+
+// Create a new instance with the given type, and add it to the worker
+// pool. The worker is added immediately; instance creation runs in
+// the background.
+func (wp *workerPool) Create(it arvados.InstanceType) error {
+	wp.logger.Debugf("workerPool: Create(%v)", it)
+	wp.setupOnce.Do(wp.setup)
+	wp.mtx.Lock()
+	defer wp.mtx.Unlock()
+	tags := cloud.InstanceTags{"InstanceType": it.Name}
+	wp.creating[it]++
+	go func() {
+		inst, err := wp.instanceSet.Create(it, wp.imageID, tags, nil)
+		wp.mtx.Lock()
+		defer wp.mtx.Unlock()
+		wp.creating[it]--
+		if err, ok := err.(cloud.QuotaError); ok && err.IsQuotaError() {
+			wp.atQuotaUntil = time.Now().Add(time.Minute)
+		}
+		if err != nil {
+			wp.logger.Errorf("workerPool: create instance: %s", err)
+			go wp.notify()
+			return
+		}
+		wp.updateWorker(inst, it, StateBooting)
+	}()
+	return nil
+}
+
+// AtQuota returns true if Create is not expected to work at the
+// moment.
+func (wp *workerPool) AtQuota() bool {
+	return time.Now().Before(wp.atQuotaUntil)
+}
+
+// Add or update worker attached to the given instance. Use
+// initialState if a new worker is created. Caller must have lock.
+func (wp *workerPool) updateWorker(inst cloud.Instance, it arvados.InstanceType, initialState State) {
+	id := inst.ID()
+	if wp.workers[id] != nil {
+		wp.workers[id].executor.SetTarget(inst)
+		wp.workers[id].instance = inst
+		wp.workers[id].updated = time.Now()
+		if initialState == StateBooting && wp.workers[id].state == StateUnknown {
+			wp.workers[id].state = StateBooting
+		}
+		return
+	}
+	wp.logger.Debugf("workerPool: instance %q appeared with InstanceType %q -- adding with state %q", inst, it.Name, initialState)
+	wp.workers[id] = &worker{
+		executor:    wp.newExecutor(inst),
+		state:       initialState,
+		instance:    inst,
+		instType:    it,
+		probed:      time.Now(),
+		busy:        time.Now(),
+		updated:     time.Now(),
+		unallocated: time.Now(),
+	}
+	go wp.notify()
+}
+
+// Shutdown shuts down a worker with the given type, or returns false
+// if all workers with the given type are busy.
+func (wp *workerPool) Shutdown(it arvados.InstanceType) bool {
+	wp.setupOnce.Do(wp.setup)
+	wp.mtx.Lock()
+	defer wp.mtx.Unlock()
+	wp.logger.Debugf("workerPool: Shutdown(%s)", it.Name)
+	for _, tryState := range []State{StateBooting, StateRunning} {
+		// TODO: shutdown the worker with the longest idle
+		// time (Running) or the earliest create time
+		// (Booting)
+		for _, wkr := range wp.workers {
+			if wkr.state != tryState || len(wkr.running) > 0 {
+				continue
+			}
+			if wkr.instType != it {
+				continue
+			}
+			go func() {
+				err := wkr.instance.Destroy()
+				if err != nil {
+					wp.logger.Warnf("workerPool: error destroying instance %s: %s", wkr.instance, err)
+				} else {
+					wp.atQuotaUntil = time.Now()
+				}
+				wp.notify()
+			}()
+			wkr.state = StateShutdown
+			wkr.updated = time.Now()
+			return true
+		}
+	}
+	return false
+}
+
+func (wp *workerPool) Workers() map[State]int {
+	wp.setupOnce.Do(wp.setup)
+	wp.mtx.Lock()
+	defer wp.mtx.Unlock()
+	r := map[State]int{}
+	for _, w := range wp.workers {
+		r[w.state]++
+	}
+	return r
+}
+
+func (wp *workerPool) Running() map[string]bool {
+	wp.setupOnce.Do(wp.setup)
+	wp.mtx.Lock()
+	defer wp.mtx.Unlock()
+	r := map[string]bool{}
+	for _, w := range wp.workers {
+		for _, uuid := range w.running {
+			r[uuid] = true
+		}
+	}
+	return r
+}
+
+// StartContainer starts a container on an idle worker immediately if
+// possible, otherwise returns false.
+func (wp *workerPool) StartContainer(it arvados.InstanceType, ctr arvados.Container) bool {
+	wp.setupOnce.Do(wp.setup)
+	wp.mtx.Lock()
+	defer wp.mtx.Unlock()
+	var wkr *worker
+	for _, w := range wp.workers {
+		if w.instType == it && w.state == StateRunning && len(w.running) == 0 {
+			if wkr == nil || w.busy.After(wkr.busy) {
+				wkr = w
+			}
+		}
+	}
+	if wkr == nil {
+		return false
+	}
+	wp.logger.Debugf("workerPool: starting container %s on instance %s", ctr.UUID, wkr.instance)
+	wkr.running = append(wkr.running, ctr.UUID)
+	wkr.updated = time.Now()
+	go func() {
+		_, stderr, err := wkr.executor.Execute("crunch-run --detach '"+ctr.UUID+"'", nil)
+		if err != nil {
+			wp.logger.Errorf("workerPool: error starting container %s on instance %s: %s (stderr %q)", ctr.UUID, wkr.instance, err, stderr)
+			for i, uuid := range wkr.running {
+				if ctr.UUID == uuid {
+					wkr.running = append(wkr.running[:i], wkr.running[i+1:]...)
+					break
+				}
+			}
+			return
+		}
+		wp.mtx.Lock()
+		if err == nil {
+			wp.logger.Debugf("workerPool: starting container %s on instance %s succeeded", ctr.UUID, wkr.instance)
+			wkr.lastUUID = ctr.UUID
+			wkr.updated = time.Now()
+		}
+		wp.mtx.Unlock()
+	}()
+	return true
+}
+
+// KillContainer kills the crunch-run process for the given container
+// UUID, if it's running on any worker.
+func (wp *workerPool) KillContainer(uuid string) {
+	wp.mtx.Lock()
+	defer wp.mtx.Unlock()
+	for _, wkr := range wp.workers {
+		for _, u := range wkr.running {
+			if u == uuid {
+				go wp.kill(wkr, uuid)
+				return
+			}
+		}
+	}
+	wp.logger.Debugf("KillContainer: %s already disappeared", uuid)
+}
+
+func (wp *workerPool) kill(wkr *worker, uuid string) {
+	_, stderr, err := wkr.executor.Execute("crunch-run --kill "+uuid, nil)
+	if err != nil {
+		wp.logger.Infof("error killing container %s on instance %s: %s (stderr: %q)", uuid, wkr.instance, err, stderr)
+	}
+}
+
+func (wp *workerPool) run() {
+	wp.setupOnce.Do(wp.setup)
+
+	go func() {
+		maxPPS := wp.maxProbesPerSecond
+		if maxPPS < 1 {
+			maxPPS = defaultMaxProbesPerSecond
+		}
+		limitticker := time.NewTicker(time.Second / time.Duration(maxPPS))
+		defer limitticker.Stop()
+
+		probeInterval := wp.probeInterval
+		if probeInterval < 1 {
+			probeInterval = defaultProbeInterval
+		}
+		probeticker := time.NewTicker(probeInterval)
+		defer probeticker.Stop()
+
+		workers := []cloud.InstanceID{}
+		for range probeticker.C {
+			workers = workers[:0]
+			wp.mtx.Lock()
+			for id := range wp.workers {
+				workers = append(workers, id)
+			}
+			wp.mtx.Unlock()
+
+			for _, id := range workers {
+				wp.mtx.Lock()
+				wkr, ok := wp.workers[id]
+				wp.mtx.Unlock()
+				if !ok {
+					continue
+				}
+				go wp.probeAndUpdate(wkr)
+				select {
+				case <-wp.stop:
+					return
+				case <-limitticker.C:
+				}
+			}
+		}
+	}()
+
+	timer := time.NewTimer(time.Nanosecond)
+	for {
+		err := wp.getInstancesAndSync()
+		if err != nil {
+			wp.logger.Warnf("workerPool: sync error: %s", err)
+		}
+
+		// Reset timer to desired interval, and ignore the
+		// tick that might have already arrived.
+		timer.Stop()
+		select {
+		case <-timer.C:
+		default:
+		}
+		timer.Reset(wp.syncInterval)
+
+		select {
+		case <-timer.C:
+		case <-wp.stop:
+			wp.logger.Debugf("workerPool: stopped")
+			return
+		}
+	}
+}
+
+// Stop synchronizing with the InstanceSet.
+func (wp *workerPool) Stop() {
+	wp.setupOnce.Do(wp.setup)
+	close(wp.stop)
+}
+
+// View reports status information for every worker in the pool.
+func (wp *workerPool) View() []WorkerView {
+	var r []WorkerView
+	wp.setupOnce.Do(wp.setup)
+	wp.mtx.Lock()
+	for _, w := range wp.workers {
+		r = append(r, WorkerView{
+			Instance:             w.instance.String(),
+			Price:                w.instType.Price,
+			ArvadosInstanceType:  w.instType.Name,
+			ProviderInstanceType: w.instType.ProviderType,
+			LastContainerUUID:    w.lastUUID,
+			Unallocated:          w.unallocated,
+			WorkerState:          w.state.String(),
+		})
+	}
+	wp.mtx.Unlock()
+	sort.Slice(r, func(i, j int) bool {
+		return strings.Compare(r[i].Instance, r[j].Instance) < 0
+	})
+	return r
+}
+
+func (wp *workerPool) setup() {
+	wp.creating = map[arvados.InstanceType]int{}
+	wp.workers = map[cloud.InstanceID]*worker{}
+	wp.subscribers = map[<-chan struct{}]chan<- struct{}{}
+	if wp.syncInterval == 0 {
+		wp.syncInterval = defaultSyncInterval
+	}
+}
+
+func (wp *workerPool) notify() {
+	wp.mtx.RLock()
+	defer wp.mtx.RUnlock()
+	for _, send := range wp.subscribers {
+		select {
+		case send <- struct{}{}:
+		default:
+		}
+	}
+}
+
+func (wp *workerPool) getInstancesAndSync() error {
+	wp.setupOnce.Do(wp.setup)
+	wp.logger.Debugf("workerPool: getInstancesAndSync")
+	threshold := time.Now()
+	instances, err := wp.instanceSet.Instances(cloud.InstanceTags{})
+	if err != nil {
+		return err
+	}
+	wp.sync(threshold, instances)
+	return nil
+}
+
+// Add/remove/update workers based on instances, which was obtained
+// from the instanceSet. However, don't clobber any other updates that
+// already happened after threshold.
+func (wp *workerPool) sync(threshold time.Time, instances []cloud.Instance) {
+	wp.mtx.Lock()
+	defer wp.mtx.Unlock()
+	wp.logger.Debugf("workerPool: sync with %d instances", len(instances))
+
+	for _, inst := range instances {
+		itTag := inst.Tags()["InstanceType"]
+		it, ok := wp.instanceTypes[itTag]
+		if !ok {
+			wp.logger.Errorf("workerPool: instance %q has unknown InstanceType tag %q --- ignoring", inst, itTag)
+			continue
+		}
+		wp.updateWorker(inst, it, StateUnknown)
+	}
+
+	for id, wkr := range wp.workers {
+		if wkr.updated.After(threshold) {
+			continue
+		}
+		wp.logger.Infof("workerPool: instance %q disappeared, shutting down worker with state %q", wkr.instance, wkr.state)
+		delete(wp.workers, id)
+		wp.shutdown(wkr)
+	}
+
+	if !wp.loaded {
+		wp.loaded = true
+		wp.logger.Infof("workerPool: loaded initial set of instances (%d) from InstanceSet", len(wp.workers))
+	}
+}
+
+// caller must have lock
+func (wp *workerPool) shutdown(wkr *worker) {
+	wkr.state = StateShutdown
+	go func() {
+		wkr.instance.Destroy()
+		wp.notify()
+	}()
+}
+
+// should be called in a new goroutine
+func (wp *workerPool) probeAndUpdate(wkr *worker) {
+	wp.mtx.Lock()
+	updated := wkr.updated
+	booted := wkr.booted
+	wp.mtx.Unlock()
+
+	var err error
+	if !booted {
+		err = wp.probeBooted(wkr)
+		if err == nil {
+			booted = true
+			wp.logger.Infof("workerPool: instance %q booted", wkr.instance)
+		}
+	}
+	var running []string
+	if err == nil && booted {
+		running, err = wp.probeRunning(wkr)
+	}
+	wp.mtx.Lock()
+	defer wp.mtx.Unlock()
+	defer func() { wkr.updated = time.Now() }()
+	wkr.booted = booted
+	if err != nil {
+		if wkr.state != StateShutdown {
+			elapsed := time.Since(wkr.probed)
+			wp.logger.Infof("workerPool: instance %q not responding for %s: %s", wkr.instance, elapsed, err)
+
+			label, threshold := "", maxPingFailTime
+			if wkr.state == StateBooting {
+				label, threshold = "new ", maxBootTime
+			}
+			if elapsed > threshold {
+				wp.logger.Warnf("workerPool: %sinstance %q unresponsive since %s; shutting down", label, wkr.instance, wkr.probed)
+				wp.shutdown(wkr)
+			}
+		}
+		return
+	}
+	wkr.probed = time.Now()
+	if len(running) > 0 {
+		wkr.busy = time.Now()
+		wkr.lastUUID = running[0]
+	}
+	if wkr.state == StateShutdown {
+	} else if booted {
+		wkr.state = StateRunning
+	} else {
+		wkr.state = StateBooting
+	}
+	if updated == wkr.updated {
+		// We haven't started any new work since starting the
+		// probe, so this is the latest available information.
+		if len(running) == 0 && len(wkr.running) > 0 {
+			wkr.unallocated = time.Now()
+		}
+		wkr.running = running
+	}
+	go wp.notify()
+}
+
+func (wp *workerPool) probeRunning(wkr *worker) ([]string, error) {
+	stdout, _, err := wkr.executor.Execute("crunch-run --list", nil)
+	if err != nil {
+		return nil, err
+	}
+	stdout = bytes.TrimRight(stdout, "\n")
+	if len(stdout) == 0 {
+		return nil, nil
+	}
+	return strings.Split(string(stdout), "\n"), nil
+}
+
+func (wp *workerPool) probeBooted(wkr *worker) error {
+	cmd := wp.bootProbeCommand
+	if cmd == "" {
+		cmd = "true"
+	}
+	stdout, stderr, err := wkr.executor.Execute(cmd, nil)
+	switch err.(type) {
+	case nil:
+		wp.logger.Infof("boot probe succeeded on instance %s (command %q, stdout %q, stderr %q)", wkr.instance, cmd, stdout, stderr)
+		return nil
+	case *ssh.ExitError:
+		return fmt.Errorf("boot probe failed on instance %s: %s (command %q, stdout %q, stderr %q)", wkr.instance, err, cmd, stdout, stderr)
+	default:
+		return fmt.Errorf("boot probe failed on instance %s: %s", wkr.instance, err)
+	}
+}
diff --git a/lib/dispatchcloud/worker/pool_test.go b/lib/dispatchcloud/worker/pool_test.go
new file mode 100644
index 000000000..e8c90a45b
--- /dev/null
+++ b/lib/dispatchcloud/worker/pool_test.go
@@ -0,0 +1,124 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package worker
+
+import (
+	"time"
+
+	"git.curoverse.com/arvados.git/lib/cloud"
+	"git.curoverse.com/arvados.git/lib/dispatchcloud/test"
+	"git.curoverse.com/arvados.git/sdk/go/arvados"
+	"github.com/Sirupsen/logrus"
+	check "gopkg.in/check.v1"
+)
+
+const GiB arvados.ByteSize = 1 << 30
+
+var _ = check.Suite(&PoolSuite{})
+
+type PoolSuite struct{}
+
+func (suite *PoolSuite) SetUpSuite(c *check.C) {
+	logrus.StandardLogger().SetLevel(logrus.DebugLevel)
+}
+
+func (suite *PoolSuite) TestStartContainer(c *check.C) {
+	// TODO: use an instanceSet stub with an SSH server
+	c.Fail()
+}
+
+func (suite *PoolSuite) TestVerifyHostKey(c *check.C) {
+	// TODO: use an instanceSet stub with an SSH server
+	c.Fail()
+}
+
+func (suite *PoolSuite) TestCreateUnallocShutdown(c *check.C) {
+	lameInstanceSet := &test.LameInstanceSet{Hold: make(chan bool)}
+	type1 := arvados.InstanceType{Name: "a1s", ProviderType: "a1.small", VCPUs: 1, RAM: 1 * GiB, Price: .01}
+	type2 := arvados.InstanceType{Name: "a2m", ProviderType: "a2.medium", VCPUs: 2, RAM: 2 * GiB, Price: .02}
+	pool := &workerPool{
+		logger:      logrus.StandardLogger(),
+		newExecutor: func(cloud.Instance) Executor { return &stubExecutor{} },
+		instanceSet: lameInstanceSet,
+		instanceTypes: arvados.InstanceTypeMap{
+			type1.Name: type1,
+			type2.Name: type2,
+		},
+	}
+	notify := pool.Subscribe()
+	defer pool.Unsubscribe(notify)
+	notify2 := pool.Subscribe()
+	defer pool.Unsubscribe(notify2)
+
+	c.Check(pool.Unallocated()[type1], check.Equals, 0)
+	c.Check(pool.Unallocated()[type2], check.Equals, 0)
+	pool.Create(type2)
+	pool.Create(type1)
+	pool.Create(type2)
+	c.Check(pool.Unallocated()[type1], check.Equals, 1)
+	c.Check(pool.Unallocated()[type2], check.Equals, 2)
+	// Unblock the pending Create calls and (before calling Sync!)
+	// wait for the pool to process the returned instances.
+	go lameInstanceSet.Release(3)
+	suite.wait(c, pool, notify, func() bool {
+		list, err := lameInstanceSet.Instances(nil)
+		return err == nil && len(list) == 3
+	})
+
+	c.Check(pool.Unallocated()[type1], check.Equals, 1)
+	c.Check(pool.Unallocated()[type2], check.Equals, 2)
+	pool.getInstancesAndSync()
+	c.Check(pool.Unallocated()[type1], check.Equals, 1)
+	c.Check(pool.Unallocated()[type2], check.Equals, 2)
+
+	c.Check(pool.Shutdown(type2), check.Equals, true)
+	suite.wait(c, pool, notify, func() bool {
+		return pool.Unallocated()[type1] == 1 && pool.Unallocated()[type2] == 1
+	})
+	c.Check(pool.Shutdown(type2), check.Equals, true)
+	suite.wait(c, pool, notify, func() bool {
+		return pool.Unallocated()[type1] == 1 && pool.Unallocated()[type2] == 0
+	})
+	c.Check(pool.Shutdown(type2), check.Equals, false)
+	for {
+		// Consume any waiting notifications to ensure the
+		// next one we get is from Shutdown.
+		select {
+		case <-notify:
+			continue
+		default:
+		}
+		break
+	}
+	c.Check(pool.Shutdown(type1), check.Equals, true)
+	suite.wait(c, pool, notify, func() bool {
+		return pool.Unallocated()[type1] == 0 && pool.Unallocated()[type2] == 0
+	})
+	select {
+	case <-notify2:
+	case <-time.After(time.Second):
+		c.Error("notify did not receive")
+	}
+	go lameInstanceSet.Release(3) // unblock Destroy calls
+}
+
+func (suite *PoolSuite) wait(c *check.C, pool Pool, notify <-chan struct{}, ready func() bool) {
+	timeout := time.NewTimer(time.Second).C
+	for !ready() {
+		select {
+		case <-notify:
+			continue
+		case <-timeout:
+		}
+		break
+	}
+	c.Check(ready(), check.Equals, true)
+}
+
+type stubExecutor struct{}
+
+func (*stubExecutor) SetInstance(cloud.Instance) {}
+
+func (*stubExecutor) Execute(cmd string, stdin []byte) ([]byte, []byte, error) { return nil, nil, nil }
diff --git a/lib/dispatchcloud/worker/worker.go b/lib/dispatchcloud/worker/worker.go
new file mode 100644
index 000000000..3e000c24d
--- /dev/null
+++ b/lib/dispatchcloud/worker/worker.go
@@ -0,0 +1,44 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package worker
+
+import (
+	"time"
+)
+
+// State indicates whether a worker is available to do work, and (if
+// not) whether/when it is expected to become ready.
+type State int
+
+const (
+	StateUnknown  State = iota // might be running a container already
+	StateBooting               // instance is booting
+	StateRunning               // instance is running
+	StateShutdown              // worker has stopped monitoring the instance
+)
+
+const (
+	// TODO: configurable
+	maxPingFailTime = 10 * time.Minute
+	maxBootTime     = 20 * time.Minute
+)
+
+var stateString = map[State]string{
+	StateUnknown:  "unknown",
+	StateBooting:  "booting",
+	StateRunning:  "running",
+	StateShutdown: "shutdown",
+}
+
+// String implements fmt.Stringer.
+func (s State) String() string {
+	return stateString[s]
+}
+
+// MarshalText implements encoding.TextMarshaler so a JSON encoding of
+// map[State]anything uses the state's string representation.
+func (s State) MarshalText() ([]byte, error) {
+	return []byte(stateString[s]), nil
+}
diff --git a/sdk/go/arvados/config.go b/sdk/go/arvados/config.go
index 6edd18418..daf927b5f 100644
--- a/sdk/go/arvados/config.go
+++ b/sdk/go/arvados/config.go
@@ -55,6 +55,8 @@ type Cluster struct {
 	ManagementToken    string
 	NodeProfiles       map[string]NodeProfile
 	InstanceTypes      InstanceTypeMap
+	CloudVMs           CloudVMs
+	Dispatch           Dispatch
 	HTTPRequestTimeout Duration
 	RemoteClusters     map[string]RemoteCluster
 	PostgreSQL         PostgreSQL
@@ -89,6 +91,38 @@ type InstanceType struct {
 	Preemptible  bool
 }
 
+type Dispatch struct {
+	// PEM encoded SSH key (RSA, DSA, or ECDSA) able to log in to
+	// cloud VMs.
+	PrivateKey []byte
+
+	// Max time for workers to come up before abandoning stale
+	// locks from previous run
+	StaleLockTimeout Duration
+
+	// Interval between queue polls
+	PollInterval Duration
+
+	// Interval between probes to each worker
+	ProbeInterval Duration
+
+	// Maximum total worker probes per second
+	MaxProbesPerSecond int
+}
+
+type CloudVMs struct {
+	// Shell command that exits zero IFF the VM is fully booted
+	// and ready to run containers, e.g., "mount | grep
+	// /encrypted-tmp"
+	BootProbeCommand string
+	SyncInterval     Duration
+
+	ImageID string
+
+	Driver           string
+	DriverParameters map[string]interface{}
+}
+
 type InstanceTypeMap map[string]InstanceType
 
 var errDuplicateInstanceTypeName = errors.New("duplicate instance type name")
@@ -153,42 +187,45 @@ func (cc *Cluster) GetNodeProfile(node string) (*NodeProfile, error) {
 }
 
 type NodeProfile struct {
-	Controller  SystemServiceInstance `json:"arvados-controller"`
-	Health      SystemServiceInstance `json:"arvados-health"`
-	Keepproxy   SystemServiceInstance `json:"keepproxy"`
-	Keepstore   SystemServiceInstance `json:"keepstore"`
-	Keepweb     SystemServiceInstance `json:"keep-web"`
-	Nodemanager SystemServiceInstance `json:"arvados-node-manager"`
-	RailsAPI    SystemServiceInstance `json:"arvados-api-server"`
-	Websocket   SystemServiceInstance `json:"arvados-ws"`
-	Workbench   SystemServiceInstance `json:"arvados-workbench"`
+	Controller    SystemServiceInstance `json:"arvados-controller"`
+	Health        SystemServiceInstance `json:"arvados-health"`
+	Keepproxy     SystemServiceInstance `json:"keepproxy"`
+	Keepstore     SystemServiceInstance `json:"keepstore"`
+	Keepweb       SystemServiceInstance `json:"keep-web"`
+	Nodemanager   SystemServiceInstance `json:"arvados-node-manager"`
+	DispatchCloud SystemServiceInstance `json:"arvados-dispatch-cloud"`
+	RailsAPI      SystemServiceInstance `json:"arvados-api-server"`
+	Websocket     SystemServiceInstance `json:"arvados-ws"`
+	Workbench     SystemServiceInstance `json:"arvados-workbench"`
 }
 
 type ServiceName string
 
 const (
-	ServiceNameRailsAPI    ServiceName = "arvados-api-server"
-	ServiceNameController  ServiceName = "arvados-controller"
-	ServiceNameNodemanager ServiceName = "arvados-node-manager"
-	ServiceNameWorkbench   ServiceName = "arvados-workbench"
-	ServiceNameWebsocket   ServiceName = "arvados-ws"
-	ServiceNameKeepweb     ServiceName = "keep-web"
-	ServiceNameKeepproxy   ServiceName = "keepproxy"
-	ServiceNameKeepstore   ServiceName = "keepstore"
+	ServiceNameRailsAPI      ServiceName = "arvados-api-server"
+	ServiceNameController    ServiceName = "arvados-controller"
+	ServiceNameDispatchCloud ServiceName = "arvados-dispatch-cloud"
+	ServiceNameNodemanager   ServiceName = "arvados-node-manager"
+	ServiceNameWorkbench     ServiceName = "arvados-workbench"
+	ServiceNameWebsocket     ServiceName = "arvados-ws"
+	ServiceNameKeepweb       ServiceName = "keep-web"
+	ServiceNameKeepproxy     ServiceName = "keepproxy"
+	ServiceNameKeepstore     ServiceName = "keepstore"
 )
 
 // ServicePorts returns the configured listening address (or "" if
 // disabled) for each service on the node.
 func (np *NodeProfile) ServicePorts() map[ServiceName]string {
 	return map[ServiceName]string{
-		ServiceNameRailsAPI:    np.RailsAPI.Listen,
-		ServiceNameController:  np.Controller.Listen,
-		ServiceNameNodemanager: np.Nodemanager.Listen,
-		ServiceNameWorkbench:   np.Workbench.Listen,
-		ServiceNameWebsocket:   np.Websocket.Listen,
-		ServiceNameKeepweb:     np.Keepweb.Listen,
-		ServiceNameKeepproxy:   np.Keepproxy.Listen,
-		ServiceNameKeepstore:   np.Keepstore.Listen,
+		ServiceNameRailsAPI:      np.RailsAPI.Listen,
+		ServiceNameController:    np.Controller.Listen,
+		ServiceNameDispatchCloud: np.DispatchCloud.Listen,
+		ServiceNameNodemanager:   np.Nodemanager.Listen,
+		ServiceNameWorkbench:     np.Workbench.Listen,
+		ServiceNameWebsocket:     np.Websocket.Listen,
+		ServiceNameKeepweb:       np.Keepweb.Listen,
+		ServiceNameKeepproxy:     np.Keepproxy.Listen,
+		ServiceNameKeepstore:     np.Keepstore.Listen,
 	}
 }
 
diff --git a/sdk/go/arvados/container.go b/sdk/go/arvados/container.go
index 2622c1370..d0f14284c 100644
--- a/sdk/go/arvados/container.go
+++ b/sdk/go/arvados/container.go
@@ -18,7 +18,7 @@ type Container struct {
 	Mounts               map[string]Mount     `json:"mounts"`
 	Output               string               `json:"output"`
 	OutputPath           string               `json:"output_path"`
-	Priority             int                  `json:"priority"`
+	Priority             int64                `json:"priority"`
 	RuntimeConstraints   RuntimeConstraints   `json:"runtime_constraints"`
 	State                ContainerState       `json:"state"`
 	SchedulingParameters SchedulingParameters `json:"scheduling_parameters"`
diff --git a/vendor/vendor.json b/vendor/vendor.json
index aa6b2d773..aee25beab 100644
--- a/vendor/vendor.json
+++ b/vendor/vendor.json
@@ -349,6 +349,12 @@
 			"revisionTime": "2016-12-03T19:45:07Z"
 		},
 		{
+			"checksumSHA1": "ewGq4nGalpCQOHcmBTdAEQx1wW0=",
+			"path": "github.com/mitchellh/mapstructure",
+			"revision": "bb74f1db0675b241733089d5a1faa5dd8b0ef57b",
+			"revisionTime": "2018-05-11T14:21:26Z"
+		},
+		{
 			"checksumSHA1": "OFNit1Qx2DdWhotfREKodDNUwCM=",
 			"path": "github.com/opencontainers/go-digest",
 			"revision": "279bed98673dd5bef374d3b6e4b09e2af76183bf",

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list