[ARVADOS] created: 1.1.2-131-g10f08c3

Git user git at public.curoverse.com
Thu Jan 25 19:53:05 EST 2018


        at  10f08c358c12468119dc2621c48b68d6d33417da (commit)


commit 10f08c358c12468119dc2621c48b68d6d33417da
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Thu Jan 25 19:13:18 2018 -0500

    12199: Pass node type to sbatch --constraint argument if configured.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/services/crunch-dispatch-slurm/crunch-dispatch-slurm.go b/services/crunch-dispatch-slurm/crunch-dispatch-slurm.go
index bcc8197..879cb78 100644
--- a/services/crunch-dispatch-slurm/crunch-dispatch-slurm.go
+++ b/services/crunch-dispatch-slurm/crunch-dispatch-slurm.go
@@ -131,6 +131,8 @@ func (cmd *command) Run(prog string, args []string) error {
 		log.Fatalf("error loading config: %s", err)
 	} else if cmd.cluster, err = siteConfig.GetCluster(""); err != nil {
 		log.Fatalf("config error: %s", err)
+	} else if len(cmd.cluster.InstanceTypes) > 0 {
+		go dispatchcloud.SlurmNodeTypeFeatureKludge(cmd.cluster)
 	}
 
 	if cmd.slurm == nil {
@@ -211,6 +213,16 @@ func (cmd *command) sbatchArgs(container arvados.Container) ([]string, error) {
 		sbatchArgs = append(sbatchArgs, fmt.Sprintf("--partition=%s", strings.Join(container.SchedulingParameters.Partitions, ",")))
 	}
 
+	if cmd.cluster == nil {
+		// no instance types configured
+	} else if it, err := dispatchcloud.ChooseInstanceType(cmd.cluster, &container); err == dispatchcloud.ErrInstanceTypesNotConfigured {
+		// ditto
+	} else if err != nil {
+		return nil, err
+	} else {
+		sbatchArgs = append(sbatchArgs, "--constraint="+it.Name)
+	}
+
 	return sbatchArgs, nil
 }
 
@@ -243,7 +255,13 @@ func (cmd *command) run(_ *dispatch.Dispatcher, ctr arvados.Container, status <-
 	if ctr.State == dispatch.Locked && !cmd.sqCheck.HasUUID(ctr.UUID) {
 		log.Printf("Submitting container %s to slurm", ctr.UUID)
 		if err := cmd.submit(ctr, cmd.CrunchRunCommand); err != nil {
-			text := fmt.Sprintf("Error submitting container %s to slurm: %s", ctr.UUID, err)
+			var text string
+			if err == dispatchcloud.ErrConstraintsNotSatisfiable {
+				text = fmt.Sprintf("cannot run container %s: %s", ctr.UUID, err)
+				cmd.dispatcher.UpdateState(ctr.UUID, dispatch.Cancelled)
+			} else {
+				text = fmt.Sprintf("Error submitting container %s to slurm: %s", ctr.UUID, err)
+			}
 			log.Print(text)
 
 			lr := arvadosclient.Dict{"log": arvadosclient.Dict{
diff --git a/services/crunch-dispatch-slurm/crunch-dispatch-slurm_test.go b/services/crunch-dispatch-slurm/crunch-dispatch-slurm_test.go
index 63f56f3..459b7c6 100644
--- a/services/crunch-dispatch-slurm/crunch-dispatch-slurm_test.go
+++ b/services/crunch-dispatch-slurm/crunch-dispatch-slurm_test.go
@@ -24,6 +24,7 @@ import (
 	"git.curoverse.com/arvados.git/sdk/go/arvadosclient"
 	"git.curoverse.com/arvados.git/sdk/go/arvadostest"
 	"git.curoverse.com/arvados.git/sdk/go/dispatch"
+	"git.curoverse.com/arvados.git/services/dispatchcloud"
 	. "gopkg.in/check.v1"
 )
 
@@ -32,32 +33,19 @@ func Test(t *testing.T) {
 	TestingT(t)
 }
 
-var _ = Suite(&TestSuite{})
-var _ = Suite(&MockArvadosServerSuite{})
+var _ = Suite(&IntegrationSuite{})
+var _ = Suite(&StubbedSuite{})
 
-type TestSuite struct {
+type IntegrationSuite struct {
 	cmd command
 }
 
-var initialArgs []string
-
-func (s *TestSuite) SetUpSuite(c *C) {
-	initialArgs = os.Args
-}
-
-func (s *TestSuite) TearDownSuite(c *C) {
-}
-
-func (s *TestSuite) SetUpTest(c *C) {
-	args := []string{"crunch-dispatch-slurm"}
-	os.Args = args
-
+func (s *IntegrationSuite) SetUpTest(c *C) {
 	arvadostest.StartAPI()
 	os.Setenv("ARVADOS_API_TOKEN", arvadostest.Dispatch1Token)
 }
 
-func (s *TestSuite) TearDownTest(c *C) {
-	os.Args = initialArgs
+func (s *IntegrationSuite) TearDownTest(c *C) {
 	arvadostest.ResetEnv()
 	arvadostest.StopAPI()
 }
@@ -99,7 +87,7 @@ func (sf *slurmFake) Cancel(name string) error {
 	return nil
 }
 
-func (s *TestSuite) integrationTest(c *C, slurm *slurmFake,
+func (s *IntegrationSuite) integrationTest(c *C, slurm *slurmFake,
 	expectBatch [][]string,
 	runContainer func(*dispatch.Dispatcher, arvados.Container)) arvados.Container {
 	arvadostest.ResetEnv()
@@ -159,7 +147,7 @@ func (s *TestSuite) integrationTest(c *C, slurm *slurmFake,
 	return container
 }
 
-func (s *TestSuite) TestIntegrationNormal(c *C) {
+func (s *IntegrationSuite) TestNormal(c *C) {
 	container := s.integrationTest(c,
 		&slurmFake{queue: "zzzzz-dz642-queuedcontainer 9990 100\n"},
 		nil,
@@ -171,7 +159,7 @@ func (s *TestSuite) TestIntegrationNormal(c *C) {
 	c.Check(container.State, Equals, arvados.ContainerStateComplete)
 }
 
-func (s *TestSuite) TestIntegrationCancel(c *C) {
+func (s *IntegrationSuite) TestCancel(c *C) {
 	slurm := &slurmFake{queue: "zzzzz-dz642-queuedcontainer 9990 100\n"}
 	readyToCancel := make(chan bool)
 	slurm.onCancel = func() { <-readyToCancel }
@@ -193,7 +181,7 @@ func (s *TestSuite) TestIntegrationCancel(c *C) {
 	c.Check(slurm.didCancel[:2], DeepEquals, []string{"zzzzz-dz642-queuedcontainer", "zzzzz-dz642-queuedcontainer"})
 }
 
-func (s *TestSuite) TestIntegrationMissingFromSqueue(c *C) {
+func (s *IntegrationSuite) TestMissingFromSqueue(c *C) {
 	container := s.integrationTest(c, &slurmFake{},
 		[][]string{{
 			fmt.Sprintf("--job-name=%s", "zzzzz-dz642-queuedcontainer"),
@@ -209,7 +197,7 @@ func (s *TestSuite) TestIntegrationMissingFromSqueue(c *C) {
 	c.Check(container.State, Equals, arvados.ContainerStateCancelled)
 }
 
-func (s *TestSuite) TestSbatchFail(c *C) {
+func (s *IntegrationSuite) TestSbatchFail(c *C) {
 	container := s.integrationTest(c,
 		&slurmFake{errBatch: errors.New("something terrible happened")},
 		[][]string{{"--job-name=zzzzz-dz642-queuedcontainer", "--mem=11445", "--cpus-per-task=4", "--tmp=45777", "--nice=9990"}},
@@ -230,7 +218,7 @@ func (s *TestSuite) TestSbatchFail(c *C) {
 	c.Assert(len(ll.Items), Equals, 1)
 }
 
-func (s *TestSuite) TestIntegrationChangePriority(c *C) {
+func (s *IntegrationSuite) TestChangePriority(c *C) {
 	slurm := &slurmFake{queue: "zzzzz-dz642-queuedcontainer 9990 100\n"}
 	container := s.integrationTest(c, slurm, nil,
 		func(dispatcher *dispatch.Dispatcher, container arvados.Container) {
@@ -248,15 +236,15 @@ func (s *TestSuite) TestIntegrationChangePriority(c *C) {
 	c.Check(slurm.didRenice[len(slurm.didRenice)-1], DeepEquals, []string{"zzzzz-dz642-queuedcontainer", "4000"})
 }
 
-type MockArvadosServerSuite struct {
+type StubbedSuite struct {
 	cmd command
 }
 
-func (s *MockArvadosServerSuite) TearDownTest(c *C) {
-	arvadostest.ResetEnv()
+func (s *StubbedSuite) SetUpTest(c *C) {
+	s.cmd = command{}
 }
 
-func (s *MockArvadosServerSuite) TestAPIErrorGettingContainers(c *C) {
+func (s *StubbedSuite) TestAPIErrorGettingContainers(c *C) {
 	apiStubResponses := make(map[string]arvadostest.StubResponse)
 	apiStubResponses["/arvados/v1/api_client_authorizations/current"] = arvadostest.StubResponse{200, `{"uuid":"` + arvadostest.Dispatch1AuthUUID + `"}`}
 	apiStubResponses["/arvados/v1/containers"] = arvadostest.StubResponse{500, string(`{}`)}
@@ -264,7 +252,7 @@ func (s *MockArvadosServerSuite) TestAPIErrorGettingContainers(c *C) {
 	s.testWithServerStub(c, apiStubResponses, "echo", "Error getting list of containers")
 }
 
-func (s *MockArvadosServerSuite) testWithServerStub(c *C, apiStubResponses map[string]arvadostest.StubResponse, crunchCmd string, expected string) {
+func (s *StubbedSuite) testWithServerStub(c *C, apiStubResponses map[string]arvadostest.StubResponse, crunchCmd string, expected string) {
 	apiStub := arvadostest.ServerStub{apiStubResponses}
 
 	api := httptest.NewServer(&apiStub)
@@ -312,12 +300,12 @@ func (s *MockArvadosServerSuite) testWithServerStub(c *C, apiStubResponses map[s
 	c.Check(buf.String(), Matches, `(?ms).*`+expected+`.*`)
 }
 
-func (s *MockArvadosServerSuite) TestNoSuchConfigFile(c *C) {
+func (s *StubbedSuite) TestNoSuchConfigFile(c *C) {
 	err := s.cmd.readConfig("/nosuchdir89j7879/8hjwr7ojgyy7")
 	c.Assert(err, NotNil)
 }
 
-func (s *MockArvadosServerSuite) TestBadSbatchArgsConfig(c *C) {
+func (s *StubbedSuite) TestBadSbatchArgsConfig(c *C) {
 	tmpfile, err := ioutil.TempFile(os.TempDir(), "config")
 	c.Check(err, IsNil)
 	defer os.Remove(tmpfile.Name())
@@ -329,7 +317,7 @@ func (s *MockArvadosServerSuite) TestBadSbatchArgsConfig(c *C) {
 	c.Assert(err, NotNil)
 }
 
-func (s *MockArvadosServerSuite) TestNoSuchArgInConfigIgnored(c *C) {
+func (s *StubbedSuite) TestNoSuchArgInConfigIgnored(c *C) {
 	tmpfile, err := ioutil.TempFile(os.TempDir(), "config")
 	c.Check(err, IsNil)
 	defer os.Remove(tmpfile.Name())
@@ -342,7 +330,7 @@ func (s *MockArvadosServerSuite) TestNoSuchArgInConfigIgnored(c *C) {
 	c.Check(0, Equals, len(s.cmd.SbatchArguments))
 }
 
-func (s *MockArvadosServerSuite) TestReadConfig(c *C) {
+func (s *StubbedSuite) TestReadConfig(c *C) {
 	tmpfile, err := ioutil.TempFile(os.TempDir(), "config")
 	c.Check(err, IsNil)
 	defer os.Remove(tmpfile.Name())
@@ -357,40 +345,79 @@ func (s *MockArvadosServerSuite) TestReadConfig(c *C) {
 	c.Check(args, DeepEquals, s.cmd.SbatchArguments)
 }
 
-func (s *MockArvadosServerSuite) TestSbatchFuncWithNoConfigArgs(c *C) {
-	s.testSbatchFuncWithArgs(c, nil)
-}
-
-func (s *MockArvadosServerSuite) TestSbatchFuncWithEmptyConfigArgs(c *C) {
-	s.testSbatchFuncWithArgs(c, []string{})
-}
+func (s *StubbedSuite) TestSbatchArgs(c *C) {
+	container := arvados.Container{
+		UUID:               "123",
+		RuntimeConstraints: arvados.RuntimeConstraints{RAM: 250000000, VCPUs: 2},
+		Priority:           1,
+	}
 
-func (s *MockArvadosServerSuite) TestSbatchFuncWithConfigArgs(c *C) {
-	s.testSbatchFuncWithArgs(c, []string{"--arg1=v1", "--arg2"})
+	for _, defaults := range [][]string{
+		nil,
+		{},
+		{"--arg1=v1", "--arg2"},
+	} {
+		c.Logf("%#v", defaults)
+		s.cmd.SbatchArguments = defaults
+
+		args, err := s.cmd.sbatchArgs(container)
+		c.Check(args, DeepEquals, append(defaults, "--job-name=123", "--mem=239", "--cpus-per-task=2", "--tmp=0", "--nice=9990"))
+		c.Check(err, IsNil)
+	}
 }
 
-func (s *MockArvadosServerSuite) testSbatchFuncWithArgs(c *C, args []string) {
-	s.cmd.SbatchArguments = append([]string(nil), args...)
-
+func (s *StubbedSuite) TestSbatchInstanceTypeConstraint(c *C) {
 	container := arvados.Container{
 		UUID:               "123",
 		RuntimeConstraints: arvados.RuntimeConstraints{RAM: 250000000, VCPUs: 2},
-		Priority:           1}
+		Priority:           1,
+	}
 
-	var expected []string
-	expected = append(expected, s.cmd.SbatchArguments...)
-	expected = append(expected, "--job-name=123", "--mem=239", "--cpus-per-task=2", "--tmp=0", "--nice=9990")
-	args, err := s.cmd.sbatchArgs(container)
-	c.Check(args, DeepEquals, expected)
-	c.Check(err, IsNil)
+	for _, trial := range []struct {
+		types      []arvados.InstanceType
+		sbatchArgs []string
+		err        error
+	}{
+		// Choose node type => use --constraint arg
+		{
+			types: []arvados.InstanceType{
+				{Name: "a1.tiny", Price: 0.02, RAM: 128000000, VCPUs: 1},
+				{Name: "a1.small", Price: 0.04, RAM: 256000000, VCPUs: 2},
+				{Name: "a1.medium", Price: 0.08, RAM: 512000000, VCPUs: 4},
+			},
+			sbatchArgs: []string{"--constraint=a1.small"},
+		},
+		// No node types configured => no slurm constraint
+		{
+			types:      nil,
+			sbatchArgs: nil,
+		},
+		// No node type is big enough => error
+		{
+			types: []arvados.InstanceType{
+				{Name: "a1.tiny", Price: 0.02, RAM: 128000000, VCPUs: 1},
+			},
+			err: dispatchcloud.ErrConstraintsNotSatisfiable,
+		},
+	} {
+		c.Logf("%#v", trial)
+		s.cmd.cluster = &arvados.Cluster{InstanceTypes: trial.types}
+
+		args, err := s.cmd.sbatchArgs(container)
+		c.Check(err, Equals, trial.err)
+		if trial.err == nil {
+			c.Check(args, DeepEquals, append([]string{"--job-name=123", "--mem=239", "--cpus-per-task=2", "--tmp=0", "--nice=9990"}, trial.sbatchArgs...))
+		}
+	}
 }
 
-func (s *MockArvadosServerSuite) TestSbatchPartition(c *C) {
+func (s *StubbedSuite) TestSbatchPartition(c *C) {
 	container := arvados.Container{
 		UUID:                 "123",
 		RuntimeConstraints:   arvados.RuntimeConstraints{RAM: 250000000, VCPUs: 1},
 		SchedulingParameters: arvados.SchedulingParameters{Partitions: []string{"blurb", "b2"}},
-		Priority:             1}
+		Priority:             1,
+	}
 
 	args, err := s.cmd.sbatchArgs(container)
 	c.Check(args, DeepEquals, []string{

commit 0c9586ca048805b404dd762f5cd7cabe5d1ed227
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Thu Jan 25 19:12:49 2018 -0500

    12199: Refactor to eliminate evil globals.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/services/crunch-dispatch-slurm/crunch-dispatch-slurm.go b/services/crunch-dispatch-slurm/crunch-dispatch-slurm.go
index ae2ca58..bcc8197 100644
--- a/services/crunch-dispatch-slurm/crunch-dispatch-slurm.go
+++ b/services/crunch-dispatch-slurm/crunch-dispatch-slurm.go
@@ -21,13 +21,18 @@ import (
 	"git.curoverse.com/arvados.git/sdk/go/arvadosclient"
 	"git.curoverse.com/arvados.git/sdk/go/config"
 	"git.curoverse.com/arvados.git/sdk/go/dispatch"
+	"git.curoverse.com/arvados.git/services/dispatchcloud"
 	"github.com/coreos/go-systemd/daemon"
 )
 
 var version = "dev"
 
-// Config used by crunch-dispatch-slurm
-type Config struct {
+type command struct {
+	dispatcher *dispatch.Dispatcher
+	cluster    *arvados.Cluster
+	sqCheck    *SqueueChecker
+	slurm      Slurm
+
 	Client arvados.Client
 
 	SbatchArguments []string
@@ -41,27 +46,19 @@ type Config struct {
 
 	// Minimum time between two attempts to run the same container
 	MinRetryPeriod arvados.Duration
-
-	slurm Slurm
 }
 
 func main() {
-	theConfig.slurm = &slurmCLI{}
-	err := doMain()
+	err := (&command{}).Run(os.Args[0], os.Args[1:])
 	if err != nil {
 		log.Fatal(err)
 	}
 }
 
-var (
-	theConfig Config
-	sqCheck   = &SqueueChecker{}
-)
-
 const defaultConfigPath = "/etc/arvados/crunch-dispatch-slurm/crunch-dispatch-slurm.yml"
 
-func doMain() error {
-	flags := flag.NewFlagSet("crunch-dispatch-slurm", flag.ExitOnError)
+func (cmd *command) Run(prog string, args []string) error {
+	flags := flag.NewFlagSet(prog, flag.ExitOnError)
 	flags.Usage = func() { usage(flags) }
 
 	configPath := flags.String(
@@ -77,7 +74,7 @@ func doMain() error {
 		false,
 		"Print version information and exit.")
 	// Parse args; omit the first arg which is the command name
-	flags.Parse(os.Args[1:])
+	flags.Parse(args)
 
 	// Print version information if requested
 	if *getVersion {
@@ -87,37 +84,37 @@ func doMain() error {
 
 	log.Printf("crunch-dispatch-slurm %s started", version)
 
-	err := readConfig(&theConfig, *configPath)
+	err := cmd.readConfig(*configPath)
 	if err != nil {
 		return err
 	}
 
-	if theConfig.CrunchRunCommand == nil {
-		theConfig.CrunchRunCommand = []string{"crunch-run"}
+	if cmd.CrunchRunCommand == nil {
+		cmd.CrunchRunCommand = []string{"crunch-run"}
 	}
 
-	if theConfig.PollPeriod == 0 {
-		theConfig.PollPeriod = arvados.Duration(10 * time.Second)
+	if cmd.PollPeriod == 0 {
+		cmd.PollPeriod = arvados.Duration(10 * time.Second)
 	}
 
-	if theConfig.Client.APIHost != "" || theConfig.Client.AuthToken != "" {
+	if cmd.Client.APIHost != "" || cmd.Client.AuthToken != "" {
 		// Copy real configs into env vars so [a]
 		// MakeArvadosClient() uses them, and [b] they get
 		// propagated to crunch-run via SLURM.
-		os.Setenv("ARVADOS_API_HOST", theConfig.Client.APIHost)
-		os.Setenv("ARVADOS_API_TOKEN", theConfig.Client.AuthToken)
+		os.Setenv("ARVADOS_API_HOST", cmd.Client.APIHost)
+		os.Setenv("ARVADOS_API_TOKEN", cmd.Client.AuthToken)
 		os.Setenv("ARVADOS_API_HOST_INSECURE", "")
-		if theConfig.Client.Insecure {
+		if cmd.Client.Insecure {
 			os.Setenv("ARVADOS_API_HOST_INSECURE", "1")
 		}
-		os.Setenv("ARVADOS_KEEP_SERVICES", strings.Join(theConfig.Client.KeepServiceURIs, " "))
+		os.Setenv("ARVADOS_KEEP_SERVICES", strings.Join(cmd.Client.KeepServiceURIs, " "))
 		os.Setenv("ARVADOS_EXTERNAL_CLIENT", "")
 	} else {
 		log.Printf("warning: Client credentials missing from config, so falling back on environment variables (deprecated).")
 	}
 
 	if *dumpConfig {
-		log.Fatal(config.DumpAndExit(theConfig))
+		log.Fatal(config.DumpAndExit(cmd))
 	}
 
 	arv, err := arvadosclient.MakeArvadosClient()
@@ -127,23 +124,39 @@ func doMain() error {
 	}
 	arv.Retries = 25
 
-	sqCheck = &SqueueChecker{Period: time.Duration(theConfig.PollPeriod)}
-	defer sqCheck.Stop()
+	siteConfig, err := arvados.GetConfig(arvados.DefaultConfigFile)
+	if os.IsNotExist(err) {
+		log.Printf("warning: no cluster config file %q (%s), proceeding with no node types defined", arvados.DefaultConfigFile, err)
+	} else if err != nil {
+		log.Fatalf("error loading config: %s", err)
+	} else if cmd.cluster, err = siteConfig.GetCluster(""); err != nil {
+		log.Fatalf("config error: %s", err)
+	}
+
+	if cmd.slurm == nil {
+		cmd.slurm = &slurmCLI{}
+	}
+
+	cmd.sqCheck = &SqueueChecker{
+		Period: time.Duration(cmd.PollPeriod),
+		Slurm:  cmd.slurm,
+	}
+	defer cmd.sqCheck.Stop()
 
-	dispatcher := &dispatch.Dispatcher{
+	cmd.dispatcher = &dispatch.Dispatcher{
 		Arv:            arv,
-		RunContainer:   run,
-		PollPeriod:     time.Duration(theConfig.PollPeriod),
-		MinRetryPeriod: time.Duration(theConfig.MinRetryPeriod),
+		RunContainer:   cmd.run,
+		PollPeriod:     time.Duration(cmd.PollPeriod),
+		MinRetryPeriod: time.Duration(cmd.MinRetryPeriod),
 	}
 
 	if _, err := daemon.SdNotify(false, "READY=1"); err != nil {
 		log.Printf("Error notifying init daemon: %v", err)
 	}
 
-	go checkSqueueForOrphans(dispatcher, sqCheck)
+	go cmd.checkSqueueForOrphans()
 
-	return dispatcher.Run(context.Background())
+	return cmd.dispatcher.Run(context.Background())
 }
 
 var containerUuidPattern = regexp.MustCompile(`^[a-z0-9]{5}-dz642-[a-z0-9]{15}$`)
@@ -153,19 +166,19 @@ var containerUuidPattern = regexp.MustCompile(`^[a-z0-9]{5}-dz642-[a-z0-9]{15}$`
 // jobs started by a previous dispatch process that never released
 // their slurm allocations even though their container states are
 // Cancelled or Complete. See https://dev.arvados.org/issues/10979
-func checkSqueueForOrphans(dispatcher *dispatch.Dispatcher, sqCheck *SqueueChecker) {
-	for _, uuid := range sqCheck.All() {
+func (cmd *command) checkSqueueForOrphans() {
+	for _, uuid := range cmd.sqCheck.All() {
 		if !containerUuidPattern.MatchString(uuid) {
 			continue
 		}
-		err := dispatcher.TrackContainer(uuid)
+		err := cmd.dispatcher.TrackContainer(uuid)
 		if err != nil {
 			log.Printf("checkSqueueForOrphans: TrackContainer(%s): %s", uuid, err)
 		}
 	}
 }
 
-func niceness(priority int) int {
+func (cmd *command) niceness(priority int) int {
 	if priority > 1000 {
 		priority = 1000
 	}
@@ -176,7 +189,7 @@ func niceness(priority int) int {
 	return (1000 - priority) * 10
 }
 
-func sbatchArgs(container arvados.Container) []string {
+func (cmd *command) sbatchArgs(container arvados.Container) ([]string, error) {
 	mem := int64(math.Ceil(float64(container.RuntimeConstraints.RAM+container.RuntimeConstraints.KeepCacheRAM) / float64(1048576)))
 
 	var disk int64
@@ -188,45 +201,48 @@ func sbatchArgs(container arvados.Container) []string {
 	disk = int64(math.Ceil(float64(disk) / float64(1048576)))
 
 	var sbatchArgs []string
-	sbatchArgs = append(sbatchArgs, theConfig.SbatchArguments...)
+	sbatchArgs = append(sbatchArgs, cmd.SbatchArguments...)
 	sbatchArgs = append(sbatchArgs, fmt.Sprintf("--job-name=%s", container.UUID))
 	sbatchArgs = append(sbatchArgs, fmt.Sprintf("--mem=%d", mem))
 	sbatchArgs = append(sbatchArgs, fmt.Sprintf("--cpus-per-task=%d", container.RuntimeConstraints.VCPUs))
 	sbatchArgs = append(sbatchArgs, fmt.Sprintf("--tmp=%d", disk))
-	sbatchArgs = append(sbatchArgs, fmt.Sprintf("--nice=%d", niceness(container.Priority)))
+	sbatchArgs = append(sbatchArgs, fmt.Sprintf("--nice=%d", cmd.niceness(container.Priority)))
 	if len(container.SchedulingParameters.Partitions) > 0 {
 		sbatchArgs = append(sbatchArgs, fmt.Sprintf("--partition=%s", strings.Join(container.SchedulingParameters.Partitions, ",")))
 	}
 
-	return sbatchArgs
+	return sbatchArgs, nil
 }
 
-func submit(dispatcher *dispatch.Dispatcher, container arvados.Container, crunchRunCommand []string) error {
+func (cmd *command) submit(container arvados.Container, crunchRunCommand []string) error {
 	// append() here avoids modifying crunchRunCommand's
 	// underlying array, which is shared with other goroutines.
 	crArgs := append([]string(nil), crunchRunCommand...)
 	crArgs = append(crArgs, container.UUID)
 	crScript := strings.NewReader(execScript(crArgs))
 
-	sqCheck.L.Lock()
-	defer sqCheck.L.Unlock()
+	cmd.sqCheck.L.Lock()
+	defer cmd.sqCheck.L.Unlock()
 
-	sbArgs := sbatchArgs(container)
+	sbArgs, err := cmd.sbatchArgs(container)
+	if err != nil {
+		return err
+	}
 	log.Printf("running sbatch %+q", sbArgs)
-	return theConfig.slurm.Batch(crScript, sbArgs)
+	return cmd.slurm.Batch(crScript, sbArgs)
 }
 
 // Submit a container to the slurm queue (or resume monitoring if it's
 // already in the queue).  Cancel the slurm job if the container's
 // priority changes to zero or its state indicates it's no longer
 // running.
-func run(disp *dispatch.Dispatcher, ctr arvados.Container, status <-chan arvados.Container) {
+func (cmd *command) run(_ *dispatch.Dispatcher, ctr arvados.Container, status <-chan arvados.Container) {
 	ctx, cancel := context.WithCancel(context.Background())
 	defer cancel()
 
-	if ctr.State == dispatch.Locked && !sqCheck.HasUUID(ctr.UUID) {
+	if ctr.State == dispatch.Locked && !cmd.sqCheck.HasUUID(ctr.UUID) {
 		log.Printf("Submitting container %s to slurm", ctr.UUID)
-		if err := submit(disp, ctr, theConfig.CrunchRunCommand); err != nil {
+		if err := cmd.submit(ctr, cmd.CrunchRunCommand); err != nil {
 			text := fmt.Sprintf("Error submitting container %s to slurm: %s", ctr.UUID, err)
 			log.Print(text)
 
@@ -234,9 +250,9 @@ func run(disp *dispatch.Dispatcher, ctr arvados.Container, status <-chan arvados
 				"object_uuid": ctr.UUID,
 				"event_type":  "dispatch",
 				"properties":  map[string]string{"text": text}}}
-			disp.Arv.Create("logs", lr, nil)
+			cmd.dispatcher.Arv.Create("logs", lr, nil)
 
-			disp.Unlock(ctr.UUID)
+			cmd.dispatcher.Unlock(ctr.UUID)
 			return
 		}
 	}
@@ -248,7 +264,7 @@ func run(disp *dispatch.Dispatcher, ctr arvados.Container, status <-chan arvados
 	// no point in waiting for further dispatch updates: just
 	// clean up and return.
 	go func(uuid string) {
-		for ctx.Err() == nil && sqCheck.HasUUID(uuid) {
+		for ctx.Err() == nil && cmd.sqCheck.HasUUID(uuid) {
 		}
 		cancel()
 	}(ctr.UUID)
@@ -257,68 +273,68 @@ func run(disp *dispatch.Dispatcher, ctr arvados.Container, status <-chan arvados
 		select {
 		case <-ctx.Done():
 			// Disappeared from squeue
-			if err := disp.Arv.Get("containers", ctr.UUID, nil, &ctr); err != nil {
+			if err := cmd.dispatcher.Arv.Get("containers", ctr.UUID, nil, &ctr); err != nil {
 				log.Printf("Error getting final container state for %s: %s", ctr.UUID, err)
 			}
 			switch ctr.State {
 			case dispatch.Running:
-				disp.UpdateState(ctr.UUID, dispatch.Cancelled)
+				cmd.dispatcher.UpdateState(ctr.UUID, dispatch.Cancelled)
 			case dispatch.Locked:
-				disp.Unlock(ctr.UUID)
+				cmd.dispatcher.Unlock(ctr.UUID)
 			}
 			return
 		case updated, ok := <-status:
 			if !ok {
 				log.Printf("Dispatcher says container %s is done: cancel slurm job", ctr.UUID)
-				scancel(ctr)
+				cmd.scancel(ctr)
 			} else if updated.Priority == 0 {
 				log.Printf("Container %s has state %q, priority %d: cancel slurm job", ctr.UUID, updated.State, updated.Priority)
-				scancel(ctr)
+				cmd.scancel(ctr)
 			} else {
-				renice(updated)
+				cmd.renice(updated)
 			}
 		}
 	}
 }
 
-func scancel(ctr arvados.Container) {
-	sqCheck.L.Lock()
-	err := theConfig.slurm.Cancel(ctr.UUID)
-	sqCheck.L.Unlock()
+func (cmd *command) scancel(ctr arvados.Container) {
+	cmd.sqCheck.L.Lock()
+	err := cmd.slurm.Cancel(ctr.UUID)
+	cmd.sqCheck.L.Unlock()
 
 	if err != nil {
 		log.Printf("scancel: %s", err)
 		time.Sleep(time.Second)
-	} else if sqCheck.HasUUID(ctr.UUID) {
+	} else if cmd.sqCheck.HasUUID(ctr.UUID) {
 		log.Printf("container %s is still in squeue after scancel", ctr.UUID)
 		time.Sleep(time.Second)
 	}
 }
 
-func renice(ctr arvados.Container) {
-	nice := niceness(ctr.Priority)
-	oldnice := sqCheck.GetNiceness(ctr.UUID)
+func (cmd *command) renice(ctr arvados.Container) {
+	nice := cmd.niceness(ctr.Priority)
+	oldnice := cmd.sqCheck.GetNiceness(ctr.UUID)
 	if nice == oldnice || oldnice == -1 {
 		return
 	}
 	log.Printf("updating slurm nice value to %d (was %d)", nice, oldnice)
-	sqCheck.L.Lock()
-	err := theConfig.slurm.Renice(ctr.UUID, nice)
-	sqCheck.L.Unlock()
+	cmd.sqCheck.L.Lock()
+	err := cmd.slurm.Renice(ctr.UUID, nice)
+	cmd.sqCheck.L.Unlock()
 
 	if err != nil {
 		log.Printf("renice: %s", err)
 		time.Sleep(time.Second)
 		return
 	}
-	if sqCheck.HasUUID(ctr.UUID) {
+	if cmd.sqCheck.HasUUID(ctr.UUID) {
 		log.Printf("container %s has arvados priority %d, slurm nice %d",
-			ctr.UUID, ctr.Priority, sqCheck.GetNiceness(ctr.UUID))
+			ctr.UUID, ctr.Priority, cmd.sqCheck.GetNiceness(ctr.UUID))
 	}
 }
 
-func readConfig(dst interface{}, path string) error {
-	err := config.LoadFile(dst, path)
+func (cmd *command) readConfig(path string) error {
+	err := config.LoadFile(cmd, path)
 	if err != nil && os.IsNotExist(err) && path == defaultConfigPath {
 		log.Printf("Config not specified. Continue with default configuration.")
 		err = nil
diff --git a/services/crunch-dispatch-slurm/crunch-dispatch-slurm_test.go b/services/crunch-dispatch-slurm/crunch-dispatch-slurm_test.go
index 830976d..63f56f3 100644
--- a/services/crunch-dispatch-slurm/crunch-dispatch-slurm_test.go
+++ b/services/crunch-dispatch-slurm/crunch-dispatch-slurm_test.go
@@ -35,8 +35,9 @@ func Test(t *testing.T) {
 var _ = Suite(&TestSuite{})
 var _ = Suite(&MockArvadosServerSuite{})
 
-type TestSuite struct{}
-type MockArvadosServerSuite struct{}
+type TestSuite struct {
+	cmd command
+}
 
 var initialArgs []string
 
@@ -61,10 +62,6 @@ func (s *TestSuite) TearDownTest(c *C) {
 	arvadostest.StopAPI()
 }
 
-func (s *MockArvadosServerSuite) TearDownTest(c *C) {
-	arvadostest.ResetEnv()
-}
-
 type slurmFake struct {
 	didBatch  [][]string
 	didCancel []string
@@ -110,10 +107,7 @@ func (s *TestSuite) integrationTest(c *C, slurm *slurmFake,
 	arv, err := arvadosclient.MakeArvadosClient()
 	c.Assert(err, IsNil)
 
-	defer func(orig Slurm) {
-		theConfig.slurm = orig
-	}(theConfig.slurm)
-	theConfig.slurm = slurm
+	s.cmd.slurm = slurm
 
 	// There should be one queued container
 	params := arvadosclient.Dict{
@@ -124,12 +118,12 @@ func (s *TestSuite) integrationTest(c *C, slurm *slurmFake,
 	c.Check(err, IsNil)
 	c.Check(len(containers.Items), Equals, 1)
 
-	theConfig.CrunchRunCommand = []string{"echo"}
+	s.cmd.CrunchRunCommand = []string{"echo"}
 
 	ctx, cancel := context.WithCancel(context.Background())
 	doneRun := make(chan struct{})
 
-	dispatcher := dispatch.Dispatcher{
+	s.cmd.dispatcher = &dispatch.Dispatcher{
 		Arv:        arv,
 		PollPeriod: time.Duration(1) * time.Second,
 		RunContainer: func(disp *dispatch.Dispatcher, ctr arvados.Container, status <-chan arvados.Container) {
@@ -138,18 +132,18 @@ func (s *TestSuite) integrationTest(c *C, slurm *slurmFake,
 				slurm.queue = ""
 				doneRun <- struct{}{}
 			}()
-			run(disp, ctr, status)
+			s.cmd.run(disp, ctr, status)
 			cancel()
 		},
 	}
 
-	sqCheck = &SqueueChecker{Period: 500 * time.Millisecond}
+	s.cmd.sqCheck = &SqueueChecker{Period: 500 * time.Millisecond, Slurm: slurm}
 
-	err = dispatcher.Run(ctx)
+	err = s.cmd.dispatcher.Run(ctx)
 	<-doneRun
 	c.Assert(err, Equals, context.Canceled)
 
-	sqCheck.Stop()
+	s.cmd.sqCheck.Stop()
 
 	c.Check(slurm.didBatch, DeepEquals, expectBatch)
 
@@ -236,15 +230,41 @@ func (s *TestSuite) TestSbatchFail(c *C) {
 	c.Assert(len(ll.Items), Equals, 1)
 }
 
+func (s *TestSuite) TestIntegrationChangePriority(c *C) {
+	slurm := &slurmFake{queue: "zzzzz-dz642-queuedcontainer 9990 100\n"}
+	container := s.integrationTest(c, slurm, nil,
+		func(dispatcher *dispatch.Dispatcher, container arvados.Container) {
+			dispatcher.UpdateState(container.UUID, dispatch.Running)
+			time.Sleep(time.Second)
+			dispatcher.Arv.Update("containers", container.UUID,
+				arvadosclient.Dict{
+					"container": arvadosclient.Dict{"priority": 600}},
+				nil)
+			time.Sleep(time.Second)
+			dispatcher.UpdateState(container.UUID, dispatch.Complete)
+		})
+	c.Check(container.State, Equals, arvados.ContainerStateComplete)
+	c.Assert(len(slurm.didRenice), Not(Equals), 0)
+	c.Check(slurm.didRenice[len(slurm.didRenice)-1], DeepEquals, []string{"zzzzz-dz642-queuedcontainer", "4000"})
+}
+
+type MockArvadosServerSuite struct {
+	cmd command
+}
+
+func (s *MockArvadosServerSuite) TearDownTest(c *C) {
+	arvadostest.ResetEnv()
+}
+
 func (s *MockArvadosServerSuite) TestAPIErrorGettingContainers(c *C) {
 	apiStubResponses := make(map[string]arvadostest.StubResponse)
 	apiStubResponses["/arvados/v1/api_client_authorizations/current"] = arvadostest.StubResponse{200, `{"uuid":"` + arvadostest.Dispatch1AuthUUID + `"}`}
 	apiStubResponses["/arvados/v1/containers"] = arvadostest.StubResponse{500, string(`{}`)}
 
-	testWithServerStub(c, apiStubResponses, "echo", "Error getting list of containers")
+	s.testWithServerStub(c, apiStubResponses, "echo", "Error getting list of containers")
 }
 
-func testWithServerStub(c *C, apiStubResponses map[string]arvadostest.StubResponse, crunchCmd string, expected string) {
+func (s *MockArvadosServerSuite) testWithServerStub(c *C, apiStubResponses map[string]arvadostest.StubResponse, crunchCmd string, expected string) {
 	apiStub := arvadostest.ServerStub{apiStubResponses}
 
 	api := httptest.NewServer(&apiStub)
@@ -262,7 +282,7 @@ func testWithServerStub(c *C, apiStubResponses map[string]arvadostest.StubRespon
 	log.SetOutput(io.MultiWriter(buf, os.Stderr))
 	defer log.SetOutput(os.Stderr)
 
-	theConfig.CrunchRunCommand = []string{crunchCmd}
+	s.cmd.CrunchRunCommand = []string{crunchCmd}
 
 	ctx, cancel := context.WithCancel(context.Background())
 	dispatcher := dispatch.Dispatcher{
@@ -274,7 +294,7 @@ func testWithServerStub(c *C, apiStubResponses map[string]arvadostest.StubRespon
 				disp.UpdateState(ctr.UUID, dispatch.Running)
 				disp.UpdateState(ctr.UUID, dispatch.Complete)
 			}()
-			run(disp, ctr, status)
+			s.cmd.run(disp, ctr, status)
 			cancel()
 		},
 	}
@@ -293,14 +313,11 @@ func testWithServerStub(c *C, apiStubResponses map[string]arvadostest.StubRespon
 }
 
 func (s *MockArvadosServerSuite) TestNoSuchConfigFile(c *C) {
-	var config Config
-	err := readConfig(&config, "/nosuchdir89j7879/8hjwr7ojgyy7")
+	err := s.cmd.readConfig("/nosuchdir89j7879/8hjwr7ojgyy7")
 	c.Assert(err, NotNil)
 }
 
 func (s *MockArvadosServerSuite) TestBadSbatchArgsConfig(c *C) {
-	var config Config
-
 	tmpfile, err := ioutil.TempFile(os.TempDir(), "config")
 	c.Check(err, IsNil)
 	defer os.Remove(tmpfile.Name())
@@ -308,13 +325,11 @@ func (s *MockArvadosServerSuite) TestBadSbatchArgsConfig(c *C) {
 	_, err = tmpfile.Write([]byte(`{"SbatchArguments": "oops this is not a string array"}`))
 	c.Check(err, IsNil)
 
-	err = readConfig(&config, tmpfile.Name())
+	err = s.cmd.readConfig(tmpfile.Name())
 	c.Assert(err, NotNil)
 }
 
 func (s *MockArvadosServerSuite) TestNoSuchArgInConfigIgnored(c *C) {
-	var config Config
-
 	tmpfile, err := ioutil.TempFile(os.TempDir(), "config")
 	c.Check(err, IsNil)
 	defer os.Remove(tmpfile.Name())
@@ -322,14 +337,12 @@ func (s *MockArvadosServerSuite) TestNoSuchArgInConfigIgnored(c *C) {
 	_, err = tmpfile.Write([]byte(`{"NoSuchArg": "Nobody loves me, not one tiny hunk."}`))
 	c.Check(err, IsNil)
 
-	err = readConfig(&config, tmpfile.Name())
+	err = s.cmd.readConfig(tmpfile.Name())
 	c.Assert(err, IsNil)
-	c.Check(0, Equals, len(config.SbatchArguments))
+	c.Check(0, Equals, len(s.cmd.SbatchArguments))
 }
 
 func (s *MockArvadosServerSuite) TestReadConfig(c *C) {
-	var config Config
-
 	tmpfile, err := ioutil.TempFile(os.TempDir(), "config")
 	c.Check(err, IsNil)
 	defer os.Remove(tmpfile.Name())
@@ -339,27 +352,25 @@ func (s *MockArvadosServerSuite) TestReadConfig(c *C) {
 	_, err = tmpfile.Write([]byte(argsS))
 	c.Check(err, IsNil)
 
-	err = readConfig(&config, tmpfile.Name())
+	err = s.cmd.readConfig(tmpfile.Name())
 	c.Assert(err, IsNil)
-	c.Check(3, Equals, len(config.SbatchArguments))
-	c.Check(args, DeepEquals, config.SbatchArguments)
+	c.Check(args, DeepEquals, s.cmd.SbatchArguments)
 }
 
 func (s *MockArvadosServerSuite) TestSbatchFuncWithNoConfigArgs(c *C) {
-	testSbatchFuncWithArgs(c, nil)
+	s.testSbatchFuncWithArgs(c, nil)
 }
 
 func (s *MockArvadosServerSuite) TestSbatchFuncWithEmptyConfigArgs(c *C) {
-	testSbatchFuncWithArgs(c, []string{})
+	s.testSbatchFuncWithArgs(c, []string{})
 }
 
 func (s *MockArvadosServerSuite) TestSbatchFuncWithConfigArgs(c *C) {
-	testSbatchFuncWithArgs(c, []string{"--arg1=v1", "--arg2"})
+	s.testSbatchFuncWithArgs(c, []string{"--arg1=v1", "--arg2"})
 }
 
-func testSbatchFuncWithArgs(c *C, args []string) {
-	defer func() { theConfig.SbatchArguments = nil }()
-	theConfig.SbatchArguments = append(theConfig.SbatchArguments, args...)
+func (s *MockArvadosServerSuite) testSbatchFuncWithArgs(c *C, args []string) {
+	s.cmd.SbatchArguments = append([]string(nil), args...)
 
 	container := arvados.Container{
 		UUID:               "123",
@@ -367,9 +378,11 @@ func testSbatchFuncWithArgs(c *C, args []string) {
 		Priority:           1}
 
 	var expected []string
-	expected = append(expected, theConfig.SbatchArguments...)
+	expected = append(expected, s.cmd.SbatchArguments...)
 	expected = append(expected, "--job-name=123", "--mem=239", "--cpus-per-task=2", "--tmp=0", "--nice=9990")
-	c.Check(sbatchArgs(container), DeepEquals, expected)
+	args, err := s.cmd.sbatchArgs(container)
+	c.Check(args, DeepEquals, expected)
+	c.Check(err, IsNil)
 }
 
 func (s *MockArvadosServerSuite) TestSbatchPartition(c *C) {
@@ -379,26 +392,10 @@ func (s *MockArvadosServerSuite) TestSbatchPartition(c *C) {
 		SchedulingParameters: arvados.SchedulingParameters{Partitions: []string{"blurb", "b2"}},
 		Priority:             1}
 
-	c.Check(sbatchArgs(container), DeepEquals, []string{
+	args, err := s.cmd.sbatchArgs(container)
+	c.Check(args, DeepEquals, []string{
 		"--job-name=123", "--mem=239", "--cpus-per-task=1", "--tmp=0", "--nice=9990",
 		"--partition=blurb,b2",
 	})
-}
-
-func (s *TestSuite) TestIntegrationChangePriority(c *C) {
-	slurm := &slurmFake{queue: "zzzzz-dz642-queuedcontainer 9990 100\n"}
-	container := s.integrationTest(c, slurm, nil,
-		func(dispatcher *dispatch.Dispatcher, container arvados.Container) {
-			dispatcher.UpdateState(container.UUID, dispatch.Running)
-			time.Sleep(time.Second)
-			dispatcher.Arv.Update("containers", container.UUID,
-				arvadosclient.Dict{
-					"container": arvadosclient.Dict{"priority": 600}},
-				nil)
-			time.Sleep(time.Second)
-			dispatcher.UpdateState(container.UUID, dispatch.Complete)
-		})
-	c.Check(container.State, Equals, arvados.ContainerStateComplete)
-	c.Assert(len(slurm.didRenice), Not(Equals), 0)
-	c.Check(slurm.didRenice[len(slurm.didRenice)-1], DeepEquals, []string{"zzzzz-dz642-queuedcontainer", "4000"})
+	c.Check(err, IsNil)
 }
diff --git a/services/crunch-dispatch-slurm/squeue.go b/services/crunch-dispatch-slurm/squeue.go
index 5ecfe8f..adb620e 100644
--- a/services/crunch-dispatch-slurm/squeue.go
+++ b/services/crunch-dispatch-slurm/squeue.go
@@ -22,6 +22,7 @@ type jobPriority struct {
 // command 'squeue'.
 type SqueueChecker struct {
 	Period    time.Duration
+	Slurm     Slurm
 	uuids     map[string]jobPriority
 	startOnce sync.Once
 	done      chan struct{}
@@ -77,7 +78,7 @@ func (sqc *SqueueChecker) check() {
 	sqc.L.Lock()
 	defer sqc.L.Unlock()
 
-	cmd := theConfig.slurm.QueueCommand([]string{"--all", "--format=%j %y %Q"})
+	cmd := sqc.Slurm.QueueCommand([]string{"--all", "--format=%j %y %Q"})
 	stdout, stderr := &bytes.Buffer{}, &bytes.Buffer{}
 	cmd.Stdout, cmd.Stderr = stdout, stderr
 	if err := cmd.Run(); err != nil {

commit bfa436c97990c1a0cf39907acc0235a8535b6a43
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Thu Jan 25 19:11:32 2018 -0500

    12199: Add SlurmNodeTypeFeatureKludge.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/services/dispatchcloud/node_size.go b/services/dispatchcloud/node_size.go
index 1abccc5..9cb6d3f 100644
--- a/services/dispatchcloud/node_size.go
+++ b/services/dispatchcloud/node_size.go
@@ -5,7 +5,12 @@
 package dispatchcloud
 
 import (
+	"bytes"
 	"errors"
+	"log"
+	"os/exec"
+	"strings"
+	"time"
 
 	"git.curoverse.com/arvados.git/sdk/go/arvados"
 )
@@ -42,3 +47,67 @@ func ChooseInstanceType(cc *arvados.Cluster, ctr *arvados.Container) (best arvad
 	}
 	return
 }
+
+// SlurmNodeTypeFeatureKludge ensures SLURM accepts every instance
+// type name as a valid feature name, even if no instances of that
+// type have appeared yet.
+//
+// It takes advantage of some SLURM peculiarities:
+//
+// (1) A feature is valid after it has been offered by a node, even if
+// it is no longer offered by any node. So, to make a feature name
+// valid, we can add it to a dummy node ("compute0"), then remove it.
+//
+// (2) when srun is given an invalid --gres argument and an invalid
+// --constraint argument, the error message mentions "Invalid feature
+// specification". So, to test whether a feature name is valid without
+// actually submitting a job, we can call srun with the feature name
+// and an invalid --gres argument.
+//
+// SlurmNodeTypeFeatureKludge does a test-and-fix operation
+// immediately, and then periodically, in case slurm restarts and
+// forgets the list of valid features. It never returns, so it should
+// generally be invoked with "go".
+func SlurmNodeTypeFeatureKludge(cc *arvados.Cluster) {
+	var types []string
+	for _, it := range cc.InstanceTypes {
+		types = append(types, it.Name)
+	}
+	for {
+		slurmKludge(types)
+		time.Sleep(time.Minute)
+	}
+}
+
+var (
+	slurmDummyNode     = "compute0"
+	slurmErrBadFeature = "Invalid feature"
+	slurmErrBadGres    = "Invalid generic resource"
+)
+
+func slurmKludge(types []string) {
+	cmd := exec.Command("srun", "--gres=invalid-gres-specification", "--constraint="+strings.Join(types, "&"), "true")
+	out, err := cmd.CombinedOutput()
+	switch {
+	case err == nil:
+		log.Printf("warning: guaranteed-to-fail srun command did not fail: %q %q", cmd.Path, cmd.Args)
+		log.Printf("output was: %q", out)
+
+	case bytes.Contains(out, []byte(slurmErrBadFeature)):
+		log.Printf("temporarily configuring node %q with all node type features", slurmDummyNode)
+		for _, features := range []string{strings.Join(types, ","), ""} {
+			cmd = exec.Command("scontrol", "update", "NodeName="+slurmDummyNode, "Features="+features)
+			log.Printf("running: %q %q", cmd.Path, cmd.Args)
+			out, err := cmd.CombinedOutput()
+			if err != nil {
+				log.Printf("error: scontrol: %s (output was %q)", err, out)
+			}
+		}
+
+	case bytes.Contains(out, []byte(slurmErrBadGres)):
+		// Evidently our node-type feature names are all valid.
+
+	default:
+		log.Printf("warning: expected srun error %q or %q, but output was %q", slurmErrBadFeature, slurmErrBadGres, out)
+	}
+}

commit f9e1447e5ab21dbc0fdb08328c31811fca2fd327
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Tue Jan 23 14:25:21 2018 -0500

    12199: Add node size calculator.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/build/run-tests.sh b/build/run-tests.sh
index a02f732..57ce41e 100755
--- a/build/run-tests.sh
+++ b/build/run-tests.sh
@@ -77,6 +77,7 @@ lib/crunchstat
 services/api
 services/arv-git-httpd
 services/crunchstat
+services/dispatchcloud
 services/dockercleaner
 services/fuse
 services/health
@@ -869,6 +870,7 @@ gostuff=(
     sdk/go/stats
     services/arv-git-httpd
     services/crunchstat
+    services/dispatchcloud
     services/health
     services/keep-web
     services/keepstore
diff --git a/sdk/go/arvados/config.go b/sdk/go/arvados/config.go
index ca0df1f..9ed0eac 100644
--- a/sdk/go/arvados/config.go
+++ b/sdk/go/arvados/config.go
@@ -1,3 +1,7 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: Apache-2.0
+
 package arvados
 
 import (
@@ -48,6 +52,16 @@ type Cluster struct {
 	ClusterID       string `json:"-"`
 	ManagementToken string
 	SystemNodes     map[string]SystemNode
+	InstanceTypes   []InstanceType
+}
+
+type InstanceType struct {
+	Name         string
+	ProviderType string
+	VCPUs        int
+	RAM          int64
+	Scratch      int64
+	Price        float64
 }
 
 // GetThisSystemNode returns a SystemNode for the node we're running
diff --git a/sdk/go/arvados/container.go b/sdk/go/arvados/container.go
index a541a8d..20d007c 100644
--- a/sdk/go/arvados/container.go
+++ b/sdk/go/arvados/container.go
@@ -41,9 +41,9 @@ type Mount struct {
 // CPU) and network connectivity.
 type RuntimeConstraints struct {
 	API          *bool
-	RAM          int `json:"ram"`
-	VCPUs        int `json:"vcpus"`
-	KeepCacheRAM int `json:"keep_cache_ram"`
+	RAM          int64 `json:"ram"`
+	VCPUs        int   `json:"vcpus"`
+	KeepCacheRAM int64 `json:"keep_cache_ram"`
 }
 
 // SchedulingParameters specify a container's scheduling parameters
diff --git a/services/dispatchcloud/gocheck_test.go b/services/dispatchcloud/gocheck_test.go
new file mode 100644
index 0000000..22f89f0
--- /dev/null
+++ b/services/dispatchcloud/gocheck_test.go
@@ -0,0 +1,16 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package dispatchcloud
+
+import (
+	"testing"
+
+	check "gopkg.in/check.v1"
+)
+
+// Gocheck boilerplate
+func Test(t *testing.T) {
+	check.TestingT(t)
+}
diff --git a/services/dispatchcloud/node_size.go b/services/dispatchcloud/node_size.go
new file mode 100644
index 0000000..1abccc5
--- /dev/null
+++ b/services/dispatchcloud/node_size.go
@@ -0,0 +1,44 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package dispatchcloud
+
+import (
+	"errors"
+
+	"git.curoverse.com/arvados.git/sdk/go/arvados"
+)
+
+var (
+	ErrConstraintsNotSatisfiable  = errors.New("constraints not satisfiable by any configured instance type")
+	ErrInstanceTypesNotConfigured = errors.New("site configuration does not list any instance types")
+)
+
+// ChooseInstanceType returns the cheapest available
+// arvados.InstanceType big enough to run ctr.
+func ChooseInstanceType(cc *arvados.Cluster, ctr *arvados.Container) (best arvados.InstanceType, err error) {
+	needVCPUs := ctr.RuntimeConstraints.VCPUs
+	needRAM := ctr.RuntimeConstraints.RAM + ctr.RuntimeConstraints.KeepCacheRAM
+
+	if len(cc.InstanceTypes) == 0 {
+		err = ErrInstanceTypesNotConfigured
+		return
+	}
+
+	err = ErrConstraintsNotSatisfiable
+	for _, it := range cc.InstanceTypes {
+		switch {
+		case err == nil && it.Price > best.Price:
+		case it.RAM < needRAM:
+		case it.VCPUs < needVCPUs:
+		case it.Price == best.Price && (it.RAM < best.RAM || it.VCPUs < best.VCPUs):
+			// Equal price, but worse specs
+		default:
+			// Lower price || (same price && better specs)
+			best = it
+			err = nil
+		}
+	}
+	return
+}
diff --git a/services/dispatchcloud/node_size_test.go b/services/dispatchcloud/node_size_test.go
new file mode 100644
index 0000000..bc628b5
--- /dev/null
+++ b/services/dispatchcloud/node_size_test.go
@@ -0,0 +1,73 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package dispatchcloud
+
+import (
+	"git.curoverse.com/arvados.git/sdk/go/arvados"
+	check "gopkg.in/check.v1"
+)
+
+var _ = check.Suite(&NodeSizeSuite{})
+
+type NodeSizeSuite struct{}
+
+func (*NodeSizeSuite) TestChooseNotConfigured(c *check.C) {
+	_, err := ChooseInstanceType(&arvados.Cluster{}, &arvados.Container{
+		RuntimeConstraints: arvados.RuntimeConstraints{
+			RAM:   1234567890,
+			VCPUs: 2,
+		},
+	})
+	c.Check(err, check.Equals, ErrInstanceTypesNotConfigured)
+}
+
+func (*NodeSizeSuite) TestChooseUnsatisfiable(c *check.C) {
+	for _, rc := range []arvados.RuntimeConstraints{
+		{RAM: 9876543210, VCPUs: 2},
+		{RAM: 1234567890, VCPUs: 20},
+		{RAM: 1234567890, VCPUs: 2, KeepCacheRAM: 9876543210},
+	} {
+		_, err := ChooseInstanceType(&arvados.Cluster{InstanceTypes: []arvados.InstanceType{
+			{Price: 1.1, RAM: 1000000000, VCPUs: 2, Name: "small1"},
+			{Price: 2.2, RAM: 2000000000, VCPUs: 4, Name: "small2"},
+			{Price: 4.4, RAM: 4000000000, VCPUs: 8, Name: "small4"},
+		}}, &arvados.Container{RuntimeConstraints: rc})
+		c.Check(err, check.Equals, ErrConstraintsNotSatisfiable)
+	}
+}
+
+func (*NodeSizeSuite) TestChoose(c *check.C) {
+	for _, menu := range [][]arvados.InstanceType{
+		{
+			{Price: 4.4, RAM: 4000000000, VCPUs: 8, Name: "costly"},
+			{Price: 2.2, RAM: 2000000000, VCPUs: 4, Name: "best"},
+			{Price: 1.1, RAM: 1000000000, VCPUs: 2, Name: "small"},
+		},
+		{
+			{Price: 4.4, RAM: 4000000000, VCPUs: 8, Name: "costly"},
+			{Price: 2.2, RAM: 2000000000, VCPUs: 4, Name: "goodenough"},
+			{Price: 2.2, RAM: 4000000000, VCPUs: 4, Name: "best"},
+			{Price: 1.1, RAM: 1000000000, VCPUs: 2, Name: "small"},
+		},
+		{
+			{Price: 1.1, RAM: 1000000000, VCPUs: 2, Name: "small"},
+			{Price: 2.2, RAM: 2000000000, VCPUs: 4, Name: "goodenough"},
+			{Price: 2.2, RAM: 4000000000, VCPUs: 4, Name: "best"},
+			{Price: 4.4, RAM: 4000000000, VCPUs: 8, Name: "costly"},
+		},
+	} {
+		best, err := ChooseInstanceType(&arvados.Cluster{InstanceTypes: menu}, &arvados.Container{
+			RuntimeConstraints: arvados.RuntimeConstraints{
+				VCPUs:        2,
+				RAM:          987654321,
+				KeepCacheRAM: 123456789,
+			},
+		})
+		c.Check(err, check.IsNil)
+		c.Check(best.Name, check.Equals, "best")
+		c.Check(best.RAM >= 1234567890, check.Equals, true)
+		c.Check(best.VCPUs >= 2, check.Equals, true)
+	}
+}

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list