[ARVADOS] created: 1.1.2-131-g10f08c3
Git user
git at public.curoverse.com
Thu Jan 25 19:53:05 EST 2018
at 10f08c358c12468119dc2621c48b68d6d33417da (commit)
commit 10f08c358c12468119dc2621c48b68d6d33417da
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date: Thu Jan 25 19:13:18 2018 -0500
12199: Pass node type to sbatch --constraint argument if configured.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>
diff --git a/services/crunch-dispatch-slurm/crunch-dispatch-slurm.go b/services/crunch-dispatch-slurm/crunch-dispatch-slurm.go
index bcc8197..879cb78 100644
--- a/services/crunch-dispatch-slurm/crunch-dispatch-slurm.go
+++ b/services/crunch-dispatch-slurm/crunch-dispatch-slurm.go
@@ -131,6 +131,8 @@ func (cmd *command) Run(prog string, args []string) error {
log.Fatalf("error loading config: %s", err)
} else if cmd.cluster, err = siteConfig.GetCluster(""); err != nil {
log.Fatalf("config error: %s", err)
+ } else if len(cmd.cluster.InstanceTypes) > 0 {
+ go dispatchcloud.SlurmNodeTypeFeatureKludge(cmd.cluster)
}
if cmd.slurm == nil {
@@ -211,6 +213,16 @@ func (cmd *command) sbatchArgs(container arvados.Container) ([]string, error) {
sbatchArgs = append(sbatchArgs, fmt.Sprintf("--partition=%s", strings.Join(container.SchedulingParameters.Partitions, ",")))
}
+ if cmd.cluster == nil {
+ // no instance types configured
+ } else if it, err := dispatchcloud.ChooseInstanceType(cmd.cluster, &container); err == dispatchcloud.ErrInstanceTypesNotConfigured {
+ // ditto
+ } else if err != nil {
+ return nil, err
+ } else {
+ sbatchArgs = append(sbatchArgs, "--constraint="+it.Name)
+ }
+
return sbatchArgs, nil
}
@@ -243,7 +255,13 @@ func (cmd *command) run(_ *dispatch.Dispatcher, ctr arvados.Container, status <-
if ctr.State == dispatch.Locked && !cmd.sqCheck.HasUUID(ctr.UUID) {
log.Printf("Submitting container %s to slurm", ctr.UUID)
if err := cmd.submit(ctr, cmd.CrunchRunCommand); err != nil {
- text := fmt.Sprintf("Error submitting container %s to slurm: %s", ctr.UUID, err)
+ var text string
+ if err == dispatchcloud.ErrConstraintsNotSatisfiable {
+ text = fmt.Sprintf("cannot run container %s: %s", ctr.UUID, err)
+ cmd.dispatcher.UpdateState(ctr.UUID, dispatch.Cancelled)
+ } else {
+ text = fmt.Sprintf("Error submitting container %s to slurm: %s", ctr.UUID, err)
+ }
log.Print(text)
lr := arvadosclient.Dict{"log": arvadosclient.Dict{
diff --git a/services/crunch-dispatch-slurm/crunch-dispatch-slurm_test.go b/services/crunch-dispatch-slurm/crunch-dispatch-slurm_test.go
index 63f56f3..459b7c6 100644
--- a/services/crunch-dispatch-slurm/crunch-dispatch-slurm_test.go
+++ b/services/crunch-dispatch-slurm/crunch-dispatch-slurm_test.go
@@ -24,6 +24,7 @@ import (
"git.curoverse.com/arvados.git/sdk/go/arvadosclient"
"git.curoverse.com/arvados.git/sdk/go/arvadostest"
"git.curoverse.com/arvados.git/sdk/go/dispatch"
+ "git.curoverse.com/arvados.git/services/dispatchcloud"
. "gopkg.in/check.v1"
)
@@ -32,32 +33,19 @@ func Test(t *testing.T) {
TestingT(t)
}
-var _ = Suite(&TestSuite{})
-var _ = Suite(&MockArvadosServerSuite{})
+var _ = Suite(&IntegrationSuite{})
+var _ = Suite(&StubbedSuite{})
-type TestSuite struct {
+type IntegrationSuite struct {
cmd command
}
-var initialArgs []string
-
-func (s *TestSuite) SetUpSuite(c *C) {
- initialArgs = os.Args
-}
-
-func (s *TestSuite) TearDownSuite(c *C) {
-}
-
-func (s *TestSuite) SetUpTest(c *C) {
- args := []string{"crunch-dispatch-slurm"}
- os.Args = args
-
+func (s *IntegrationSuite) SetUpTest(c *C) {
arvadostest.StartAPI()
os.Setenv("ARVADOS_API_TOKEN", arvadostest.Dispatch1Token)
}
-func (s *TestSuite) TearDownTest(c *C) {
- os.Args = initialArgs
+func (s *IntegrationSuite) TearDownTest(c *C) {
arvadostest.ResetEnv()
arvadostest.StopAPI()
}
@@ -99,7 +87,7 @@ func (sf *slurmFake) Cancel(name string) error {
return nil
}
-func (s *TestSuite) integrationTest(c *C, slurm *slurmFake,
+func (s *IntegrationSuite) integrationTest(c *C, slurm *slurmFake,
expectBatch [][]string,
runContainer func(*dispatch.Dispatcher, arvados.Container)) arvados.Container {
arvadostest.ResetEnv()
@@ -159,7 +147,7 @@ func (s *TestSuite) integrationTest(c *C, slurm *slurmFake,
return container
}
-func (s *TestSuite) TestIntegrationNormal(c *C) {
+func (s *IntegrationSuite) TestNormal(c *C) {
container := s.integrationTest(c,
&slurmFake{queue: "zzzzz-dz642-queuedcontainer 9990 100\n"},
nil,
@@ -171,7 +159,7 @@ func (s *TestSuite) TestIntegrationNormal(c *C) {
c.Check(container.State, Equals, arvados.ContainerStateComplete)
}
-func (s *TestSuite) TestIntegrationCancel(c *C) {
+func (s *IntegrationSuite) TestCancel(c *C) {
slurm := &slurmFake{queue: "zzzzz-dz642-queuedcontainer 9990 100\n"}
readyToCancel := make(chan bool)
slurm.onCancel = func() { <-readyToCancel }
@@ -193,7 +181,7 @@ func (s *TestSuite) TestIntegrationCancel(c *C) {
c.Check(slurm.didCancel[:2], DeepEquals, []string{"zzzzz-dz642-queuedcontainer", "zzzzz-dz642-queuedcontainer"})
}
-func (s *TestSuite) TestIntegrationMissingFromSqueue(c *C) {
+func (s *IntegrationSuite) TestMissingFromSqueue(c *C) {
container := s.integrationTest(c, &slurmFake{},
[][]string{{
fmt.Sprintf("--job-name=%s", "zzzzz-dz642-queuedcontainer"),
@@ -209,7 +197,7 @@ func (s *TestSuite) TestIntegrationMissingFromSqueue(c *C) {
c.Check(container.State, Equals, arvados.ContainerStateCancelled)
}
-func (s *TestSuite) TestSbatchFail(c *C) {
+func (s *IntegrationSuite) TestSbatchFail(c *C) {
container := s.integrationTest(c,
&slurmFake{errBatch: errors.New("something terrible happened")},
[][]string{{"--job-name=zzzzz-dz642-queuedcontainer", "--mem=11445", "--cpus-per-task=4", "--tmp=45777", "--nice=9990"}},
@@ -230,7 +218,7 @@ func (s *TestSuite) TestSbatchFail(c *C) {
c.Assert(len(ll.Items), Equals, 1)
}
-func (s *TestSuite) TestIntegrationChangePriority(c *C) {
+func (s *IntegrationSuite) TestChangePriority(c *C) {
slurm := &slurmFake{queue: "zzzzz-dz642-queuedcontainer 9990 100\n"}
container := s.integrationTest(c, slurm, nil,
func(dispatcher *dispatch.Dispatcher, container arvados.Container) {
@@ -248,15 +236,15 @@ func (s *TestSuite) TestIntegrationChangePriority(c *C) {
c.Check(slurm.didRenice[len(slurm.didRenice)-1], DeepEquals, []string{"zzzzz-dz642-queuedcontainer", "4000"})
}
-type MockArvadosServerSuite struct {
+type StubbedSuite struct {
cmd command
}
-func (s *MockArvadosServerSuite) TearDownTest(c *C) {
- arvadostest.ResetEnv()
+func (s *StubbedSuite) SetUpTest(c *C) {
+ s.cmd = command{}
}
-func (s *MockArvadosServerSuite) TestAPIErrorGettingContainers(c *C) {
+func (s *StubbedSuite) TestAPIErrorGettingContainers(c *C) {
apiStubResponses := make(map[string]arvadostest.StubResponse)
apiStubResponses["/arvados/v1/api_client_authorizations/current"] = arvadostest.StubResponse{200, `{"uuid":"` + arvadostest.Dispatch1AuthUUID + `"}`}
apiStubResponses["/arvados/v1/containers"] = arvadostest.StubResponse{500, string(`{}`)}
@@ -264,7 +252,7 @@ func (s *MockArvadosServerSuite) TestAPIErrorGettingContainers(c *C) {
s.testWithServerStub(c, apiStubResponses, "echo", "Error getting list of containers")
}
-func (s *MockArvadosServerSuite) testWithServerStub(c *C, apiStubResponses map[string]arvadostest.StubResponse, crunchCmd string, expected string) {
+func (s *StubbedSuite) testWithServerStub(c *C, apiStubResponses map[string]arvadostest.StubResponse, crunchCmd string, expected string) {
apiStub := arvadostest.ServerStub{apiStubResponses}
api := httptest.NewServer(&apiStub)
@@ -312,12 +300,12 @@ func (s *MockArvadosServerSuite) testWithServerStub(c *C, apiStubResponses map[s
c.Check(buf.String(), Matches, `(?ms).*`+expected+`.*`)
}
-func (s *MockArvadosServerSuite) TestNoSuchConfigFile(c *C) {
+func (s *StubbedSuite) TestNoSuchConfigFile(c *C) {
err := s.cmd.readConfig("/nosuchdir89j7879/8hjwr7ojgyy7")
c.Assert(err, NotNil)
}
-func (s *MockArvadosServerSuite) TestBadSbatchArgsConfig(c *C) {
+func (s *StubbedSuite) TestBadSbatchArgsConfig(c *C) {
tmpfile, err := ioutil.TempFile(os.TempDir(), "config")
c.Check(err, IsNil)
defer os.Remove(tmpfile.Name())
@@ -329,7 +317,7 @@ func (s *MockArvadosServerSuite) TestBadSbatchArgsConfig(c *C) {
c.Assert(err, NotNil)
}
-func (s *MockArvadosServerSuite) TestNoSuchArgInConfigIgnored(c *C) {
+func (s *StubbedSuite) TestNoSuchArgInConfigIgnored(c *C) {
tmpfile, err := ioutil.TempFile(os.TempDir(), "config")
c.Check(err, IsNil)
defer os.Remove(tmpfile.Name())
@@ -342,7 +330,7 @@ func (s *MockArvadosServerSuite) TestNoSuchArgInConfigIgnored(c *C) {
c.Check(0, Equals, len(s.cmd.SbatchArguments))
}
-func (s *MockArvadosServerSuite) TestReadConfig(c *C) {
+func (s *StubbedSuite) TestReadConfig(c *C) {
tmpfile, err := ioutil.TempFile(os.TempDir(), "config")
c.Check(err, IsNil)
defer os.Remove(tmpfile.Name())
@@ -357,40 +345,79 @@ func (s *MockArvadosServerSuite) TestReadConfig(c *C) {
c.Check(args, DeepEquals, s.cmd.SbatchArguments)
}
-func (s *MockArvadosServerSuite) TestSbatchFuncWithNoConfigArgs(c *C) {
- s.testSbatchFuncWithArgs(c, nil)
-}
-
-func (s *MockArvadosServerSuite) TestSbatchFuncWithEmptyConfigArgs(c *C) {
- s.testSbatchFuncWithArgs(c, []string{})
-}
+func (s *StubbedSuite) TestSbatchArgs(c *C) {
+ container := arvados.Container{
+ UUID: "123",
+ RuntimeConstraints: arvados.RuntimeConstraints{RAM: 250000000, VCPUs: 2},
+ Priority: 1,
+ }
-func (s *MockArvadosServerSuite) TestSbatchFuncWithConfigArgs(c *C) {
- s.testSbatchFuncWithArgs(c, []string{"--arg1=v1", "--arg2"})
+ for _, defaults := range [][]string{
+ nil,
+ {},
+ {"--arg1=v1", "--arg2"},
+ } {
+ c.Logf("%#v", defaults)
+ s.cmd.SbatchArguments = defaults
+
+ args, err := s.cmd.sbatchArgs(container)
+ c.Check(args, DeepEquals, append(defaults, "--job-name=123", "--mem=239", "--cpus-per-task=2", "--tmp=0", "--nice=9990"))
+ c.Check(err, IsNil)
+ }
}
-func (s *MockArvadosServerSuite) testSbatchFuncWithArgs(c *C, args []string) {
- s.cmd.SbatchArguments = append([]string(nil), args...)
-
+func (s *StubbedSuite) TestSbatchInstanceTypeConstraint(c *C) {
container := arvados.Container{
UUID: "123",
RuntimeConstraints: arvados.RuntimeConstraints{RAM: 250000000, VCPUs: 2},
- Priority: 1}
+ Priority: 1,
+ }
- var expected []string
- expected = append(expected, s.cmd.SbatchArguments...)
- expected = append(expected, "--job-name=123", "--mem=239", "--cpus-per-task=2", "--tmp=0", "--nice=9990")
- args, err := s.cmd.sbatchArgs(container)
- c.Check(args, DeepEquals, expected)
- c.Check(err, IsNil)
+ for _, trial := range []struct {
+ types []arvados.InstanceType
+ sbatchArgs []string
+ err error
+ }{
+ // Choose node type => use --constraint arg
+ {
+ types: []arvados.InstanceType{
+ {Name: "a1.tiny", Price: 0.02, RAM: 128000000, VCPUs: 1},
+ {Name: "a1.small", Price: 0.04, RAM: 256000000, VCPUs: 2},
+ {Name: "a1.medium", Price: 0.08, RAM: 512000000, VCPUs: 4},
+ },
+ sbatchArgs: []string{"--constraint=a1.small"},
+ },
+ // No node types configured => no slurm constraint
+ {
+ types: nil,
+ sbatchArgs: nil,
+ },
+ // No node type is big enough => error
+ {
+ types: []arvados.InstanceType{
+ {Name: "a1.tiny", Price: 0.02, RAM: 128000000, VCPUs: 1},
+ },
+ err: dispatchcloud.ErrConstraintsNotSatisfiable,
+ },
+ } {
+ c.Logf("%#v", trial)
+ s.cmd.cluster = &arvados.Cluster{InstanceTypes: trial.types}
+
+ args, err := s.cmd.sbatchArgs(container)
+ c.Check(err, Equals, trial.err)
+ if trial.err == nil {
+ c.Check(args, DeepEquals, append([]string{"--job-name=123", "--mem=239", "--cpus-per-task=2", "--tmp=0", "--nice=9990"}, trial.sbatchArgs...))
+ }
+ }
}
-func (s *MockArvadosServerSuite) TestSbatchPartition(c *C) {
+func (s *StubbedSuite) TestSbatchPartition(c *C) {
container := arvados.Container{
UUID: "123",
RuntimeConstraints: arvados.RuntimeConstraints{RAM: 250000000, VCPUs: 1},
SchedulingParameters: arvados.SchedulingParameters{Partitions: []string{"blurb", "b2"}},
- Priority: 1}
+ Priority: 1,
+ }
args, err := s.cmd.sbatchArgs(container)
c.Check(args, DeepEquals, []string{
commit 0c9586ca048805b404dd762f5cd7cabe5d1ed227
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date: Thu Jan 25 19:12:49 2018 -0500
12199: Refactor to eliminate evil globals.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>
diff --git a/services/crunch-dispatch-slurm/crunch-dispatch-slurm.go b/services/crunch-dispatch-slurm/crunch-dispatch-slurm.go
index ae2ca58..bcc8197 100644
--- a/services/crunch-dispatch-slurm/crunch-dispatch-slurm.go
+++ b/services/crunch-dispatch-slurm/crunch-dispatch-slurm.go
@@ -21,13 +21,18 @@ import (
"git.curoverse.com/arvados.git/sdk/go/arvadosclient"
"git.curoverse.com/arvados.git/sdk/go/config"
"git.curoverse.com/arvados.git/sdk/go/dispatch"
+ "git.curoverse.com/arvados.git/services/dispatchcloud"
"github.com/coreos/go-systemd/daemon"
)
var version = "dev"
-// Config used by crunch-dispatch-slurm
-type Config struct {
+type command struct {
+ dispatcher *dispatch.Dispatcher
+ cluster *arvados.Cluster
+ sqCheck *SqueueChecker
+ slurm Slurm
+
Client arvados.Client
SbatchArguments []string
@@ -41,27 +46,19 @@ type Config struct {
// Minimum time between two attempts to run the same container
MinRetryPeriod arvados.Duration
-
- slurm Slurm
}
func main() {
- theConfig.slurm = &slurmCLI{}
- err := doMain()
+ err := (&command{}).Run(os.Args[0], os.Args[1:])
if err != nil {
log.Fatal(err)
}
}
-var (
- theConfig Config
- sqCheck = &SqueueChecker{}
-)
-
const defaultConfigPath = "/etc/arvados/crunch-dispatch-slurm/crunch-dispatch-slurm.yml"
-func doMain() error {
- flags := flag.NewFlagSet("crunch-dispatch-slurm", flag.ExitOnError)
+func (cmd *command) Run(prog string, args []string) error {
+ flags := flag.NewFlagSet(prog, flag.ExitOnError)
flags.Usage = func() { usage(flags) }
configPath := flags.String(
@@ -77,7 +74,7 @@ func doMain() error {
false,
"Print version information and exit.")
// Parse args; omit the first arg which is the command name
- flags.Parse(os.Args[1:])
+ flags.Parse(args)
// Print version information if requested
if *getVersion {
@@ -87,37 +84,37 @@ func doMain() error {
log.Printf("crunch-dispatch-slurm %s started", version)
- err := readConfig(&theConfig, *configPath)
+ err := cmd.readConfig(*configPath)
if err != nil {
return err
}
- if theConfig.CrunchRunCommand == nil {
- theConfig.CrunchRunCommand = []string{"crunch-run"}
+ if cmd.CrunchRunCommand == nil {
+ cmd.CrunchRunCommand = []string{"crunch-run"}
}
- if theConfig.PollPeriod == 0 {
- theConfig.PollPeriod = arvados.Duration(10 * time.Second)
+ if cmd.PollPeriod == 0 {
+ cmd.PollPeriod = arvados.Duration(10 * time.Second)
}
- if theConfig.Client.APIHost != "" || theConfig.Client.AuthToken != "" {
+ if cmd.Client.APIHost != "" || cmd.Client.AuthToken != "" {
// Copy real configs into env vars so [a]
// MakeArvadosClient() uses them, and [b] they get
// propagated to crunch-run via SLURM.
- os.Setenv("ARVADOS_API_HOST", theConfig.Client.APIHost)
- os.Setenv("ARVADOS_API_TOKEN", theConfig.Client.AuthToken)
+ os.Setenv("ARVADOS_API_HOST", cmd.Client.APIHost)
+ os.Setenv("ARVADOS_API_TOKEN", cmd.Client.AuthToken)
os.Setenv("ARVADOS_API_HOST_INSECURE", "")
- if theConfig.Client.Insecure {
+ if cmd.Client.Insecure {
os.Setenv("ARVADOS_API_HOST_INSECURE", "1")
}
- os.Setenv("ARVADOS_KEEP_SERVICES", strings.Join(theConfig.Client.KeepServiceURIs, " "))
+ os.Setenv("ARVADOS_KEEP_SERVICES", strings.Join(cmd.Client.KeepServiceURIs, " "))
os.Setenv("ARVADOS_EXTERNAL_CLIENT", "")
} else {
log.Printf("warning: Client credentials missing from config, so falling back on environment variables (deprecated).")
}
if *dumpConfig {
- log.Fatal(config.DumpAndExit(theConfig))
+ log.Fatal(config.DumpAndExit(cmd))
}
arv, err := arvadosclient.MakeArvadosClient()
@@ -127,23 +124,39 @@ func doMain() error {
}
arv.Retries = 25
- sqCheck = &SqueueChecker{Period: time.Duration(theConfig.PollPeriod)}
- defer sqCheck.Stop()
+ siteConfig, err := arvados.GetConfig(arvados.DefaultConfigFile)
+ if os.IsNotExist(err) {
+ log.Printf("warning: no cluster config file %q (%s), proceeding with no node types defined", arvados.DefaultConfigFile, err)
+ } else if err != nil {
+ log.Fatalf("error loading config: %s", err)
+ } else if cmd.cluster, err = siteConfig.GetCluster(""); err != nil {
+ log.Fatalf("config error: %s", err)
+ }
+
+ if cmd.slurm == nil {
+ cmd.slurm = &slurmCLI{}
+ }
+
+ cmd.sqCheck = &SqueueChecker{
+ Period: time.Duration(cmd.PollPeriod),
+ Slurm: cmd.slurm,
+ }
+ defer cmd.sqCheck.Stop()
- dispatcher := &dispatch.Dispatcher{
+ cmd.dispatcher = &dispatch.Dispatcher{
Arv: arv,
- RunContainer: run,
- PollPeriod: time.Duration(theConfig.PollPeriod),
- MinRetryPeriod: time.Duration(theConfig.MinRetryPeriod),
+ RunContainer: cmd.run,
+ PollPeriod: time.Duration(cmd.PollPeriod),
+ MinRetryPeriod: time.Duration(cmd.MinRetryPeriod),
}
if _, err := daemon.SdNotify(false, "READY=1"); err != nil {
log.Printf("Error notifying init daemon: %v", err)
}
- go checkSqueueForOrphans(dispatcher, sqCheck)
+ go cmd.checkSqueueForOrphans()
- return dispatcher.Run(context.Background())
+ return cmd.dispatcher.Run(context.Background())
}
var containerUuidPattern = regexp.MustCompile(`^[a-z0-9]{5}-dz642-[a-z0-9]{15}$`)
@@ -153,19 +166,19 @@ var containerUuidPattern = regexp.MustCompile(`^[a-z0-9]{5}-dz642-[a-z0-9]{15}$`
// jobs started by a previous dispatch process that never released
// their slurm allocations even though their container states are
// Cancelled or Complete. See https://dev.arvados.org/issues/10979
-func checkSqueueForOrphans(dispatcher *dispatch.Dispatcher, sqCheck *SqueueChecker) {
- for _, uuid := range sqCheck.All() {
+func (cmd *command) checkSqueueForOrphans() {
+ for _, uuid := range cmd.sqCheck.All() {
if !containerUuidPattern.MatchString(uuid) {
continue
}
- err := dispatcher.TrackContainer(uuid)
+ err := cmd.dispatcher.TrackContainer(uuid)
if err != nil {
log.Printf("checkSqueueForOrphans: TrackContainer(%s): %s", uuid, err)
}
}
}
-func niceness(priority int) int {
+func (cmd *command) niceness(priority int) int {
if priority > 1000 {
priority = 1000
}
@@ -176,7 +189,7 @@ func niceness(priority int) int {
return (1000 - priority) * 10
}
-func sbatchArgs(container arvados.Container) []string {
+func (cmd *command) sbatchArgs(container arvados.Container) ([]string, error) {
mem := int64(math.Ceil(float64(container.RuntimeConstraints.RAM+container.RuntimeConstraints.KeepCacheRAM) / float64(1048576)))
var disk int64
@@ -188,45 +201,48 @@ func sbatchArgs(container arvados.Container) []string {
disk = int64(math.Ceil(float64(disk) / float64(1048576)))
var sbatchArgs []string
- sbatchArgs = append(sbatchArgs, theConfig.SbatchArguments...)
+ sbatchArgs = append(sbatchArgs, cmd.SbatchArguments...)
sbatchArgs = append(sbatchArgs, fmt.Sprintf("--job-name=%s", container.UUID))
sbatchArgs = append(sbatchArgs, fmt.Sprintf("--mem=%d", mem))
sbatchArgs = append(sbatchArgs, fmt.Sprintf("--cpus-per-task=%d", container.RuntimeConstraints.VCPUs))
sbatchArgs = append(sbatchArgs, fmt.Sprintf("--tmp=%d", disk))
- sbatchArgs = append(sbatchArgs, fmt.Sprintf("--nice=%d", niceness(container.Priority)))
+ sbatchArgs = append(sbatchArgs, fmt.Sprintf("--nice=%d", cmd.niceness(container.Priority)))
if len(container.SchedulingParameters.Partitions) > 0 {
sbatchArgs = append(sbatchArgs, fmt.Sprintf("--partition=%s", strings.Join(container.SchedulingParameters.Partitions, ",")))
}
- return sbatchArgs
+ return sbatchArgs, nil
}
-func submit(dispatcher *dispatch.Dispatcher, container arvados.Container, crunchRunCommand []string) error {
+func (cmd *command) submit(container arvados.Container, crunchRunCommand []string) error {
// append() here avoids modifying crunchRunCommand's
// underlying array, which is shared with other goroutines.
crArgs := append([]string(nil), crunchRunCommand...)
crArgs = append(crArgs, container.UUID)
crScript := strings.NewReader(execScript(crArgs))
- sqCheck.L.Lock()
- defer sqCheck.L.Unlock()
+ cmd.sqCheck.L.Lock()
+ defer cmd.sqCheck.L.Unlock()
- sbArgs := sbatchArgs(container)
+ sbArgs, err := cmd.sbatchArgs(container)
+ if err != nil {
+ return err
+ }
log.Printf("running sbatch %+q", sbArgs)
- return theConfig.slurm.Batch(crScript, sbArgs)
+ return cmd.slurm.Batch(crScript, sbArgs)
}
// Submit a container to the slurm queue (or resume monitoring if it's
// already in the queue). Cancel the slurm job if the container's
// priority changes to zero or its state indicates it's no longer
// running.
-func run(disp *dispatch.Dispatcher, ctr arvados.Container, status <-chan arvados.Container) {
+func (cmd *command) run(_ *dispatch.Dispatcher, ctr arvados.Container, status <-chan arvados.Container) {
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
- if ctr.State == dispatch.Locked && !sqCheck.HasUUID(ctr.UUID) {
+ if ctr.State == dispatch.Locked && !cmd.sqCheck.HasUUID(ctr.UUID) {
log.Printf("Submitting container %s to slurm", ctr.UUID)
- if err := submit(disp, ctr, theConfig.CrunchRunCommand); err != nil {
+ if err := cmd.submit(ctr, cmd.CrunchRunCommand); err != nil {
text := fmt.Sprintf("Error submitting container %s to slurm: %s", ctr.UUID, err)
log.Print(text)
@@ -234,9 +250,9 @@ func run(disp *dispatch.Dispatcher, ctr arvados.Container, status <-chan arvados
"object_uuid": ctr.UUID,
"event_type": "dispatch",
"properties": map[string]string{"text": text}}}
- disp.Arv.Create("logs", lr, nil)
+ cmd.dispatcher.Arv.Create("logs", lr, nil)
- disp.Unlock(ctr.UUID)
+ cmd.dispatcher.Unlock(ctr.UUID)
return
}
}
@@ -248,7 +264,7 @@ func run(disp *dispatch.Dispatcher, ctr arvados.Container, status <-chan arvados
// no point in waiting for further dispatch updates: just
// clean up and return.
go func(uuid string) {
- for ctx.Err() == nil && sqCheck.HasUUID(uuid) {
+ for ctx.Err() == nil && cmd.sqCheck.HasUUID(uuid) {
}
cancel()
}(ctr.UUID)
@@ -257,68 +273,68 @@ func run(disp *dispatch.Dispatcher, ctr arvados.Container, status <-chan arvados
select {
case <-ctx.Done():
// Disappeared from squeue
- if err := disp.Arv.Get("containers", ctr.UUID, nil, &ctr); err != nil {
+ if err := cmd.dispatcher.Arv.Get("containers", ctr.UUID, nil, &ctr); err != nil {
log.Printf("Error getting final container state for %s: %s", ctr.UUID, err)
}
switch ctr.State {
case dispatch.Running:
- disp.UpdateState(ctr.UUID, dispatch.Cancelled)
+ cmd.dispatcher.UpdateState(ctr.UUID, dispatch.Cancelled)
case dispatch.Locked:
- disp.Unlock(ctr.UUID)
+ cmd.dispatcher.Unlock(ctr.UUID)
}
return
case updated, ok := <-status:
if !ok {
log.Printf("Dispatcher says container %s is done: cancel slurm job", ctr.UUID)
- scancel(ctr)
+ cmd.scancel(ctr)
} else if updated.Priority == 0 {
log.Printf("Container %s has state %q, priority %d: cancel slurm job", ctr.UUID, updated.State, updated.Priority)
- scancel(ctr)
+ cmd.scancel(ctr)
} else {
- renice(updated)
+ cmd.renice(updated)
}
}
}
}
-func scancel(ctr arvados.Container) {
- sqCheck.L.Lock()
- err := theConfig.slurm.Cancel(ctr.UUID)
- sqCheck.L.Unlock()
+func (cmd *command) scancel(ctr arvados.Container) {
+ cmd.sqCheck.L.Lock()
+ err := cmd.slurm.Cancel(ctr.UUID)
+ cmd.sqCheck.L.Unlock()
if err != nil {
log.Printf("scancel: %s", err)
time.Sleep(time.Second)
- } else if sqCheck.HasUUID(ctr.UUID) {
+ } else if cmd.sqCheck.HasUUID(ctr.UUID) {
log.Printf("container %s is still in squeue after scancel", ctr.UUID)
time.Sleep(time.Second)
}
}
-func renice(ctr arvados.Container) {
- nice := niceness(ctr.Priority)
- oldnice := sqCheck.GetNiceness(ctr.UUID)
+func (cmd *command) renice(ctr arvados.Container) {
+ nice := cmd.niceness(ctr.Priority)
+ oldnice := cmd.sqCheck.GetNiceness(ctr.UUID)
if nice == oldnice || oldnice == -1 {
return
}
log.Printf("updating slurm nice value to %d (was %d)", nice, oldnice)
- sqCheck.L.Lock()
- err := theConfig.slurm.Renice(ctr.UUID, nice)
- sqCheck.L.Unlock()
+ cmd.sqCheck.L.Lock()
+ err := cmd.slurm.Renice(ctr.UUID, nice)
+ cmd.sqCheck.L.Unlock()
if err != nil {
log.Printf("renice: %s", err)
time.Sleep(time.Second)
return
}
- if sqCheck.HasUUID(ctr.UUID) {
+ if cmd.sqCheck.HasUUID(ctr.UUID) {
log.Printf("container %s has arvados priority %d, slurm nice %d",
- ctr.UUID, ctr.Priority, sqCheck.GetNiceness(ctr.UUID))
+ ctr.UUID, ctr.Priority, cmd.sqCheck.GetNiceness(ctr.UUID))
}
}
-func readConfig(dst interface{}, path string) error {
- err := config.LoadFile(dst, path)
+func (cmd *command) readConfig(path string) error {
+ err := config.LoadFile(cmd, path)
if err != nil && os.IsNotExist(err) && path == defaultConfigPath {
log.Printf("Config not specified. Continue with default configuration.")
err = nil
diff --git a/services/crunch-dispatch-slurm/crunch-dispatch-slurm_test.go b/services/crunch-dispatch-slurm/crunch-dispatch-slurm_test.go
index 830976d..63f56f3 100644
--- a/services/crunch-dispatch-slurm/crunch-dispatch-slurm_test.go
+++ b/services/crunch-dispatch-slurm/crunch-dispatch-slurm_test.go
@@ -35,8 +35,9 @@ func Test(t *testing.T) {
var _ = Suite(&TestSuite{})
var _ = Suite(&MockArvadosServerSuite{})
-type TestSuite struct{}
-type MockArvadosServerSuite struct{}
+type TestSuite struct {
+ cmd command
+}
var initialArgs []string
@@ -61,10 +62,6 @@ func (s *TestSuite) TearDownTest(c *C) {
arvadostest.StopAPI()
}
-func (s *MockArvadosServerSuite) TearDownTest(c *C) {
- arvadostest.ResetEnv()
-}
-
type slurmFake struct {
didBatch [][]string
didCancel []string
@@ -110,10 +107,7 @@ func (s *TestSuite) integrationTest(c *C, slurm *slurmFake,
arv, err := arvadosclient.MakeArvadosClient()
c.Assert(err, IsNil)
- defer func(orig Slurm) {
- theConfig.slurm = orig
- }(theConfig.slurm)
- theConfig.slurm = slurm
+ s.cmd.slurm = slurm
// There should be one queued container
params := arvadosclient.Dict{
@@ -124,12 +118,12 @@ func (s *TestSuite) integrationTest(c *C, slurm *slurmFake,
c.Check(err, IsNil)
c.Check(len(containers.Items), Equals, 1)
- theConfig.CrunchRunCommand = []string{"echo"}
+ s.cmd.CrunchRunCommand = []string{"echo"}
ctx, cancel := context.WithCancel(context.Background())
doneRun := make(chan struct{})
- dispatcher := dispatch.Dispatcher{
+ s.cmd.dispatcher = &dispatch.Dispatcher{
Arv: arv,
PollPeriod: time.Duration(1) * time.Second,
RunContainer: func(disp *dispatch.Dispatcher, ctr arvados.Container, status <-chan arvados.Container) {
@@ -138,18 +132,18 @@ func (s *TestSuite) integrationTest(c *C, slurm *slurmFake,
slurm.queue = ""
doneRun <- struct{}{}
}()
- run(disp, ctr, status)
+ s.cmd.run(disp, ctr, status)
cancel()
},
}
- sqCheck = &SqueueChecker{Period: 500 * time.Millisecond}
+ s.cmd.sqCheck = &SqueueChecker{Period: 500 * time.Millisecond, Slurm: slurm}
- err = dispatcher.Run(ctx)
+ err = s.cmd.dispatcher.Run(ctx)
<-doneRun
c.Assert(err, Equals, context.Canceled)
- sqCheck.Stop()
+ s.cmd.sqCheck.Stop()
c.Check(slurm.didBatch, DeepEquals, expectBatch)
@@ -236,15 +230,41 @@ func (s *TestSuite) TestSbatchFail(c *C) {
c.Assert(len(ll.Items), Equals, 1)
}
+func (s *TestSuite) TestIntegrationChangePriority(c *C) {
+ slurm := &slurmFake{queue: "zzzzz-dz642-queuedcontainer 9990 100\n"}
+ container := s.integrationTest(c, slurm, nil,
+ func(dispatcher *dispatch.Dispatcher, container arvados.Container) {
+ dispatcher.UpdateState(container.UUID, dispatch.Running)
+ time.Sleep(time.Second)
+ dispatcher.Arv.Update("containers", container.UUID,
+ arvadosclient.Dict{
+ "container": arvadosclient.Dict{"priority": 600}},
+ nil)
+ time.Sleep(time.Second)
+ dispatcher.UpdateState(container.UUID, dispatch.Complete)
+ })
+ c.Check(container.State, Equals, arvados.ContainerStateComplete)
+ c.Assert(len(slurm.didRenice), Not(Equals), 0)
+ c.Check(slurm.didRenice[len(slurm.didRenice)-1], DeepEquals, []string{"zzzzz-dz642-queuedcontainer", "4000"})
+}
+
+type MockArvadosServerSuite struct {
+ cmd command
+}
+
+func (s *MockArvadosServerSuite) TearDownTest(c *C) {
+ arvadostest.ResetEnv()
+}
+
func (s *MockArvadosServerSuite) TestAPIErrorGettingContainers(c *C) {
apiStubResponses := make(map[string]arvadostest.StubResponse)
apiStubResponses["/arvados/v1/api_client_authorizations/current"] = arvadostest.StubResponse{200, `{"uuid":"` + arvadostest.Dispatch1AuthUUID + `"}`}
apiStubResponses["/arvados/v1/containers"] = arvadostest.StubResponse{500, string(`{}`)}
- testWithServerStub(c, apiStubResponses, "echo", "Error getting list of containers")
+ s.testWithServerStub(c, apiStubResponses, "echo", "Error getting list of containers")
}
-func testWithServerStub(c *C, apiStubResponses map[string]arvadostest.StubResponse, crunchCmd string, expected string) {
+func (s *MockArvadosServerSuite) testWithServerStub(c *C, apiStubResponses map[string]arvadostest.StubResponse, crunchCmd string, expected string) {
apiStub := arvadostest.ServerStub{apiStubResponses}
api := httptest.NewServer(&apiStub)
@@ -262,7 +282,7 @@ func testWithServerStub(c *C, apiStubResponses map[string]arvadostest.StubRespon
log.SetOutput(io.MultiWriter(buf, os.Stderr))
defer log.SetOutput(os.Stderr)
- theConfig.CrunchRunCommand = []string{crunchCmd}
+ s.cmd.CrunchRunCommand = []string{crunchCmd}
ctx, cancel := context.WithCancel(context.Background())
dispatcher := dispatch.Dispatcher{
@@ -274,7 +294,7 @@ func testWithServerStub(c *C, apiStubResponses map[string]arvadostest.StubRespon
disp.UpdateState(ctr.UUID, dispatch.Running)
disp.UpdateState(ctr.UUID, dispatch.Complete)
}()
- run(disp, ctr, status)
+ s.cmd.run(disp, ctr, status)
cancel()
},
}
@@ -293,14 +313,11 @@ func testWithServerStub(c *C, apiStubResponses map[string]arvadostest.StubRespon
}
func (s *MockArvadosServerSuite) TestNoSuchConfigFile(c *C) {
- var config Config
- err := readConfig(&config, "/nosuchdir89j7879/8hjwr7ojgyy7")
+ err := s.cmd.readConfig("/nosuchdir89j7879/8hjwr7ojgyy7")
c.Assert(err, NotNil)
}
func (s *MockArvadosServerSuite) TestBadSbatchArgsConfig(c *C) {
- var config Config
-
tmpfile, err := ioutil.TempFile(os.TempDir(), "config")
c.Check(err, IsNil)
defer os.Remove(tmpfile.Name())
@@ -308,13 +325,11 @@ func (s *MockArvadosServerSuite) TestBadSbatchArgsConfig(c *C) {
_, err = tmpfile.Write([]byte(`{"SbatchArguments": "oops this is not a string array"}`))
c.Check(err, IsNil)
- err = readConfig(&config, tmpfile.Name())
+ err = s.cmd.readConfig(tmpfile.Name())
c.Assert(err, NotNil)
}
func (s *MockArvadosServerSuite) TestNoSuchArgInConfigIgnored(c *C) {
- var config Config
-
tmpfile, err := ioutil.TempFile(os.TempDir(), "config")
c.Check(err, IsNil)
defer os.Remove(tmpfile.Name())
@@ -322,14 +337,12 @@ func (s *MockArvadosServerSuite) TestNoSuchArgInConfigIgnored(c *C) {
_, err = tmpfile.Write([]byte(`{"NoSuchArg": "Nobody loves me, not one tiny hunk."}`))
c.Check(err, IsNil)
- err = readConfig(&config, tmpfile.Name())
+ err = s.cmd.readConfig(tmpfile.Name())
c.Assert(err, IsNil)
- c.Check(0, Equals, len(config.SbatchArguments))
+ c.Check(0, Equals, len(s.cmd.SbatchArguments))
}
func (s *MockArvadosServerSuite) TestReadConfig(c *C) {
- var config Config
-
tmpfile, err := ioutil.TempFile(os.TempDir(), "config")
c.Check(err, IsNil)
defer os.Remove(tmpfile.Name())
@@ -339,27 +352,25 @@ func (s *MockArvadosServerSuite) TestReadConfig(c *C) {
_, err = tmpfile.Write([]byte(argsS))
c.Check(err, IsNil)
- err = readConfig(&config, tmpfile.Name())
+ err = s.cmd.readConfig(tmpfile.Name())
c.Assert(err, IsNil)
- c.Check(3, Equals, len(config.SbatchArguments))
- c.Check(args, DeepEquals, config.SbatchArguments)
+ c.Check(args, DeepEquals, s.cmd.SbatchArguments)
}
func (s *MockArvadosServerSuite) TestSbatchFuncWithNoConfigArgs(c *C) {
- testSbatchFuncWithArgs(c, nil)
+ s.testSbatchFuncWithArgs(c, nil)
}
func (s *MockArvadosServerSuite) TestSbatchFuncWithEmptyConfigArgs(c *C) {
- testSbatchFuncWithArgs(c, []string{})
+ s.testSbatchFuncWithArgs(c, []string{})
}
func (s *MockArvadosServerSuite) TestSbatchFuncWithConfigArgs(c *C) {
- testSbatchFuncWithArgs(c, []string{"--arg1=v1", "--arg2"})
+ s.testSbatchFuncWithArgs(c, []string{"--arg1=v1", "--arg2"})
}
-func testSbatchFuncWithArgs(c *C, args []string) {
- defer func() { theConfig.SbatchArguments = nil }()
- theConfig.SbatchArguments = append(theConfig.SbatchArguments, args...)
+func (s *MockArvadosServerSuite) testSbatchFuncWithArgs(c *C, args []string) {
+ s.cmd.SbatchArguments = append([]string(nil), args...)
container := arvados.Container{
UUID: "123",
@@ -367,9 +378,11 @@ func testSbatchFuncWithArgs(c *C, args []string) {
Priority: 1}
var expected []string
- expected = append(expected, theConfig.SbatchArguments...)
+ expected = append(expected, s.cmd.SbatchArguments...)
expected = append(expected, "--job-name=123", "--mem=239", "--cpus-per-task=2", "--tmp=0", "--nice=9990")
- c.Check(sbatchArgs(container), DeepEquals, expected)
+ args, err := s.cmd.sbatchArgs(container)
+ c.Check(args, DeepEquals, expected)
+ c.Check(err, IsNil)
}
func (s *MockArvadosServerSuite) TestSbatchPartition(c *C) {
@@ -379,26 +392,10 @@ func (s *MockArvadosServerSuite) TestSbatchPartition(c *C) {
SchedulingParameters: arvados.SchedulingParameters{Partitions: []string{"blurb", "b2"}},
Priority: 1}
- c.Check(sbatchArgs(container), DeepEquals, []string{
+ args, err := s.cmd.sbatchArgs(container)
+ c.Check(args, DeepEquals, []string{
"--job-name=123", "--mem=239", "--cpus-per-task=1", "--tmp=0", "--nice=9990",
"--partition=blurb,b2",
})
-}
-
-func (s *TestSuite) TestIntegrationChangePriority(c *C) {
- slurm := &slurmFake{queue: "zzzzz-dz642-queuedcontainer 9990 100\n"}
- container := s.integrationTest(c, slurm, nil,
- func(dispatcher *dispatch.Dispatcher, container arvados.Container) {
- dispatcher.UpdateState(container.UUID, dispatch.Running)
- time.Sleep(time.Second)
- dispatcher.Arv.Update("containers", container.UUID,
- arvadosclient.Dict{
- "container": arvadosclient.Dict{"priority": 600}},
- nil)
- time.Sleep(time.Second)
- dispatcher.UpdateState(container.UUID, dispatch.Complete)
- })
- c.Check(container.State, Equals, arvados.ContainerStateComplete)
- c.Assert(len(slurm.didRenice), Not(Equals), 0)
- c.Check(slurm.didRenice[len(slurm.didRenice)-1], DeepEquals, []string{"zzzzz-dz642-queuedcontainer", "4000"})
+ c.Check(err, IsNil)
}
diff --git a/services/crunch-dispatch-slurm/squeue.go b/services/crunch-dispatch-slurm/squeue.go
index 5ecfe8f..adb620e 100644
--- a/services/crunch-dispatch-slurm/squeue.go
+++ b/services/crunch-dispatch-slurm/squeue.go
@@ -22,6 +22,7 @@ type jobPriority struct {
// command 'squeue'.
type SqueueChecker struct {
Period time.Duration
+ Slurm Slurm
uuids map[string]jobPriority
startOnce sync.Once
done chan struct{}
@@ -77,7 +78,7 @@ func (sqc *SqueueChecker) check() {
sqc.L.Lock()
defer sqc.L.Unlock()
- cmd := theConfig.slurm.QueueCommand([]string{"--all", "--format=%j %y %Q"})
+ cmd := sqc.Slurm.QueueCommand([]string{"--all", "--format=%j %y %Q"})
stdout, stderr := &bytes.Buffer{}, &bytes.Buffer{}
cmd.Stdout, cmd.Stderr = stdout, stderr
if err := cmd.Run(); err != nil {
commit bfa436c97990c1a0cf39907acc0235a8535b6a43
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date: Thu Jan 25 19:11:32 2018 -0500
12199: Add SlurmNodeTypeFeatureKludge.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>
diff --git a/services/dispatchcloud/node_size.go b/services/dispatchcloud/node_size.go
index 1abccc5..9cb6d3f 100644
--- a/services/dispatchcloud/node_size.go
+++ b/services/dispatchcloud/node_size.go
@@ -5,7 +5,12 @@
package dispatchcloud
import (
+ "bytes"
"errors"
+ "log"
+ "os/exec"
+ "strings"
+ "time"
"git.curoverse.com/arvados.git/sdk/go/arvados"
)
@@ -42,3 +47,67 @@ func ChooseInstanceType(cc *arvados.Cluster, ctr *arvados.Container) (best arvad
}
return
}
+
+// SlurmNodeTypeFeatureKludge ensures SLURM accepts every instance
+// type name as a valid feature name, even if no instances of that
+// type have appeared yet.
+//
+// It takes advantage of some SLURM peculiarities:
+//
+// (1) A feature is valid after it has been offered by a node, even if
+// it is no longer offered by any node. So, to make a feature name
+// valid, we can add it to a dummy node ("compute0"), then remove it.
+//
+// (2) when srun is given an invalid --gres argument and an invalid
+// --constraint argument, the error message mentions "Invalid feature
+// specification". So, to test whether a feature name is valid without
+// actually submitting a job, we can call srun with the feature name
+// and an invalid --gres argument.
+//
+// SlurmNodeTypeFeatureKludge does a test-and-fix operation
+// immediately, and then periodically, in case slurm restarts and
+// forgets the list of valid features. It never returns, so it should
+// generally be invoked with "go".
+func SlurmNodeTypeFeatureKludge(cc *arvados.Cluster) {
+ var types []string
+ for _, it := range cc.InstanceTypes {
+ types = append(types, it.Name)
+ }
+ for {
+ slurmKludge(types)
+ time.Sleep(time.Minute)
+ }
+}
+
+var (
+ slurmDummyNode = "compute0"
+ slurmErrBadFeature = "Invalid feature"
+ slurmErrBadGres = "Invalid generic resource"
+)
+
+func slurmKludge(types []string) {
+ cmd := exec.Command("srun", "--gres=invalid-gres-specification", "--constraint="+strings.Join(types, "&"), "true")
+ out, err := cmd.CombinedOutput()
+ switch {
+ case err == nil:
+ log.Printf("warning: guaranteed-to-fail srun command did not fail: %q %q", cmd.Path, cmd.Args)
+ log.Printf("output was: %q", out)
+
+ case bytes.Contains(out, []byte(slurmErrBadFeature)):
+ log.Printf("temporarily configuring node %q with all node type features", slurmDummyNode)
+ for _, features := range []string{strings.Join(types, ","), ""} {
+ cmd = exec.Command("scontrol", "update", "NodeName="+slurmDummyNode, "Features="+features)
+ log.Printf("running: %q %q", cmd.Path, cmd.Args)
+ out, err := cmd.CombinedOutput()
+ if err != nil {
+ log.Printf("error: scontrol: %s (output was %q)", err, out)
+ }
+ }
+
+ case bytes.Contains(out, []byte(slurmErrBadGres)):
+ // Evidently our node-type feature names are all valid.
+
+ default:
+ log.Printf("warning: expected srun error %q or %q, but output was %q", slurmErrBadFeature, slurmErrBadGres, out)
+ }
+}
commit f9e1447e5ab21dbc0fdb08328c31811fca2fd327
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date: Tue Jan 23 14:25:21 2018 -0500
12199: Add node size calculator.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>
diff --git a/build/run-tests.sh b/build/run-tests.sh
index a02f732..57ce41e 100755
--- a/build/run-tests.sh
+++ b/build/run-tests.sh
@@ -77,6 +77,7 @@ lib/crunchstat
services/api
services/arv-git-httpd
services/crunchstat
+services/dispatchcloud
services/dockercleaner
services/fuse
services/health
@@ -869,6 +870,7 @@ gostuff=(
sdk/go/stats
services/arv-git-httpd
services/crunchstat
+ services/dispatchcloud
services/health
services/keep-web
services/keepstore
diff --git a/sdk/go/arvados/config.go b/sdk/go/arvados/config.go
index ca0df1f..9ed0eac 100644
--- a/sdk/go/arvados/config.go
+++ b/sdk/go/arvados/config.go
@@ -1,3 +1,7 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: Apache-2.0
+
package arvados
import (
@@ -48,6 +52,16 @@ type Cluster struct {
ClusterID string `json:"-"`
ManagementToken string
SystemNodes map[string]SystemNode
+ InstanceTypes []InstanceType
+}
+
+type InstanceType struct {
+ Name string
+ ProviderType string
+ VCPUs int
+ RAM int64
+ Scratch int64
+ Price float64
}
// GetThisSystemNode returns a SystemNode for the node we're running
diff --git a/sdk/go/arvados/container.go b/sdk/go/arvados/container.go
index a541a8d..20d007c 100644
--- a/sdk/go/arvados/container.go
+++ b/sdk/go/arvados/container.go
@@ -41,9 +41,9 @@ type Mount struct {
// CPU) and network connectivity.
type RuntimeConstraints struct {
API *bool
- RAM int `json:"ram"`
- VCPUs int `json:"vcpus"`
- KeepCacheRAM int `json:"keep_cache_ram"`
+ RAM int64 `json:"ram"`
+ VCPUs int `json:"vcpus"`
+ KeepCacheRAM int64 `json:"keep_cache_ram"`
}
// SchedulingParameters specify a container's scheduling parameters
diff --git a/services/dispatchcloud/gocheck_test.go b/services/dispatchcloud/gocheck_test.go
new file mode 100644
index 0000000..22f89f0
--- /dev/null
+++ b/services/dispatchcloud/gocheck_test.go
@@ -0,0 +1,16 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package dispatchcloud
+
+import (
+ "testing"
+
+ check "gopkg.in/check.v1"
+)
+
+// Gocheck boilerplate
+func Test(t *testing.T) {
+ check.TestingT(t)
+}
diff --git a/services/dispatchcloud/node_size.go b/services/dispatchcloud/node_size.go
new file mode 100644
index 0000000..1abccc5
--- /dev/null
+++ b/services/dispatchcloud/node_size.go
@@ -0,0 +1,44 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package dispatchcloud
+
+import (
+ "errors"
+
+ "git.curoverse.com/arvados.git/sdk/go/arvados"
+)
+
+var (
+ ErrConstraintsNotSatisfiable = errors.New("constraints not satisfiable by any configured instance type")
+ ErrInstanceTypesNotConfigured = errors.New("site configuration does not list any instance types")
+)
+
+// ChooseInstanceType returns the cheapest available
+// arvados.InstanceType big enough to run ctr.
+func ChooseInstanceType(cc *arvados.Cluster, ctr *arvados.Container) (best arvados.InstanceType, err error) {
+ needVCPUs := ctr.RuntimeConstraints.VCPUs
+ needRAM := ctr.RuntimeConstraints.RAM + ctr.RuntimeConstraints.KeepCacheRAM
+
+ if len(cc.InstanceTypes) == 0 {
+ err = ErrInstanceTypesNotConfigured
+ return
+ }
+
+ err = ErrConstraintsNotSatisfiable
+ for _, it := range cc.InstanceTypes {
+ switch {
+ case err == nil && it.Price > best.Price:
+ case it.RAM < needRAM:
+ case it.VCPUs < needVCPUs:
+ case it.Price == best.Price && (it.RAM < best.RAM || it.VCPUs < best.VCPUs):
+ // Equal price, but worse specs
+ default:
+ // Lower price || (same price && better specs)
+ best = it
+ err = nil
+ }
+ }
+ return
+}
diff --git a/services/dispatchcloud/node_size_test.go b/services/dispatchcloud/node_size_test.go
new file mode 100644
index 0000000..bc628b5
--- /dev/null
+++ b/services/dispatchcloud/node_size_test.go
@@ -0,0 +1,73 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package dispatchcloud
+
+import (
+ "git.curoverse.com/arvados.git/sdk/go/arvados"
+ check "gopkg.in/check.v1"
+)
+
+var _ = check.Suite(&NodeSizeSuite{})
+
+type NodeSizeSuite struct{}
+
+func (*NodeSizeSuite) TestChooseNotConfigured(c *check.C) {
+ _, err := ChooseInstanceType(&arvados.Cluster{}, &arvados.Container{
+ RuntimeConstraints: arvados.RuntimeConstraints{
+ RAM: 1234567890,
+ VCPUs: 2,
+ },
+ })
+ c.Check(err, check.Equals, ErrInstanceTypesNotConfigured)
+}
+
+func (*NodeSizeSuite) TestChooseUnsatisfiable(c *check.C) {
+ for _, rc := range []arvados.RuntimeConstraints{
+ {RAM: 9876543210, VCPUs: 2},
+ {RAM: 1234567890, VCPUs: 20},
+ {RAM: 1234567890, VCPUs: 2, KeepCacheRAM: 9876543210},
+ } {
+ _, err := ChooseInstanceType(&arvados.Cluster{InstanceTypes: []arvados.InstanceType{
+ {Price: 1.1, RAM: 1000000000, VCPUs: 2, Name: "small1"},
+ {Price: 2.2, RAM: 2000000000, VCPUs: 4, Name: "small2"},
+ {Price: 4.4, RAM: 4000000000, VCPUs: 8, Name: "small4"},
+ }}, &arvados.Container{RuntimeConstraints: rc})
+ c.Check(err, check.Equals, ErrConstraintsNotSatisfiable)
+ }
+}
+
+func (*NodeSizeSuite) TestChoose(c *check.C) {
+ for _, menu := range [][]arvados.InstanceType{
+ {
+ {Price: 4.4, RAM: 4000000000, VCPUs: 8, Name: "costly"},
+ {Price: 2.2, RAM: 2000000000, VCPUs: 4, Name: "best"},
+ {Price: 1.1, RAM: 1000000000, VCPUs: 2, Name: "small"},
+ },
+ {
+ {Price: 4.4, RAM: 4000000000, VCPUs: 8, Name: "costly"},
+ {Price: 2.2, RAM: 2000000000, VCPUs: 4, Name: "goodenough"},
+ {Price: 2.2, RAM: 4000000000, VCPUs: 4, Name: "best"},
+ {Price: 1.1, RAM: 1000000000, VCPUs: 2, Name: "small"},
+ },
+ {
+ {Price: 1.1, RAM: 1000000000, VCPUs: 2, Name: "small"},
+ {Price: 2.2, RAM: 2000000000, VCPUs: 4, Name: "goodenough"},
+ {Price: 2.2, RAM: 4000000000, VCPUs: 4, Name: "best"},
+ {Price: 4.4, RAM: 4000000000, VCPUs: 8, Name: "costly"},
+ },
+ } {
+ best, err := ChooseInstanceType(&arvados.Cluster{InstanceTypes: menu}, &arvados.Container{
+ RuntimeConstraints: arvados.RuntimeConstraints{
+ VCPUs: 2,
+ RAM: 987654321,
+ KeepCacheRAM: 123456789,
+ },
+ })
+ c.Check(err, check.IsNil)
+ c.Check(best.Name, check.Equals, "best")
+ c.Check(best.RAM >= 1234567890, check.Equals, true)
+ c.Check(best.VCPUs >= 2, check.Equals, true)
+ }
+}
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list