[ARVADOS] created: 1.1.4-363-g652b52a
Git user
git at public.curoverse.com
Thu Jun 7 13:55:43 EDT 2018
at 652b52a64f9f697394e81bdf1f91352a6131d0db (commit)
commit 652b52a64f9f697394e81bdf1f91352a6131d0db
Author: Lucas Di Pentima <ldipentima at veritasgenetics.com>
Date: Thu Jun 7 14:51:26 2018 -0300
13581: Adds ErrConstraintsNotSatisfiable error reason to user logs.
Arvados-DCO-1.1-Signed-off-by: Lucas Di Pentima <ldipentima at veritasgenetics.com>
diff --git a/services/crunch-dispatch-slurm/crunch-dispatch-slurm.go b/services/crunch-dispatch-slurm/crunch-dispatch-slurm.go
index 9e3baab..f7c53c8 100644
--- a/services/crunch-dispatch-slurm/crunch-dispatch-slurm.go
+++ b/services/crunch-dispatch-slurm/crunch-dispatch-slurm.go
@@ -219,19 +219,20 @@ func (disp *Dispatcher) slurmConstraintArgs(container arvados.Container) []strin
}
}
-func (disp *Dispatcher) sbatchArgs(container arvados.Container) ([]string, error) {
+func (disp *Dispatcher) sbatchArgs(container arvados.Container) ([]string, string, error) {
var args []string
+ var reason string
args = append(args, disp.SbatchArguments...)
args = append(args, "--job-name="+container.UUID, fmt.Sprintf("--nice=%d", initialNiceValue))
if disp.cluster == nil {
// no instance types configured
args = append(args, disp.slurmConstraintArgs(container)...)
- } else if it, err := dispatchcloud.ChooseInstanceType(disp.cluster, &container); err == dispatchcloud.ErrInstanceTypesNotConfigured {
+ } else if it, reason, err := dispatchcloud.ChooseInstanceType(disp.cluster, &container); err == dispatchcloud.ErrInstanceTypesNotConfigured {
// ditto
args = append(args, disp.slurmConstraintArgs(container)...)
} else if err != nil {
- return nil, err
+ return nil, reason, err
} else {
// use instancetype constraint instead of slurm mem/cpu/tmp specs
args = append(args, "--constraint=instancetype="+it.Name)
@@ -241,10 +242,10 @@ func (disp *Dispatcher) sbatchArgs(container arvados.Container) ([]string, error
args = append(args, "--partition="+strings.Join(container.SchedulingParameters.Partitions, ","))
}
- return args, nil
+ return args, reason, nil
}
-func (disp *Dispatcher) submit(container arvados.Container, crunchRunCommand []string) error {
+func (disp *Dispatcher) submit(container arvados.Container, crunchRunCommand []string) (string, error) {
// append() here avoids modifying crunchRunCommand's
// underlying array, which is shared with other goroutines.
crArgs := append([]string(nil), crunchRunCommand...)
@@ -254,12 +255,12 @@ func (disp *Dispatcher) submit(container arvados.Container, crunchRunCommand []s
disp.sqCheck.L.Lock()
defer disp.sqCheck.L.Unlock()
- sbArgs, err := disp.sbatchArgs(container)
+ sbArgs, reason, err := disp.sbatchArgs(container)
if err != nil {
- return err
+ return reason, err
}
log.Printf("running sbatch %+q", sbArgs)
- return disp.slurm.Batch(crScript, sbArgs)
+ return reason, disp.slurm.Batch(crScript, sbArgs)
}
// Submit a container to the slurm queue (or resume monitoring if it's
@@ -272,10 +273,10 @@ func (disp *Dispatcher) runContainer(_ *dispatch.Dispatcher, ctr arvados.Contain
if ctr.State == dispatch.Locked && !disp.sqCheck.HasUUID(ctr.UUID) {
log.Printf("Submitting container %s to slurm", ctr.UUID)
- if err := disp.submit(ctr, disp.CrunchRunCommand); err != nil {
+ if reason, err := disp.submit(ctr, disp.CrunchRunCommand); err != nil {
var text string
if err == dispatchcloud.ErrConstraintsNotSatisfiable {
- text = fmt.Sprintf("cannot run container %s: %s", ctr.UUID, err)
+ text = fmt.Sprintf("cannot run container %s: %s (%s)", ctr.UUID, err, reason)
disp.UpdateState(ctr.UUID, dispatch.Cancelled)
} else {
text = fmt.Sprintf("Error submitting container %s to slurm: %s", ctr.UUID, err)
diff --git a/services/crunch-dispatch-slurm/crunch-dispatch-slurm_test.go b/services/crunch-dispatch-slurm/crunch-dispatch-slurm_test.go
index 85617cf..2753477 100644
--- a/services/crunch-dispatch-slurm/crunch-dispatch-slurm_test.go
+++ b/services/crunch-dispatch-slurm/crunch-dispatch-slurm_test.go
@@ -353,7 +353,7 @@ func (s *StubbedSuite) TestSbatchArgs(c *C) {
c.Logf("%#v", defaults)
s.disp.SbatchArguments = defaults
- args, err := s.disp.sbatchArgs(container)
+ args, _, err := s.disp.sbatchArgs(container)
c.Check(args, DeepEquals, append(defaults, "--job-name=123", "--nice=10000", "--mem=239", "--cpus-per-task=2", "--tmp=0"))
c.Check(err, IsNil)
}
@@ -369,6 +369,7 @@ func (s *StubbedSuite) TestSbatchInstanceTypeConstraint(c *C) {
for _, trial := range []struct {
types []arvados.InstanceType
sbatchArgs []string
+ reason string
err error
}{
// Choose node type => use --constraint arg
@@ -391,14 +392,16 @@ func (s *StubbedSuite) TestSbatchInstanceTypeConstraint(c *C) {
types: []arvados.InstanceType{
{Name: "a1.tiny", Price: 0.02, RAM: 128000000, VCPUs: 1},
},
- err: dispatchcloud.ErrConstraintsNotSatisfiable,
+ reason: "max RAM available: 128000000",
+ err: dispatchcloud.ErrConstraintsNotSatisfiable,
},
} {
c.Logf("%#v", trial)
s.disp.cluster = &arvados.Cluster{InstanceTypes: trial.types}
- args, err := s.disp.sbatchArgs(container)
+ args, reason, err := s.disp.sbatchArgs(container)
c.Check(err, Equals, trial.err)
+ c.Check(reason, Equals, trial.reason)
if trial.err == nil {
c.Check(args, DeepEquals, append([]string{"--job-name=123", "--nice=10000"}, trial.sbatchArgs...))
}
@@ -413,7 +416,7 @@ func (s *StubbedSuite) TestSbatchPartition(c *C) {
Priority: 1,
}
- args, err := s.disp.sbatchArgs(container)
+ args, _, err := s.disp.sbatchArgs(container)
c.Check(args, DeepEquals, []string{
"--job-name=123", "--nice=10000",
"--mem=239", "--cpus-per-task=1", "--tmp=0",
commit 30e06a4b88088ab23b7082965088a770c799dff9
Author: Lucas Di Pentima <ldipentima at veritasgenetics.com>
Date: Thu Jun 7 14:11:00 2018 -0300
13581: Updates ChooseInstanceType func to return error reasons.
Arvados-DCO-1.1-Signed-off-by: Lucas Di Pentima <ldipentima at veritasgenetics.com>
diff --git a/lib/dispatchcloud/node_size.go b/lib/dispatchcloud/node_size.go
index 2ca4050..bfa6ec7 100644
--- a/lib/dispatchcloud/node_size.go
+++ b/lib/dispatchcloud/node_size.go
@@ -6,6 +6,7 @@ package dispatchcloud
import (
"errors"
+ "fmt"
"log"
"os/exec"
"strings"
@@ -22,7 +23,7 @@ var (
// ChooseInstanceType returns the cheapest available
// arvados.InstanceType big enough to run ctr.
-func ChooseInstanceType(cc *arvados.Cluster, ctr *arvados.Container) (best arvados.InstanceType, err error) {
+func ChooseInstanceType(cc *arvados.Cluster, ctr *arvados.Container) (best arvados.InstanceType, errReason string, err error) {
if len(cc.InstanceTypes) == 0 {
err = ErrInstanceTypesNotConfigured
return
@@ -40,13 +41,21 @@ func ChooseInstanceType(cc *arvados.Cluster, ctr *arvados.Container) (best arvad
needRAM := ctr.RuntimeConstraints.RAM + ctr.RuntimeConstraints.KeepCacheRAM
needRAM = (needRAM * 100) / int64(100-discountConfiguredRAMPercent)
+ maxSpecsAvailable := make(map[string]int64)
+
err = ErrConstraintsNotSatisfiable
for _, it := range cc.InstanceTypes {
+ maxSpecsAvailable["Scratch"] = maxInt64(maxSpecsAvailable["Scratch"], it.Scratch)
+ maxSpecsAvailable["VCPUs"] = maxInt64(maxSpecsAvailable["VCPUs"], int64(it.VCPUs))
+ maxSpecsAvailable["RAM"] = maxInt64(maxSpecsAvailable["RAM"], it.RAM)
switch {
case err == nil && it.Price > best.Price:
case it.Scratch < needScratch:
+ errReason = fmt.Sprintf("max scratch space available: %d", maxSpecsAvailable["Scratch"])
case it.RAM < needRAM:
+ errReason = fmt.Sprintf("max RAM available: %d", maxSpecsAvailable["RAM"])
case it.VCPUs < needVCPUs:
+ errReason = fmt.Sprintf("max VCPUs available: %d", maxSpecsAvailable["VCPUs"])
case it.Price == best.Price && (it.RAM < best.RAM || it.VCPUs < best.VCPUs):
// Equal price, but worse specs
default:
@@ -55,6 +64,9 @@ func ChooseInstanceType(cc *arvados.Cluster, ctr *arvados.Container) (best arvad
err = nil
}
}
+ if err == nil {
+ errReason = ""
+ }
return
}
@@ -115,3 +127,10 @@ func slurmKludge(features []string) {
log.Printf("error: scontrol: %s (output was %q)", err, out)
}
}
+
+func maxInt64(a, b int64) int64 {
+ if a > b {
+ return a
+ }
+ return b
+}
diff --git a/lib/dispatchcloud/node_size_test.go b/lib/dispatchcloud/node_size_test.go
index 0c02a0e..0c6e5d6 100644
--- a/lib/dispatchcloud/node_size_test.go
+++ b/lib/dispatchcloud/node_size_test.go
@@ -16,7 +16,7 @@ const GiB = int64(1 << 30)
type NodeSizeSuite struct{}
func (*NodeSizeSuite) TestChooseNotConfigured(c *check.C) {
- _, err := ChooseInstanceType(&arvados.Cluster{}, &arvados.Container{
+ _, _, err := ChooseInstanceType(&arvados.Cluster{}, &arvados.Container{
RuntimeConstraints: arvados.RuntimeConstraints{
RAM: 1234567890,
VCPUs: 2,
@@ -27,12 +27,13 @@ func (*NodeSizeSuite) TestChooseNotConfigured(c *check.C) {
func (*NodeSizeSuite) TestChooseUnsatisfiable(c *check.C) {
checkUnsatisfiable := func(ctr *arvados.Container) {
- _, err := ChooseInstanceType(&arvados.Cluster{InstanceTypes: []arvados.InstanceType{
+ _, reason, err := ChooseInstanceType(&arvados.Cluster{InstanceTypes: []arvados.InstanceType{
{Price: 1.1, RAM: 1000000000, VCPUs: 2, Name: "small1"},
{Price: 2.2, RAM: 2000000000, VCPUs: 4, Name: "small2"},
{Price: 4.4, RAM: 4000000000, VCPUs: 8, Name: "small4", Scratch: GiB},
}}, ctr)
c.Check(err, check.Equals, ErrConstraintsNotSatisfiable)
+ c.Check(reason, check.Not(check.Equals), "")
}
for _, rc := range []arvados.RuntimeConstraints{
@@ -74,7 +75,7 @@ func (*NodeSizeSuite) TestChoose(c *check.C) {
{Price: 4.4, RAM: 4000000000, VCPUs: 8, Scratch: 2 * GiB, Name: "costly"},
},
} {
- best, err := ChooseInstanceType(&arvados.Cluster{InstanceTypes: menu}, &arvados.Container{
+ best, _, err := ChooseInstanceType(&arvados.Cluster{InstanceTypes: menu}, &arvados.Container{
Mounts: map[string]arvados.Mount{
"/tmp": {Kind: "tmp", Capacity: 2 * GiB},
},
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list