[arvados] updated: 2.7.0-5289-g88aedea4fd
git repository hosting
git at public.arvados.org
Thu Nov 2 19:47:19 UTC 2023
Summary of changes:
lib/cloud/ec2/ec2.go | 6 ++-
lib/cloud/ec2/ec2_test.go | 20 +++++++---
lib/dispatchcloud/node_size.go | 6 +--
lib/dispatchcloud/scheduler/run_queue.go | 66 +++++++++++++++++---------------
4 files changed, 56 insertions(+), 42 deletions(-)
via 88aedea4fdf827524c620830ec11681e5cd5f527 (commit)
via 66cee5a8021e73271650e0997ca7f757e419d169 (commit)
via da204dccd3df12b5c885068768f95c84e4703047 (commit)
from 1028c0630dac2a2bff363da1390bbf942e7fe7ae (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
commit 88aedea4fdf827524c620830ec11681e5cd5f527
Author: Tom Clegg <tom at curii.com>
Date: Thu Nov 2 15:37:16 2023 -0400
20978: Add sort-by-scratch fallback.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>
diff --git a/lib/dispatchcloud/node_size.go b/lib/dispatchcloud/node_size.go
index 0a5a79bc70..802bc65c28 100644
--- a/lib/dispatchcloud/node_size.go
+++ b/lib/dispatchcloud/node_size.go
@@ -178,9 +178,9 @@ func ChooseInstanceType(cc *arvados.Cluster, ctr *arvados.Container) ([]arvados.
// if same price and RAM, prefer more VCPUs
return types[i].VCPUs > types[j].VCPUs
}
- if types[i].VCPUs != types[j].VCPUs {
- // if same price and RAM, prefer more VCPUs
- return types[i].VCPUs > types[j].VCPUs
+ if types[i].Scratch != types[j].Scratch {
+ // if same price and RAM and VCPUs, prefer more scratch
+ return types[i].Scratch > types[j].Scratch
}
// no preference, just sort the same way each time
return types[i].Name < types[j].Name
commit 66cee5a8021e73271650e0997ca7f757e419d169
Author: Tom Clegg <tom at curii.com>
Date: Thu Nov 2 15:35:43 2023 -0400
20978: Rearrange large if-else sequence for clarity.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>
diff --git a/lib/dispatchcloud/scheduler/run_queue.go b/lib/dispatchcloud/scheduler/run_queue.go
index 8264c5ef0a..2f1f175890 100644
--- a/lib/dispatchcloud/scheduler/run_queue.go
+++ b/lib/dispatchcloud/scheduler/run_queue.go
@@ -235,9 +235,28 @@ tryrun:
}
trying++
if unallocOK {
+ // We have a suitable instance type,
+ // so mark it as allocated, and try to
+ // start the container.
unalloc[unallocType]--
logger = logger.WithField("InstanceType", unallocType)
- } else if sch.pool.AtQuota() {
+ if dontstart[unallocType] {
+ // We already tried & failed to start
+ // a higher-priority container on the
+ // same instance type. Don't let this
+ // one sneak in ahead of it.
+ } else if sch.pool.KillContainer(ctr.UUID, "about to start") {
+ logger.Info("not restarting yet: crunch-run process from previous attempt has not exited")
+ } else if sch.pool.StartContainer(unallocType, ctr) {
+ logger.Trace("StartContainer => true")
+ } else {
+ logger.Trace("StartContainer => false")
+ containerAllocatedWorkerBootingCount += 1
+ dontstart[unallocType] = true
+ }
+ continue
+ }
+ if sch.pool.AtQuota() {
// Don't let lower-priority containers
// starve this one by using keeping
// idle workers alive on different
@@ -245,7 +264,8 @@ tryrun:
logger.Trace("overquota")
overquota = sorted[i:]
break tryrun
- } else if !availableOK {
+ }
+ if !availableOK {
// Continue trying lower-priority
// containers in case they can run on
// different instance types that are
@@ -261,40 +281,24 @@ tryrun:
// container B now.
logger.Trace("all eligible types at capacity")
continue
- } else if logger = logger.WithField("InstanceType", availableType); sch.pool.Create(availableType) {
- // Success. (Note pool.Create works
- // asynchronously and does its own
- // logging about the eventual outcome,
- // so we don't need to.)
- logger.Info("creating new instance")
- // Don't bother trying to start the
- // container yet -- obviously the
- // instance will take some time to
- // boot and become ready.
- containerAllocatedWorkerBootingCount += 1
- dontstart[availableType] = true
- continue
- } else {
+ }
+ logger = logger.WithField("InstanceType", availableType)
+ if !sch.pool.Create(availableType) {
// Failed despite not being at quota,
// e.g., cloud ops throttled.
logger.Trace("pool declined to create new instance")
continue
}
-
- if dontstart[unallocType] {
- // We already tried & failed to start
- // a higher-priority container on the
- // same instance type. Don't let this
- // one sneak in ahead of it.
- } else if sch.pool.KillContainer(ctr.UUID, "about to start") {
- logger.Info("not restarting yet: crunch-run process from previous attempt has not exited")
- } else if sch.pool.StartContainer(unallocType, ctr) {
- logger.Trace("StartContainer => true")
- } else {
- logger.Trace("StartContainer => false")
- containerAllocatedWorkerBootingCount += 1
- dontstart[unallocType] = true
- }
+ // Success. (Note pool.Create works
+ // asynchronously and does its own logging
+ // about the eventual outcome, so we don't
+ // need to.)
+ logger.Info("creating new instance")
+ // Don't bother trying to start the container
+ // yet -- obviously the instance will take
+ // some time to boot and become ready.
+ containerAllocatedWorkerBootingCount += 1
+ dontstart[availableType] = true
}
}
commit da204dccd3df12b5c885068768f95c84e4703047
Author: Tom Clegg <tom at curii.com>
Date: Thu Nov 2 15:26:56 2023 -0400
20978: Treat "unsupported instance type" as capacity=0.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>
diff --git a/lib/cloud/ec2/ec2.go b/lib/cloud/ec2/ec2.go
index 816df48d90..55f9a1e3a3 100644
--- a/lib/cloud/ec2/ec2.go
+++ b/lib/cloud/ec2/ec2.go
@@ -711,7 +711,8 @@ func isErrorSubnetSpecific(err error) bool {
code := aerr.Code()
return strings.Contains(code, "Subnet") ||
code == "InsufficientInstanceCapacity" ||
- code == "InsufficientVolumeCapacity"
+ code == "InsufficientVolumeCapacity" ||
+ code == "Unsupported"
}
type ec2QuotaError struct {
@@ -737,7 +738,8 @@ func wrapError(err error, throttleValue *atomic.Value) error {
return rateLimitError{error: err, earliestRetry: time.Now().Add(d)}
} else if isErrorQuota(err) {
return &ec2QuotaError{err}
- } else if aerr, ok := err.(awserr.Error); ok && aerr != nil && aerr.Code() == "InsufficientInstanceCapacity" {
+ } else if aerr, ok := err.(awserr.Error); ok && (aerr.Code() == "InsufficientInstanceCapacity" ||
+ (aerr.Code() == "Unsupported" && strings.Contains(aerr.Message(), "requested instance type"))) {
return &capacityError{err, true}
} else if err != nil {
throttleValue.Store(time.Duration(0))
diff --git a/lib/cloud/ec2/ec2_test.go b/lib/cloud/ec2/ec2_test.go
index a57fcebf76..6ce5aa3cf9 100644
--- a/lib/cloud/ec2/ec2_test.go
+++ b/lib/cloud/ec2/ec2_test.go
@@ -513,10 +513,18 @@ func (*EC2InstanceSetSuite) TestWrapError(c *check.C) {
_, ok = wrapped.(cloud.QuotaError)
c.Check(ok, check.Equals, true)
- capacityError := awserr.New("InsufficientInstanceCapacity", "", nil)
- wrapped = wrapError(capacityError, nil)
- caperr, ok := wrapped.(cloud.CapacityError)
- c.Check(ok, check.Equals, true)
- c.Check(caperr.IsCapacityError(), check.Equals, true)
- c.Check(caperr.IsInstanceTypeSpecific(), check.Equals, true)
+ for _, trial := range []struct {
+ code string
+ msg string
+ }{
+ {"InsufficientInstanceCapacity", ""},
+ {"Unsupported", "Your requested instance type (t3.micro) is not supported in your requested Availability Zone (us-east-1e). Please retry your request by not specifying an Availability Zone or choosing us-east-1a, us-east-1b, us-east-1c, us-east-1d, us-east-1f."},
+ } {
+ capacityError := awserr.New(trial.code, trial.msg, nil)
+ wrapped = wrapError(capacityError, nil)
+ caperr, ok := wrapped.(cloud.CapacityError)
+ c.Check(ok, check.Equals, true)
+ c.Check(caperr.IsCapacityError(), check.Equals, true)
+ c.Check(caperr.IsInstanceTypeSpecific(), check.Equals, true)
+ }
}
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list