[arvados] updated: 2.6.1-8-g8b5478a1e
git repository hosting
git at public.arvados.org
Mon May 1 20:13:10 UTC 2023
Summary of changes:
lib/dispatchcloud/container/queue.go | 15 +++++++++++++++
lib/dispatchcloud/container/queue_test.go | 3 +++
lib/dispatchcloud/dispatcher_test.go | 8 +++++++-
lib/dispatchcloud/scheduler/run_queue.go | 2 +-
lib/dispatchcloud/test/queue.go | 14 +++++++++++++-
sdk/go/arvados/container.go | 1 +
6 files changed, 40 insertions(+), 3 deletions(-)
via 8b5478a1ebb168312c8c8aa9fe806954ded42fb3 (commit)
from abcc0cc2bb9f6b63a2b8a4d7f77a77a0f2a1f58a (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
commit 8b5478a1ebb168312c8c8aa9fe806954ded42fb3
Author: Tom Clegg <tom at curii.com>
Date: Mon May 1 16:11:07 2023 -0400
Merge branch '20457-logs-and-mem-usage'
refs #20457
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>
diff --git a/lib/dispatchcloud/container/queue.go b/lib/dispatchcloud/container/queue.go
index 938ef915f..ab686e85c 100644
--- a/lib/dispatchcloud/container/queue.go
+++ b/lib/dispatchcloud/container/queue.go
@@ -233,6 +233,11 @@ func (cq *Queue) delEnt(uuid string, state arvados.ContainerState) {
// Caller must have lock.
func (cq *Queue) addEnt(uuid string, ctr arvados.Container) {
it, err := cq.chooseType(&ctr)
+
+ // Avoid wasting memory on a large Mounts attr (we don't need
+ // it after choosing type).
+ ctr.Mounts = nil
+
if err != nil && (ctr.State == arvados.ContainerStateQueued || ctr.State == arvados.ContainerStateLocked) {
// We assume here that any chooseType error is a hard
// error: it wouldn't help to try again, or to leave
@@ -490,6 +495,16 @@ func (cq *Queue) fetchAll(initialParams arvados.ResourceListParams) ([]arvados.C
break
}
+ // Conserve memory by deleting mounts that aren't
+ // relevant to choosing the instance type.
+ for _, c := range list.Items {
+ for path, mnt := range c.Mounts {
+ if mnt.Kind != "tmp" {
+ delete(c.Mounts, path)
+ }
+ }
+ }
+
results = append(results, list.Items...)
if len(params.Order) == 1 && params.Order == "uuid" {
params.Filters = append(initialParams.Filters, arvados.Filter{"uuid", ">", list.Items[len(list.Items)-1].UUID})
diff --git a/lib/dispatchcloud/container/queue_test.go b/lib/dispatchcloud/container/queue_test.go
index 0075ee324..ca1098353 100644
--- a/lib/dispatchcloud/container/queue_test.go
+++ b/lib/dispatchcloud/container/queue_test.go
@@ -41,6 +41,7 @@ func (suite *IntegrationSuite) TearDownTest(c *check.C) {
func (suite *IntegrationSuite) TestGetLockUnlockCancel(c *check.C) {
typeChooser := func(ctr *arvados.Container) (arvados.InstanceType, error) {
+ c.Check(ctr.Mounts["/tmp"].Capacity, check.Equals, int64(24000000000))
return arvados.InstanceType{Name: "testType"}, nil
}
@@ -64,6 +65,8 @@ func (suite *IntegrationSuite) TestGetLockUnlockCancel(c *check.C) {
c.Check(ent.InstanceType.Name, check.Equals, "testType")
c.Check(ent.Container.State, check.Equals, arvados.ContainerStateQueued)
c.Check(ent.Container.Priority > 0, check.Equals, true)
+ // Mounts should be deleted to avoid wasting memory
+ c.Check(ent.Container.Mounts, check.IsNil)
ctr, ok := cq.Get(uuid)
c.Check(ok, check.Equals, true)
diff --git a/lib/dispatchcloud/dispatcher_test.go b/lib/dispatchcloud/dispatcher_test.go
index 7454b5784..273a3836d 100644
--- a/lib/dispatchcloud/dispatcher_test.go
+++ b/lib/dispatchcloud/dispatcher_test.go
@@ -105,7 +105,10 @@ func (s *DispatcherSuite) SetUpTest(c *check.C) {
// Disable auto-retry
arvClient.Timeout = 0
- s.error503Server = httptest.NewTLSServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { w.WriteHeader(http.StatusServiceUnavailable) }))
+ s.error503Server = httptest.NewTLSServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ c.Logf("503 stub: returning 503")
+ w.WriteHeader(http.StatusServiceUnavailable)
+ }))
arvClient.Client = &http.Client{
Transport: &http.Transport{
Proxy: s.arvClientProxy(c),
@@ -136,6 +139,7 @@ func (s *DispatcherSuite) TearDownTest(c *check.C) {
func (s *DispatcherSuite) arvClientProxy(c *check.C) func(*http.Request) (*url.URL, error) {
return func(req *http.Request) (*url.URL, error) {
if req.URL.Path == "/503" {
+ c.Logf("arvClientProxy: proxying to 503 stub")
return url.Parse(s.error503Server.URL)
} else {
return nil, nil
@@ -151,6 +155,7 @@ func (s *DispatcherSuite) TestDispatchToStubDriver(c *check.C) {
Drivers["test"] = s.stubDriver
s.disp.setupOnce.Do(s.disp.initialize)
queue := &test.Queue{
+ MaxDispatchAttempts: 5,
ChooseType: func(ctr *arvados.Container) (arvados.InstanceType, error) {
return ChooseInstanceType(s.cluster, ctr)
},
@@ -185,6 +190,7 @@ func (s *DispatcherSuite) TestDispatchToStubDriver(c *check.C) {
delete(waiting, ctr.UUID)
if len(waiting) == 100 {
// trigger scheduler maxConcurrency limit
+ c.Logf("test: requesting 503 in order to trigger maxConcurrency limit")
s.disp.ArvClient.RequestAndDecode(nil, "GET", "503", nil, nil)
}
if len(waiting) == 0 {
diff --git a/lib/dispatchcloud/scheduler/run_queue.go b/lib/dispatchcloud/scheduler/run_queue.go
index b8158579a..dda3630ee 100644
--- a/lib/dispatchcloud/scheduler/run_queue.go
+++ b/lib/dispatchcloud/scheduler/run_queue.go
@@ -185,7 +185,7 @@ tryrun:
_, toolate := running[ctr.UUID]
if ctr.State == arvados.ContainerStateLocked && !toolate {
logger := sch.logger.WithField("ContainerUUID", ctr.UUID)
- logger.Debug("unlock because pool capacity is used by higher priority containers")
+ logger.Info("unlock because pool capacity is used by higher priority containers")
err := sch.queue.Unlock(ctr.UUID)
if err != nil {
logger.WithError(err).Warn("error unlocking")
diff --git a/lib/dispatchcloud/test/queue.go b/lib/dispatchcloud/test/queue.go
index fcb2cfb33..2be8246bd 100644
--- a/lib/dispatchcloud/test/queue.go
+++ b/lib/dispatchcloud/test/queue.go
@@ -24,6 +24,9 @@ type Queue struct {
// must not be nil.
ChooseType func(*arvados.Container) (arvados.InstanceType, error)
+ // Mimic railsapi implementation of MaxDispatchAttempts config
+ MaxDispatchAttempts int
+
Logger logrus.FieldLogger
entries map[string]container.QueueEnt
@@ -133,7 +136,15 @@ func (q *Queue) changeState(uuid string, from, to arvados.ContainerState) error
q.entries[uuid] = ent
for i, ctr := range q.Containers {
if ctr.UUID == uuid {
- q.Containers[i].State = to
+ if max := q.MaxDispatchAttempts; max > 0 && ctr.LockCount >= max && to == arvados.ContainerStateQueued {
+ q.Containers[i].State = arvados.ContainerStateCancelled
+ q.Containers[i].RuntimeStatus = map[string]interface{}{"error": fmt.Sprintf("Failed to start: lock_count == %d", ctr.LockCount)}
+ } else {
+ q.Containers[i].State = to
+ if to == arvados.ContainerStateLocked {
+ q.Containers[i].LockCount++
+ }
+ }
break
}
}
@@ -157,6 +168,7 @@ func (q *Queue) Update() error {
upd[ctr.UUID] = ent
} else {
it, _ := q.ChooseType(&ctr)
+ ctr.Mounts = nil
upd[ctr.UUID] = container.QueueEnt{
Container: ctr,
InstanceType: it,
diff --git a/sdk/go/arvados/container.go b/sdk/go/arvados/container.go
index 7b31726aa..2467e807a 100644
--- a/sdk/go/arvados/container.go
+++ b/sdk/go/arvados/container.go
@@ -19,6 +19,7 @@ type Container struct {
Cwd string `json:"cwd"`
Environment map[string]string `json:"environment"`
LockedByUUID string `json:"locked_by_uuid"`
+ LockCount int `json:"lock_count"`
Mounts map[string]Mount `json:"mounts"`
Output string `json:"output"`
OutputPath string `json:"output_path"`
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list