[arvados] updated: 2.6.1-8-g8b5478a1e

git repository hosting git at public.arvados.org
Mon May 1 20:13:10 UTC 2023


Summary of changes:
 lib/dispatchcloud/container/queue.go      | 15 +++++++++++++++
 lib/dispatchcloud/container/queue_test.go |  3 +++
 lib/dispatchcloud/dispatcher_test.go      |  8 +++++++-
 lib/dispatchcloud/scheduler/run_queue.go  |  2 +-
 lib/dispatchcloud/test/queue.go           | 14 +++++++++++++-
 sdk/go/arvados/container.go               |  1 +
 6 files changed, 40 insertions(+), 3 deletions(-)

       via  8b5478a1ebb168312c8c8aa9fe806954ded42fb3 (commit)
      from  abcc0cc2bb9f6b63a2b8a4d7f77a77a0f2a1f58a (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.


commit 8b5478a1ebb168312c8c8aa9fe806954ded42fb3
Author: Tom Clegg <tom at curii.com>
Date:   Mon May 1 16:11:07 2023 -0400

    Merge branch '20457-logs-and-mem-usage'
    
    refs #20457
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>

diff --git a/lib/dispatchcloud/container/queue.go b/lib/dispatchcloud/container/queue.go
index 938ef915f..ab686e85c 100644
--- a/lib/dispatchcloud/container/queue.go
+++ b/lib/dispatchcloud/container/queue.go
@@ -233,6 +233,11 @@ func (cq *Queue) delEnt(uuid string, state arvados.ContainerState) {
 // Caller must have lock.
 func (cq *Queue) addEnt(uuid string, ctr arvados.Container) {
 	it, err := cq.chooseType(&ctr)
+
+	// Avoid wasting memory on a large Mounts attr (we don't need
+	// it after choosing type).
+	ctr.Mounts = nil
+
 	if err != nil && (ctr.State == arvados.ContainerStateQueued || ctr.State == arvados.ContainerStateLocked) {
 		// We assume here that any chooseType error is a hard
 		// error: it wouldn't help to try again, or to leave
@@ -490,6 +495,16 @@ func (cq *Queue) fetchAll(initialParams arvados.ResourceListParams) ([]arvados.C
 			break
 		}
 
+		// Conserve memory by deleting mounts that aren't
+		// relevant to choosing the instance type.
+		for _, c := range list.Items {
+			for path, mnt := range c.Mounts {
+				if mnt.Kind != "tmp" {
+					delete(c.Mounts, path)
+				}
+			}
+		}
+
 		results = append(results, list.Items...)
 		if len(params.Order) == 1 && params.Order == "uuid" {
 			params.Filters = append(initialParams.Filters, arvados.Filter{"uuid", ">", list.Items[len(list.Items)-1].UUID})
diff --git a/lib/dispatchcloud/container/queue_test.go b/lib/dispatchcloud/container/queue_test.go
index 0075ee324..ca1098353 100644
--- a/lib/dispatchcloud/container/queue_test.go
+++ b/lib/dispatchcloud/container/queue_test.go
@@ -41,6 +41,7 @@ func (suite *IntegrationSuite) TearDownTest(c *check.C) {
 
 func (suite *IntegrationSuite) TestGetLockUnlockCancel(c *check.C) {
 	typeChooser := func(ctr *arvados.Container) (arvados.InstanceType, error) {
+		c.Check(ctr.Mounts["/tmp"].Capacity, check.Equals, int64(24000000000))
 		return arvados.InstanceType{Name: "testType"}, nil
 	}
 
@@ -64,6 +65,8 @@ func (suite *IntegrationSuite) TestGetLockUnlockCancel(c *check.C) {
 		c.Check(ent.InstanceType.Name, check.Equals, "testType")
 		c.Check(ent.Container.State, check.Equals, arvados.ContainerStateQueued)
 		c.Check(ent.Container.Priority > 0, check.Equals, true)
+		// Mounts should be deleted to avoid wasting memory
+		c.Check(ent.Container.Mounts, check.IsNil)
 
 		ctr, ok := cq.Get(uuid)
 		c.Check(ok, check.Equals, true)
diff --git a/lib/dispatchcloud/dispatcher_test.go b/lib/dispatchcloud/dispatcher_test.go
index 7454b5784..273a3836d 100644
--- a/lib/dispatchcloud/dispatcher_test.go
+++ b/lib/dispatchcloud/dispatcher_test.go
@@ -105,7 +105,10 @@ func (s *DispatcherSuite) SetUpTest(c *check.C) {
 	// Disable auto-retry
 	arvClient.Timeout = 0
 
-	s.error503Server = httptest.NewTLSServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { w.WriteHeader(http.StatusServiceUnavailable) }))
+	s.error503Server = httptest.NewTLSServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		c.Logf("503 stub: returning 503")
+		w.WriteHeader(http.StatusServiceUnavailable)
+	}))
 	arvClient.Client = &http.Client{
 		Transport: &http.Transport{
 			Proxy: s.arvClientProxy(c),
@@ -136,6 +139,7 @@ func (s *DispatcherSuite) TearDownTest(c *check.C) {
 func (s *DispatcherSuite) arvClientProxy(c *check.C) func(*http.Request) (*url.URL, error) {
 	return func(req *http.Request) (*url.URL, error) {
 		if req.URL.Path == "/503" {
+			c.Logf("arvClientProxy: proxying to 503 stub")
 			return url.Parse(s.error503Server.URL)
 		} else {
 			return nil, nil
@@ -151,6 +155,7 @@ func (s *DispatcherSuite) TestDispatchToStubDriver(c *check.C) {
 	Drivers["test"] = s.stubDriver
 	s.disp.setupOnce.Do(s.disp.initialize)
 	queue := &test.Queue{
+		MaxDispatchAttempts: 5,
 		ChooseType: func(ctr *arvados.Container) (arvados.InstanceType, error) {
 			return ChooseInstanceType(s.cluster, ctr)
 		},
@@ -185,6 +190,7 @@ func (s *DispatcherSuite) TestDispatchToStubDriver(c *check.C) {
 		delete(waiting, ctr.UUID)
 		if len(waiting) == 100 {
 			// trigger scheduler maxConcurrency limit
+			c.Logf("test: requesting 503 in order to trigger maxConcurrency limit")
 			s.disp.ArvClient.RequestAndDecode(nil, "GET", "503", nil, nil)
 		}
 		if len(waiting) == 0 {
diff --git a/lib/dispatchcloud/scheduler/run_queue.go b/lib/dispatchcloud/scheduler/run_queue.go
index b8158579a..dda3630ee 100644
--- a/lib/dispatchcloud/scheduler/run_queue.go
+++ b/lib/dispatchcloud/scheduler/run_queue.go
@@ -185,7 +185,7 @@ tryrun:
 			_, toolate := running[ctr.UUID]
 			if ctr.State == arvados.ContainerStateLocked && !toolate {
 				logger := sch.logger.WithField("ContainerUUID", ctr.UUID)
-				logger.Debug("unlock because pool capacity is used by higher priority containers")
+				logger.Info("unlock because pool capacity is used by higher priority containers")
 				err := sch.queue.Unlock(ctr.UUID)
 				if err != nil {
 					logger.WithError(err).Warn("error unlocking")
diff --git a/lib/dispatchcloud/test/queue.go b/lib/dispatchcloud/test/queue.go
index fcb2cfb33..2be8246bd 100644
--- a/lib/dispatchcloud/test/queue.go
+++ b/lib/dispatchcloud/test/queue.go
@@ -24,6 +24,9 @@ type Queue struct {
 	// must not be nil.
 	ChooseType func(*arvados.Container) (arvados.InstanceType, error)
 
+	// Mimic railsapi implementation of MaxDispatchAttempts config
+	MaxDispatchAttempts int
+
 	Logger logrus.FieldLogger
 
 	entries      map[string]container.QueueEnt
@@ -133,7 +136,15 @@ func (q *Queue) changeState(uuid string, from, to arvados.ContainerState) error
 	q.entries[uuid] = ent
 	for i, ctr := range q.Containers {
 		if ctr.UUID == uuid {
-			q.Containers[i].State = to
+			if max := q.MaxDispatchAttempts; max > 0 && ctr.LockCount >= max && to == arvados.ContainerStateQueued {
+				q.Containers[i].State = arvados.ContainerStateCancelled
+				q.Containers[i].RuntimeStatus = map[string]interface{}{"error": fmt.Sprintf("Failed to start: lock_count == %d", ctr.LockCount)}
+			} else {
+				q.Containers[i].State = to
+				if to == arvados.ContainerStateLocked {
+					q.Containers[i].LockCount++
+				}
+			}
 			break
 		}
 	}
@@ -157,6 +168,7 @@ func (q *Queue) Update() error {
 			upd[ctr.UUID] = ent
 		} else {
 			it, _ := q.ChooseType(&ctr)
+			ctr.Mounts = nil
 			upd[ctr.UUID] = container.QueueEnt{
 				Container:    ctr,
 				InstanceType: it,
diff --git a/sdk/go/arvados/container.go b/sdk/go/arvados/container.go
index 7b31726aa..2467e807a 100644
--- a/sdk/go/arvados/container.go
+++ b/sdk/go/arvados/container.go
@@ -19,6 +19,7 @@ type Container struct {
 	Cwd                       string                 `json:"cwd"`
 	Environment               map[string]string      `json:"environment"`
 	LockedByUUID              string                 `json:"locked_by_uuid"`
+	LockCount                 int                    `json:"lock_count"`
 	Mounts                    map[string]Mount       `json:"mounts"`
 	Output                    string                 `json:"output"`
 	OutputPath                string                 `json:"output_path"`

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list