[arvados] created: 2.6.0-83-gaba09e96f

git repository hosting git at public.arvados.org
Mon May 1 15:56:34 UTC 2023


        at  aba09e96f2bbb8e1e45c941a7e8c62bb772344e8 (commit)


commit aba09e96f2bbb8e1e45c941a7e8c62bb772344e8
Author: Tom Clegg <tom at curii.com>
Date:   Mon May 1 11:54:59 2023 -0400

    20457: Add logging in test case wrt 503 responses.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>

diff --git a/lib/dispatchcloud/dispatcher_test.go b/lib/dispatchcloud/dispatcher_test.go
index 15e545f8a..273a3836d 100644
--- a/lib/dispatchcloud/dispatcher_test.go
+++ b/lib/dispatchcloud/dispatcher_test.go
@@ -105,7 +105,10 @@ func (s *DispatcherSuite) SetUpTest(c *check.C) {
 	// Disable auto-retry
 	arvClient.Timeout = 0
 
-	s.error503Server = httptest.NewTLSServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { w.WriteHeader(http.StatusServiceUnavailable) }))
+	s.error503Server = httptest.NewTLSServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		c.Logf("503 stub: returning 503")
+		w.WriteHeader(http.StatusServiceUnavailable)
+	}))
 	arvClient.Client = &http.Client{
 		Transport: &http.Transport{
 			Proxy: s.arvClientProxy(c),
@@ -136,6 +139,7 @@ func (s *DispatcherSuite) TearDownTest(c *check.C) {
 func (s *DispatcherSuite) arvClientProxy(c *check.C) func(*http.Request) (*url.URL, error) {
 	return func(req *http.Request) (*url.URL, error) {
 		if req.URL.Path == "/503" {
+			c.Logf("arvClientProxy: proxying to 503 stub")
 			return url.Parse(s.error503Server.URL)
 		} else {
 			return nil, nil
@@ -186,6 +190,7 @@ func (s *DispatcherSuite) TestDispatchToStubDriver(c *check.C) {
 		delete(waiting, ctr.UUID)
 		if len(waiting) == 100 {
 			// trigger scheduler maxConcurrency limit
+			c.Logf("test: requesting 503 in order to trigger maxConcurrency limit")
 			s.disp.ArvClient.RequestAndDecode(nil, "GET", "503", nil, nil)
 		}
 		if len(waiting) == 0 {

commit c19811454b70623c0f0f92e07ffc6c4b33deb63d
Author: Tom Clegg <tom at curii.com>
Date:   Mon May 1 11:54:23 2023 -0400

    20457: Don't keep ctr mounts in memory after computing node size.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>

diff --git a/lib/dispatchcloud/container/queue.go b/lib/dispatchcloud/container/queue.go
index 938ef915f..5f6fdefbd 100644
--- a/lib/dispatchcloud/container/queue.go
+++ b/lib/dispatchcloud/container/queue.go
@@ -233,6 +233,11 @@ func (cq *Queue) delEnt(uuid string, state arvados.ContainerState) {
 // Caller must have lock.
 func (cq *Queue) addEnt(uuid string, ctr arvados.Container) {
 	it, err := cq.chooseType(&ctr)
+
+	// Avoid wasting memory on a large Mounts attr (we don't need
+	// it after choosing type).
+	ctr.Mounts = nil
+
 	if err != nil && (ctr.State == arvados.ContainerStateQueued || ctr.State == arvados.ContainerStateLocked) {
 		// We assume here that any chooseType error is a hard
 		// error: it wouldn't help to try again, or to leave
diff --git a/lib/dispatchcloud/container/queue_test.go b/lib/dispatchcloud/container/queue_test.go
index 0075ee324..ca1098353 100644
--- a/lib/dispatchcloud/container/queue_test.go
+++ b/lib/dispatchcloud/container/queue_test.go
@@ -41,6 +41,7 @@ func (suite *IntegrationSuite) TearDownTest(c *check.C) {
 
 func (suite *IntegrationSuite) TestGetLockUnlockCancel(c *check.C) {
 	typeChooser := func(ctr *arvados.Container) (arvados.InstanceType, error) {
+		c.Check(ctr.Mounts["/tmp"].Capacity, check.Equals, int64(24000000000))
 		return arvados.InstanceType{Name: "testType"}, nil
 	}
 
@@ -64,6 +65,8 @@ func (suite *IntegrationSuite) TestGetLockUnlockCancel(c *check.C) {
 		c.Check(ent.InstanceType.Name, check.Equals, "testType")
 		c.Check(ent.Container.State, check.Equals, arvados.ContainerStateQueued)
 		c.Check(ent.Container.Priority > 0, check.Equals, true)
+		// Mounts should be deleted to avoid wasting memory
+		c.Check(ent.Container.Mounts, check.IsNil)
 
 		ctr, ok := cq.Get(uuid)
 		c.Check(ok, check.Equals, true)
diff --git a/lib/dispatchcloud/test/queue.go b/lib/dispatchcloud/test/queue.go
index e347338bf..2be8246bd 100644
--- a/lib/dispatchcloud/test/queue.go
+++ b/lib/dispatchcloud/test/queue.go
@@ -168,6 +168,7 @@ func (q *Queue) Update() error {
 			upd[ctr.UUID] = ent
 		} else {
 			it, _ := q.ChooseType(&ctr)
+			ctr.Mounts = nil
 			upd[ctr.UUID] = container.QueueEnt{
 				Container:    ctr,
 				InstanceType: it,

commit 9e6bf5e6b110c016423178c2ed452d148f21b3c3
Author: Tom Clegg <tom at curii.com>
Date:   Mon May 1 11:53:34 2023 -0400

    20457: Implement MaxDispatchAttempts in test stub.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>

diff --git a/lib/dispatchcloud/dispatcher_test.go b/lib/dispatchcloud/dispatcher_test.go
index 7454b5784..15e545f8a 100644
--- a/lib/dispatchcloud/dispatcher_test.go
+++ b/lib/dispatchcloud/dispatcher_test.go
@@ -151,6 +151,7 @@ func (s *DispatcherSuite) TestDispatchToStubDriver(c *check.C) {
 	Drivers["test"] = s.stubDriver
 	s.disp.setupOnce.Do(s.disp.initialize)
 	queue := &test.Queue{
+		MaxDispatchAttempts: 5,
 		ChooseType: func(ctr *arvados.Container) (arvados.InstanceType, error) {
 			return ChooseInstanceType(s.cluster, ctr)
 		},
diff --git a/lib/dispatchcloud/test/queue.go b/lib/dispatchcloud/test/queue.go
index fcb2cfb33..e347338bf 100644
--- a/lib/dispatchcloud/test/queue.go
+++ b/lib/dispatchcloud/test/queue.go
@@ -24,6 +24,9 @@ type Queue struct {
 	// must not be nil.
 	ChooseType func(*arvados.Container) (arvados.InstanceType, error)
 
+	// Mimic railsapi implementation of MaxDispatchAttempts config
+	MaxDispatchAttempts int
+
 	Logger logrus.FieldLogger
 
 	entries      map[string]container.QueueEnt
@@ -133,7 +136,15 @@ func (q *Queue) changeState(uuid string, from, to arvados.ContainerState) error
 	q.entries[uuid] = ent
 	for i, ctr := range q.Containers {
 		if ctr.UUID == uuid {
-			q.Containers[i].State = to
+			if max := q.MaxDispatchAttempts; max > 0 && ctr.LockCount >= max && to == arvados.ContainerStateQueued {
+				q.Containers[i].State = arvados.ContainerStateCancelled
+				q.Containers[i].RuntimeStatus = map[string]interface{}{"error": fmt.Sprintf("Failed to start: lock_count == %d", ctr.LockCount)}
+			} else {
+				q.Containers[i].State = to
+				if to == arvados.ContainerStateLocked {
+					q.Containers[i].LockCount++
+				}
+			}
 			break
 		}
 	}
diff --git a/sdk/go/arvados/container.go b/sdk/go/arvados/container.go
index 7b31726aa..2467e807a 100644
--- a/sdk/go/arvados/container.go
+++ b/sdk/go/arvados/container.go
@@ -19,6 +19,7 @@ type Container struct {
 	Cwd                       string                 `json:"cwd"`
 	Environment               map[string]string      `json:"environment"`
 	LockedByUUID              string                 `json:"locked_by_uuid"`
+	LockCount                 int                    `json:"lock_count"`
 	Mounts                    map[string]Mount       `json:"mounts"`
 	Output                    string                 `json:"output"`
 	OutputPath                string                 `json:"output_path"`

commit a992f644b8f34203d0a8133af829e4c84c2c4174
Author: Tom Clegg <tom at curii.com>
Date:   Mon May 1 09:36:01 2023 -0400

    20457: Log at info level when flapping lock at concurrency limit.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>

diff --git a/lib/dispatchcloud/scheduler/run_queue.go b/lib/dispatchcloud/scheduler/run_queue.go
index b8158579a..dda3630ee 100644
--- a/lib/dispatchcloud/scheduler/run_queue.go
+++ b/lib/dispatchcloud/scheduler/run_queue.go
@@ -185,7 +185,7 @@ tryrun:
 			_, toolate := running[ctr.UUID]
 			if ctr.State == arvados.ContainerStateLocked && !toolate {
 				logger := sch.logger.WithField("ContainerUUID", ctr.UUID)
-				logger.Debug("unlock because pool capacity is used by higher priority containers")
+				logger.Info("unlock because pool capacity is used by higher priority containers")
 				err := sch.queue.Unlock(ctr.UUID)
 				if err != nil {
 					logger.WithError(err).Warn("error unlocking")

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list