[arvados] created: 2.6.0-83-gaba09e96f
git repository hosting
git at public.arvados.org
Mon May 1 15:56:34 UTC 2023
at aba09e96f2bbb8e1e45c941a7e8c62bb772344e8 (commit)
commit aba09e96f2bbb8e1e45c941a7e8c62bb772344e8
Author: Tom Clegg <tom at curii.com>
Date: Mon May 1 11:54:59 2023 -0400
20457: Add logging in test case wrt 503 responses.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>
diff --git a/lib/dispatchcloud/dispatcher_test.go b/lib/dispatchcloud/dispatcher_test.go
index 15e545f8a..273a3836d 100644
--- a/lib/dispatchcloud/dispatcher_test.go
+++ b/lib/dispatchcloud/dispatcher_test.go
@@ -105,7 +105,10 @@ func (s *DispatcherSuite) SetUpTest(c *check.C) {
// Disable auto-retry
arvClient.Timeout = 0
- s.error503Server = httptest.NewTLSServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { w.WriteHeader(http.StatusServiceUnavailable) }))
+ s.error503Server = httptest.NewTLSServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ c.Logf("503 stub: returning 503")
+ w.WriteHeader(http.StatusServiceUnavailable)
+ }))
arvClient.Client = &http.Client{
Transport: &http.Transport{
Proxy: s.arvClientProxy(c),
@@ -136,6 +139,7 @@ func (s *DispatcherSuite) TearDownTest(c *check.C) {
func (s *DispatcherSuite) arvClientProxy(c *check.C) func(*http.Request) (*url.URL, error) {
return func(req *http.Request) (*url.URL, error) {
if req.URL.Path == "/503" {
+ c.Logf("arvClientProxy: proxying to 503 stub")
return url.Parse(s.error503Server.URL)
} else {
return nil, nil
@@ -186,6 +190,7 @@ func (s *DispatcherSuite) TestDispatchToStubDriver(c *check.C) {
delete(waiting, ctr.UUID)
if len(waiting) == 100 {
// trigger scheduler maxConcurrency limit
+ c.Logf("test: requesting 503 in order to trigger maxConcurrency limit")
s.disp.ArvClient.RequestAndDecode(nil, "GET", "503", nil, nil)
}
if len(waiting) == 0 {
commit c19811454b70623c0f0f92e07ffc6c4b33deb63d
Author: Tom Clegg <tom at curii.com>
Date: Mon May 1 11:54:23 2023 -0400
20457: Don't keep ctr mounts in memory after computing node size.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>
diff --git a/lib/dispatchcloud/container/queue.go b/lib/dispatchcloud/container/queue.go
index 938ef915f..5f6fdefbd 100644
--- a/lib/dispatchcloud/container/queue.go
+++ b/lib/dispatchcloud/container/queue.go
@@ -233,6 +233,11 @@ func (cq *Queue) delEnt(uuid string, state arvados.ContainerState) {
// Caller must have lock.
func (cq *Queue) addEnt(uuid string, ctr arvados.Container) {
it, err := cq.chooseType(&ctr)
+
+ // Avoid wasting memory on a large Mounts attr (we don't need
+ // it after choosing type).
+ ctr.Mounts = nil
+
if err != nil && (ctr.State == arvados.ContainerStateQueued || ctr.State == arvados.ContainerStateLocked) {
// We assume here that any chooseType error is a hard
// error: it wouldn't help to try again, or to leave
diff --git a/lib/dispatchcloud/container/queue_test.go b/lib/dispatchcloud/container/queue_test.go
index 0075ee324..ca1098353 100644
--- a/lib/dispatchcloud/container/queue_test.go
+++ b/lib/dispatchcloud/container/queue_test.go
@@ -41,6 +41,7 @@ func (suite *IntegrationSuite) TearDownTest(c *check.C) {
func (suite *IntegrationSuite) TestGetLockUnlockCancel(c *check.C) {
typeChooser := func(ctr *arvados.Container) (arvados.InstanceType, error) {
+ c.Check(ctr.Mounts["/tmp"].Capacity, check.Equals, int64(24000000000))
return arvados.InstanceType{Name: "testType"}, nil
}
@@ -64,6 +65,8 @@ func (suite *IntegrationSuite) TestGetLockUnlockCancel(c *check.C) {
c.Check(ent.InstanceType.Name, check.Equals, "testType")
c.Check(ent.Container.State, check.Equals, arvados.ContainerStateQueued)
c.Check(ent.Container.Priority > 0, check.Equals, true)
+ // Mounts should be deleted to avoid wasting memory
+ c.Check(ent.Container.Mounts, check.IsNil)
ctr, ok := cq.Get(uuid)
c.Check(ok, check.Equals, true)
diff --git a/lib/dispatchcloud/test/queue.go b/lib/dispatchcloud/test/queue.go
index e347338bf..2be8246bd 100644
--- a/lib/dispatchcloud/test/queue.go
+++ b/lib/dispatchcloud/test/queue.go
@@ -168,6 +168,7 @@ func (q *Queue) Update() error {
upd[ctr.UUID] = ent
} else {
it, _ := q.ChooseType(&ctr)
+ ctr.Mounts = nil
upd[ctr.UUID] = container.QueueEnt{
Container: ctr,
InstanceType: it,
commit 9e6bf5e6b110c016423178c2ed452d148f21b3c3
Author: Tom Clegg <tom at curii.com>
Date: Mon May 1 11:53:34 2023 -0400
20457: Implement MaxDispatchAttempts in test stub.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>
diff --git a/lib/dispatchcloud/dispatcher_test.go b/lib/dispatchcloud/dispatcher_test.go
index 7454b5784..15e545f8a 100644
--- a/lib/dispatchcloud/dispatcher_test.go
+++ b/lib/dispatchcloud/dispatcher_test.go
@@ -151,6 +151,7 @@ func (s *DispatcherSuite) TestDispatchToStubDriver(c *check.C) {
Drivers["test"] = s.stubDriver
s.disp.setupOnce.Do(s.disp.initialize)
queue := &test.Queue{
+ MaxDispatchAttempts: 5,
ChooseType: func(ctr *arvados.Container) (arvados.InstanceType, error) {
return ChooseInstanceType(s.cluster, ctr)
},
diff --git a/lib/dispatchcloud/test/queue.go b/lib/dispatchcloud/test/queue.go
index fcb2cfb33..e347338bf 100644
--- a/lib/dispatchcloud/test/queue.go
+++ b/lib/dispatchcloud/test/queue.go
@@ -24,6 +24,9 @@ type Queue struct {
// must not be nil.
ChooseType func(*arvados.Container) (arvados.InstanceType, error)
+ // Mimic railsapi implementation of MaxDispatchAttempts config
+ MaxDispatchAttempts int
+
Logger logrus.FieldLogger
entries map[string]container.QueueEnt
@@ -133,7 +136,15 @@ func (q *Queue) changeState(uuid string, from, to arvados.ContainerState) error
q.entries[uuid] = ent
for i, ctr := range q.Containers {
if ctr.UUID == uuid {
- q.Containers[i].State = to
+ if max := q.MaxDispatchAttempts; max > 0 && ctr.LockCount >= max && to == arvados.ContainerStateQueued {
+ q.Containers[i].State = arvados.ContainerStateCancelled
+ q.Containers[i].RuntimeStatus = map[string]interface{}{"error": fmt.Sprintf("Failed to start: lock_count == %d", ctr.LockCount)}
+ } else {
+ q.Containers[i].State = to
+ if to == arvados.ContainerStateLocked {
+ q.Containers[i].LockCount++
+ }
+ }
break
}
}
diff --git a/sdk/go/arvados/container.go b/sdk/go/arvados/container.go
index 7b31726aa..2467e807a 100644
--- a/sdk/go/arvados/container.go
+++ b/sdk/go/arvados/container.go
@@ -19,6 +19,7 @@ type Container struct {
Cwd string `json:"cwd"`
Environment map[string]string `json:"environment"`
LockedByUUID string `json:"locked_by_uuid"`
+ LockCount int `json:"lock_count"`
Mounts map[string]Mount `json:"mounts"`
Output string `json:"output"`
OutputPath string `json:"output_path"`
commit a992f644b8f34203d0a8133af829e4c84c2c4174
Author: Tom Clegg <tom at curii.com>
Date: Mon May 1 09:36:01 2023 -0400
20457: Log at info level when flapping lock at concurrency limit.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>
diff --git a/lib/dispatchcloud/scheduler/run_queue.go b/lib/dispatchcloud/scheduler/run_queue.go
index b8158579a..dda3630ee 100644
--- a/lib/dispatchcloud/scheduler/run_queue.go
+++ b/lib/dispatchcloud/scheduler/run_queue.go
@@ -185,7 +185,7 @@ tryrun:
_, toolate := running[ctr.UUID]
if ctr.State == arvados.ContainerStateLocked && !toolate {
logger := sch.logger.WithField("ContainerUUID", ctr.UUID)
- logger.Debug("unlock because pool capacity is used by higher priority containers")
+ logger.Info("unlock because pool capacity is used by higher priority containers")
err := sch.queue.Unlock(ctr.UUID)
if err != nil {
logger.WithError(err).Warn("error unlocking")
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list