[ARVADOS] updated: 1.3.0-183-g8052381fb

Git user git at public.curoverse.com
Fri Jan 18 16:33:34 EST 2019


Summary of changes:
 lib/dispatchcloud/container/queue.go      | 49 ++++++++++++++++++++++++++-----
 lib/dispatchcloud/container/queue_test.go |  4 +--
 sdk/go/arvados/container.go               | 31 +++++++++----------
 3 files changed, 59 insertions(+), 25 deletions(-)

  discards  3bf30090d15962ea34b761ce8ca5b43a972ba7f1 (commit)
       via  8052381fb4e7aceb52497e8378b596178cf5af7c (commit)

This update added new revisions after undoing existing revisions.  That is
to say, the old revision is not a strict subset of the new revision.  This
situation occurs when you --force push a change and generate a repository
containing something like this:

 * -- * -- B -- O -- O -- O (3bf30090d15962ea34b761ce8ca5b43a972ba7f1)
            \
             N -- N -- N (8052381fb4e7aceb52497e8378b596178cf5af7c)

When this happens we assume that you've already had alert emails for all
of the O revisions, and so we here report only the revisions in the N
branch from the common base, B.

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.


commit 8052381fb4e7aceb52497e8378b596178cf5af7c
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Fri Jan 18 16:28:45 2019 -0500

    14325: Cancel containers with unsatisfiable runtime constraints.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/lib/dispatchcloud/container/queue.go b/lib/dispatchcloud/container/queue.go
index 965407e51..7a41d47c3 100644
--- a/lib/dispatchcloud/container/queue.go
+++ b/lib/dispatchcloud/container/queue.go
@@ -211,9 +211,41 @@ func (cq *Queue) Update() error {
 
 func (cq *Queue) addEnt(uuid string, ctr arvados.Container) {
 	it, err := cq.chooseType(&ctr)
-	if err != nil {
-		// FIXME: throttle warnings, cancel after timeout
-		cq.logger.Warnf("cannot run %s", &ctr)
+	if err != nil && (ctr.State == arvados.ContainerStateQueued || ctr.State == arvados.ContainerStateLocked) {
+		errorString := err.Error()
+		cq.logger.WithField("ContainerUUID", ctr.UUID).Warn("cancel container with no suitable instance type")
+		go func() {
+			var err error
+			defer func() {
+				if err == nil {
+					return
+				}
+				// On failure, check current container
+				// state, and don't log the error if
+				// the failure came from losing a
+				// race.
+				var latest arvados.Container
+				cq.client.RequestAndDecode(&latest, "GET", "arvados/v1/containers/"+ctr.UUID, nil, map[string][]string{"select": {"state"}})
+				if latest.State == arvados.ContainerStateCancelled {
+					return
+				}
+				cq.logger.WithField("ContainerUUID", ctr.UUID).WithError(err).Warn("error while trying to cancel unsatisfiable container")
+			}()
+			if ctr.State == arvados.ContainerStateQueued {
+				err = cq.Lock(ctr.UUID)
+				if err != nil {
+					return
+				}
+			}
+			err = cq.setRuntimeError(ctr.UUID, errorString)
+			if err != nil {
+				return
+			}
+			err = cq.Cancel(ctr.UUID)
+			if err != nil {
+				return
+			}
+		}()
 		return
 	}
 	cq.current[uuid] = QueueEnt{Container: ctr, InstanceType: it}
@@ -229,6 +261,18 @@ func (cq *Queue) Unlock(uuid string) error {
 	return cq.apiUpdate(uuid, "unlock")
 }
 
+// setRuntimeError sets runtime_status["error"] to the given value.
+// Container should already have state==Locked or Running.
+func (cq *Queue) setRuntimeError(uuid, errorString string) error {
+	return cq.client.RequestAndDecode(nil, "PUT", "arvados/v1/containers/"+uuid, nil, map[string]map[string]map[string]interface{}{
+		"container": {
+			"runtime_status": {
+				"error": errorString,
+			},
+		},
+	})
+}
+
 // Cancel cancels the given container.
 func (cq *Queue) Cancel(uuid string) error {
 	err := cq.client.RequestAndDecode(nil, "PUT", "arvados/v1/containers/"+uuid, nil, map[string]map[string]interface{}{
diff --git a/lib/dispatchcloud/container/queue_test.go b/lib/dispatchcloud/container/queue_test.go
index 9d2f83090..a84497424 100644
--- a/lib/dispatchcloud/container/queue_test.go
+++ b/lib/dispatchcloud/container/queue_test.go
@@ -5,6 +5,7 @@
 package container
 
 import (
+	"errors"
 	"sync"
 	"testing"
 	"time"
@@ -24,9 +25,18 @@ var _ = check.Suite(&IntegrationSuite{})
 
 type IntegrationSuite struct{}
 
-func (*IntegrationSuite) TestControllerBackedQueue(c *check.C) {
+func (suite *IntegrationSuite) TearDownTest(c *check.C) {
+	err := arvados.NewClientFromEnv().RequestAndDecode(nil, "POST", "database/reset", nil, nil)
+	c.Check(err, check.IsNil)
+}
+
+func (suite *IntegrationSuite) TestGetLockUnlockCancel(c *check.C) {
+	typeChooser := func(ctr *arvados.Container) (arvados.InstanceType, error) {
+		return arvados.InstanceType{Name: "testType"}, nil
+	}
+
 	client := arvados.NewClientFromEnv()
-	cq := NewQueue(logrus.StandardLogger(), nil, testTypeChooser, client)
+	cq := NewQueue(logrus.StandardLogger(), nil, typeChooser, client)
 
 	err := cq.Update()
 	c.Check(err, check.IsNil)
@@ -77,6 +87,36 @@ func (*IntegrationSuite) TestControllerBackedQueue(c *check.C) {
 	c.Check(err, check.ErrorMatches, `.*State cannot change from Complete to Cancelled.*`)
 }
 
-func testTypeChooser(ctr *arvados.Container) (arvados.InstanceType, error) {
-	return arvados.InstanceType{Name: "testType"}, nil
+func (suite *IntegrationSuite) TestCancelIfNoInstanceType(c *check.C) {
+	errorTypeChooser := func(ctr *arvados.Container) (arvados.InstanceType, error) {
+		return arvados.InstanceType{}, errors.New("no suitable instance type")
+	}
+
+	client := arvados.NewClientFromEnv()
+	cq := NewQueue(logrus.StandardLogger(), nil, errorTypeChooser, client)
+
+	var ctr arvados.Container
+	err := client.RequestAndDecode(&ctr, "GET", "arvados/v1/containers/"+arvadostest.QueuedContainerUUID, nil, nil)
+	c.Check(err, check.IsNil)
+	c.Check(ctr.State, check.Equals, arvados.ContainerStateQueued)
+
+	cq.Update()
+
+	// Wait for the cancel operation to take effect. Container
+	// will have state=Cancelled or just disappear from the queue.
+	suite.waitfor(c, time.Second, func() bool {
+		err := client.RequestAndDecode(&ctr, "GET", "arvados/v1/containers/"+arvadostest.QueuedContainerUUID, nil, nil)
+		return err == nil && ctr.State == arvados.ContainerStateCancelled
+	})
+	c.Check(ctr.RuntimeStatus["error"], check.Equals, `no suitable instance type`)
+}
+
+func (suite *IntegrationSuite) waitfor(c *check.C, timeout time.Duration, fn func() bool) {
+	defer func() {
+		c.Check(fn(), check.Equals, true)
+	}()
+	deadline := time.Now().Add(timeout)
+	for !fn() && time.Now().Before(deadline) {
+		time.Sleep(timeout / 1000)
+	}
 }
diff --git a/sdk/go/arvados/container.go b/sdk/go/arvados/container.go
index 02a0d76de..fb095481b 100644
--- a/sdk/go/arvados/container.go
+++ b/sdk/go/arvados/container.go
@@ -8,21 +8,22 @@ import "time"
 
 // Container is an arvados#container resource.
 type Container struct {
-	UUID                 string               `json:"uuid"`
-	CreatedAt            time.Time            `json:"created_at"`
-	Command              []string             `json:"command"`
-	ContainerImage       string               `json:"container_image"`
-	Cwd                  string               `json:"cwd"`
-	Environment          map[string]string    `json:"environment"`
-	LockedByUUID         string               `json:"locked_by_uuid"`
-	Mounts               map[string]Mount     `json:"mounts"`
-	Output               string               `json:"output"`
-	OutputPath           string               `json:"output_path"`
-	Priority             int64                `json:"priority"`
-	RuntimeConstraints   RuntimeConstraints   `json:"runtime_constraints"`
-	State                ContainerState       `json:"state"`
-	SchedulingParameters SchedulingParameters `json:"scheduling_parameters"`
-	ExitCode             int                  `json:"exit_code"`
+	UUID                 string                 `json:"uuid"`
+	CreatedAt            time.Time              `json:"created_at"`
+	Command              []string               `json:"command"`
+	ContainerImage       string                 `json:"container_image"`
+	Cwd                  string                 `json:"cwd"`
+	Environment          map[string]string      `json:"environment"`
+	LockedByUUID         string                 `json:"locked_by_uuid"`
+	Mounts               map[string]Mount       `json:"mounts"`
+	Output               string                 `json:"output"`
+	OutputPath           string                 `json:"output_path"`
+	Priority             int64                  `json:"priority"`
+	RuntimeConstraints   RuntimeConstraints     `json:"runtime_constraints"`
+	State                ContainerState         `json:"state"`
+	SchedulingParameters SchedulingParameters   `json:"scheduling_parameters"`
+	ExitCode             int                    `json:"exit_code"`
+	RuntimeStatus        map[string]interface{} `json:"runtime_status"`
 }
 
 // Container is an arvados#container resource.

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list