[arvados] created: 2.7.0-6101-g99a1c26447

git repository hosting git at public.arvados.org
Tue Mar 5 22:40:38 UTC 2024


        at  99a1c26447c7cc06f27ce9b79690cdb64752f2ca (commit)


commit 99a1c26447c7cc06f27ce9b79690cdb64752f2ca
Author: Tom Clegg <tom at curii.com>
Date:   Tue Mar 5 17:34:13 2024 -0500

    21123: Check container status in integration tests.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>

diff --git a/lib/controller/integration_test.go b/lib/controller/integration_test.go
index 4bf7a03447..53e6a90b8f 100644
--- a/lib/controller/integration_test.go
+++ b/lib/controller/integration_test.go
@@ -1244,10 +1244,22 @@ func (s *IntegrationSuite) runContainer(c *check.C, clusterID string, token stri
 
 	var ctr arvados.Container
 	var lastState arvados.ContainerState
+	var status, lastStatus arvados.ContainerStatus
+	var allStatus string
+	checkstatus := func() {
+		err := ac.RequestAndDecode(&status, "GET", "/arvados/v1/container_requests/"+cr.UUID+"/container_status", nil, nil)
+		c.Assert(err, check.IsNil)
+		if status != lastStatus {
+			c.Logf("container status: %s, %s", status.State, status.SchedulingStatus)
+			allStatus += fmt.Sprintf("%s, %s\n", status.State, status.SchedulingStatus)
+			lastStatus = status
+		}
+	}
 	deadline := time.Now().Add(time.Minute)
-	for cr.State != arvados.ContainerRequestStateFinal {
+	for cr.State != arvados.ContainerRequestStateFinal || (lastStatus.State != arvados.ContainerStateComplete && lastStatus.State != arvados.ContainerStateCancelled) {
 		err = ac.RequestAndDecode(&cr, "GET", "/arvados/v1/container_requests/"+cr.UUID, nil, nil)
 		c.Assert(err, check.IsNil)
+		checkstatus()
 		err = ac.RequestAndDecode(&ctr, "GET", "/arvados/v1/containers/"+cr.ContainerUUID, nil, nil)
 		if err != nil {
 			c.Logf("error getting container state: %s", err)
@@ -1267,6 +1279,7 @@ func (s *IntegrationSuite) runContainer(c *check.C, clusterID string, token stri
 			time.Sleep(time.Second / 2)
 		}
 	}
+	checkstatus()
 	c.Logf("cr.CumulativeCost == %f", cr.CumulativeCost)
 	c.Check(cr.CumulativeCost, check.Not(check.Equals), 0.0)
 	if expectExitCode >= 0 {
@@ -1274,6 +1287,13 @@ func (s *IntegrationSuite) runContainer(c *check.C, clusterID string, token stri
 		c.Check(ctr.ExitCode, check.Equals, expectExitCode)
 		err = ac.RequestAndDecode(&outcoll, "GET", "/arvados/v1/collections/"+cr.OutputUUID, nil, nil)
 		c.Assert(err, check.IsNil)
+		c.Check(allStatus, check.Matches, `Queued, waiting for dispatch\n`+
+			`(Queued, waiting.*\n)*`+
+			`(Locked, waiting for dispatch\n)?`+
+			`(Locked, waiting for new instance to be ready\n)?`+
+			`(Locked, preparing runtime environment\n)?`+
+			`(Running, \n)?`+
+			`Complete, \n`)
 	}
 	logcfs = showlogs(cr.LogUUID)
 	checkwebdavlogs(cr)

commit 60b6f4e619f1a22405e831eda2186d61f7f1ea48
Author: Tom Clegg <tom at curii.com>
Date:   Mon Mar 4 19:02:44 2024 -0500

    21123: Add container_requests/{uuid}/container_status endpoint.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>

diff --git a/doc/api/methods/container_requests.html.textile.liquid b/doc/api/methods/container_requests.html.textile.liquid
index c108c32808..770b56b697 100644
--- a/doc/api/methods/container_requests.html.textile.liquid
+++ b/doc/api/methods/container_requests.html.textile.liquid
@@ -224,6 +224,26 @@ Setting the priority of a committed container_request to 0 may cancel a running
 See "Canceling a container request":{{site.baseurl}}/api/methods/container_requests.html#cancel_container for further details.
 {% include 'notebox_end' %}
 
+h3(#container_status). container_status
+
+Get container status.
+
+table(table table-bordered table-condensed).
+|_. Argument |_. Type |_. Description |_. Location |
+{background:#ccffcc}.|uuid|string|The UUID of the container request in question.|path|
+
+Example request: @GET /arvados/v1/container_requests/zzzzz-xvdhp-0123456789abcde/container_status@
+
+Response attributes:
+
+table(table table-bordered table-condensed).
+|_. Attribute|_. Type|_. Description|_. Examples|
+|uuid|string|The UUID of the container assigned to this request.||
+|state|string|The state of the container assigned to this request (see "container resource attributes":containers.html).||
+|scheduling_status|string|A brief explanation of the container's status in the dispatch queue. Empty if scheduling is not applicable, e.g., the container is running or finished.|@waiting for cloud resources: queue position 3@
+ at creating new instance@
+ at preparing runtime environment@|
+
 h3(#log). log
 
 Get container log data using WebDAV methods.
diff --git a/lib/controller/federation/conn.go b/lib/controller/federation/conn.go
index c5facdc7d9..949cc56dd2 100644
--- a/lib/controller/federation/conn.go
+++ b/lib/controller/federation/conn.go
@@ -510,6 +510,10 @@ func (conn *Conn) ContainerRequestDelete(ctx context.Context, options arvados.De
 	return conn.chooseBackend(options.UUID).ContainerRequestDelete(ctx, options)
 }
 
+func (conn *Conn) ContainerRequestContainerStatus(ctx context.Context, options arvados.GetOptions) (arvados.ContainerStatus, error) {
+	return conn.chooseBackend(options.UUID).ContainerRequestContainerStatus(ctx, options)
+}
+
 func (conn *Conn) ContainerRequestLog(ctx context.Context, options arvados.ContainerLogOptions) (http.Handler, error) {
 	return conn.chooseBackend(options.UUID).ContainerRequestLog(ctx, options)
 }
diff --git a/lib/controller/localdb/container_request.go b/lib/controller/localdb/container_request.go
index 49e21840ea..0234ee8fa6 100644
--- a/lib/controller/localdb/container_request.go
+++ b/lib/controller/localdb/container_request.go
@@ -6,8 +6,15 @@ package localdb
 
 import (
 	"context"
+	"encoding/json"
+	"fmt"
+	"net/http"
+	"net/url"
 
+	"git.arvados.org/arvados.git/lib/dispatchcloud/scheduler"
 	"git.arvados.org/arvados.git/sdk/go/arvados"
+	"git.arvados.org/arvados.git/sdk/go/auth"
+	"git.arvados.org/arvados.git/sdk/go/httpserver"
 )
 
 // ContainerRequestCreate defers to railsProxy for everything except
@@ -54,3 +61,87 @@ func (conn *Conn) ContainerRequestDelete(ctx context.Context, opts arvados.Delet
 	conn.logActivity(ctx)
 	return conn.railsProxy.ContainerRequestDelete(ctx, opts)
 }
+
+func (conn *Conn) ContainerRequestContainerStatus(ctx context.Context, opts arvados.GetOptions) (arvados.ContainerStatus, error) {
+	conn.logActivity(ctx)
+	var ret arvados.ContainerStatus
+	cr, err := conn.railsProxy.ContainerRequestGet(ctx, arvados.GetOptions{UUID: opts.UUID, Select: []string{"uuid", "container_uuid", "log_uuid"}})
+	if err != nil {
+		return ret, err
+	}
+	if cr.ContainerUUID == "" {
+		ret.SchedulingStatus = "no container assigned"
+		return ret, nil
+	}
+	// We use admin credentials to get the container record so we
+	// don't get an error when we're in a race with auto-retry and
+	// the container became user-unreadable since we fetched the
+	// CR above.
+	ctxRoot := auth.NewContext(ctx, &auth.Credentials{Tokens: []string{conn.cluster.SystemRootToken}})
+	ctr, err := conn.railsProxy.ContainerGet(ctxRoot, arvados.GetOptions{UUID: cr.ContainerUUID, Select: []string{"uuid", "state", "priority"}})
+	if err != nil {
+		return ret, err
+	}
+	ret.UUID = ctr.UUID
+	ret.State = ctr.State
+	if ctr.State != arvados.ContainerStateQueued && ctr.State != arvados.ContainerStateLocked {
+		// Scheduling status is not a thing once the container
+		// is in running state.
+		return ret, nil
+	}
+	var lastErr error
+	for dispatchurl := range conn.cluster.Services.DispatchCloud.InternalURLs {
+		baseurl := url.URL(dispatchurl)
+		apiurl, err := baseurl.Parse("/arvados/v1/dispatch/container?container_uuid=" + cr.ContainerUUID)
+		if err != nil {
+			lastErr = err
+			continue
+		}
+		req, err := http.NewRequestWithContext(ctx, http.MethodGet, apiurl.String(), nil)
+		if err != nil {
+			lastErr = err
+			continue
+		}
+		req.Header.Set("Authorization", "Bearer "+conn.cluster.ManagementToken)
+		resp, err := http.DefaultClient.Do(req)
+		if err != nil {
+			lastErr = fmt.Errorf("error getting status from dispatcher: %w", err)
+			continue
+		}
+		if resp.StatusCode == http.StatusNotFound {
+			continue
+		} else if resp.StatusCode != http.StatusOK {
+			lastErr = fmt.Errorf("error getting status from dispatcher: %s", resp.Status)
+			continue
+		}
+		var qent scheduler.QueueEnt
+		err = json.NewDecoder(resp.Body).Decode(&qent)
+		if err != nil {
+			lastErr = err
+			continue
+		}
+		ret.State = qent.Container.State // Prefer dispatcher's view of state if not equal to ctr.State
+		ret.SchedulingStatus = qent.SchedulingStatus
+		return ret, nil
+	}
+	if lastErr != nil {
+		// If we got a non-nil error from a dispatchcloud
+		// service, and the container state suggests
+		// dispatchcloud should know about it, then we return
+		// an error so the client knows to retry.
+		return ret, httpserver.ErrorWithStatus(lastErr, http.StatusBadGateway)
+	}
+	// All running dispatchcloud services confirm they don't have
+	// this container (the dispatcher hasn't yet noticed it
+	// appearing in the queue) or there are no dispatchcloud
+	// services configured. Either way, all we can say is that
+	// it's queued.
+	if ctr.State == arvados.ContainerStateQueued && ctr.Priority < 1 {
+		// If it hasn't been picked up by a dispatcher
+		// already, it won't be -- it's just on hold.
+		// Scheduling status does not apply.
+		return ret, nil
+	}
+	ret.SchedulingStatus = "waiting for dispatch"
+	return ret, nil
+}
diff --git a/lib/controller/router/router.go b/lib/controller/router/router.go
index d39f493a95..054bcffaf7 100644
--- a/lib/controller/router/router.go
+++ b/lib/controller/router/router.go
@@ -318,6 +318,13 @@ func (rtr *router) addRoutes() {
 				return rtr.backend.ContainerRequestDelete(ctx, *opts.(*arvados.DeleteOptions))
 			},
 		},
+		{
+			arvados.EndpointContainerRequestContainerStatus,
+			func() interface{} { return &arvados.GetOptions{} },
+			func(ctx context.Context, opts interface{}) (interface{}, error) {
+				return rtr.backend.ContainerRequestContainerStatus(ctx, *opts.(*arvados.GetOptions))
+			},
+		},
 		{
 			arvados.EndpointContainerRequestLog,
 			func() interface{} { return &arvados.ContainerLogOptions{} },
diff --git a/lib/controller/rpc/conn.go b/lib/controller/rpc/conn.go
index 9f518d9c7a..c6be679a25 100644
--- a/lib/controller/rpc/conn.go
+++ b/lib/controller/rpc/conn.go
@@ -529,6 +529,13 @@ func (conn *Conn) ContainerRequestDelete(ctx context.Context, options arvados.De
 	return resp, err
 }
 
+func (conn *Conn) ContainerRequestContainerStatus(ctx context.Context, options arvados.GetOptions) (arvados.ContainerStatus, error) {
+	ep := arvados.EndpointContainerRequestContainerStatus
+	var resp arvados.ContainerStatus
+	err := conn.requestAndDecode(ctx, &resp, ep, nil, options)
+	return resp, err
+}
+
 func (conn *Conn) ContainerRequestLog(ctx context.Context, options arvados.ContainerLogOptions) (resp http.Handler, err error) {
 	proxy := &httputil.ReverseProxy{
 		Transport: conn.httpClient.Transport,
diff --git a/sdk/go/arvados/api.go b/sdk/go/arvados/api.go
index e7310818f7..c3d0ea8aef 100644
--- a/sdk/go/arvados/api.go
+++ b/sdk/go/arvados/api.go
@@ -23,90 +23,91 @@ type APIEndpoint struct {
 }
 
 var (
-	EndpointConfigGet                     = APIEndpoint{"GET", "arvados/v1/config", ""}
-	EndpointVocabularyGet                 = APIEndpoint{"GET", "arvados/v1/vocabulary", ""}
-	EndpointDiscoveryDocument             = APIEndpoint{"GET", "discovery/v1/apis/arvados/v1/rest", ""}
-	EndpointLogin                         = APIEndpoint{"GET", "login", ""}
-	EndpointLogout                        = APIEndpoint{"GET", "logout", ""}
-	EndpointAuthorizedKeyCreate           = APIEndpoint{"POST", "arvados/v1/authorized_keys", "authorized_key"}
-	EndpointAuthorizedKeyUpdate           = APIEndpoint{"PATCH", "arvados/v1/authorized_keys/{uuid}", "authorized_key"}
-	EndpointAuthorizedKeyGet              = APIEndpoint{"GET", "arvados/v1/authorized_keys/{uuid}", ""}
-	EndpointAuthorizedKeyList             = APIEndpoint{"GET", "arvados/v1/authorized_keys", ""}
-	EndpointAuthorizedKeyDelete           = APIEndpoint{"DELETE", "arvados/v1/authorized_keys/{uuid}", ""}
-	EndpointCollectionCreate              = APIEndpoint{"POST", "arvados/v1/collections", "collection"}
-	EndpointCollectionUpdate              = APIEndpoint{"PATCH", "arvados/v1/collections/{uuid}", "collection"}
-	EndpointCollectionGet                 = APIEndpoint{"GET", "arvados/v1/collections/{uuid}", ""}
-	EndpointCollectionList                = APIEndpoint{"GET", "arvados/v1/collections", ""}
-	EndpointCollectionProvenance          = APIEndpoint{"GET", "arvados/v1/collections/{uuid}/provenance", ""}
-	EndpointCollectionUsedBy              = APIEndpoint{"GET", "arvados/v1/collections/{uuid}/used_by", ""}
-	EndpointCollectionDelete              = APIEndpoint{"DELETE", "arvados/v1/collections/{uuid}", ""}
-	EndpointCollectionTrash               = APIEndpoint{"POST", "arvados/v1/collections/{uuid}/trash", ""}
-	EndpointCollectionUntrash             = APIEndpoint{"POST", "arvados/v1/collections/{uuid}/untrash", ""}
-	EndpointSpecimenCreate                = APIEndpoint{"POST", "arvados/v1/specimens", "specimen"}
-	EndpointSpecimenUpdate                = APIEndpoint{"PATCH", "arvados/v1/specimens/{uuid}", "specimen"}
-	EndpointSpecimenGet                   = APIEndpoint{"GET", "arvados/v1/specimens/{uuid}", ""}
-	EndpointSpecimenList                  = APIEndpoint{"GET", "arvados/v1/specimens", ""}
-	EndpointSpecimenDelete                = APIEndpoint{"DELETE", "arvados/v1/specimens/{uuid}", ""}
-	EndpointContainerCreate               = APIEndpoint{"POST", "arvados/v1/containers", "container"}
-	EndpointContainerUpdate               = APIEndpoint{"PATCH", "arvados/v1/containers/{uuid}", "container"}
-	EndpointContainerPriorityUpdate       = APIEndpoint{"POST", "arvados/v1/containers/{uuid}/update_priority", "container"}
-	EndpointContainerGet                  = APIEndpoint{"GET", "arvados/v1/containers/{uuid}", ""}
-	EndpointContainerList                 = APIEndpoint{"GET", "arvados/v1/containers", ""}
-	EndpointContainerDelete               = APIEndpoint{"DELETE", "arvados/v1/containers/{uuid}", ""}
-	EndpointContainerLock                 = APIEndpoint{"POST", "arvados/v1/containers/{uuid}/lock", ""}
-	EndpointContainerUnlock               = APIEndpoint{"POST", "arvados/v1/containers/{uuid}/unlock", ""}
-	EndpointContainerSSH                  = APIEndpoint{"POST", "arvados/v1/containers/{uuid}/ssh", ""}
-	EndpointContainerSSHCompat            = APIEndpoint{"POST", "arvados/v1/connect/{uuid}/ssh", ""} // for compatibility with arvados <2.7
-	EndpointContainerGatewayTunnel        = APIEndpoint{"POST", "arvados/v1/containers/{uuid}/gateway_tunnel", ""}
-	EndpointContainerGatewayTunnelCompat  = APIEndpoint{"POST", "arvados/v1/connect/{uuid}/gateway_tunnel", ""} // for compatibility with arvados <2.7
-	EndpointContainerRequestCreate        = APIEndpoint{"POST", "arvados/v1/container_requests", "container_request"}
-	EndpointContainerRequestUpdate        = APIEndpoint{"PATCH", "arvados/v1/container_requests/{uuid}", "container_request"}
-	EndpointContainerRequestGet           = APIEndpoint{"GET", "arvados/v1/container_requests/{uuid}", ""}
-	EndpointContainerRequestList          = APIEndpoint{"GET", "arvados/v1/container_requests", ""}
-	EndpointContainerRequestDelete        = APIEndpoint{"DELETE", "arvados/v1/container_requests/{uuid}", ""}
-	EndpointContainerRequestLog           = APIEndpoint{"GET", "arvados/v1/container_requests/{uuid}/log{path:|/.*}", ""}
-	EndpointGroupCreate                   = APIEndpoint{"POST", "arvados/v1/groups", "group"}
-	EndpointGroupUpdate                   = APIEndpoint{"PATCH", "arvados/v1/groups/{uuid}", "group"}
-	EndpointGroupGet                      = APIEndpoint{"GET", "arvados/v1/groups/{uuid}", ""}
-	EndpointGroupList                     = APIEndpoint{"GET", "arvados/v1/groups", ""}
-	EndpointGroupContents                 = APIEndpoint{"GET", "arvados/v1/groups/contents", ""}
-	EndpointGroupContentsUUIDInPath       = APIEndpoint{"GET", "arvados/v1/groups/{uuid}/contents", ""} // Alternative HTTP route; client-side code should always use EndpointGroupContents instead
-	EndpointGroupShared                   = APIEndpoint{"GET", "arvados/v1/groups/shared", ""}
-	EndpointGroupDelete                   = APIEndpoint{"DELETE", "arvados/v1/groups/{uuid}", ""}
-	EndpointGroupTrash                    = APIEndpoint{"POST", "arvados/v1/groups/{uuid}/trash", ""}
-	EndpointGroupUntrash                  = APIEndpoint{"POST", "arvados/v1/groups/{uuid}/untrash", ""}
-	EndpointLinkCreate                    = APIEndpoint{"POST", "arvados/v1/links", "link"}
-	EndpointLinkUpdate                    = APIEndpoint{"PATCH", "arvados/v1/links/{uuid}", "link"}
-	EndpointLinkGet                       = APIEndpoint{"GET", "arvados/v1/links/{uuid}", ""}
-	EndpointLinkList                      = APIEndpoint{"GET", "arvados/v1/links", ""}
-	EndpointLinkDelete                    = APIEndpoint{"DELETE", "arvados/v1/links/{uuid}", ""}
-	EndpointLogCreate                     = APIEndpoint{"POST", "arvados/v1/logs", "log"}
-	EndpointLogUpdate                     = APIEndpoint{"PATCH", "arvados/v1/logs/{uuid}", "log"}
-	EndpointLogGet                        = APIEndpoint{"GET", "arvados/v1/logs/{uuid}", ""}
-	EndpointLogList                       = APIEndpoint{"GET", "arvados/v1/logs", ""}
-	EndpointLogDelete                     = APIEndpoint{"DELETE", "arvados/v1/logs/{uuid}", ""}
-	EndpointSysTrashSweep                 = APIEndpoint{"POST", "sys/trash_sweep", ""}
-	EndpointUserActivate                  = APIEndpoint{"POST", "arvados/v1/users/{uuid}/activate", ""}
-	EndpointUserCreate                    = APIEndpoint{"POST", "arvados/v1/users", "user"}
-	EndpointUserCurrent                   = APIEndpoint{"GET", "arvados/v1/users/current", ""}
-	EndpointUserDelete                    = APIEndpoint{"DELETE", "arvados/v1/users/{uuid}", ""}
-	EndpointUserGet                       = APIEndpoint{"GET", "arvados/v1/users/{uuid}", ""}
-	EndpointUserGetCurrent                = APIEndpoint{"GET", "arvados/v1/users/current", ""}
-	EndpointUserGetSystem                 = APIEndpoint{"GET", "arvados/v1/users/system", ""}
-	EndpointUserList                      = APIEndpoint{"GET", "arvados/v1/users", ""}
-	EndpointUserMerge                     = APIEndpoint{"POST", "arvados/v1/users/merge", ""}
-	EndpointUserSetup                     = APIEndpoint{"POST", "arvados/v1/users/setup", "user"}
-	EndpointUserSystem                    = APIEndpoint{"GET", "arvados/v1/users/system", ""}
-	EndpointUserUnsetup                   = APIEndpoint{"POST", "arvados/v1/users/{uuid}/unsetup", ""}
-	EndpointUserUpdate                    = APIEndpoint{"PATCH", "arvados/v1/users/{uuid}", "user"}
-	EndpointUserBatchUpdate               = APIEndpoint{"PATCH", "arvados/v1/users/batch_update", ""}
-	EndpointUserAuthenticate              = APIEndpoint{"POST", "arvados/v1/users/authenticate", ""}
-	EndpointAPIClientAuthorizationCurrent = APIEndpoint{"GET", "arvados/v1/api_client_authorizations/current", ""}
-	EndpointAPIClientAuthorizationCreate  = APIEndpoint{"POST", "arvados/v1/api_client_authorizations", "api_client_authorization"}
-	EndpointAPIClientAuthorizationUpdate  = APIEndpoint{"PUT", "arvados/v1/api_client_authorizations/{uuid}", "api_client_authorization"}
-	EndpointAPIClientAuthorizationList    = APIEndpoint{"GET", "arvados/v1/api_client_authorizations", ""}
-	EndpointAPIClientAuthorizationDelete  = APIEndpoint{"DELETE", "arvados/v1/api_client_authorizations/{uuid}", ""}
-	EndpointAPIClientAuthorizationGet     = APIEndpoint{"GET", "arvados/v1/api_client_authorizations/{uuid}", ""}
+	EndpointConfigGet                       = APIEndpoint{"GET", "arvados/v1/config", ""}
+	EndpointVocabularyGet                   = APIEndpoint{"GET", "arvados/v1/vocabulary", ""}
+	EndpointDiscoveryDocument               = APIEndpoint{"GET", "discovery/v1/apis/arvados/v1/rest", ""}
+	EndpointLogin                           = APIEndpoint{"GET", "login", ""}
+	EndpointLogout                          = APIEndpoint{"GET", "logout", ""}
+	EndpointAuthorizedKeyCreate             = APIEndpoint{"POST", "arvados/v1/authorized_keys", "authorized_key"}
+	EndpointAuthorizedKeyUpdate             = APIEndpoint{"PATCH", "arvados/v1/authorized_keys/{uuid}", "authorized_key"}
+	EndpointAuthorizedKeyGet                = APIEndpoint{"GET", "arvados/v1/authorized_keys/{uuid}", ""}
+	EndpointAuthorizedKeyList               = APIEndpoint{"GET", "arvados/v1/authorized_keys", ""}
+	EndpointAuthorizedKeyDelete             = APIEndpoint{"DELETE", "arvados/v1/authorized_keys/{uuid}", ""}
+	EndpointCollectionCreate                = APIEndpoint{"POST", "arvados/v1/collections", "collection"}
+	EndpointCollectionUpdate                = APIEndpoint{"PATCH", "arvados/v1/collections/{uuid}", "collection"}
+	EndpointCollectionGet                   = APIEndpoint{"GET", "arvados/v1/collections/{uuid}", ""}
+	EndpointCollectionList                  = APIEndpoint{"GET", "arvados/v1/collections", ""}
+	EndpointCollectionProvenance            = APIEndpoint{"GET", "arvados/v1/collections/{uuid}/provenance", ""}
+	EndpointCollectionUsedBy                = APIEndpoint{"GET", "arvados/v1/collections/{uuid}/used_by", ""}
+	EndpointCollectionDelete                = APIEndpoint{"DELETE", "arvados/v1/collections/{uuid}", ""}
+	EndpointCollectionTrash                 = APIEndpoint{"POST", "arvados/v1/collections/{uuid}/trash", ""}
+	EndpointCollectionUntrash               = APIEndpoint{"POST", "arvados/v1/collections/{uuid}/untrash", ""}
+	EndpointSpecimenCreate                  = APIEndpoint{"POST", "arvados/v1/specimens", "specimen"}
+	EndpointSpecimenUpdate                  = APIEndpoint{"PATCH", "arvados/v1/specimens/{uuid}", "specimen"}
+	EndpointSpecimenGet                     = APIEndpoint{"GET", "arvados/v1/specimens/{uuid}", ""}
+	EndpointSpecimenList                    = APIEndpoint{"GET", "arvados/v1/specimens", ""}
+	EndpointSpecimenDelete                  = APIEndpoint{"DELETE", "arvados/v1/specimens/{uuid}", ""}
+	EndpointContainerCreate                 = APIEndpoint{"POST", "arvados/v1/containers", "container"}
+	EndpointContainerUpdate                 = APIEndpoint{"PATCH", "arvados/v1/containers/{uuid}", "container"}
+	EndpointContainerPriorityUpdate         = APIEndpoint{"POST", "arvados/v1/containers/{uuid}/update_priority", "container"}
+	EndpointContainerGet                    = APIEndpoint{"GET", "arvados/v1/containers/{uuid}", ""}
+	EndpointContainerList                   = APIEndpoint{"GET", "arvados/v1/containers", ""}
+	EndpointContainerDelete                 = APIEndpoint{"DELETE", "arvados/v1/containers/{uuid}", ""}
+	EndpointContainerLock                   = APIEndpoint{"POST", "arvados/v1/containers/{uuid}/lock", ""}
+	EndpointContainerUnlock                 = APIEndpoint{"POST", "arvados/v1/containers/{uuid}/unlock", ""}
+	EndpointContainerSSH                    = APIEndpoint{"POST", "arvados/v1/containers/{uuid}/ssh", ""}
+	EndpointContainerSSHCompat              = APIEndpoint{"POST", "arvados/v1/connect/{uuid}/ssh", ""} // for compatibility with arvados <2.7
+	EndpointContainerGatewayTunnel          = APIEndpoint{"POST", "arvados/v1/containers/{uuid}/gateway_tunnel", ""}
+	EndpointContainerGatewayTunnelCompat    = APIEndpoint{"POST", "arvados/v1/connect/{uuid}/gateway_tunnel", ""} // for compatibility with arvados <2.7
+	EndpointContainerRequestCreate          = APIEndpoint{"POST", "arvados/v1/container_requests", "container_request"}
+	EndpointContainerRequestUpdate          = APIEndpoint{"PATCH", "arvados/v1/container_requests/{uuid}", "container_request"}
+	EndpointContainerRequestGet             = APIEndpoint{"GET", "arvados/v1/container_requests/{uuid}", ""}
+	EndpointContainerRequestList            = APIEndpoint{"GET", "arvados/v1/container_requests", ""}
+	EndpointContainerRequestDelete          = APIEndpoint{"DELETE", "arvados/v1/container_requests/{uuid}", ""}
+	EndpointContainerRequestContainerStatus = APIEndpoint{"GET", "arvados/v1/container_requests/{uuid}/container_status", ""}
+	EndpointContainerRequestLog             = APIEndpoint{"GET", "arvados/v1/container_requests/{uuid}/log{path:|/.*}", ""}
+	EndpointGroupCreate                     = APIEndpoint{"POST", "arvados/v1/groups", "group"}
+	EndpointGroupUpdate                     = APIEndpoint{"PATCH", "arvados/v1/groups/{uuid}", "group"}
+	EndpointGroupGet                        = APIEndpoint{"GET", "arvados/v1/groups/{uuid}", ""}
+	EndpointGroupList                       = APIEndpoint{"GET", "arvados/v1/groups", ""}
+	EndpointGroupContents                   = APIEndpoint{"GET", "arvados/v1/groups/contents", ""}
+	EndpointGroupContentsUUIDInPath         = APIEndpoint{"GET", "arvados/v1/groups/{uuid}/contents", ""} // Alternative HTTP route; client-side code should always use EndpointGroupContents instead
+	EndpointGroupShared                     = APIEndpoint{"GET", "arvados/v1/groups/shared", ""}
+	EndpointGroupDelete                     = APIEndpoint{"DELETE", "arvados/v1/groups/{uuid}", ""}
+	EndpointGroupTrash                      = APIEndpoint{"POST", "arvados/v1/groups/{uuid}/trash", ""}
+	EndpointGroupUntrash                    = APIEndpoint{"POST", "arvados/v1/groups/{uuid}/untrash", ""}
+	EndpointLinkCreate                      = APIEndpoint{"POST", "arvados/v1/links", "link"}
+	EndpointLinkUpdate                      = APIEndpoint{"PATCH", "arvados/v1/links/{uuid}", "link"}
+	EndpointLinkGet                         = APIEndpoint{"GET", "arvados/v1/links/{uuid}", ""}
+	EndpointLinkList                        = APIEndpoint{"GET", "arvados/v1/links", ""}
+	EndpointLinkDelete                      = APIEndpoint{"DELETE", "arvados/v1/links/{uuid}", ""}
+	EndpointLogCreate                       = APIEndpoint{"POST", "arvados/v1/logs", "log"}
+	EndpointLogUpdate                       = APIEndpoint{"PATCH", "arvados/v1/logs/{uuid}", "log"}
+	EndpointLogGet                          = APIEndpoint{"GET", "arvados/v1/logs/{uuid}", ""}
+	EndpointLogList                         = APIEndpoint{"GET", "arvados/v1/logs", ""}
+	EndpointLogDelete                       = APIEndpoint{"DELETE", "arvados/v1/logs/{uuid}", ""}
+	EndpointSysTrashSweep                   = APIEndpoint{"POST", "sys/trash_sweep", ""}
+	EndpointUserActivate                    = APIEndpoint{"POST", "arvados/v1/users/{uuid}/activate", ""}
+	EndpointUserCreate                      = APIEndpoint{"POST", "arvados/v1/users", "user"}
+	EndpointUserCurrent                     = APIEndpoint{"GET", "arvados/v1/users/current", ""}
+	EndpointUserDelete                      = APIEndpoint{"DELETE", "arvados/v1/users/{uuid}", ""}
+	EndpointUserGet                         = APIEndpoint{"GET", "arvados/v1/users/{uuid}", ""}
+	EndpointUserGetCurrent                  = APIEndpoint{"GET", "arvados/v1/users/current", ""}
+	EndpointUserGetSystem                   = APIEndpoint{"GET", "arvados/v1/users/system", ""}
+	EndpointUserList                        = APIEndpoint{"GET", "arvados/v1/users", ""}
+	EndpointUserMerge                       = APIEndpoint{"POST", "arvados/v1/users/merge", ""}
+	EndpointUserSetup                       = APIEndpoint{"POST", "arvados/v1/users/setup", "user"}
+	EndpointUserSystem                      = APIEndpoint{"GET", "arvados/v1/users/system", ""}
+	EndpointUserUnsetup                     = APIEndpoint{"POST", "arvados/v1/users/{uuid}/unsetup", ""}
+	EndpointUserUpdate                      = APIEndpoint{"PATCH", "arvados/v1/users/{uuid}", "user"}
+	EndpointUserBatchUpdate                 = APIEndpoint{"PATCH", "arvados/v1/users/batch_update", ""}
+	EndpointUserAuthenticate                = APIEndpoint{"POST", "arvados/v1/users/authenticate", ""}
+	EndpointAPIClientAuthorizationCurrent   = APIEndpoint{"GET", "arvados/v1/api_client_authorizations/current", ""}
+	EndpointAPIClientAuthorizationCreate    = APIEndpoint{"POST", "arvados/v1/api_client_authorizations", "api_client_authorization"}
+	EndpointAPIClientAuthorizationUpdate    = APIEndpoint{"PUT", "arvados/v1/api_client_authorizations/{uuid}", "api_client_authorization"}
+	EndpointAPIClientAuthorizationList      = APIEndpoint{"GET", "arvados/v1/api_client_authorizations", ""}
+	EndpointAPIClientAuthorizationDelete    = APIEndpoint{"DELETE", "arvados/v1/api_client_authorizations/{uuid}", ""}
+	EndpointAPIClientAuthorizationGet       = APIEndpoint{"GET", "arvados/v1/api_client_authorizations/{uuid}", ""}
 )
 
 type ContainerSSHOptions struct {
@@ -310,6 +311,7 @@ type API interface {
 	ContainerRequestGet(ctx context.Context, options GetOptions) (ContainerRequest, error)
 	ContainerRequestList(ctx context.Context, options ListOptions) (ContainerRequestList, error)
 	ContainerRequestDelete(ctx context.Context, options DeleteOptions) (ContainerRequest, error)
+	ContainerRequestContainerStatus(ctx context.Context, options GetOptions) (ContainerStatus, error)
 	ContainerRequestLog(ctx context.Context, options ContainerLogOptions) (http.Handler, error)
 	GroupCreate(ctx context.Context, options CreateOptions) (Group, error)
 	GroupUpdate(ctx context.Context, options UpdateOptions) (Group, error)
diff --git a/sdk/go/arvados/container.go b/sdk/go/arvados/container.go
index 2467e807a1..91c8fbfe29 100644
--- a/sdk/go/arvados/container.go
+++ b/sdk/go/arvados/container.go
@@ -160,3 +160,9 @@ const (
 	ContainerRequestStateCommitted  = ContainerRequestState("Committed")
 	ContainerRequestStateFinal      = ContainerRequestState("Final")
 )
+
+type ContainerStatus struct {
+	UUID             string         `json:"uuid"`
+	State            ContainerState `json:"container_state"`
+	SchedulingStatus string         `json:"scheduling_status"`
+}
diff --git a/sdk/go/arvadostest/api.go b/sdk/go/arvadostest/api.go
index 3ba794380f..e1827b5d1f 100644
--- a/sdk/go/arvadostest/api.go
+++ b/sdk/go/arvadostest/api.go
@@ -168,6 +168,10 @@ func (as *APIStub) ContainerRequestDelete(ctx context.Context, options arvados.D
 	as.appendCall(ctx, as.ContainerRequestDelete, options)
 	return arvados.ContainerRequest{}, as.Error
 }
+func (as *APIStub) ContainerRequestContainerStatus(ctx context.Context, options arvados.GetOptions) (arvados.ContainerStatus, error) {
+	as.appendCall(ctx, as.ContainerRequestContainerStatus, options)
+	return arvados.ContainerStatus{}, as.Error
+}
 func (as *APIStub) ContainerRequestLog(ctx context.Context, options arvados.ContainerLogOptions) (http.Handler, error) {
 	as.appendCall(ctx, as.ContainerRequestLog, options)
 	// Return a handler that responds with the configured
diff --git a/services/api/config/routes.rb b/services/api/config/routes.rb
index 87e2737575..b87e86f664 100644
--- a/services/api/config/routes.rb
+++ b/services/api/config/routes.rb
@@ -44,7 +44,9 @@ Rails.application.routes.draw do
         get 'secret_mounts', on: :member
         get 'current', on: :collection
       end
-      resources :container_requests
+      resources :container_requests do
+        get 'container_status', on: :member
+      end
       resources :jobs do
         get 'queue', on: :collection
         get 'queue_size', on: :collection

commit 7345838cb097f11e2ba8239020762ae867591510
Author: Tom Clegg <tom at curii.com>
Date:   Fri Mar 1 16:00:09 2024 -0500

    21123: Add container status API to cloud dispatcher.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>

diff --git a/doc/api/dispatch.html.textile.liquid b/doc/api/dispatch.html.textile.liquid
index b06136db9a..488545c7d4 100644
--- a/doc/api/dispatch.html.textile.liquid
+++ b/doc/api/dispatch.html.textile.liquid
@@ -32,6 +32,7 @@ Return a list of containers that are either ready to dispatch, or being started/
 Each entry in the returned list of @items@ includes:
 * an @instance_type@ entry with the name and attributes of the instance type that will be used to schedule the container (chosen from the @InstanceTypes@ section of your cluster config file); and
 * a @container@ entry with selected attributes of the container itself, including @uuid@, @priority@, @runtime_constraints@, and @state at . Other fields of the container records are not loaded by the dispatcher, and will have empty/zero values here (e.g., @{...,"created_at":"0001-01-01T00:00:00Z","command":[],...}@).
+* a @scheduling_status@ entry: a brief explanation of the container's status in the dispatch queue, or empty if scheduling is not applicable, e.g., the container has already started running.
 
 Example response:
 
@@ -56,12 +57,31 @@ Example response:
         "AddedScratch": 0,
         "Price": 0.146,
         "Preemptible": false
-      }
+      },
+      "scheduling_status": "waiting for new instance to be ready"
     },
     ...
   ]
 }</pre></notextile>
 
+h3. Get specified container
+
+ at GET /arvados/v1/dispatch/container?container_uuid={uuid}@
+
+Return the same information as "list containers" above, but for a single specified container.
+
+Example response:
+
+<notextile><pre>{
+  "container": {
+    ...
+  },
+  "instance_type": {
+    ...
+  },
+  "scheduling_status": "waiting for new instance to be ready"
+}</pre></notextile>
+
 h3. Terminate a container
 
 @POST /arvados/v1/dispatch/containers/kill?container_uuid={uuid}&reason={string}@
diff --git a/lib/dispatchcloud/dispatcher.go b/lib/dispatchcloud/dispatcher.go
index 47e60abdee..611d13306f 100644
--- a/lib/dispatchcloud/dispatcher.go
+++ b/lib/dispatchcloud/dispatcher.go
@@ -61,14 +61,22 @@ type dispatcher struct {
 	instanceSet cloud.InstanceSet
 	pool        pool
 	queue       scheduler.ContainerQueue
+	sched       *scheduler.Scheduler
 	httpHandler http.Handler
 	sshKey      ssh.Signer
 
 	setupOnce sync.Once
 	stop      chan struct{}
 	stopped   chan struct{}
+
+	sQueueMtx       sync.Mutex
+	sQueueRefreshed time.Time
+	sQueue          []scheduler.QueueEnt
+	sQueueMap       map[string]scheduler.QueueEnt
 }
 
+var sQueueRefresh = time.Second
+
 // Start starts the dispatcher. Start can be called multiple times
 // with no ill effect.
 func (disp *dispatcher) Start() {
@@ -155,7 +163,22 @@ func (disp *dispatcher) initialize() {
 	dblock.Dispatch.Lock(disp.Context, disp.dbConnector.GetDB)
 	disp.instanceSet = instanceSet
 	disp.pool = worker.NewPool(disp.logger, disp.ArvClient, disp.Registry, disp.InstanceSetID, disp.instanceSet, disp.newExecutor, installPublicKey, disp.Cluster)
-	disp.queue = container.NewQueue(disp.logger, disp.Registry, disp.typeChooser, disp.ArvClient)
+	if disp.queue == nil {
+		disp.queue = container.NewQueue(disp.logger, disp.Registry, disp.typeChooser, disp.ArvClient)
+	}
+
+	staleLockTimeout := time.Duration(disp.Cluster.Containers.StaleLockTimeout)
+	if staleLockTimeout == 0 {
+		staleLockTimeout = defaultStaleLockTimeout
+	}
+	pollInterval := time.Duration(disp.Cluster.Containers.CloudVMs.PollInterval)
+	if pollInterval <= 0 {
+		pollInterval = defaultPollInterval
+	}
+	disp.sched = scheduler.New(disp.Context, disp.ArvClient, disp.queue, disp.pool, disp.Registry, staleLockTimeout, pollInterval,
+		disp.Cluster.Containers.CloudVMs.InitialQuotaEstimate,
+		disp.Cluster.Containers.CloudVMs.MaxInstances,
+		disp.Cluster.Containers.CloudVMs.SupervisorFraction)
 
 	if disp.Cluster.ManagementToken == "" {
 		disp.httpHandler = http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
@@ -164,6 +187,7 @@ func (disp *dispatcher) initialize() {
 	} else {
 		mux := httprouter.New()
 		mux.HandlerFunc("GET", "/arvados/v1/dispatch/containers", disp.apiContainers)
+		mux.HandlerFunc("GET", "/arvados/v1/dispatch/container", disp.apiContainer)
 		mux.HandlerFunc("POST", "/arvados/v1/dispatch/containers/kill", disp.apiContainerKill)
 		mux.HandlerFunc("GET", "/arvados/v1/dispatch/instances", disp.apiInstances)
 		mux.HandlerFunc("POST", "/arvados/v1/dispatch/instances/hold", disp.apiInstanceHold)
@@ -190,36 +214,53 @@ func (disp *dispatcher) run() {
 	defer disp.instanceSet.Stop()
 	defer disp.pool.Stop()
 
-	staleLockTimeout := time.Duration(disp.Cluster.Containers.StaleLockTimeout)
-	if staleLockTimeout == 0 {
-		staleLockTimeout = defaultStaleLockTimeout
-	}
-	pollInterval := time.Duration(disp.Cluster.Containers.CloudVMs.PollInterval)
-	if pollInterval <= 0 {
-		pollInterval = defaultPollInterval
-	}
-	sched := scheduler.New(disp.Context, disp.ArvClient, disp.queue, disp.pool, disp.Registry, staleLockTimeout, pollInterval,
-		disp.Cluster.Containers.CloudVMs.InitialQuotaEstimate,
-		disp.Cluster.Containers.CloudVMs.MaxInstances,
-		disp.Cluster.Containers.CloudVMs.SupervisorFraction)
-	sched.Start()
-	defer sched.Stop()
+	disp.sched.Start()
+	defer disp.sched.Stop()
 
 	<-disp.stop
 }
 
-// Management API: all active and queued containers.
+// Get a snapshot of the scheduler's queue, no older than
+// sQueueRefresh.
+//
+// First return value is in the sorted order used by the scheduler.
+// Second return value is a map of the same entries, for efficiently
+// looking up a single container.
+func (disp *dispatcher) sQueueCurrent() ([]scheduler.QueueEnt, map[string]scheduler.QueueEnt) {
+	disp.sQueueMtx.Lock()
+	defer disp.sQueueMtx.Unlock()
+	if time.Since(disp.sQueueRefreshed) > sQueueRefresh {
+		disp.sQueue = disp.sched.Queue()
+		disp.sQueueMap = make(map[string]scheduler.QueueEnt)
+		for _, ent := range disp.sQueue {
+			disp.sQueueMap[ent.Container.UUID] = ent
+		}
+		disp.sQueueRefreshed = time.Now()
+	}
+	return disp.sQueue, disp.sQueueMap
+}
+
+// Management API: scheduling queue entries for all active and queued
+// containers.
 func (disp *dispatcher) apiContainers(w http.ResponseWriter, r *http.Request) {
 	var resp struct {
-		Items []container.QueueEnt `json:"items"`
-	}
-	qEntries, _ := disp.queue.Entries()
-	for _, ent := range qEntries {
-		resp.Items = append(resp.Items, ent)
+		Items []scheduler.QueueEnt `json:"items"`
 	}
+	resp.Items, _ = disp.sQueueCurrent()
 	json.NewEncoder(w).Encode(resp)
 }
 
+// Management API: scheduling queue entry for a specified container.
+func (disp *dispatcher) apiContainer(w http.ResponseWriter, r *http.Request) {
+	_, sq := disp.sQueueCurrent()
+	ent, ok := sq[r.FormValue("container_uuid")]
+	if !ok {
+		httpserver.Error(w, "container not found", http.StatusNotFound)
+		return
+	}
+	json.NewEncoder(w).Encode(ent)
+}
+
 // Management API: all active instances (cloud VMs).
 func (disp *dispatcher) apiInstances(w http.ResponseWriter, r *http.Request) {
 	var resp struct {
diff --git a/lib/dispatchcloud/dispatcher_test.go b/lib/dispatchcloud/dispatcher_test.go
index 20185554b8..e7465d65b1 100644
--- a/lib/dispatchcloud/dispatcher_test.go
+++ b/lib/dispatchcloud/dispatcher_test.go
@@ -8,12 +8,14 @@ import (
 	"context"
 	"crypto/tls"
 	"encoding/json"
+	"fmt"
 	"io/ioutil"
 	"math/rand"
 	"net/http"
 	"net/http/httptest"
 	"net/url"
 	"os"
+	"strings"
 	"sync"
 	"sync/atomic"
 	"time"
@@ -159,7 +161,6 @@ func (s *DispatcherSuite) arvClientProxy(c *check.C) func(*http.Request) (*url.U
 // artificial errors in order to exercise a variety of code paths.
 func (s *DispatcherSuite) TestDispatchToStubDriver(c *check.C) {
 	Drivers["test"] = s.stubDriver
-	s.disp.setupOnce.Do(s.disp.initialize)
 	queue := &test.Queue{
 		MaxDispatchAttempts: 5,
 		ChooseType: func(ctr *arvados.Container) ([]arvados.InstanceType, error) {
@@ -179,6 +180,7 @@ func (s *DispatcherSuite) TestDispatchToStubDriver(c *check.C) {
 		})
 	}
 	s.disp.queue = queue
+	s.disp.setupOnce.Do(s.disp.initialize)
 
 	var mtx sync.Mutex
 	done := make(chan struct{})
@@ -323,7 +325,7 @@ func (s *DispatcherSuite) TestDispatchToStubDriver(c *check.C) {
 	c.Check(resp.Body.String(), check.Matches, `(?ms).*max_concurrent_containers [1-9][0-9e+.]*`)
 }
 
-func (s *DispatcherSuite) TestAPIPermissions(c *check.C) {
+func (s *DispatcherSuite) TestManagementAPI_Permissions(c *check.C) {
 	s.cluster.ManagementToken = "abcdefgh"
 	Drivers["test"] = s.stubDriver
 	s.disp.setupOnce.Do(s.disp.initialize)
@@ -345,7 +347,7 @@ func (s *DispatcherSuite) TestAPIPermissions(c *check.C) {
 	}
 }
 
-func (s *DispatcherSuite) TestAPIDisabled(c *check.C) {
+func (s *DispatcherSuite) TestManagementAPI_Disabled(c *check.C) {
 	s.cluster.ManagementToken = ""
 	Drivers["test"] = s.stubDriver
 	s.disp.setupOnce.Do(s.disp.initialize)
@@ -363,13 +365,122 @@ func (s *DispatcherSuite) TestAPIDisabled(c *check.C) {
 	}
 }
 
-func (s *DispatcherSuite) TestInstancesAPI(c *check.C) {
+func (s *DispatcherSuite) TestManagementAPI_Containers(c *check.C) {
+	s.cluster.ManagementToken = "abcdefgh"
+	s.cluster.Containers.CloudVMs.InitialQuotaEstimate = 4
+	Drivers["test"] = s.stubDriver
+	queue := &test.Queue{
+		MaxDispatchAttempts: 5,
+		ChooseType: func(ctr *arvados.Container) ([]arvados.InstanceType, error) {
+			return ChooseInstanceType(s.cluster, ctr)
+		},
+		Logger: ctxlog.TestLogger(c),
+	}
+	s.stubDriver.Queue = queue
+	s.stubDriver.QuotaMaxInstances = 4
+	s.stubDriver.SetupVM = func(stubvm *test.StubVM) error {
+		if stubvm.Instance().ProviderType() >= test.InstanceType(4).ProviderType {
+			return test.CapacityError{InstanceTypeSpecific: true}
+		}
+		stubvm.ExecuteContainer = func(ctr arvados.Container) int {
+			time.Sleep(5 * time.Second)
+			return 0
+		}
+		return nil
+	}
+	s.disp.queue = queue
+	s.disp.setupOnce.Do(s.disp.initialize)
+
+	go s.disp.run()
+
+	type queueEnt struct {
+		Container        arvados.Container
+		InstanceType     arvados.InstanceType `json:"instance_type"`
+		SchedulingStatus string               `json:"scheduling_status"`
+	}
+	type containersResponse struct {
+		Items []queueEnt
+	}
+	getContainers := func() containersResponse {
+		sQueueRefresh = time.Millisecond
+		req := httptest.NewRequest("GET", "/arvados/v1/dispatch/containers", nil)
+		req.Header.Set("Authorization", "Bearer abcdefgh")
+		resp := httptest.NewRecorder()
+		s.disp.ServeHTTP(resp, req)
+		var cresp containersResponse
+		c.Check(resp.Code, check.Equals, http.StatusOK)
+		err := json.Unmarshal(resp.Body.Bytes(), &cresp)
+		c.Check(err, check.IsNil)
+		return cresp
+	}
+
+	c.Check(getContainers().Items, check.HasLen, 0)
+
+	for i := 0; i < 20; i++ {
+		queue.Containers = append(queue.Containers, arvados.Container{
+			UUID:     test.ContainerUUID(i),
+			State:    arvados.ContainerStateQueued,
+			Priority: int64(100 - i),
+			RuntimeConstraints: arvados.RuntimeConstraints{
+				RAM:   int64(i%3+1) << 30,
+				VCPUs: i%8 + 1,
+			},
+		})
+	}
+	queue.Update()
+
+	expect := `
+ 0 zzzzz-dz642-000000000000000 (Running) ""
+ 1 zzzzz-dz642-000000000000001 (Running) ""
+ 2 zzzzz-dz642-000000000000002 (Locked) "waiting for suitable instance type to become available: queue position 1"
+ 3 zzzzz-dz642-000000000000003 (Locked) "waiting for suitable instance type to become available: queue position 2"
+ 4 zzzzz-dz642-000000000000004 (Queued) "waiting while cluster is running at capacity: queue position 3"
+ 5 zzzzz-dz642-000000000000005 (Queued) "waiting while cluster is running at capacity: queue position 4"
+ 6 zzzzz-dz642-000000000000006 (Queued) "waiting while cluster is running at capacity: queue position 5"
+ 7 zzzzz-dz642-000000000000007 (Queued) "waiting while cluster is running at capacity: queue position 6"
+ 8 zzzzz-dz642-000000000000008 (Queued) "waiting while cluster is running at capacity: queue position 7"
+ 9 zzzzz-dz642-000000000000009 (Queued) "waiting while cluster is running at capacity: queue position 8"
+ 10 zzzzz-dz642-000000000000010 (Queued) "waiting while cluster is running at capacity: queue position 9"
+ 11 zzzzz-dz642-000000000000011 (Queued) "waiting while cluster is running at capacity: queue position 10"
+ 12 zzzzz-dz642-000000000000012 (Queued) "waiting while cluster is running at capacity: queue position 11"
+ 13 zzzzz-dz642-000000000000013 (Queued) "waiting while cluster is running at capacity: queue position 12"
+ 14 zzzzz-dz642-000000000000014 (Queued) "waiting while cluster is running at capacity: queue position 13"
+ 15 zzzzz-dz642-000000000000015 (Queued) "waiting while cluster is running at capacity: queue position 14"
+ 16 zzzzz-dz642-000000000000016 (Queued) "waiting while cluster is running at capacity: queue position 15"
+ 17 zzzzz-dz642-000000000000017 (Queued) "waiting while cluster is running at capacity: queue position 16"
+ 18 zzzzz-dz642-000000000000018 (Queued) "waiting while cluster is running at capacity: queue position 17"
+ 19 zzzzz-dz642-000000000000019 (Queued) "waiting while cluster is running at capacity: queue position 18"
+`
+	sequence := make(map[string][]string)
+	var summary string
+	for deadline := time.Now().Add(time.Second); time.Now().Before(deadline); time.Sleep(time.Millisecond) {
+		cresp := getContainers()
+		summary = "\n"
+		for i, ent := range cresp.Items {
+			summary += fmt.Sprintf("% 2d %s (%s) %q\n", i, ent.Container.UUID, ent.Container.State, ent.SchedulingStatus)
+			s := sequence[ent.Container.UUID]
+			if len(s) == 0 || s[len(s)-1] != ent.SchedulingStatus {
+				sequence[ent.Container.UUID] = append(s, ent.SchedulingStatus)
+			}
+		}
+		if summary == expect {
+			break
+		}
+	}
+	c.Check(summary, check.Equals, expect)
+	for i := 0; i < 5; i++ {
+		c.Logf("sequence for container %d:\n... %s", i, strings.Join(sequence[test.ContainerUUID(i)], "\n... "))
+	}
+}
+
+func (s *DispatcherSuite) TestManagementAPI_Instances(c *check.C) {
 	s.cluster.ManagementToken = "abcdefgh"
 	s.cluster.Containers.CloudVMs.TimeoutBooting = arvados.Duration(time.Second)
 	Drivers["test"] = s.stubDriver
 	s.disp.setupOnce.Do(s.disp.initialize)
 	s.disp.queue = &test.Queue{}
 	go s.disp.run()
+	defer s.disp.Close()
 
 	type instance struct {
 		Instance             string
diff --git a/lib/dispatchcloud/scheduler/run_queue.go b/lib/dispatchcloud/scheduler/run_queue.go
index 03fa592777..2f4bce8987 100644
--- a/lib/dispatchcloud/scheduler/run_queue.go
+++ b/lib/dispatchcloud/scheduler/run_queue.go
@@ -5,6 +5,7 @@
 package scheduler
 
 import (
+	"fmt"
 	"sort"
 	"time"
 
@@ -15,6 +16,20 @@ import (
 
 var quietAfter503 = time.Minute
 
+type QueueEnt struct {
+	container.QueueEnt
+
+	// Human-readable scheduling status as of the last scheduling
+	// iteration.
+	SchedulingStatus string `json:"scheduling_status"`
+}
+
+// Queue returns the sorted queue from the last scheduling iteration.
+func (sch *Scheduler) Queue() []QueueEnt {
+	ents, _ := sch.lastQueue.Load().([]QueueEnt)
+	return ents
+}
+
 func (sch *Scheduler) runQueue() {
 	running := sch.pool.Running()
 	unalloc := sch.pool.Unallocated()
@@ -25,9 +40,9 @@ func (sch *Scheduler) runQueue() {
 	}
 
 	unsorted, _ := sch.queue.Entries()
-	sorted := make([]container.QueueEnt, 0, len(unsorted))
+	sorted := make([]QueueEnt, 0, len(unsorted))
 	for _, ent := range unsorted {
-		sorted = append(sorted, ent)
+		sorted = append(sorted, QueueEnt{QueueEnt: ent})
 	}
 	sort.Slice(sorted, func(i, j int) bool {
 		_, irunning := running[sorted[i].Container.UUID]
@@ -149,9 +164,9 @@ func (sch *Scheduler) runQueue() {
 	}).Debug("runQueue")
 
 	dontstart := map[arvados.InstanceType]bool{}
-	var atcapacity = map[string]bool{}    // ProviderTypes reported as AtCapacity during this runQueue() invocation
-	var overquota []container.QueueEnt    // entries that are unmappable because of worker pool quota
-	var overmaxsuper []container.QueueEnt // unmappable because max supervisors (these are not included in overquota)
+	var atcapacity = map[string]bool{} // ProviderTypes reported as AtCapacity during this runQueue() invocation
+	var overquota []QueueEnt           // entries that are unmappable because of worker pool quota
+	var overmaxsuper []QueueEnt        // unmappable because max supervisors (these are not included in overquota)
 	var containerAllocatedWorkerBootingCount int
 
 	// trying is #containers running + #containers we're trying to
@@ -159,6 +174,7 @@ func (sch *Scheduler) runQueue() {
 	// reaches the dynamic maxConcurrency limit.
 	trying := len(running)
 
+	qpos := 0
 	supervisors := 0
 
 tryrun:
@@ -169,12 +185,20 @@ tryrun:
 		})
 		if ctr.SchedulingParameters.Supervisor {
 			supervisors += 1
-			if maxSupervisors > 0 && supervisors > maxSupervisors {
-				overmaxsuper = append(overmaxsuper, sorted[i])
-				continue
+		}
+		if _, running := running[ctr.UUID]; running {
+			if ctr.State == arvados.ContainerStateQueued || ctr.State == arvados.ContainerStateLocked {
+				sorted[i].SchedulingStatus = "preparing runtime environment"
 			}
+			continue
 		}
-		if _, running := running[ctr.UUID]; running || ctr.Priority < 1 {
+		if ctr.Priority < 1 {
+			sorted[i].SchedulingStatus = "not scheduling: priority 0, state " + string(ctr.State)
+			continue
+		}
+		if ctr.SchedulingParameters.Supervisor && maxSupervisors > 0 && supervisors > maxSupervisors {
+			overmaxsuper = append(overmaxsuper, sorted[i])
+			sorted[i].SchedulingStatus = "not starting: supervisor container limit has been reached"
 			continue
 		}
 		// If we have unalloc instances of any of the eligible
@@ -214,7 +238,7 @@ tryrun:
 			}
 			trying++
 			if !unallocOK && sch.pool.AtQuota() {
-				logger.Trace("not locking: AtQuota and no unalloc workers")
+				logger.Trace("not starting: AtQuota and no unalloc workers")
 				overquota = sorted[i:]
 				break tryrun
 			}
@@ -246,10 +270,13 @@ tryrun:
 					// same instance type. Don't let this
 					// one sneak in ahead of it.
 				} else if sch.pool.KillContainer(ctr.UUID, "about to start") {
+					sorted[i].SchedulingStatus = "waiting for previous attempt to exit"
 					logger.Info("not restarting yet: crunch-run process from previous attempt has not exited")
 				} else if sch.pool.StartContainer(unallocType, ctr) {
+					sorted[i].SchedulingStatus = "preparing runtime environment"
 					logger.Trace("StartContainer => true")
 				} else {
+					sorted[i].SchedulingStatus = "waiting for new instance to be ready"
 					logger.Trace("StartContainer => false")
 					containerAllocatedWorkerBootingCount += 1
 					dontstart[unallocType] = true
@@ -279,6 +306,8 @@ tryrun:
 				// container A on the next call to
 				// runQueue(), rather than run
 				// container B now.
+				qpos++
+				sorted[i].SchedulingStatus = fmt.Sprintf("waiting for suitable instance type to become available: queue position %d", qpos)
 				logger.Trace("all eligible types at capacity")
 				continue
 			}
@@ -293,6 +322,7 @@ tryrun:
 			// asynchronously and does its own logging
 			// about the eventual outcome, so we don't
 			// need to.)
+			sorted[i].SchedulingStatus = "waiting for new instance to be ready"
 			logger.Info("creating new instance")
 			// Don't bother trying to start the container
 			// yet -- obviously the instance will take
@@ -305,12 +335,26 @@ tryrun:
 	sch.mContainersAllocatedNotStarted.Set(float64(containerAllocatedWorkerBootingCount))
 	sch.mContainersNotAllocatedOverQuota.Set(float64(len(overquota) + len(overmaxsuper)))
 
+	var qreason string
+	if sch.pool.AtQuota() {
+		qreason = "waiting for cloud resources"
+	} else {
+		qreason = "waiting while cluster is running at capacity"
+	}
+	for i, ent := range sorted {
+		if ent.SchedulingStatus == "" && (ent.Container.State == arvados.ContainerStateQueued || ent.Container.State == arvados.ContainerStateLocked) {
+			qpos++
+			sorted[i].SchedulingStatus = fmt.Sprintf("%s: queue position %d", qreason, qpos)
+		}
+	}
+	sch.lastQueue.Store(sorted)
+
 	if len(overquota)+len(overmaxsuper) > 0 {
 		// Unlock any containers that are unmappable while
 		// we're at quota (but if they have already been
 		// scheduled and they're loading docker images etc.,
 		// let them run).
-		var unlock []container.QueueEnt
+		var unlock []QueueEnt
 		unlock = append(unlock, overmaxsuper...)
 		if totalInstances > 0 && len(overquota) > 1 {
 			// We don't unlock the next-in-line container
diff --git a/lib/dispatchcloud/scheduler/scheduler.go b/lib/dispatchcloud/scheduler/scheduler.go
index ee7ab50883..bc6574a21a 100644
--- a/lib/dispatchcloud/scheduler/scheduler.go
+++ b/lib/dispatchcloud/scheduler/scheduler.go
@@ -9,6 +9,7 @@ package scheduler
 import (
 	"context"
 	"sync"
+	"sync/atomic"
 	"time"
 
 	"git.arvados.org/arvados.git/sdk/go/arvados"
@@ -57,6 +58,8 @@ type Scheduler struct {
 	mLongestWaitTimeSinceQueue       prometheus.Gauge
 	mLast503Time                     prometheus.Gauge
 	mMaxContainerConcurrency         prometheus.Gauge
+
+	lastQueue atomic.Value // stores a []QueueEnt
 }
 
 // New returns a new unstarted Scheduler.

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list