[arvados] created: 2.7.0-6101-g99a1c26447
git repository hosting
git at public.arvados.org
Tue Mar 5 22:40:38 UTC 2024
at 99a1c26447c7cc06f27ce9b79690cdb64752f2ca (commit)
commit 99a1c26447c7cc06f27ce9b79690cdb64752f2ca
Author: Tom Clegg <tom at curii.com>
Date: Tue Mar 5 17:34:13 2024 -0500
21123: Check container status in integration tests.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>
diff --git a/lib/controller/integration_test.go b/lib/controller/integration_test.go
index 4bf7a03447..53e6a90b8f 100644
--- a/lib/controller/integration_test.go
+++ b/lib/controller/integration_test.go
@@ -1244,10 +1244,22 @@ func (s *IntegrationSuite) runContainer(c *check.C, clusterID string, token stri
var ctr arvados.Container
var lastState arvados.ContainerState
+ var status, lastStatus arvados.ContainerStatus
+ var allStatus string
+ checkstatus := func() {
+ err := ac.RequestAndDecode(&status, "GET", "/arvados/v1/container_requests/"+cr.UUID+"/container_status", nil, nil)
+ c.Assert(err, check.IsNil)
+ if status != lastStatus {
+ c.Logf("container status: %s, %s", status.State, status.SchedulingStatus)
+ allStatus += fmt.Sprintf("%s, %s\n", status.State, status.SchedulingStatus)
+ lastStatus = status
+ }
+ }
deadline := time.Now().Add(time.Minute)
- for cr.State != arvados.ContainerRequestStateFinal {
+ for cr.State != arvados.ContainerRequestStateFinal || (lastStatus.State != arvados.ContainerStateComplete && lastStatus.State != arvados.ContainerStateCancelled) {
err = ac.RequestAndDecode(&cr, "GET", "/arvados/v1/container_requests/"+cr.UUID, nil, nil)
c.Assert(err, check.IsNil)
+ checkstatus()
err = ac.RequestAndDecode(&ctr, "GET", "/arvados/v1/containers/"+cr.ContainerUUID, nil, nil)
if err != nil {
c.Logf("error getting container state: %s", err)
@@ -1267,6 +1279,7 @@ func (s *IntegrationSuite) runContainer(c *check.C, clusterID string, token stri
time.Sleep(time.Second / 2)
}
}
+ checkstatus()
c.Logf("cr.CumulativeCost == %f", cr.CumulativeCost)
c.Check(cr.CumulativeCost, check.Not(check.Equals), 0.0)
if expectExitCode >= 0 {
@@ -1274,6 +1287,13 @@ func (s *IntegrationSuite) runContainer(c *check.C, clusterID string, token stri
c.Check(ctr.ExitCode, check.Equals, expectExitCode)
err = ac.RequestAndDecode(&outcoll, "GET", "/arvados/v1/collections/"+cr.OutputUUID, nil, nil)
c.Assert(err, check.IsNil)
+ c.Check(allStatus, check.Matches, `Queued, waiting for dispatch\n`+
+ `(Queued, waiting.*\n)*`+
+ `(Locked, waiting for dispatch\n)?`+
+ `(Locked, waiting for new instance to be ready\n)?`+
+ `(Locked, preparing runtime environment\n)?`+
+ `(Running, \n)?`+
+ `Complete, \n`)
}
logcfs = showlogs(cr.LogUUID)
checkwebdavlogs(cr)
commit 60b6f4e619f1a22405e831eda2186d61f7f1ea48
Author: Tom Clegg <tom at curii.com>
Date: Mon Mar 4 19:02:44 2024 -0500
21123: Add container_requests/{uuid}/container_status endpoint.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>
diff --git a/doc/api/methods/container_requests.html.textile.liquid b/doc/api/methods/container_requests.html.textile.liquid
index c108c32808..770b56b697 100644
--- a/doc/api/methods/container_requests.html.textile.liquid
+++ b/doc/api/methods/container_requests.html.textile.liquid
@@ -224,6 +224,26 @@ Setting the priority of a committed container_request to 0 may cancel a running
See "Canceling a container request":{{site.baseurl}}/api/methods/container_requests.html#cancel_container for further details.
{% include 'notebox_end' %}
+h3(#container_status). container_status
+
+Get container status.
+
+table(table table-bordered table-condensed).
+|_. Argument |_. Type |_. Description |_. Location |
+{background:#ccffcc}.|uuid|string|The UUID of the container request in question.|path|
+
+Example request: @GET /arvados/v1/container_requests/zzzzz-xvdhp-0123456789abcde/container_status@
+
+Response attributes:
+
+table(table table-bordered table-condensed).
+|_. Attribute|_. Type|_. Description|_. Examples|
+|uuid|string|The UUID of the container assigned to this request.||
+|state|string|The state of the container assigned to this request (see "container resource attributes":containers.html).||
+|scheduling_status|string|A brief explanation of the container's status in the dispatch queue. Empty if scheduling is not applicable, e.g., the container is running or finished.|@waiting for cloud resources: queue position 3@
+ at creating new instance@
+ at preparing runtime environment@|
+
h3(#log). log
Get container log data using WebDAV methods.
diff --git a/lib/controller/federation/conn.go b/lib/controller/federation/conn.go
index c5facdc7d9..949cc56dd2 100644
--- a/lib/controller/federation/conn.go
+++ b/lib/controller/federation/conn.go
@@ -510,6 +510,10 @@ func (conn *Conn) ContainerRequestDelete(ctx context.Context, options arvados.De
return conn.chooseBackend(options.UUID).ContainerRequestDelete(ctx, options)
}
+func (conn *Conn) ContainerRequestContainerStatus(ctx context.Context, options arvados.GetOptions) (arvados.ContainerStatus, error) {
+ return conn.chooseBackend(options.UUID).ContainerRequestContainerStatus(ctx, options)
+}
+
func (conn *Conn) ContainerRequestLog(ctx context.Context, options arvados.ContainerLogOptions) (http.Handler, error) {
return conn.chooseBackend(options.UUID).ContainerRequestLog(ctx, options)
}
diff --git a/lib/controller/localdb/container_request.go b/lib/controller/localdb/container_request.go
index 49e21840ea..0234ee8fa6 100644
--- a/lib/controller/localdb/container_request.go
+++ b/lib/controller/localdb/container_request.go
@@ -6,8 +6,15 @@ package localdb
import (
"context"
+ "encoding/json"
+ "fmt"
+ "net/http"
+ "net/url"
+ "git.arvados.org/arvados.git/lib/dispatchcloud/scheduler"
"git.arvados.org/arvados.git/sdk/go/arvados"
+ "git.arvados.org/arvados.git/sdk/go/auth"
+ "git.arvados.org/arvados.git/sdk/go/httpserver"
)
// ContainerRequestCreate defers to railsProxy for everything except
@@ -54,3 +61,87 @@ func (conn *Conn) ContainerRequestDelete(ctx context.Context, opts arvados.Delet
conn.logActivity(ctx)
return conn.railsProxy.ContainerRequestDelete(ctx, opts)
}
+
+func (conn *Conn) ContainerRequestContainerStatus(ctx context.Context, opts arvados.GetOptions) (arvados.ContainerStatus, error) {
+ conn.logActivity(ctx)
+ var ret arvados.ContainerStatus
+ cr, err := conn.railsProxy.ContainerRequestGet(ctx, arvados.GetOptions{UUID: opts.UUID, Select: []string{"uuid", "container_uuid", "log_uuid"}})
+ if err != nil {
+ return ret, err
+ }
+ if cr.ContainerUUID == "" {
+ ret.SchedulingStatus = "no container assigned"
+ return ret, nil
+ }
+ // We use admin credentials to get the container record so we
+ // don't get an error when we're in a race with auto-retry and
+ // the container became user-unreadable since we fetched the
+ // CR above.
+ ctxRoot := auth.NewContext(ctx, &auth.Credentials{Tokens: []string{conn.cluster.SystemRootToken}})
+ ctr, err := conn.railsProxy.ContainerGet(ctxRoot, arvados.GetOptions{UUID: cr.ContainerUUID, Select: []string{"uuid", "state", "priority"}})
+ if err != nil {
+ return ret, err
+ }
+ ret.UUID = ctr.UUID
+ ret.State = ctr.State
+ if ctr.State != arvados.ContainerStateQueued && ctr.State != arvados.ContainerStateLocked {
+ // Scheduling status is not a thing once the container
+ // is in running state.
+ return ret, nil
+ }
+ var lastErr error
+ for dispatchurl := range conn.cluster.Services.DispatchCloud.InternalURLs {
+ baseurl := url.URL(dispatchurl)
+ apiurl, err := baseurl.Parse("/arvados/v1/dispatch/container?container_uuid=" + cr.ContainerUUID)
+ if err != nil {
+ lastErr = err
+ continue
+ }
+ req, err := http.NewRequestWithContext(ctx, http.MethodGet, apiurl.String(), nil)
+ if err != nil {
+ lastErr = err
+ continue
+ }
+ req.Header.Set("Authorization", "Bearer "+conn.cluster.ManagementToken)
+ resp, err := http.DefaultClient.Do(req)
+ if err != nil {
+ lastErr = fmt.Errorf("error getting status from dispatcher: %w", err)
+ continue
+ }
+ if resp.StatusCode == http.StatusNotFound {
+ continue
+ } else if resp.StatusCode != http.StatusOK {
+ lastErr = fmt.Errorf("error getting status from dispatcher: %s", resp.Status)
+ continue
+ }
+ var qent scheduler.QueueEnt
+ err = json.NewDecoder(resp.Body).Decode(&qent)
+ if err != nil {
+ lastErr = err
+ continue
+ }
+ ret.State = qent.Container.State // Prefer dispatcher's view of state if not equal to ctr.State
+ ret.SchedulingStatus = qent.SchedulingStatus
+ return ret, nil
+ }
+ if lastErr != nil {
+ // If we got a non-nil error from a dispatchcloud
+ // service, and the container state suggests
+ // dispatchcloud should know about it, then we return
+ // an error so the client knows to retry.
+ return ret, httpserver.ErrorWithStatus(lastErr, http.StatusBadGateway)
+ }
+ // All running dispatchcloud services confirm they don't have
+ // this container (the dispatcher hasn't yet noticed it
+ // appearing in the queue) or there are no dispatchcloud
+ // services configured. Either way, all we can say is that
+ // it's queued.
+ if ctr.State == arvados.ContainerStateQueued && ctr.Priority < 1 {
+ // If it hasn't been picked up by a dispatcher
+ // already, it won't be -- it's just on hold.
+ // Scheduling status does not apply.
+ return ret, nil
+ }
+ ret.SchedulingStatus = "waiting for dispatch"
+ return ret, nil
+}
diff --git a/lib/controller/router/router.go b/lib/controller/router/router.go
index d39f493a95..054bcffaf7 100644
--- a/lib/controller/router/router.go
+++ b/lib/controller/router/router.go
@@ -318,6 +318,13 @@ func (rtr *router) addRoutes() {
return rtr.backend.ContainerRequestDelete(ctx, *opts.(*arvados.DeleteOptions))
},
},
+ {
+ arvados.EndpointContainerRequestContainerStatus,
+ func() interface{} { return &arvados.GetOptions{} },
+ func(ctx context.Context, opts interface{}) (interface{}, error) {
+ return rtr.backend.ContainerRequestContainerStatus(ctx, *opts.(*arvados.GetOptions))
+ },
+ },
{
arvados.EndpointContainerRequestLog,
func() interface{} { return &arvados.ContainerLogOptions{} },
diff --git a/lib/controller/rpc/conn.go b/lib/controller/rpc/conn.go
index 9f518d9c7a..c6be679a25 100644
--- a/lib/controller/rpc/conn.go
+++ b/lib/controller/rpc/conn.go
@@ -529,6 +529,13 @@ func (conn *Conn) ContainerRequestDelete(ctx context.Context, options arvados.De
return resp, err
}
+func (conn *Conn) ContainerRequestContainerStatus(ctx context.Context, options arvados.GetOptions) (arvados.ContainerStatus, error) {
+ ep := arvados.EndpointContainerRequestContainerStatus
+ var resp arvados.ContainerStatus
+ err := conn.requestAndDecode(ctx, &resp, ep, nil, options)
+ return resp, err
+}
+
func (conn *Conn) ContainerRequestLog(ctx context.Context, options arvados.ContainerLogOptions) (resp http.Handler, err error) {
proxy := &httputil.ReverseProxy{
Transport: conn.httpClient.Transport,
diff --git a/sdk/go/arvados/api.go b/sdk/go/arvados/api.go
index e7310818f7..c3d0ea8aef 100644
--- a/sdk/go/arvados/api.go
+++ b/sdk/go/arvados/api.go
@@ -23,90 +23,91 @@ type APIEndpoint struct {
}
var (
- EndpointConfigGet = APIEndpoint{"GET", "arvados/v1/config", ""}
- EndpointVocabularyGet = APIEndpoint{"GET", "arvados/v1/vocabulary", ""}
- EndpointDiscoveryDocument = APIEndpoint{"GET", "discovery/v1/apis/arvados/v1/rest", ""}
- EndpointLogin = APIEndpoint{"GET", "login", ""}
- EndpointLogout = APIEndpoint{"GET", "logout", ""}
- EndpointAuthorizedKeyCreate = APIEndpoint{"POST", "arvados/v1/authorized_keys", "authorized_key"}
- EndpointAuthorizedKeyUpdate = APIEndpoint{"PATCH", "arvados/v1/authorized_keys/{uuid}", "authorized_key"}
- EndpointAuthorizedKeyGet = APIEndpoint{"GET", "arvados/v1/authorized_keys/{uuid}", ""}
- EndpointAuthorizedKeyList = APIEndpoint{"GET", "arvados/v1/authorized_keys", ""}
- EndpointAuthorizedKeyDelete = APIEndpoint{"DELETE", "arvados/v1/authorized_keys/{uuid}", ""}
- EndpointCollectionCreate = APIEndpoint{"POST", "arvados/v1/collections", "collection"}
- EndpointCollectionUpdate = APIEndpoint{"PATCH", "arvados/v1/collections/{uuid}", "collection"}
- EndpointCollectionGet = APIEndpoint{"GET", "arvados/v1/collections/{uuid}", ""}
- EndpointCollectionList = APIEndpoint{"GET", "arvados/v1/collections", ""}
- EndpointCollectionProvenance = APIEndpoint{"GET", "arvados/v1/collections/{uuid}/provenance", ""}
- EndpointCollectionUsedBy = APIEndpoint{"GET", "arvados/v1/collections/{uuid}/used_by", ""}
- EndpointCollectionDelete = APIEndpoint{"DELETE", "arvados/v1/collections/{uuid}", ""}
- EndpointCollectionTrash = APIEndpoint{"POST", "arvados/v1/collections/{uuid}/trash", ""}
- EndpointCollectionUntrash = APIEndpoint{"POST", "arvados/v1/collections/{uuid}/untrash", ""}
- EndpointSpecimenCreate = APIEndpoint{"POST", "arvados/v1/specimens", "specimen"}
- EndpointSpecimenUpdate = APIEndpoint{"PATCH", "arvados/v1/specimens/{uuid}", "specimen"}
- EndpointSpecimenGet = APIEndpoint{"GET", "arvados/v1/specimens/{uuid}", ""}
- EndpointSpecimenList = APIEndpoint{"GET", "arvados/v1/specimens", ""}
- EndpointSpecimenDelete = APIEndpoint{"DELETE", "arvados/v1/specimens/{uuid}", ""}
- EndpointContainerCreate = APIEndpoint{"POST", "arvados/v1/containers", "container"}
- EndpointContainerUpdate = APIEndpoint{"PATCH", "arvados/v1/containers/{uuid}", "container"}
- EndpointContainerPriorityUpdate = APIEndpoint{"POST", "arvados/v1/containers/{uuid}/update_priority", "container"}
- EndpointContainerGet = APIEndpoint{"GET", "arvados/v1/containers/{uuid}", ""}
- EndpointContainerList = APIEndpoint{"GET", "arvados/v1/containers", ""}
- EndpointContainerDelete = APIEndpoint{"DELETE", "arvados/v1/containers/{uuid}", ""}
- EndpointContainerLock = APIEndpoint{"POST", "arvados/v1/containers/{uuid}/lock", ""}
- EndpointContainerUnlock = APIEndpoint{"POST", "arvados/v1/containers/{uuid}/unlock", ""}
- EndpointContainerSSH = APIEndpoint{"POST", "arvados/v1/containers/{uuid}/ssh", ""}
- EndpointContainerSSHCompat = APIEndpoint{"POST", "arvados/v1/connect/{uuid}/ssh", ""} // for compatibility with arvados <2.7
- EndpointContainerGatewayTunnel = APIEndpoint{"POST", "arvados/v1/containers/{uuid}/gateway_tunnel", ""}
- EndpointContainerGatewayTunnelCompat = APIEndpoint{"POST", "arvados/v1/connect/{uuid}/gateway_tunnel", ""} // for compatibility with arvados <2.7
- EndpointContainerRequestCreate = APIEndpoint{"POST", "arvados/v1/container_requests", "container_request"}
- EndpointContainerRequestUpdate = APIEndpoint{"PATCH", "arvados/v1/container_requests/{uuid}", "container_request"}
- EndpointContainerRequestGet = APIEndpoint{"GET", "arvados/v1/container_requests/{uuid}", ""}
- EndpointContainerRequestList = APIEndpoint{"GET", "arvados/v1/container_requests", ""}
- EndpointContainerRequestDelete = APIEndpoint{"DELETE", "arvados/v1/container_requests/{uuid}", ""}
- EndpointContainerRequestLog = APIEndpoint{"GET", "arvados/v1/container_requests/{uuid}/log{path:|/.*}", ""}
- EndpointGroupCreate = APIEndpoint{"POST", "arvados/v1/groups", "group"}
- EndpointGroupUpdate = APIEndpoint{"PATCH", "arvados/v1/groups/{uuid}", "group"}
- EndpointGroupGet = APIEndpoint{"GET", "arvados/v1/groups/{uuid}", ""}
- EndpointGroupList = APIEndpoint{"GET", "arvados/v1/groups", ""}
- EndpointGroupContents = APIEndpoint{"GET", "arvados/v1/groups/contents", ""}
- EndpointGroupContentsUUIDInPath = APIEndpoint{"GET", "arvados/v1/groups/{uuid}/contents", ""} // Alternative HTTP route; client-side code should always use EndpointGroupContents instead
- EndpointGroupShared = APIEndpoint{"GET", "arvados/v1/groups/shared", ""}
- EndpointGroupDelete = APIEndpoint{"DELETE", "arvados/v1/groups/{uuid}", ""}
- EndpointGroupTrash = APIEndpoint{"POST", "arvados/v1/groups/{uuid}/trash", ""}
- EndpointGroupUntrash = APIEndpoint{"POST", "arvados/v1/groups/{uuid}/untrash", ""}
- EndpointLinkCreate = APIEndpoint{"POST", "arvados/v1/links", "link"}
- EndpointLinkUpdate = APIEndpoint{"PATCH", "arvados/v1/links/{uuid}", "link"}
- EndpointLinkGet = APIEndpoint{"GET", "arvados/v1/links/{uuid}", ""}
- EndpointLinkList = APIEndpoint{"GET", "arvados/v1/links", ""}
- EndpointLinkDelete = APIEndpoint{"DELETE", "arvados/v1/links/{uuid}", ""}
- EndpointLogCreate = APIEndpoint{"POST", "arvados/v1/logs", "log"}
- EndpointLogUpdate = APIEndpoint{"PATCH", "arvados/v1/logs/{uuid}", "log"}
- EndpointLogGet = APIEndpoint{"GET", "arvados/v1/logs/{uuid}", ""}
- EndpointLogList = APIEndpoint{"GET", "arvados/v1/logs", ""}
- EndpointLogDelete = APIEndpoint{"DELETE", "arvados/v1/logs/{uuid}", ""}
- EndpointSysTrashSweep = APIEndpoint{"POST", "sys/trash_sweep", ""}
- EndpointUserActivate = APIEndpoint{"POST", "arvados/v1/users/{uuid}/activate", ""}
- EndpointUserCreate = APIEndpoint{"POST", "arvados/v1/users", "user"}
- EndpointUserCurrent = APIEndpoint{"GET", "arvados/v1/users/current", ""}
- EndpointUserDelete = APIEndpoint{"DELETE", "arvados/v1/users/{uuid}", ""}
- EndpointUserGet = APIEndpoint{"GET", "arvados/v1/users/{uuid}", ""}
- EndpointUserGetCurrent = APIEndpoint{"GET", "arvados/v1/users/current", ""}
- EndpointUserGetSystem = APIEndpoint{"GET", "arvados/v1/users/system", ""}
- EndpointUserList = APIEndpoint{"GET", "arvados/v1/users", ""}
- EndpointUserMerge = APIEndpoint{"POST", "arvados/v1/users/merge", ""}
- EndpointUserSetup = APIEndpoint{"POST", "arvados/v1/users/setup", "user"}
- EndpointUserSystem = APIEndpoint{"GET", "arvados/v1/users/system", ""}
- EndpointUserUnsetup = APIEndpoint{"POST", "arvados/v1/users/{uuid}/unsetup", ""}
- EndpointUserUpdate = APIEndpoint{"PATCH", "arvados/v1/users/{uuid}", "user"}
- EndpointUserBatchUpdate = APIEndpoint{"PATCH", "arvados/v1/users/batch_update", ""}
- EndpointUserAuthenticate = APIEndpoint{"POST", "arvados/v1/users/authenticate", ""}
- EndpointAPIClientAuthorizationCurrent = APIEndpoint{"GET", "arvados/v1/api_client_authorizations/current", ""}
- EndpointAPIClientAuthorizationCreate = APIEndpoint{"POST", "arvados/v1/api_client_authorizations", "api_client_authorization"}
- EndpointAPIClientAuthorizationUpdate = APIEndpoint{"PUT", "arvados/v1/api_client_authorizations/{uuid}", "api_client_authorization"}
- EndpointAPIClientAuthorizationList = APIEndpoint{"GET", "arvados/v1/api_client_authorizations", ""}
- EndpointAPIClientAuthorizationDelete = APIEndpoint{"DELETE", "arvados/v1/api_client_authorizations/{uuid}", ""}
- EndpointAPIClientAuthorizationGet = APIEndpoint{"GET", "arvados/v1/api_client_authorizations/{uuid}", ""}
+ EndpointConfigGet = APIEndpoint{"GET", "arvados/v1/config", ""}
+ EndpointVocabularyGet = APIEndpoint{"GET", "arvados/v1/vocabulary", ""}
+ EndpointDiscoveryDocument = APIEndpoint{"GET", "discovery/v1/apis/arvados/v1/rest", ""}
+ EndpointLogin = APIEndpoint{"GET", "login", ""}
+ EndpointLogout = APIEndpoint{"GET", "logout", ""}
+ EndpointAuthorizedKeyCreate = APIEndpoint{"POST", "arvados/v1/authorized_keys", "authorized_key"}
+ EndpointAuthorizedKeyUpdate = APIEndpoint{"PATCH", "arvados/v1/authorized_keys/{uuid}", "authorized_key"}
+ EndpointAuthorizedKeyGet = APIEndpoint{"GET", "arvados/v1/authorized_keys/{uuid}", ""}
+ EndpointAuthorizedKeyList = APIEndpoint{"GET", "arvados/v1/authorized_keys", ""}
+ EndpointAuthorizedKeyDelete = APIEndpoint{"DELETE", "arvados/v1/authorized_keys/{uuid}", ""}
+ EndpointCollectionCreate = APIEndpoint{"POST", "arvados/v1/collections", "collection"}
+ EndpointCollectionUpdate = APIEndpoint{"PATCH", "arvados/v1/collections/{uuid}", "collection"}
+ EndpointCollectionGet = APIEndpoint{"GET", "arvados/v1/collections/{uuid}", ""}
+ EndpointCollectionList = APIEndpoint{"GET", "arvados/v1/collections", ""}
+ EndpointCollectionProvenance = APIEndpoint{"GET", "arvados/v1/collections/{uuid}/provenance", ""}
+ EndpointCollectionUsedBy = APIEndpoint{"GET", "arvados/v1/collections/{uuid}/used_by", ""}
+ EndpointCollectionDelete = APIEndpoint{"DELETE", "arvados/v1/collections/{uuid}", ""}
+ EndpointCollectionTrash = APIEndpoint{"POST", "arvados/v1/collections/{uuid}/trash", ""}
+ EndpointCollectionUntrash = APIEndpoint{"POST", "arvados/v1/collections/{uuid}/untrash", ""}
+ EndpointSpecimenCreate = APIEndpoint{"POST", "arvados/v1/specimens", "specimen"}
+ EndpointSpecimenUpdate = APIEndpoint{"PATCH", "arvados/v1/specimens/{uuid}", "specimen"}
+ EndpointSpecimenGet = APIEndpoint{"GET", "arvados/v1/specimens/{uuid}", ""}
+ EndpointSpecimenList = APIEndpoint{"GET", "arvados/v1/specimens", ""}
+ EndpointSpecimenDelete = APIEndpoint{"DELETE", "arvados/v1/specimens/{uuid}", ""}
+ EndpointContainerCreate = APIEndpoint{"POST", "arvados/v1/containers", "container"}
+ EndpointContainerUpdate = APIEndpoint{"PATCH", "arvados/v1/containers/{uuid}", "container"}
+ EndpointContainerPriorityUpdate = APIEndpoint{"POST", "arvados/v1/containers/{uuid}/update_priority", "container"}
+ EndpointContainerGet = APIEndpoint{"GET", "arvados/v1/containers/{uuid}", ""}
+ EndpointContainerList = APIEndpoint{"GET", "arvados/v1/containers", ""}
+ EndpointContainerDelete = APIEndpoint{"DELETE", "arvados/v1/containers/{uuid}", ""}
+ EndpointContainerLock = APIEndpoint{"POST", "arvados/v1/containers/{uuid}/lock", ""}
+ EndpointContainerUnlock = APIEndpoint{"POST", "arvados/v1/containers/{uuid}/unlock", ""}
+ EndpointContainerSSH = APIEndpoint{"POST", "arvados/v1/containers/{uuid}/ssh", ""}
+ EndpointContainerSSHCompat = APIEndpoint{"POST", "arvados/v1/connect/{uuid}/ssh", ""} // for compatibility with arvados <2.7
+ EndpointContainerGatewayTunnel = APIEndpoint{"POST", "arvados/v1/containers/{uuid}/gateway_tunnel", ""}
+ EndpointContainerGatewayTunnelCompat = APIEndpoint{"POST", "arvados/v1/connect/{uuid}/gateway_tunnel", ""} // for compatibility with arvados <2.7
+ EndpointContainerRequestCreate = APIEndpoint{"POST", "arvados/v1/container_requests", "container_request"}
+ EndpointContainerRequestUpdate = APIEndpoint{"PATCH", "arvados/v1/container_requests/{uuid}", "container_request"}
+ EndpointContainerRequestGet = APIEndpoint{"GET", "arvados/v1/container_requests/{uuid}", ""}
+ EndpointContainerRequestList = APIEndpoint{"GET", "arvados/v1/container_requests", ""}
+ EndpointContainerRequestDelete = APIEndpoint{"DELETE", "arvados/v1/container_requests/{uuid}", ""}
+ EndpointContainerRequestContainerStatus = APIEndpoint{"GET", "arvados/v1/container_requests/{uuid}/container_status", ""}
+ EndpointContainerRequestLog = APIEndpoint{"GET", "arvados/v1/container_requests/{uuid}/log{path:|/.*}", ""}
+ EndpointGroupCreate = APIEndpoint{"POST", "arvados/v1/groups", "group"}
+ EndpointGroupUpdate = APIEndpoint{"PATCH", "arvados/v1/groups/{uuid}", "group"}
+ EndpointGroupGet = APIEndpoint{"GET", "arvados/v1/groups/{uuid}", ""}
+ EndpointGroupList = APIEndpoint{"GET", "arvados/v1/groups", ""}
+ EndpointGroupContents = APIEndpoint{"GET", "arvados/v1/groups/contents", ""}
+ EndpointGroupContentsUUIDInPath = APIEndpoint{"GET", "arvados/v1/groups/{uuid}/contents", ""} // Alternative HTTP route; client-side code should always use EndpointGroupContents instead
+ EndpointGroupShared = APIEndpoint{"GET", "arvados/v1/groups/shared", ""}
+ EndpointGroupDelete = APIEndpoint{"DELETE", "arvados/v1/groups/{uuid}", ""}
+ EndpointGroupTrash = APIEndpoint{"POST", "arvados/v1/groups/{uuid}/trash", ""}
+ EndpointGroupUntrash = APIEndpoint{"POST", "arvados/v1/groups/{uuid}/untrash", ""}
+ EndpointLinkCreate = APIEndpoint{"POST", "arvados/v1/links", "link"}
+ EndpointLinkUpdate = APIEndpoint{"PATCH", "arvados/v1/links/{uuid}", "link"}
+ EndpointLinkGet = APIEndpoint{"GET", "arvados/v1/links/{uuid}", ""}
+ EndpointLinkList = APIEndpoint{"GET", "arvados/v1/links", ""}
+ EndpointLinkDelete = APIEndpoint{"DELETE", "arvados/v1/links/{uuid}", ""}
+ EndpointLogCreate = APIEndpoint{"POST", "arvados/v1/logs", "log"}
+ EndpointLogUpdate = APIEndpoint{"PATCH", "arvados/v1/logs/{uuid}", "log"}
+ EndpointLogGet = APIEndpoint{"GET", "arvados/v1/logs/{uuid}", ""}
+ EndpointLogList = APIEndpoint{"GET", "arvados/v1/logs", ""}
+ EndpointLogDelete = APIEndpoint{"DELETE", "arvados/v1/logs/{uuid}", ""}
+ EndpointSysTrashSweep = APIEndpoint{"POST", "sys/trash_sweep", ""}
+ EndpointUserActivate = APIEndpoint{"POST", "arvados/v1/users/{uuid}/activate", ""}
+ EndpointUserCreate = APIEndpoint{"POST", "arvados/v1/users", "user"}
+ EndpointUserCurrent = APIEndpoint{"GET", "arvados/v1/users/current", ""}
+ EndpointUserDelete = APIEndpoint{"DELETE", "arvados/v1/users/{uuid}", ""}
+ EndpointUserGet = APIEndpoint{"GET", "arvados/v1/users/{uuid}", ""}
+ EndpointUserGetCurrent = APIEndpoint{"GET", "arvados/v1/users/current", ""}
+ EndpointUserGetSystem = APIEndpoint{"GET", "arvados/v1/users/system", ""}
+ EndpointUserList = APIEndpoint{"GET", "arvados/v1/users", ""}
+ EndpointUserMerge = APIEndpoint{"POST", "arvados/v1/users/merge", ""}
+ EndpointUserSetup = APIEndpoint{"POST", "arvados/v1/users/setup", "user"}
+ EndpointUserSystem = APIEndpoint{"GET", "arvados/v1/users/system", ""}
+ EndpointUserUnsetup = APIEndpoint{"POST", "arvados/v1/users/{uuid}/unsetup", ""}
+ EndpointUserUpdate = APIEndpoint{"PATCH", "arvados/v1/users/{uuid}", "user"}
+ EndpointUserBatchUpdate = APIEndpoint{"PATCH", "arvados/v1/users/batch_update", ""}
+ EndpointUserAuthenticate = APIEndpoint{"POST", "arvados/v1/users/authenticate", ""}
+ EndpointAPIClientAuthorizationCurrent = APIEndpoint{"GET", "arvados/v1/api_client_authorizations/current", ""}
+ EndpointAPIClientAuthorizationCreate = APIEndpoint{"POST", "arvados/v1/api_client_authorizations", "api_client_authorization"}
+ EndpointAPIClientAuthorizationUpdate = APIEndpoint{"PUT", "arvados/v1/api_client_authorizations/{uuid}", "api_client_authorization"}
+ EndpointAPIClientAuthorizationList = APIEndpoint{"GET", "arvados/v1/api_client_authorizations", ""}
+ EndpointAPIClientAuthorizationDelete = APIEndpoint{"DELETE", "arvados/v1/api_client_authorizations/{uuid}", ""}
+ EndpointAPIClientAuthorizationGet = APIEndpoint{"GET", "arvados/v1/api_client_authorizations/{uuid}", ""}
)
type ContainerSSHOptions struct {
@@ -310,6 +311,7 @@ type API interface {
ContainerRequestGet(ctx context.Context, options GetOptions) (ContainerRequest, error)
ContainerRequestList(ctx context.Context, options ListOptions) (ContainerRequestList, error)
ContainerRequestDelete(ctx context.Context, options DeleteOptions) (ContainerRequest, error)
+ ContainerRequestContainerStatus(ctx context.Context, options GetOptions) (ContainerStatus, error)
ContainerRequestLog(ctx context.Context, options ContainerLogOptions) (http.Handler, error)
GroupCreate(ctx context.Context, options CreateOptions) (Group, error)
GroupUpdate(ctx context.Context, options UpdateOptions) (Group, error)
diff --git a/sdk/go/arvados/container.go b/sdk/go/arvados/container.go
index 2467e807a1..91c8fbfe29 100644
--- a/sdk/go/arvados/container.go
+++ b/sdk/go/arvados/container.go
@@ -160,3 +160,9 @@ const (
ContainerRequestStateCommitted = ContainerRequestState("Committed")
ContainerRequestStateFinal = ContainerRequestState("Final")
)
+
+type ContainerStatus struct {
+ UUID string `json:"uuid"`
+ State ContainerState `json:"container_state"`
+ SchedulingStatus string `json:"scheduling_status"`
+}
diff --git a/sdk/go/arvadostest/api.go b/sdk/go/arvadostest/api.go
index 3ba794380f..e1827b5d1f 100644
--- a/sdk/go/arvadostest/api.go
+++ b/sdk/go/arvadostest/api.go
@@ -168,6 +168,10 @@ func (as *APIStub) ContainerRequestDelete(ctx context.Context, options arvados.D
as.appendCall(ctx, as.ContainerRequestDelete, options)
return arvados.ContainerRequest{}, as.Error
}
+func (as *APIStub) ContainerRequestContainerStatus(ctx context.Context, options arvados.GetOptions) (arvados.ContainerStatus, error) {
+ as.appendCall(ctx, as.ContainerRequestContainerStatus, options)
+ return arvados.ContainerStatus{}, as.Error
+}
func (as *APIStub) ContainerRequestLog(ctx context.Context, options arvados.ContainerLogOptions) (http.Handler, error) {
as.appendCall(ctx, as.ContainerRequestLog, options)
// Return a handler that responds with the configured
diff --git a/services/api/config/routes.rb b/services/api/config/routes.rb
index 87e2737575..b87e86f664 100644
--- a/services/api/config/routes.rb
+++ b/services/api/config/routes.rb
@@ -44,7 +44,9 @@ Rails.application.routes.draw do
get 'secret_mounts', on: :member
get 'current', on: :collection
end
- resources :container_requests
+ resources :container_requests do
+ get 'container_status', on: :member
+ end
resources :jobs do
get 'queue', on: :collection
get 'queue_size', on: :collection
commit 7345838cb097f11e2ba8239020762ae867591510
Author: Tom Clegg <tom at curii.com>
Date: Fri Mar 1 16:00:09 2024 -0500
21123: Add container status API to cloud dispatcher.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>
diff --git a/doc/api/dispatch.html.textile.liquid b/doc/api/dispatch.html.textile.liquid
index b06136db9a..488545c7d4 100644
--- a/doc/api/dispatch.html.textile.liquid
+++ b/doc/api/dispatch.html.textile.liquid
@@ -32,6 +32,7 @@ Return a list of containers that are either ready to dispatch, or being started/
Each entry in the returned list of @items@ includes:
* an @instance_type@ entry with the name and attributes of the instance type that will be used to schedule the container (chosen from the @InstanceTypes@ section of your cluster config file); and
* a @container@ entry with selected attributes of the container itself, including @uuid@, @priority@, @runtime_constraints@, and @state at . Other fields of the container records are not loaded by the dispatcher, and will have empty/zero values here (e.g., @{...,"created_at":"0001-01-01T00:00:00Z","command":[],...}@).
+* a @scheduling_status@ entry: a brief explanation of the container's status in the dispatch queue, or empty if scheduling is not applicable, e.g., the container has already started running.
Example response:
@@ -56,12 +57,31 @@ Example response:
"AddedScratch": 0,
"Price": 0.146,
"Preemptible": false
- }
+ },
+ "scheduling_status": "waiting for new instance to be ready"
},
...
]
}</pre></notextile>
+h3. Get specified container
+
+ at GET /arvados/v1/dispatch/container?container_uuid={uuid}@
+
+Return the same information as "list containers" above, but for a single specified container.
+
+Example response:
+
+<notextile><pre>{
+ "container": {
+ ...
+ },
+ "instance_type": {
+ ...
+ },
+ "scheduling_status": "waiting for new instance to be ready"
+}</pre></notextile>
+
h3. Terminate a container
@POST /arvados/v1/dispatch/containers/kill?container_uuid={uuid}&reason={string}@
diff --git a/lib/dispatchcloud/dispatcher.go b/lib/dispatchcloud/dispatcher.go
index 47e60abdee..611d13306f 100644
--- a/lib/dispatchcloud/dispatcher.go
+++ b/lib/dispatchcloud/dispatcher.go
@@ -61,14 +61,22 @@ type dispatcher struct {
instanceSet cloud.InstanceSet
pool pool
queue scheduler.ContainerQueue
+ sched *scheduler.Scheduler
httpHandler http.Handler
sshKey ssh.Signer
setupOnce sync.Once
stop chan struct{}
stopped chan struct{}
+
+ sQueueMtx sync.Mutex
+ sQueueRefreshed time.Time
+ sQueue []scheduler.QueueEnt
+ sQueueMap map[string]scheduler.QueueEnt
}
+var sQueueRefresh = time.Second
+
// Start starts the dispatcher. Start can be called multiple times
// with no ill effect.
func (disp *dispatcher) Start() {
@@ -155,7 +163,22 @@ func (disp *dispatcher) initialize() {
dblock.Dispatch.Lock(disp.Context, disp.dbConnector.GetDB)
disp.instanceSet = instanceSet
disp.pool = worker.NewPool(disp.logger, disp.ArvClient, disp.Registry, disp.InstanceSetID, disp.instanceSet, disp.newExecutor, installPublicKey, disp.Cluster)
- disp.queue = container.NewQueue(disp.logger, disp.Registry, disp.typeChooser, disp.ArvClient)
+ if disp.queue == nil {
+ disp.queue = container.NewQueue(disp.logger, disp.Registry, disp.typeChooser, disp.ArvClient)
+ }
+
+ staleLockTimeout := time.Duration(disp.Cluster.Containers.StaleLockTimeout)
+ if staleLockTimeout == 0 {
+ staleLockTimeout = defaultStaleLockTimeout
+ }
+ pollInterval := time.Duration(disp.Cluster.Containers.CloudVMs.PollInterval)
+ if pollInterval <= 0 {
+ pollInterval = defaultPollInterval
+ }
+ disp.sched = scheduler.New(disp.Context, disp.ArvClient, disp.queue, disp.pool, disp.Registry, staleLockTimeout, pollInterval,
+ disp.Cluster.Containers.CloudVMs.InitialQuotaEstimate,
+ disp.Cluster.Containers.CloudVMs.MaxInstances,
+ disp.Cluster.Containers.CloudVMs.SupervisorFraction)
if disp.Cluster.ManagementToken == "" {
disp.httpHandler = http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
@@ -164,6 +187,7 @@ func (disp *dispatcher) initialize() {
} else {
mux := httprouter.New()
mux.HandlerFunc("GET", "/arvados/v1/dispatch/containers", disp.apiContainers)
+ mux.HandlerFunc("GET", "/arvados/v1/dispatch/container", disp.apiContainer)
mux.HandlerFunc("POST", "/arvados/v1/dispatch/containers/kill", disp.apiContainerKill)
mux.HandlerFunc("GET", "/arvados/v1/dispatch/instances", disp.apiInstances)
mux.HandlerFunc("POST", "/arvados/v1/dispatch/instances/hold", disp.apiInstanceHold)
@@ -190,36 +214,53 @@ func (disp *dispatcher) run() {
defer disp.instanceSet.Stop()
defer disp.pool.Stop()
- staleLockTimeout := time.Duration(disp.Cluster.Containers.StaleLockTimeout)
- if staleLockTimeout == 0 {
- staleLockTimeout = defaultStaleLockTimeout
- }
- pollInterval := time.Duration(disp.Cluster.Containers.CloudVMs.PollInterval)
- if pollInterval <= 0 {
- pollInterval = defaultPollInterval
- }
- sched := scheduler.New(disp.Context, disp.ArvClient, disp.queue, disp.pool, disp.Registry, staleLockTimeout, pollInterval,
- disp.Cluster.Containers.CloudVMs.InitialQuotaEstimate,
- disp.Cluster.Containers.CloudVMs.MaxInstances,
- disp.Cluster.Containers.CloudVMs.SupervisorFraction)
- sched.Start()
- defer sched.Stop()
+ disp.sched.Start()
+ defer disp.sched.Stop()
<-disp.stop
}
-// Management API: all active and queued containers.
+// Get a snapshot of the scheduler's queue, no older than
+// sQueueRefresh.
+//
+// First return value is in the sorted order used by the scheduler.
+// Second return value is a map of the same entries, for efficiently
+// looking up a single container.
+func (disp *dispatcher) sQueueCurrent() ([]scheduler.QueueEnt, map[string]scheduler.QueueEnt) {
+ disp.sQueueMtx.Lock()
+ defer disp.sQueueMtx.Unlock()
+ if time.Since(disp.sQueueRefreshed) > sQueueRefresh {
+ disp.sQueue = disp.sched.Queue()
+ disp.sQueueMap = make(map[string]scheduler.QueueEnt)
+ for _, ent := range disp.sQueue {
+ disp.sQueueMap[ent.Container.UUID] = ent
+ }
+ disp.sQueueRefreshed = time.Now()
+ }
+ return disp.sQueue, disp.sQueueMap
+}
+
+// Management API: scheduling queue entries for all active and queued
+// containers.
func (disp *dispatcher) apiContainers(w http.ResponseWriter, r *http.Request) {
var resp struct {
- Items []container.QueueEnt `json:"items"`
- }
- qEntries, _ := disp.queue.Entries()
- for _, ent := range qEntries {
- resp.Items = append(resp.Items, ent)
+ Items []scheduler.QueueEnt `json:"items"`
}
+ resp.Items, _ = disp.sQueueCurrent()
json.NewEncoder(w).Encode(resp)
}
+// Management API: scheduling queue entry for a specified container.
+func (disp *dispatcher) apiContainer(w http.ResponseWriter, r *http.Request) {
+ _, sq := disp.sQueueCurrent()
+ ent, ok := sq[r.FormValue("container_uuid")]
+ if !ok {
+ httpserver.Error(w, "container not found", http.StatusNotFound)
+ return
+ }
+ json.NewEncoder(w).Encode(ent)
+}
+
// Management API: all active instances (cloud VMs).
func (disp *dispatcher) apiInstances(w http.ResponseWriter, r *http.Request) {
var resp struct {
diff --git a/lib/dispatchcloud/dispatcher_test.go b/lib/dispatchcloud/dispatcher_test.go
index 20185554b8..e7465d65b1 100644
--- a/lib/dispatchcloud/dispatcher_test.go
+++ b/lib/dispatchcloud/dispatcher_test.go
@@ -8,12 +8,14 @@ import (
"context"
"crypto/tls"
"encoding/json"
+ "fmt"
"io/ioutil"
"math/rand"
"net/http"
"net/http/httptest"
"net/url"
"os"
+ "strings"
"sync"
"sync/atomic"
"time"
@@ -159,7 +161,6 @@ func (s *DispatcherSuite) arvClientProxy(c *check.C) func(*http.Request) (*url.U
// artificial errors in order to exercise a variety of code paths.
func (s *DispatcherSuite) TestDispatchToStubDriver(c *check.C) {
Drivers["test"] = s.stubDriver
- s.disp.setupOnce.Do(s.disp.initialize)
queue := &test.Queue{
MaxDispatchAttempts: 5,
ChooseType: func(ctr *arvados.Container) ([]arvados.InstanceType, error) {
@@ -179,6 +180,7 @@ func (s *DispatcherSuite) TestDispatchToStubDriver(c *check.C) {
})
}
s.disp.queue = queue
+ s.disp.setupOnce.Do(s.disp.initialize)
var mtx sync.Mutex
done := make(chan struct{})
@@ -323,7 +325,7 @@ func (s *DispatcherSuite) TestDispatchToStubDriver(c *check.C) {
c.Check(resp.Body.String(), check.Matches, `(?ms).*max_concurrent_containers [1-9][0-9e+.]*`)
}
-func (s *DispatcherSuite) TestAPIPermissions(c *check.C) {
+func (s *DispatcherSuite) TestManagementAPI_Permissions(c *check.C) {
s.cluster.ManagementToken = "abcdefgh"
Drivers["test"] = s.stubDriver
s.disp.setupOnce.Do(s.disp.initialize)
@@ -345,7 +347,7 @@ func (s *DispatcherSuite) TestAPIPermissions(c *check.C) {
}
}
-func (s *DispatcherSuite) TestAPIDisabled(c *check.C) {
+func (s *DispatcherSuite) TestManagementAPI_Disabled(c *check.C) {
s.cluster.ManagementToken = ""
Drivers["test"] = s.stubDriver
s.disp.setupOnce.Do(s.disp.initialize)
@@ -363,13 +365,122 @@ func (s *DispatcherSuite) TestAPIDisabled(c *check.C) {
}
}
-func (s *DispatcherSuite) TestInstancesAPI(c *check.C) {
+func (s *DispatcherSuite) TestManagementAPI_Containers(c *check.C) {
+ s.cluster.ManagementToken = "abcdefgh"
+ s.cluster.Containers.CloudVMs.InitialQuotaEstimate = 4
+ Drivers["test"] = s.stubDriver
+ queue := &test.Queue{
+ MaxDispatchAttempts: 5,
+ ChooseType: func(ctr *arvados.Container) ([]arvados.InstanceType, error) {
+ return ChooseInstanceType(s.cluster, ctr)
+ },
+ Logger: ctxlog.TestLogger(c),
+ }
+ s.stubDriver.Queue = queue
+ s.stubDriver.QuotaMaxInstances = 4
+ s.stubDriver.SetupVM = func(stubvm *test.StubVM) error {
+ if stubvm.Instance().ProviderType() >= test.InstanceType(4).ProviderType {
+ return test.CapacityError{InstanceTypeSpecific: true}
+ }
+ stubvm.ExecuteContainer = func(ctr arvados.Container) int {
+ time.Sleep(5 * time.Second)
+ return 0
+ }
+ return nil
+ }
+ s.disp.queue = queue
+ s.disp.setupOnce.Do(s.disp.initialize)
+
+ go s.disp.run()
+
+ type queueEnt struct {
+ Container arvados.Container
+ InstanceType arvados.InstanceType `json:"instance_type"`
+ SchedulingStatus string `json:"scheduling_status"`
+ }
+ type containersResponse struct {
+ Items []queueEnt
+ }
+ getContainers := func() containersResponse {
+ sQueueRefresh = time.Millisecond
+ req := httptest.NewRequest("GET", "/arvados/v1/dispatch/containers", nil)
+ req.Header.Set("Authorization", "Bearer abcdefgh")
+ resp := httptest.NewRecorder()
+ s.disp.ServeHTTP(resp, req)
+ var cresp containersResponse
+ c.Check(resp.Code, check.Equals, http.StatusOK)
+ err := json.Unmarshal(resp.Body.Bytes(), &cresp)
+ c.Check(err, check.IsNil)
+ return cresp
+ }
+
+ c.Check(getContainers().Items, check.HasLen, 0)
+
+ for i := 0; i < 20; i++ {
+ queue.Containers = append(queue.Containers, arvados.Container{
+ UUID: test.ContainerUUID(i),
+ State: arvados.ContainerStateQueued,
+ Priority: int64(100 - i),
+ RuntimeConstraints: arvados.RuntimeConstraints{
+ RAM: int64(i%3+1) << 30,
+ VCPUs: i%8 + 1,
+ },
+ })
+ }
+ queue.Update()
+
+ expect := `
+ 0 zzzzz-dz642-000000000000000 (Running) ""
+ 1 zzzzz-dz642-000000000000001 (Running) ""
+ 2 zzzzz-dz642-000000000000002 (Locked) "waiting for suitable instance type to become available: queue position 1"
+ 3 zzzzz-dz642-000000000000003 (Locked) "waiting for suitable instance type to become available: queue position 2"
+ 4 zzzzz-dz642-000000000000004 (Queued) "waiting while cluster is running at capacity: queue position 3"
+ 5 zzzzz-dz642-000000000000005 (Queued) "waiting while cluster is running at capacity: queue position 4"
+ 6 zzzzz-dz642-000000000000006 (Queued) "waiting while cluster is running at capacity: queue position 5"
+ 7 zzzzz-dz642-000000000000007 (Queued) "waiting while cluster is running at capacity: queue position 6"
+ 8 zzzzz-dz642-000000000000008 (Queued) "waiting while cluster is running at capacity: queue position 7"
+ 9 zzzzz-dz642-000000000000009 (Queued) "waiting while cluster is running at capacity: queue position 8"
+ 10 zzzzz-dz642-000000000000010 (Queued) "waiting while cluster is running at capacity: queue position 9"
+ 11 zzzzz-dz642-000000000000011 (Queued) "waiting while cluster is running at capacity: queue position 10"
+ 12 zzzzz-dz642-000000000000012 (Queued) "waiting while cluster is running at capacity: queue position 11"
+ 13 zzzzz-dz642-000000000000013 (Queued) "waiting while cluster is running at capacity: queue position 12"
+ 14 zzzzz-dz642-000000000000014 (Queued) "waiting while cluster is running at capacity: queue position 13"
+ 15 zzzzz-dz642-000000000000015 (Queued) "waiting while cluster is running at capacity: queue position 14"
+ 16 zzzzz-dz642-000000000000016 (Queued) "waiting while cluster is running at capacity: queue position 15"
+ 17 zzzzz-dz642-000000000000017 (Queued) "waiting while cluster is running at capacity: queue position 16"
+ 18 zzzzz-dz642-000000000000018 (Queued) "waiting while cluster is running at capacity: queue position 17"
+ 19 zzzzz-dz642-000000000000019 (Queued) "waiting while cluster is running at capacity: queue position 18"
+`
+ sequence := make(map[string][]string)
+ var summary string
+ for deadline := time.Now().Add(time.Second); time.Now().Before(deadline); time.Sleep(time.Millisecond) {
+ cresp := getContainers()
+ summary = "\n"
+ for i, ent := range cresp.Items {
+ summary += fmt.Sprintf("% 2d %s (%s) %q\n", i, ent.Container.UUID, ent.Container.State, ent.SchedulingStatus)
+ s := sequence[ent.Container.UUID]
+ if len(s) == 0 || s[len(s)-1] != ent.SchedulingStatus {
+ sequence[ent.Container.UUID] = append(s, ent.SchedulingStatus)
+ }
+ }
+ if summary == expect {
+ break
+ }
+ }
+ c.Check(summary, check.Equals, expect)
+ for i := 0; i < 5; i++ {
+ c.Logf("sequence for container %d:\n... %s", i, strings.Join(sequence[test.ContainerUUID(i)], "\n... "))
+ }
+}
+
+func (s *DispatcherSuite) TestManagementAPI_Instances(c *check.C) {
s.cluster.ManagementToken = "abcdefgh"
s.cluster.Containers.CloudVMs.TimeoutBooting = arvados.Duration(time.Second)
Drivers["test"] = s.stubDriver
s.disp.setupOnce.Do(s.disp.initialize)
s.disp.queue = &test.Queue{}
go s.disp.run()
+ defer s.disp.Close()
type instance struct {
Instance string
diff --git a/lib/dispatchcloud/scheduler/run_queue.go b/lib/dispatchcloud/scheduler/run_queue.go
index 03fa592777..2f4bce8987 100644
--- a/lib/dispatchcloud/scheduler/run_queue.go
+++ b/lib/dispatchcloud/scheduler/run_queue.go
@@ -5,6 +5,7 @@
package scheduler
import (
+ "fmt"
"sort"
"time"
@@ -15,6 +16,20 @@ import (
var quietAfter503 = time.Minute
+type QueueEnt struct {
+ container.QueueEnt
+
+ // Human-readable scheduling status as of the last scheduling
+ // iteration.
+ SchedulingStatus string `json:"scheduling_status"`
+}
+
+// Queue returns the sorted queue from the last scheduling iteration.
+func (sch *Scheduler) Queue() []QueueEnt {
+ ents, _ := sch.lastQueue.Load().([]QueueEnt)
+ return ents
+}
+
func (sch *Scheduler) runQueue() {
running := sch.pool.Running()
unalloc := sch.pool.Unallocated()
@@ -25,9 +40,9 @@ func (sch *Scheduler) runQueue() {
}
unsorted, _ := sch.queue.Entries()
- sorted := make([]container.QueueEnt, 0, len(unsorted))
+ sorted := make([]QueueEnt, 0, len(unsorted))
for _, ent := range unsorted {
- sorted = append(sorted, ent)
+ sorted = append(sorted, QueueEnt{QueueEnt: ent})
}
sort.Slice(sorted, func(i, j int) bool {
_, irunning := running[sorted[i].Container.UUID]
@@ -149,9 +164,9 @@ func (sch *Scheduler) runQueue() {
}).Debug("runQueue")
dontstart := map[arvados.InstanceType]bool{}
- var atcapacity = map[string]bool{} // ProviderTypes reported as AtCapacity during this runQueue() invocation
- var overquota []container.QueueEnt // entries that are unmappable because of worker pool quota
- var overmaxsuper []container.QueueEnt // unmappable because max supervisors (these are not included in overquota)
+ var atcapacity = map[string]bool{} // ProviderTypes reported as AtCapacity during this runQueue() invocation
+ var overquota []QueueEnt // entries that are unmappable because of worker pool quota
+ var overmaxsuper []QueueEnt // unmappable because max supervisors (these are not included in overquota)
var containerAllocatedWorkerBootingCount int
// trying is #containers running + #containers we're trying to
@@ -159,6 +174,7 @@ func (sch *Scheduler) runQueue() {
// reaches the dynamic maxConcurrency limit.
trying := len(running)
+ qpos := 0
supervisors := 0
tryrun:
@@ -169,12 +185,20 @@ tryrun:
})
if ctr.SchedulingParameters.Supervisor {
supervisors += 1
- if maxSupervisors > 0 && supervisors > maxSupervisors {
- overmaxsuper = append(overmaxsuper, sorted[i])
- continue
+ }
+ if _, running := running[ctr.UUID]; running {
+ if ctr.State == arvados.ContainerStateQueued || ctr.State == arvados.ContainerStateLocked {
+ sorted[i].SchedulingStatus = "preparing runtime environment"
}
+ continue
}
- if _, running := running[ctr.UUID]; running || ctr.Priority < 1 {
+ if ctr.Priority < 1 {
+ sorted[i].SchedulingStatus = "not scheduling: priority 0, state " + string(ctr.State)
+ continue
+ }
+ if ctr.SchedulingParameters.Supervisor && maxSupervisors > 0 && supervisors > maxSupervisors {
+ overmaxsuper = append(overmaxsuper, sorted[i])
+ sorted[i].SchedulingStatus = "not starting: supervisor container limit has been reached"
continue
}
// If we have unalloc instances of any of the eligible
@@ -214,7 +238,7 @@ tryrun:
}
trying++
if !unallocOK && sch.pool.AtQuota() {
- logger.Trace("not locking: AtQuota and no unalloc workers")
+ logger.Trace("not starting: AtQuota and no unalloc workers")
overquota = sorted[i:]
break tryrun
}
@@ -246,10 +270,13 @@ tryrun:
// same instance type. Don't let this
// one sneak in ahead of it.
} else if sch.pool.KillContainer(ctr.UUID, "about to start") {
+ sorted[i].SchedulingStatus = "waiting for previous attempt to exit"
logger.Info("not restarting yet: crunch-run process from previous attempt has not exited")
} else if sch.pool.StartContainer(unallocType, ctr) {
+ sorted[i].SchedulingStatus = "preparing runtime environment"
logger.Trace("StartContainer => true")
} else {
+ sorted[i].SchedulingStatus = "waiting for new instance to be ready"
logger.Trace("StartContainer => false")
containerAllocatedWorkerBootingCount += 1
dontstart[unallocType] = true
@@ -279,6 +306,8 @@ tryrun:
// container A on the next call to
// runQueue(), rather than run
// container B now.
+ qpos++
+ sorted[i].SchedulingStatus = fmt.Sprintf("waiting for suitable instance type to become available: queue position %d", qpos)
logger.Trace("all eligible types at capacity")
continue
}
@@ -293,6 +322,7 @@ tryrun:
// asynchronously and does its own logging
// about the eventual outcome, so we don't
// need to.)
+ sorted[i].SchedulingStatus = "waiting for new instance to be ready"
logger.Info("creating new instance")
// Don't bother trying to start the container
// yet -- obviously the instance will take
@@ -305,12 +335,26 @@ tryrun:
sch.mContainersAllocatedNotStarted.Set(float64(containerAllocatedWorkerBootingCount))
sch.mContainersNotAllocatedOverQuota.Set(float64(len(overquota) + len(overmaxsuper)))
+ var qreason string
+ if sch.pool.AtQuota() {
+ qreason = "waiting for cloud resources"
+ } else {
+ qreason = "waiting while cluster is running at capacity"
+ }
+ for i, ent := range sorted {
+ if ent.SchedulingStatus == "" && (ent.Container.State == arvados.ContainerStateQueued || ent.Container.State == arvados.ContainerStateLocked) {
+ qpos++
+ sorted[i].SchedulingStatus = fmt.Sprintf("%s: queue position %d", qreason, qpos)
+ }
+ }
+ sch.lastQueue.Store(sorted)
+
if len(overquota)+len(overmaxsuper) > 0 {
// Unlock any containers that are unmappable while
// we're at quota (but if they have already been
// scheduled and they're loading docker images etc.,
// let them run).
- var unlock []container.QueueEnt
+ var unlock []QueueEnt
unlock = append(unlock, overmaxsuper...)
if totalInstances > 0 && len(overquota) > 1 {
// We don't unlock the next-in-line container
diff --git a/lib/dispatchcloud/scheduler/scheduler.go b/lib/dispatchcloud/scheduler/scheduler.go
index ee7ab50883..bc6574a21a 100644
--- a/lib/dispatchcloud/scheduler/scheduler.go
+++ b/lib/dispatchcloud/scheduler/scheduler.go
@@ -9,6 +9,7 @@ package scheduler
import (
"context"
"sync"
+ "sync/atomic"
"time"
"git.arvados.org/arvados.git/sdk/go/arvados"
@@ -57,6 +58,8 @@ type Scheduler struct {
mLongestWaitTimeSinceQueue prometheus.Gauge
mLast503Time prometheus.Gauge
mMaxContainerConcurrency prometheus.Gauge
+
+ lastQueue atomic.Value // stores a []QueueEnt
}
// New returns a new unstarted Scheduler.
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list