[ARVADOS] created: 1.3.0-1121-g49717fb59
Git user
git at public.curoverse.com
Tue Jun 18 19:25:29 UTC 2019
at 49717fb59156c2b276ccc2fde0b9f2de71e812a6 (commit)
commit 49717fb59156c2b276ccc2fde0b9f2de71e812a6
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date: Tue Jun 18 15:25:20 2019 -0400
15345: Add .../containers/kill management API to dispatcher.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>
diff --git a/lib/dispatchcloud/dispatcher.go b/lib/dispatchcloud/dispatcher.go
index 12c60ecb1..731c6d25d 100644
--- a/lib/dispatchcloud/dispatcher.go
+++ b/lib/dispatchcloud/dispatcher.go
@@ -148,6 +148,7 @@ func (disp *dispatcher) initialize() {
} else {
mux := httprouter.New()
mux.HandlerFunc("GET", "/arvados/v1/dispatch/containers", disp.apiContainers)
+ mux.HandlerFunc("POST", "/arvados/v1/dispatch/containers/kill", disp.apiInstanceKill)
mux.HandlerFunc("GET", "/arvados/v1/dispatch/instances", disp.apiInstances)
mux.HandlerFunc("POST", "/arvados/v1/dispatch/instances/hold", disp.apiInstanceHold)
mux.HandlerFunc("POST", "/arvados/v1/dispatch/instances/drain", disp.apiInstanceDrain)
@@ -232,6 +233,20 @@ func (disp *dispatcher) apiInstanceKill(w http.ResponseWriter, r *http.Request)
}
}
+// Management API: send SIGTERM to specified container's crunch-run
+// process now.
+func (disp *dispatcher) apiContainerKill(w http.ResponseWriter, r *http.Request) {
+ uuid := r.FormValue("container_uuid")
+ if uuid == "" {
+ httpserver.Error(w, "container_uuid parameter not provided", http.StatusBadRequest)
+ return
+ }
+ if !disp.pool.KillContainer(uuid, "via management API: "+r.FormValue("reason")) {
+ httpserver.Error(w, "container not found", http.StatusNotFound)
+ return
+ }
+}
+
func (disp *dispatcher) apiInstanceIdleBehavior(w http.ResponseWriter, r *http.Request, want worker.IdleBehavior) {
id := cloud.InstanceID(r.FormValue("instance_id"))
if id == "" {
diff --git a/lib/dispatchcloud/scheduler/interfaces.go b/lib/dispatchcloud/scheduler/interfaces.go
index 307807e32..6e00911bd 100644
--- a/lib/dispatchcloud/scheduler/interfaces.go
+++ b/lib/dispatchcloud/scheduler/interfaces.go
@@ -38,7 +38,8 @@ type WorkerPool interface {
Create(arvados.InstanceType) bool
Shutdown(arvados.InstanceType) bool
StartContainer(arvados.InstanceType, arvados.Container) bool
- KillContainer(uuid, reason string)
+ KillContainer(uuid, reason string) bool
+ ForgetContainer(uuid string)
Subscribe() <-chan struct{}
Unsubscribe(<-chan struct{})
}
diff --git a/lib/dispatchcloud/scheduler/run_queue_test.go b/lib/dispatchcloud/scheduler/run_queue_test.go
index dab324579..c683b704d 100644
--- a/lib/dispatchcloud/scheduler/run_queue_test.go
+++ b/lib/dispatchcloud/scheduler/run_queue_test.go
@@ -77,10 +77,13 @@ func (p *stubPool) Create(it arvados.InstanceType) bool {
p.unalloc[it]++
return true
}
-func (p *stubPool) KillContainer(uuid, reason string) {
+func (p *stubPool) ForgetContainer(uuid string) {
+}
+func (p *stubPool) KillContainer(uuid, reason string) bool {
p.Lock()
defer p.Unlock()
delete(p.running, uuid)
+ return true
}
func (p *stubPool) Shutdown(arvados.InstanceType) bool {
p.shutdowns++
diff --git a/lib/dispatchcloud/scheduler/sync.go b/lib/dispatchcloud/scheduler/sync.go
index 99bee484c..78f099549 100644
--- a/lib/dispatchcloud/scheduler/sync.go
+++ b/lib/dispatchcloud/scheduler/sync.go
@@ -99,6 +99,7 @@ func (sch *Scheduler) cancel(uuid string, reason string) {
func (sch *Scheduler) kill(uuid string, reason string) {
sch.pool.KillContainer(uuid, reason)
+ sch.pool.ForgetContainer(uuid)
}
func (sch *Scheduler) requeue(ent container.QueueEnt, reason string) {
diff --git a/lib/dispatchcloud/worker/pool.go b/lib/dispatchcloud/worker/pool.go
index 201e8aad2..97ca7f60a 100644
--- a/lib/dispatchcloud/worker/pool.go
+++ b/lib/dispatchcloud/worker/pool.go
@@ -154,7 +154,7 @@ type Pool struct {
creating map[string]createCall // unfinished (cloud.InstanceSet)Create calls (key is instance secret)
workers map[cloud.InstanceID]*worker
loaded bool // loaded list of instances from InstanceSet at least once
- exited map[string]time.Time // containers whose crunch-run proc has exited, but KillContainer has not been called
+ exited map[string]time.Time // containers whose crunch-run proc has exited, but ForgetContainer has not been called
atQuotaUntil time.Time
atQuotaErr cloud.QuotaError
stop chan bool
@@ -446,7 +446,7 @@ func (wp *Pool) CountWorkers() map[State]int {
// In the returned map, the time value indicates when the Pool
// observed that the container process had exited. A container that
// has not yet exited has a zero time value. The caller should use
-// KillContainer() to garbage-collect the entries for exited
+// ForgetContainer() to garbage-collect the entries for exited
// containers.
func (wp *Pool) Running() map[string]time.Time {
wp.setupOnce.Do(wp.setup)
@@ -493,18 +493,15 @@ func (wp *Pool) StartContainer(it arvados.InstanceType, ctr arvados.Container) b
//
// KillContainer returns immediately; the act of killing the container
// takes some time, and runs in the background.
-func (wp *Pool) KillContainer(uuid string, reason string) {
+//
+// KillContainer returns false if the container has already ended.
+func (wp *Pool) KillContainer(uuid string, reason string) bool {
wp.mtx.Lock()
defer wp.mtx.Unlock()
logger := wp.logger.WithFields(logrus.Fields{
"ContainerUUID": uuid,
"Reason": reason,
})
- if _, ok := wp.exited[uuid]; ok {
- logger.Debug("clearing placeholder for exited crunch-run process")
- delete(wp.exited, uuid)
- return
- }
for _, wkr := range wp.workers {
rr := wkr.running[uuid]
if rr == nil {
@@ -512,10 +509,30 @@ func (wp *Pool) KillContainer(uuid string, reason string) {
}
if rr != nil {
rr.Kill(reason)
- return
+ return true
}
}
logger.Debug("cannot kill: already disappeared")
+ return false
+}
+
+// ForgetContainer clears the placeholder for the given exited
+// container, so it isn't returned by subsequent calls to Running().
+//
+// ForgetContainer has no effect if the container has not yet exited.
+//
+// The "container exited at time T" placeholder (which necessitates
+// ForgetContainer) exists to make it easier for the caller
+// (scheduler) to distinguish a container that exited without
+// finalizing its state from a container that exited too recently for
+// its final state to have appeared in the scheduler's queue cache.
+func (wp *Pool) ForgetContainer(uuid string) {
+ wp.mtx.Lock()
+ defer wp.mtx.Unlock()
+ if _, ok := wp.exited[uuid]; ok {
+ wp.logger.WithField("ContainerUUID", uuid).Debug("clearing placeholder for exited crunch-run process")
+ delete(wp.exited, uuid)
+ }
}
func (wp *Pool) registerMetrics(reg *prometheus.Registry) {
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list