[ARVADOS] updated: 1.3.0-535-g41365ac59
Git user
git at public.curoverse.com
Tue Mar 19 17:28:47 UTC 2019
Summary of changes:
lib/dispatchcloud/dispatcher_test.go | 1 -
lib/dispatchcloud/worker/pool.go | 3 ---
lib/dispatchcloud/worker/runner.go | 30 +++++++++++++++++-------------
lib/dispatchcloud/worker/worker.go | 4 ++--
sdk/go/arvados/config.go | 7 ++-----
services/crunch-run/background.go | 7 +++++++
6 files changed, 28 insertions(+), 24 deletions(-)
via 41365ac598721e31fc88c462934e0a06cafe2aae (commit)
via 3384d541acca51c346f8ebfe08b0dc93a33936ff (commit)
from 96f73954851d44ab715fa0b34861a93c00aa0519 (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
commit 41365ac598721e31fc88c462934e0a06cafe2aae
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date: Tue Mar 19 13:28:39 2019 -0400
14807: Don't send SIGKILL to crunch-run processes.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>
diff --git a/lib/dispatchcloud/dispatcher_test.go b/lib/dispatchcloud/dispatcher_test.go
index d0b4efefa..7268f106a 100644
--- a/lib/dispatchcloud/dispatcher_test.go
+++ b/lib/dispatchcloud/dispatcher_test.go
@@ -64,7 +64,6 @@ func (s *DispatcherSuite) SetUpTest(c *check.C) {
MaxProbesPerSecond: 1000,
TimeoutSignal: arvados.Duration(3 * time.Millisecond),
TimeoutTERM: arvados.Duration(20 * time.Millisecond),
- TimeoutKILL: arvados.Duration(20 * time.Millisecond),
},
InstanceTypes: arvados.InstanceTypeMap{
test.InstanceType(1).Name: test.InstanceType(1),
diff --git a/lib/dispatchcloud/worker/pool.go b/lib/dispatchcloud/worker/pool.go
index d13e29c1e..81a658535 100644
--- a/lib/dispatchcloud/worker/pool.go
+++ b/lib/dispatchcloud/worker/pool.go
@@ -69,7 +69,6 @@ const (
defaultTimeoutProbe = time.Minute * 10
defaultTimeoutShutdown = time.Second * 10
defaultTimeoutTERM = time.Minute * 2
- defaultTimeoutKILL = time.Second * 20
defaultTimeoutSignal = time.Second * 5
// Time after a quota error to try again anyway, even if no
@@ -109,7 +108,6 @@ func NewPool(logger logrus.FieldLogger, arvClient *arvados.Client, reg *promethe
timeoutProbe: duration(cluster.CloudVMs.TimeoutProbe, defaultTimeoutProbe),
timeoutShutdown: duration(cluster.CloudVMs.TimeoutShutdown, defaultTimeoutShutdown),
timeoutTERM: duration(cluster.Dispatch.TimeoutTERM, defaultTimeoutTERM),
- timeoutKILL: duration(cluster.Dispatch.TimeoutKILL, defaultTimeoutKILL),
timeoutSignal: duration(cluster.Dispatch.TimeoutSignal, defaultTimeoutSignal),
installPublicKey: installPublicKey,
stop: make(chan bool),
@@ -143,7 +141,6 @@ type Pool struct {
timeoutProbe time.Duration
timeoutShutdown time.Duration
timeoutTERM time.Duration
- timeoutKILL time.Duration
timeoutSignal time.Duration
installPublicKey ssh.PublicKey
diff --git a/lib/dispatchcloud/worker/runner.go b/lib/dispatchcloud/worker/runner.go
index bf1632a6a..69048096f 100644
--- a/lib/dispatchcloud/worker/runner.go
+++ b/lib/dispatchcloud/worker/runner.go
@@ -1,3 +1,7 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
package worker
import (
@@ -19,14 +23,13 @@ type remoteRunner struct {
arvClient *arvados.Client
remoteUser string
timeoutTERM time.Duration
- timeoutKILL time.Duration
timeoutSignal time.Duration
- onUnkillable func(uuid string) // callback invoked when giving up on SIGKILL
- onKilled func(uuid string) // callback invoked when process exits after SIGTERM/SIGKILL
+ onUnkillable func(uuid string) // callback invoked when giving up on SIGTERM
+ onKilled func(uuid string) // callback invoked when process exits after SIGTERM
logger logrus.FieldLogger
stopping bool // true if Stop() has been called
- sentKILL bool // true if SIGKILL has been sent
+ givenup bool // true if timeoutTERM has been reached
closed chan struct{} // channel is closed if Close() has been called
}
@@ -39,7 +42,6 @@ func newRemoteRunner(uuid string, wkr *worker) *remoteRunner {
arvClient: wkr.wp.arvClient,
remoteUser: wkr.instance.RemoteUser(),
timeoutTERM: wkr.wp.timeoutTERM,
- timeoutKILL: wkr.wp.timeoutKILL,
timeoutSignal: wkr.wp.timeoutSignal,
onUnkillable: wkr.onUnkillable,
onKilled: wkr.onKilled,
@@ -88,9 +90,13 @@ func (rr *remoteRunner) Close() {
close(rr.closed)
}
-// Kill starts a background task to kill the remote process,
-// escalating from SIGTERM to SIGKILL to onUnkillable() according to
-// the configured timeouts.
+// Kill starts a background task to kill the remote process, first
+// trying SIGTERM until reaching timeoutTERM, then calling
+// onUnkillable().
+//
+// SIGKILL is not used. It would merely kill the crunch-run supervisor
+// and thereby make the docker container, arv-mount, etc. invisible to
+// us without actually stopping them.
//
// Once Kill has been called, calling it again has no effect.
func (rr *remoteRunner) Kill(reason string) {
@@ -101,19 +107,17 @@ func (rr *remoteRunner) Kill(reason string) {
rr.logger.WithField("Reason", reason).Info("killing crunch-run process")
go func() {
termDeadline := time.Now().Add(rr.timeoutTERM)
- killDeadline := termDeadline.Add(rr.timeoutKILL)
t := time.NewTicker(rr.timeoutSignal)
defer t.Stop()
for range t.C {
switch {
case rr.isClosed():
return
- case time.Now().After(killDeadline):
+ case time.Now().After(termDeadline):
+ rr.logger.Debug("giving up")
+ rr.givenup = true
rr.onUnkillable(rr.uuid)
return
- case time.Now().After(termDeadline):
- rr.sentKILL = true
- rr.kill(syscall.SIGKILL)
default:
rr.kill(syscall.SIGTERM)
}
diff --git a/lib/dispatchcloud/worker/worker.go b/lib/dispatchcloud/worker/worker.go
index b0b030a40..41117c1d4 100644
--- a/lib/dispatchcloud/worker/worker.go
+++ b/lib/dispatchcloud/worker/worker.go
@@ -394,12 +394,12 @@ func (wkr *worker) eligibleForShutdown() bool {
return false
}
for _, rr := range wkr.running {
- if !rr.sentKILL {
+ if !rr.givenup {
return false
}
}
for _, rr := range wkr.starting {
- if !rr.sentKILL {
+ if !rr.givenup {
return false
}
}
diff --git a/sdk/go/arvados/config.go b/sdk/go/arvados/config.go
index 8bd29097f..73addb739 100644
--- a/sdk/go/arvados/config.go
+++ b/sdk/go/arvados/config.go
@@ -123,14 +123,11 @@ type Dispatch struct {
// Maximum total worker probes per second
MaxProbesPerSecond int
- // Time before repeating TERM/KILL signal
+ // Time before repeating SIGTERM when killing a container
TimeoutSignal Duration
- // Time to give up on TERM and move to KILL
+ // Time to give up on SIGTERM and write off the worker
TimeoutTERM Duration
-
- // Time to give up on KILL and write off the worker
- TimeoutKILL Duration
}
type CloudVMs struct {
commit 3384d541acca51c346f8ebfe08b0dc93a33936ff
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date: Tue Mar 19 11:57:32 2019 -0400
14807: Add comments to "error indicates success" code.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>
diff --git a/services/crunch-run/background.go b/services/crunch-run/background.go
index b3c530e69..933692bdc 100644
--- a/services/crunch-run/background.go
+++ b/services/crunch-run/background.go
@@ -116,14 +116,21 @@ func kill(uuid string, signal syscall.Signal, stdout, stderr io.Writer) error {
proc, err := os.FindProcess(pi.PID)
if err != nil {
+ // FindProcess should have succeeded, even if the
+ // process does not exist.
return fmt.Errorf("%s: find process %d: %s", uuid, pi.PID, err)
}
+ // Send the requested signal once, then send signal 0 a few
+ // times. When proc.Signal() returns an error (process no
+ // longer exists), return success. If that doesn't happen
+ // within 1 second, return an error.
err = proc.Signal(signal)
for deadline := time.Now().Add(time.Second); err == nil && time.Now().Before(deadline); time.Sleep(time.Second / 100) {
err = proc.Signal(syscall.Signal(0))
}
if err == nil {
+ // Reached deadline without a proc.Signal() error.
return fmt.Errorf("%s: pid %d: sent signal %d (%s) but process is still alive", uuid, pi.PID, signal, signal)
}
fmt.Fprintf(stderr, "%s: pid %d: %s\n", uuid, pi.PID, err)
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list