[ARVADOS] updated: 1.3.0-535-g41365ac59

Git user git at public.curoverse.com
Tue Mar 19 17:28:47 UTC 2019


Summary of changes:
 lib/dispatchcloud/dispatcher_test.go |  1 -
 lib/dispatchcloud/worker/pool.go     |  3 ---
 lib/dispatchcloud/worker/runner.go   | 30 +++++++++++++++++-------------
 lib/dispatchcloud/worker/worker.go   |  4 ++--
 sdk/go/arvados/config.go             |  7 ++-----
 services/crunch-run/background.go    |  7 +++++++
 6 files changed, 28 insertions(+), 24 deletions(-)

       via  41365ac598721e31fc88c462934e0a06cafe2aae (commit)
       via  3384d541acca51c346f8ebfe08b0dc93a33936ff (commit)
      from  96f73954851d44ab715fa0b34861a93c00aa0519 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.


commit 41365ac598721e31fc88c462934e0a06cafe2aae
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Tue Mar 19 13:28:39 2019 -0400

    14807: Don't send SIGKILL to crunch-run processes.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/lib/dispatchcloud/dispatcher_test.go b/lib/dispatchcloud/dispatcher_test.go
index d0b4efefa..7268f106a 100644
--- a/lib/dispatchcloud/dispatcher_test.go
+++ b/lib/dispatchcloud/dispatcher_test.go
@@ -64,7 +64,6 @@ func (s *DispatcherSuite) SetUpTest(c *check.C) {
 			MaxProbesPerSecond: 1000,
 			TimeoutSignal:      arvados.Duration(3 * time.Millisecond),
 			TimeoutTERM:        arvados.Duration(20 * time.Millisecond),
-			TimeoutKILL:        arvados.Duration(20 * time.Millisecond),
 		},
 		InstanceTypes: arvados.InstanceTypeMap{
 			test.InstanceType(1).Name:  test.InstanceType(1),
diff --git a/lib/dispatchcloud/worker/pool.go b/lib/dispatchcloud/worker/pool.go
index d13e29c1e..81a658535 100644
--- a/lib/dispatchcloud/worker/pool.go
+++ b/lib/dispatchcloud/worker/pool.go
@@ -69,7 +69,6 @@ const (
 	defaultTimeoutProbe       = time.Minute * 10
 	defaultTimeoutShutdown    = time.Second * 10
 	defaultTimeoutTERM        = time.Minute * 2
-	defaultTimeoutKILL        = time.Second * 20
 	defaultTimeoutSignal      = time.Second * 5
 
 	// Time after a quota error to try again anyway, even if no
@@ -109,7 +108,6 @@ func NewPool(logger logrus.FieldLogger, arvClient *arvados.Client, reg *promethe
 		timeoutProbe:       duration(cluster.CloudVMs.TimeoutProbe, defaultTimeoutProbe),
 		timeoutShutdown:    duration(cluster.CloudVMs.TimeoutShutdown, defaultTimeoutShutdown),
 		timeoutTERM:        duration(cluster.Dispatch.TimeoutTERM, defaultTimeoutTERM),
-		timeoutKILL:        duration(cluster.Dispatch.TimeoutKILL, defaultTimeoutKILL),
 		timeoutSignal:      duration(cluster.Dispatch.TimeoutSignal, defaultTimeoutSignal),
 		installPublicKey:   installPublicKey,
 		stop:               make(chan bool),
@@ -143,7 +141,6 @@ type Pool struct {
 	timeoutProbe       time.Duration
 	timeoutShutdown    time.Duration
 	timeoutTERM        time.Duration
-	timeoutKILL        time.Duration
 	timeoutSignal      time.Duration
 	installPublicKey   ssh.PublicKey
 
diff --git a/lib/dispatchcloud/worker/runner.go b/lib/dispatchcloud/worker/runner.go
index bf1632a6a..69048096f 100644
--- a/lib/dispatchcloud/worker/runner.go
+++ b/lib/dispatchcloud/worker/runner.go
@@ -1,3 +1,7 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
 package worker
 
 import (
@@ -19,14 +23,13 @@ type remoteRunner struct {
 	arvClient     *arvados.Client
 	remoteUser    string
 	timeoutTERM   time.Duration
-	timeoutKILL   time.Duration
 	timeoutSignal time.Duration
-	onUnkillable  func(uuid string) // callback invoked when giving up on SIGKILL
-	onKilled      func(uuid string) // callback invoked when process exits after SIGTERM/SIGKILL
+	onUnkillable  func(uuid string) // callback invoked when giving up on SIGTERM
+	onKilled      func(uuid string) // callback invoked when process exits after SIGTERM
 	logger        logrus.FieldLogger
 
 	stopping bool          // true if Stop() has been called
-	sentKILL bool          // true if SIGKILL has been sent
+	givenup  bool          // true if timeoutTERM has been reached
 	closed   chan struct{} // channel is closed if Close() has been called
 }
 
@@ -39,7 +42,6 @@ func newRemoteRunner(uuid string, wkr *worker) *remoteRunner {
 		arvClient:     wkr.wp.arvClient,
 		remoteUser:    wkr.instance.RemoteUser(),
 		timeoutTERM:   wkr.wp.timeoutTERM,
-		timeoutKILL:   wkr.wp.timeoutKILL,
 		timeoutSignal: wkr.wp.timeoutSignal,
 		onUnkillable:  wkr.onUnkillable,
 		onKilled:      wkr.onKilled,
@@ -88,9 +90,13 @@ func (rr *remoteRunner) Close() {
 	close(rr.closed)
 }
 
-// Kill starts a background task to kill the remote process,
-// escalating from SIGTERM to SIGKILL to onUnkillable() according to
-// the configured timeouts.
+// Kill starts a background task to kill the remote process, first
+// trying SIGTERM until reaching timeoutTERM, then calling
+// onUnkillable().
+//
+// SIGKILL is not used. It would merely kill the crunch-run supervisor
+// and thereby make the docker container, arv-mount, etc. invisible to
+// us without actually stopping them.
 //
 // Once Kill has been called, calling it again has no effect.
 func (rr *remoteRunner) Kill(reason string) {
@@ -101,19 +107,17 @@ func (rr *remoteRunner) Kill(reason string) {
 	rr.logger.WithField("Reason", reason).Info("killing crunch-run process")
 	go func() {
 		termDeadline := time.Now().Add(rr.timeoutTERM)
-		killDeadline := termDeadline.Add(rr.timeoutKILL)
 		t := time.NewTicker(rr.timeoutSignal)
 		defer t.Stop()
 		for range t.C {
 			switch {
 			case rr.isClosed():
 				return
-			case time.Now().After(killDeadline):
+			case time.Now().After(termDeadline):
+				rr.logger.Debug("giving up")
+				rr.givenup = true
 				rr.onUnkillable(rr.uuid)
 				return
-			case time.Now().After(termDeadline):
-				rr.sentKILL = true
-				rr.kill(syscall.SIGKILL)
 			default:
 				rr.kill(syscall.SIGTERM)
 			}
diff --git a/lib/dispatchcloud/worker/worker.go b/lib/dispatchcloud/worker/worker.go
index b0b030a40..41117c1d4 100644
--- a/lib/dispatchcloud/worker/worker.go
+++ b/lib/dispatchcloud/worker/worker.go
@@ -394,12 +394,12 @@ func (wkr *worker) eligibleForShutdown() bool {
 			return false
 		}
 		for _, rr := range wkr.running {
-			if !rr.sentKILL {
+			if !rr.givenup {
 				return false
 			}
 		}
 		for _, rr := range wkr.starting {
-			if !rr.sentKILL {
+			if !rr.givenup {
 				return false
 			}
 		}
diff --git a/sdk/go/arvados/config.go b/sdk/go/arvados/config.go
index 8bd29097f..73addb739 100644
--- a/sdk/go/arvados/config.go
+++ b/sdk/go/arvados/config.go
@@ -123,14 +123,11 @@ type Dispatch struct {
 	// Maximum total worker probes per second
 	MaxProbesPerSecond int
 
-	// Time before repeating TERM/KILL signal
+	// Time before repeating SIGTERM when killing a container
 	TimeoutSignal Duration
 
-	// Time to give up on TERM and move to KILL
+	// Time to give up on SIGTERM and write off the worker
 	TimeoutTERM Duration
-
-	// Time to give up on KILL and write off the worker
-	TimeoutKILL Duration
 }
 
 type CloudVMs struct {

commit 3384d541acca51c346f8ebfe08b0dc93a33936ff
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Tue Mar 19 11:57:32 2019 -0400

    14807: Add comments to "error indicates success" code.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/services/crunch-run/background.go b/services/crunch-run/background.go
index b3c530e69..933692bdc 100644
--- a/services/crunch-run/background.go
+++ b/services/crunch-run/background.go
@@ -116,14 +116,21 @@ func kill(uuid string, signal syscall.Signal, stdout, stderr io.Writer) error {
 
 	proc, err := os.FindProcess(pi.PID)
 	if err != nil {
+		// FindProcess should have succeeded, even if the
+		// process does not exist.
 		return fmt.Errorf("%s: find process %d: %s", uuid, pi.PID, err)
 	}
 
+	// Send the requested signal once, then send signal 0 a few
+	// times.  When proc.Signal() returns an error (process no
+	// longer exists), return success.  If that doesn't happen
+	// within 1 second, return an error.
 	err = proc.Signal(signal)
 	for deadline := time.Now().Add(time.Second); err == nil && time.Now().Before(deadline); time.Sleep(time.Second / 100) {
 		err = proc.Signal(syscall.Signal(0))
 	}
 	if err == nil {
+		// Reached deadline without a proc.Signal() error.
 		return fmt.Errorf("%s: pid %d: sent signal %d (%s) but process is still alive", uuid, pi.PID, signal, signal)
 	}
 	fmt.Fprintf(stderr, "%s: pid %d: %s\n", uuid, pi.PID, err)

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list