[ARVADOS] created: 1.3.0-3135-gc22f6513d
Git user
git at public.arvados.org
Sat Sep 12 04:04:58 UTC 2020
at c22f6513da55abc41ec64099f50e24f8d7aef03f (commit)
commit c22f6513da55abc41ec64099f50e24f8d7aef03f
Author: Tom Clegg <tom at tomclegg.ca>
Date: Sat Sep 12 00:00:30 2020 -0400
16834: Drain instance if crunch-run exits but arv-mount doesn't.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at tomclegg.ca>
diff --git a/lib/config/config.default.yml b/lib/config/config.default.yml
index b1865a221..15e7c7c06 100644
--- a/lib/config/config.default.yml
+++ b/lib/config/config.default.yml
@@ -938,6 +938,11 @@ Clusters:
# Time before repeating SIGTERM when killing a container.
TimeoutSignal: 5s
+ # Time to give up on a process (most likely arv-mount) that
+ # still holds a container lockfile after its main supervisor
+ # process has exited, and declare the instance broken.
+ TimeoutStaleRunLock: 5s
+
# Time to give up on SIGTERM and write off the worker.
TimeoutTERM: 2m
diff --git a/lib/config/generated_config.go b/lib/config/generated_config.go
index 201ae3604..7ed332151 100644
--- a/lib/config/generated_config.go
+++ b/lib/config/generated_config.go
@@ -944,6 +944,11 @@ Clusters:
# Time before repeating SIGTERM when killing a container.
TimeoutSignal: 5s
+ # Time to give up on a process (most likely arv-mount) that
+ # still holds a container lockfile after its main supervisor
+ # process has exited, and declare the instance broken.
+ TimeoutStaleRunLock: 5s
+
# Time to give up on SIGTERM and write off the worker.
TimeoutTERM: 2m
diff --git a/lib/crunchrun/background.go b/lib/crunchrun/background.go
index bf039afa0..8cdba72c1 100644
--- a/lib/crunchrun/background.go
+++ b/lib/crunchrun/background.go
@@ -218,6 +218,24 @@ func ListProcesses(stdout, stderr io.Writer) int {
return nil
}
+ proc, err := os.FindProcess(pi.PID)
+ if err != nil {
+ // FindProcess should have succeeded, even if the
+ // process does not exist.
+ fmt.Fprintf(stderr, "%s: find process %d: %s", path, pi.PID, err)
+ return nil
+ }
+ err = proc.Signal(syscall.Signal(0))
+ if err != nil {
+ // Process is dead, even though lockfile was
+ // still locked. Most likely a stuck arv-mount
+ // process that inherited the lock from
+ // crunch-run. Report container UUID as
+ // "stale".
+ fmt.Fprintln(stdout, pi.UUID, "stale")
+ return nil
+ }
+
fmt.Fprintln(stdout, pi.UUID)
return nil
}))
diff --git a/lib/dispatchcloud/dispatcher_test.go b/lib/dispatchcloud/dispatcher_test.go
index 6e1850410..cb5cdf1cf 100644
--- a/lib/dispatchcloud/dispatcher_test.go
+++ b/lib/dispatchcloud/dispatcher_test.go
@@ -66,6 +66,7 @@ func (s *DispatcherSuite) SetUpTest(c *check.C) {
ProbeInterval: arvados.Duration(5 * time.Millisecond),
MaxProbesPerSecond: 1000,
TimeoutSignal: arvados.Duration(3 * time.Millisecond),
+ TimeoutStaleRunLock: arvados.Duration(3 * time.Millisecond),
TimeoutTERM: arvados.Duration(20 * time.Millisecond),
ResourceTags: map[string]string{"testtag": "test value"},
TagKeyPrefix: "test:",
@@ -169,6 +170,7 @@ func (s *DispatcherSuite) TestDispatchToStubDriver(c *check.C) {
stubvm.ReportBroken = time.Now().Add(time.Duration(rand.Int63n(200)) * time.Millisecond)
default:
stubvm.CrunchRunCrashRate = 0.1
+ stubvm.ArvMountDeadlockRate = 0.1
}
}
s.stubDriver.Bugf = c.Errorf
diff --git a/lib/dispatchcloud/test/stub_driver.go b/lib/dispatchcloud/test/stub_driver.go
index 132bd4d69..fe3056b48 100644
--- a/lib/dispatchcloud/test/stub_driver.go
+++ b/lib/dispatchcloud/test/stub_driver.go
@@ -189,6 +189,7 @@ type StubVM struct {
CrunchRunMissing bool
CrunchRunCrashRate float64
CrunchRunDetachDelay time.Duration
+ ArvMountDeadlockRate float64
ExecuteContainer func(arvados.Container) int
CrashRunningContainer func(arvados.Container)
@@ -201,6 +202,7 @@ type StubVM struct {
running map[string]int64
killing map[string]bool
lastPID int64
+ deadlocked string
sync.Mutex
}
@@ -281,6 +283,9 @@ func (svm *StubVM) Exec(env map[string]string, command string, stdin io.Reader,
bugf("[test] StubDriver bug or caller bug: pid %d exiting, running[%s]==%d", pid, uuid, svm.running[uuid])
} else {
delete(svm.running, uuid)
+ if math_rand.Float64() < svm.ArvMountDeadlockRate {
+ svm.deadlocked = uuid + " stale\n"
+ }
}
if !completed {
logger.WithField("State", ctr.State).Print("[test] crashing crunch-run stub")
@@ -339,6 +344,7 @@ func (svm *StubVM) Exec(env map[string]string, command string, stdin io.Reader,
if !svm.ReportBroken.IsZero() && svm.ReportBroken.Before(time.Now()) {
fmt.Fprintln(stdout, "broken")
}
+ fmt.Fprintln(stdout, svm.deadlocked)
return 0
}
if strings.HasPrefix(command, "crunch-run --kill ") {
diff --git a/lib/dispatchcloud/worker/pool.go b/lib/dispatchcloud/worker/pool.go
index 086887cb4..953195c59 100644
--- a/lib/dispatchcloud/worker/pool.go
+++ b/lib/dispatchcloud/worker/pool.go
@@ -64,15 +64,16 @@ type Executor interface {
}
const (
- defaultSyncInterval = time.Minute
- defaultProbeInterval = time.Second * 10
- defaultMaxProbesPerSecond = 10
- defaultTimeoutIdle = time.Minute
- defaultTimeoutBooting = time.Minute * 10
- defaultTimeoutProbe = time.Minute * 10
- defaultTimeoutShutdown = time.Second * 10
- defaultTimeoutTERM = time.Minute * 2
- defaultTimeoutSignal = time.Second * 5
+ defaultSyncInterval = time.Minute
+ defaultProbeInterval = time.Second * 10
+ defaultMaxProbesPerSecond = 10
+ defaultTimeoutIdle = time.Minute
+ defaultTimeoutBooting = time.Minute * 10
+ defaultTimeoutProbe = time.Minute * 10
+ defaultTimeoutShutdown = time.Second * 10
+ defaultTimeoutTERM = time.Minute * 2
+ defaultTimeoutSignal = time.Second * 5
+ defaultTimeoutStaleRunLock = time.Second * 5
// Time after a quota error to try again anyway, even if no
// instances have been shutdown.
@@ -115,6 +116,7 @@ func NewPool(logger logrus.FieldLogger, arvClient *arvados.Client, reg *promethe
timeoutShutdown: duration(cluster.Containers.CloudVMs.TimeoutShutdown, defaultTimeoutShutdown),
timeoutTERM: duration(cluster.Containers.CloudVMs.TimeoutTERM, defaultTimeoutTERM),
timeoutSignal: duration(cluster.Containers.CloudVMs.TimeoutSignal, defaultTimeoutSignal),
+ timeoutStaleRunLock: duration(cluster.Containers.CloudVMs.TimeoutStaleRunLock, defaultTimeoutStaleRunLock),
installPublicKey: installPublicKey,
tagKeyPrefix: cluster.Containers.CloudVMs.TagKeyPrefix,
stop: make(chan bool),
@@ -152,6 +154,7 @@ type Pool struct {
timeoutShutdown time.Duration
timeoutTERM time.Duration
timeoutSignal time.Duration
+ timeoutStaleRunLock time.Duration
installPublicKey ssh.PublicKey
tagKeyPrefix string
diff --git a/lib/dispatchcloud/worker/worker.go b/lib/dispatchcloud/worker/worker.go
index 9199d4baf..fff0737b4 100644
--- a/lib/dispatchcloud/worker/worker.go
+++ b/lib/dispatchcloud/worker/worker.go
@@ -110,6 +110,7 @@ type worker struct {
probing chan struct{}
bootOutcomeReported bool
timeToReadyReported bool
+ staleRunLockSince time.Time
}
func (wkr *worker) onUnkillable(uuid string) {
@@ -382,13 +383,29 @@ func (wkr *worker) probeRunning() (running []string, reportsBroken, ok bool) {
return
}
ok = true
+
+ staleRunLock := false
for _, s := range strings.Split(string(stdout), "\n") {
if s == "broken" {
reportsBroken = true
- } else if s != "" {
+ } else if s == "" {
+ } else if toks := strings.Split(s, " "); len(toks) == 1 {
running = append(running, s)
+ } else if toks[1] == "stale" {
+ wkr.logger.WithField("ContainerUUID", toks[0]).Warn("probe reported stale lockfile")
+ staleRunLock = true
}
}
+ wkr.mtx.Lock()
+ defer wkr.mtx.Unlock()
+ if !staleRunLock {
+ wkr.staleRunLockSince = time.Time{}
+ } else if wkr.staleRunLockSince.IsZero() {
+ wkr.staleRunLockSince = time.Now()
+ } else if dur := time.Now().Sub(wkr.staleRunLockSince); dur > wkr.wp.timeoutStaleRunLock {
+ wkr.logger.WithField("Duration", dur).Warn("reporting broken after reporting stale run lock for too long")
+ reportsBroken = true
+ }
return
}
diff --git a/sdk/go/arvados/config.go b/sdk/go/arvados/config.go
index 363d09daf..394e30a73 100644
--- a/sdk/go/arvados/config.go
+++ b/sdk/go/arvados/config.go
@@ -462,6 +462,7 @@ type CloudVMsConfig struct {
TimeoutProbe Duration
TimeoutShutdown Duration
TimeoutSignal Duration
+ TimeoutStaleRunLock Duration
TimeoutTERM Duration
ResourceTags map[string]string
TagKeyPrefix string
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list