[ARVADOS] created: 1.3.0-3135-gc22f6513d

Git user git at public.arvados.org
Sat Sep 12 04:04:58 UTC 2020


        at  c22f6513da55abc41ec64099f50e24f8d7aef03f (commit)


commit c22f6513da55abc41ec64099f50e24f8d7aef03f
Author: Tom Clegg <tom at tomclegg.ca>
Date:   Sat Sep 12 00:00:30 2020 -0400

    16834: Drain instance if crunch-run exits but arv-mount doesn't.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at tomclegg.ca>

diff --git a/lib/config/config.default.yml b/lib/config/config.default.yml
index b1865a221..15e7c7c06 100644
--- a/lib/config/config.default.yml
+++ b/lib/config/config.default.yml
@@ -938,6 +938,11 @@ Clusters:
         # Time before repeating SIGTERM when killing a container.
         TimeoutSignal: 5s
 
+        # Time to give up on a process (most likely arv-mount) that
+        # still holds a container lockfile after its main supervisor
+        # process has exited, and declare the instance broken.
+        TimeoutStaleRunLock: 5s
+
         # Time to give up on SIGTERM and write off the worker.
         TimeoutTERM: 2m
 
diff --git a/lib/config/generated_config.go b/lib/config/generated_config.go
index 201ae3604..7ed332151 100644
--- a/lib/config/generated_config.go
+++ b/lib/config/generated_config.go
@@ -944,6 +944,11 @@ Clusters:
         # Time before repeating SIGTERM when killing a container.
         TimeoutSignal: 5s
 
+        # Time to give up on a process (most likely arv-mount) that
+        # still holds a container lockfile after its main supervisor
+        # process has exited, and declare the instance broken.
+        TimeoutStaleRunLock: 5s
+
         # Time to give up on SIGTERM and write off the worker.
         TimeoutTERM: 2m
 
diff --git a/lib/crunchrun/background.go b/lib/crunchrun/background.go
index bf039afa0..8cdba72c1 100644
--- a/lib/crunchrun/background.go
+++ b/lib/crunchrun/background.go
@@ -218,6 +218,24 @@ func ListProcesses(stdout, stderr io.Writer) int {
 			return nil
 		}
 
+		proc, err := os.FindProcess(pi.PID)
+		if err != nil {
+			// FindProcess should have succeeded, even if the
+			// process does not exist.
+			fmt.Fprintf(stderr, "%s: find process %d: %s", path, pi.PID, err)
+			return nil
+		}
+		err = proc.Signal(syscall.Signal(0))
+		if err != nil {
+			// Process is dead, even though lockfile was
+			// still locked. Most likely a stuck arv-mount
+			// process that inherited the lock from
+			// crunch-run. Report container UUID as
+			// "stale".
+			fmt.Fprintln(stdout, pi.UUID, "stale")
+			return nil
+		}
+
 		fmt.Fprintln(stdout, pi.UUID)
 		return nil
 	}))
diff --git a/lib/dispatchcloud/dispatcher_test.go b/lib/dispatchcloud/dispatcher_test.go
index 6e1850410..cb5cdf1cf 100644
--- a/lib/dispatchcloud/dispatcher_test.go
+++ b/lib/dispatchcloud/dispatcher_test.go
@@ -66,6 +66,7 @@ func (s *DispatcherSuite) SetUpTest(c *check.C) {
 				ProbeInterval:        arvados.Duration(5 * time.Millisecond),
 				MaxProbesPerSecond:   1000,
 				TimeoutSignal:        arvados.Duration(3 * time.Millisecond),
+				TimeoutStaleRunLock:  arvados.Duration(3 * time.Millisecond),
 				TimeoutTERM:          arvados.Duration(20 * time.Millisecond),
 				ResourceTags:         map[string]string{"testtag": "test value"},
 				TagKeyPrefix:         "test:",
@@ -169,6 +170,7 @@ func (s *DispatcherSuite) TestDispatchToStubDriver(c *check.C) {
 			stubvm.ReportBroken = time.Now().Add(time.Duration(rand.Int63n(200)) * time.Millisecond)
 		default:
 			stubvm.CrunchRunCrashRate = 0.1
+			stubvm.ArvMountDeadlockRate = 0.1
 		}
 	}
 	s.stubDriver.Bugf = c.Errorf
diff --git a/lib/dispatchcloud/test/stub_driver.go b/lib/dispatchcloud/test/stub_driver.go
index 132bd4d69..fe3056b48 100644
--- a/lib/dispatchcloud/test/stub_driver.go
+++ b/lib/dispatchcloud/test/stub_driver.go
@@ -189,6 +189,7 @@ type StubVM struct {
 	CrunchRunMissing      bool
 	CrunchRunCrashRate    float64
 	CrunchRunDetachDelay  time.Duration
+	ArvMountDeadlockRate  float64
 	ExecuteContainer      func(arvados.Container) int
 	CrashRunningContainer func(arvados.Container)
 
@@ -201,6 +202,7 @@ type StubVM struct {
 	running      map[string]int64
 	killing      map[string]bool
 	lastPID      int64
+	deadlocked   string
 	sync.Mutex
 }
 
@@ -281,6 +283,9 @@ func (svm *StubVM) Exec(env map[string]string, command string, stdin io.Reader,
 					bugf("[test] StubDriver bug or caller bug: pid %d exiting, running[%s]==%d", pid, uuid, svm.running[uuid])
 				} else {
 					delete(svm.running, uuid)
+					if math_rand.Float64() < svm.ArvMountDeadlockRate {
+						svm.deadlocked = uuid + " stale\n"
+					}
 				}
 				if !completed {
 					logger.WithField("State", ctr.State).Print("[test] crashing crunch-run stub")
@@ -339,6 +344,7 @@ func (svm *StubVM) Exec(env map[string]string, command string, stdin io.Reader,
 		if !svm.ReportBroken.IsZero() && svm.ReportBroken.Before(time.Now()) {
 			fmt.Fprintln(stdout, "broken")
 		}
+		fmt.Fprintln(stdout, svm.deadlocked)
 		return 0
 	}
 	if strings.HasPrefix(command, "crunch-run --kill ") {
diff --git a/lib/dispatchcloud/worker/pool.go b/lib/dispatchcloud/worker/pool.go
index 086887cb4..953195c59 100644
--- a/lib/dispatchcloud/worker/pool.go
+++ b/lib/dispatchcloud/worker/pool.go
@@ -64,15 +64,16 @@ type Executor interface {
 }
 
 const (
-	defaultSyncInterval       = time.Minute
-	defaultProbeInterval      = time.Second * 10
-	defaultMaxProbesPerSecond = 10
-	defaultTimeoutIdle        = time.Minute
-	defaultTimeoutBooting     = time.Minute * 10
-	defaultTimeoutProbe       = time.Minute * 10
-	defaultTimeoutShutdown    = time.Second * 10
-	defaultTimeoutTERM        = time.Minute * 2
-	defaultTimeoutSignal      = time.Second * 5
+	defaultSyncInterval        = time.Minute
+	defaultProbeInterval       = time.Second * 10
+	defaultMaxProbesPerSecond  = 10
+	defaultTimeoutIdle         = time.Minute
+	defaultTimeoutBooting      = time.Minute * 10
+	defaultTimeoutProbe        = time.Minute * 10
+	defaultTimeoutShutdown     = time.Second * 10
+	defaultTimeoutTERM         = time.Minute * 2
+	defaultTimeoutSignal       = time.Second * 5
+	defaultTimeoutStaleRunLock = time.Second * 5
 
 	// Time after a quota error to try again anyway, even if no
 	// instances have been shutdown.
@@ -115,6 +116,7 @@ func NewPool(logger logrus.FieldLogger, arvClient *arvados.Client, reg *promethe
 		timeoutShutdown:                duration(cluster.Containers.CloudVMs.TimeoutShutdown, defaultTimeoutShutdown),
 		timeoutTERM:                    duration(cluster.Containers.CloudVMs.TimeoutTERM, defaultTimeoutTERM),
 		timeoutSignal:                  duration(cluster.Containers.CloudVMs.TimeoutSignal, defaultTimeoutSignal),
+		timeoutStaleRunLock:            duration(cluster.Containers.CloudVMs.TimeoutStaleRunLock, defaultTimeoutStaleRunLock),
 		installPublicKey:               installPublicKey,
 		tagKeyPrefix:                   cluster.Containers.CloudVMs.TagKeyPrefix,
 		stop:                           make(chan bool),
@@ -152,6 +154,7 @@ type Pool struct {
 	timeoutShutdown                time.Duration
 	timeoutTERM                    time.Duration
 	timeoutSignal                  time.Duration
+	timeoutStaleRunLock            time.Duration
 	installPublicKey               ssh.PublicKey
 	tagKeyPrefix                   string
 
diff --git a/lib/dispatchcloud/worker/worker.go b/lib/dispatchcloud/worker/worker.go
index 9199d4baf..fff0737b4 100644
--- a/lib/dispatchcloud/worker/worker.go
+++ b/lib/dispatchcloud/worker/worker.go
@@ -110,6 +110,7 @@ type worker struct {
 	probing             chan struct{}
 	bootOutcomeReported bool
 	timeToReadyReported bool
+	staleRunLockSince   time.Time
 }
 
 func (wkr *worker) onUnkillable(uuid string) {
@@ -382,13 +383,29 @@ func (wkr *worker) probeRunning() (running []string, reportsBroken, ok bool) {
 		return
 	}
 	ok = true
+
+	staleRunLock := false
 	for _, s := range strings.Split(string(stdout), "\n") {
 		if s == "broken" {
 			reportsBroken = true
-		} else if s != "" {
+		} else if s == "" {
+		} else if toks := strings.Split(s, " "); len(toks) == 1 {
 			running = append(running, s)
+		} else if toks[1] == "stale" {
+			wkr.logger.WithField("ContainerUUID", toks[0]).Warn("probe reported stale lockfile")
+			staleRunLock = true
 		}
 	}
+	wkr.mtx.Lock()
+	defer wkr.mtx.Unlock()
+	if !staleRunLock {
+		wkr.staleRunLockSince = time.Time{}
+	} else if wkr.staleRunLockSince.IsZero() {
+		wkr.staleRunLockSince = time.Now()
+	} else if dur := time.Now().Sub(wkr.staleRunLockSince); dur > wkr.wp.timeoutStaleRunLock {
+		wkr.logger.WithField("Duration", dur).Warn("reporting broken after reporting stale run lock for too long")
+		reportsBroken = true
+	}
 	return
 }
 
diff --git a/sdk/go/arvados/config.go b/sdk/go/arvados/config.go
index 363d09daf..394e30a73 100644
--- a/sdk/go/arvados/config.go
+++ b/sdk/go/arvados/config.go
@@ -462,6 +462,7 @@ type CloudVMsConfig struct {
 	TimeoutProbe                   Duration
 	TimeoutShutdown                Duration
 	TimeoutSignal                  Duration
+	TimeoutStaleRunLock            Duration
 	TimeoutTERM                    Duration
 	ResourceTags                   map[string]string
 	TagKeyPrefix                   string

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list