[ARVADOS] created: 1.3.0-3135-g224f4793b

Git user git at public.arvados.org
Sat Sep 12 19:51:50 UTC 2020


        at  224f4793b7c147cb3ef9a261e3a60cddb557e1f1 (commit)


commit 224f4793b7c147cb3ef9a261e3a60cddb557e1f1
Author: Tom Clegg <tom at tomclegg.ca>
Date:   Sat Sep 12 15:51:25 2020 -0400

    16834: Drain instance if crunch-run exits but arv-mount doesn't.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at tomclegg.ca>

diff --git a/lib/config/config.default.yml b/lib/config/config.default.yml
index b1865a221..15e7c7c06 100644
--- a/lib/config/config.default.yml
+++ b/lib/config/config.default.yml
@@ -938,6 +938,11 @@ Clusters:
         # Time before repeating SIGTERM when killing a container.
         TimeoutSignal: 5s
 
+        # Time to give up on a process (most likely arv-mount) that
+        # still holds a container lockfile after its main supervisor
+        # process has exited, and declare the instance broken.
+        TimeoutStaleRunLock: 5s
+
         # Time to give up on SIGTERM and write off the worker.
         TimeoutTERM: 2m
 
diff --git a/lib/config/generated_config.go b/lib/config/generated_config.go
index 201ae3604..7ed332151 100644
--- a/lib/config/generated_config.go
+++ b/lib/config/generated_config.go
@@ -944,6 +944,11 @@ Clusters:
         # Time before repeating SIGTERM when killing a container.
         TimeoutSignal: 5s
 
+        # Time to give up on a process (most likely arv-mount) that
+        # still holds a container lockfile after its main supervisor
+        # process has exited, and declare the instance broken.
+        TimeoutStaleRunLock: 5s
+
         # Time to give up on SIGTERM and write off the worker.
         TimeoutTERM: 2m
 
diff --git a/lib/crunchrun/background.go b/lib/crunchrun/background.go
index bf039afa0..8cdba72c1 100644
--- a/lib/crunchrun/background.go
+++ b/lib/crunchrun/background.go
@@ -218,6 +218,24 @@ func ListProcesses(stdout, stderr io.Writer) int {
 			return nil
 		}
 
+		proc, err := os.FindProcess(pi.PID)
+		if err != nil {
+			// FindProcess should have succeeded, even if the
+			// process does not exist.
+			fmt.Fprintf(stderr, "%s: find process %d: %s", path, pi.PID, err)
+			return nil
+		}
+		err = proc.Signal(syscall.Signal(0))
+		if err != nil {
+			// Process is dead, even though lockfile was
+			// still locked. Most likely a stuck arv-mount
+			// process that inherited the lock from
+			// crunch-run. Report container UUID as
+			// "stale".
+			fmt.Fprintln(stdout, pi.UUID, "stale")
+			return nil
+		}
+
 		fmt.Fprintln(stdout, pi.UUID)
 		return nil
 	}))
diff --git a/lib/dispatchcloud/dispatcher_test.go b/lib/dispatchcloud/dispatcher_test.go
index 6e1850410..cb5cdf1cf 100644
--- a/lib/dispatchcloud/dispatcher_test.go
+++ b/lib/dispatchcloud/dispatcher_test.go
@@ -66,6 +66,7 @@ func (s *DispatcherSuite) SetUpTest(c *check.C) {
 				ProbeInterval:        arvados.Duration(5 * time.Millisecond),
 				MaxProbesPerSecond:   1000,
 				TimeoutSignal:        arvados.Duration(3 * time.Millisecond),
+				TimeoutStaleRunLock:  arvados.Duration(3 * time.Millisecond),
 				TimeoutTERM:          arvados.Duration(20 * time.Millisecond),
 				ResourceTags:         map[string]string{"testtag": "test value"},
 				TagKeyPrefix:         "test:",
@@ -169,6 +170,7 @@ func (s *DispatcherSuite) TestDispatchToStubDriver(c *check.C) {
 			stubvm.ReportBroken = time.Now().Add(time.Duration(rand.Int63n(200)) * time.Millisecond)
 		default:
 			stubvm.CrunchRunCrashRate = 0.1
+			stubvm.ArvMountDeadlockRate = 0.1
 		}
 	}
 	s.stubDriver.Bugf = c.Errorf
diff --git a/lib/dispatchcloud/test/stub_driver.go b/lib/dispatchcloud/test/stub_driver.go
index 132bd4d69..4d32cf221 100644
--- a/lib/dispatchcloud/test/stub_driver.go
+++ b/lib/dispatchcloud/test/stub_driver.go
@@ -131,7 +131,7 @@ func (sis *StubInstanceSet) Create(it arvados.InstanceType, image cloud.ImageID,
 		tags:         copyTags(tags),
 		providerType: it.ProviderType,
 		initCommand:  cmd,
-		running:      map[string]int64{},
+		running:      map[string]stubProcess{},
 		killing:      map[string]bool{},
 	}
 	svm.SSHService = SSHService{
@@ -189,6 +189,8 @@ type StubVM struct {
 	CrunchRunMissing      bool
 	CrunchRunCrashRate    float64
 	CrunchRunDetachDelay  time.Duration
+	ArvMountMaxExitLag    time.Duration
+	ArvMountDeadlockRate  float64
 	ExecuteContainer      func(arvados.Container) int
 	CrashRunningContainer func(arvados.Container)
 
@@ -198,12 +200,21 @@ type StubVM struct {
 	initCommand  cloud.InitCommand
 	providerType string
 	SSHService   SSHService
-	running      map[string]int64
+	running      map[string]stubProcess
 	killing      map[string]bool
 	lastPID      int64
+	deadlocked   string
 	sync.Mutex
 }
 
+type stubProcess struct {
+	pid int64
+
+	// crunch-run has exited, but arv-mount process (or something)
+	// still holds lock in /var/run/
+	exited bool
+}
+
 func (svm *StubVM) Instance() stubInstance {
 	svm.Lock()
 	defer svm.Unlock()
@@ -256,7 +267,7 @@ func (svm *StubVM) Exec(env map[string]string, command string, stdin io.Reader,
 		svm.Lock()
 		svm.lastPID++
 		pid := svm.lastPID
-		svm.running[uuid] = pid
+		svm.running[uuid] = stubProcess{pid: pid}
 		svm.Unlock()
 		time.Sleep(svm.CrunchRunDetachDelay)
 		fmt.Fprintf(stderr, "starting %s\n", uuid)
@@ -273,14 +284,13 @@ func (svm *StubVM) Exec(env map[string]string, command string, stdin io.Reader,
 				logger.Print("[test] exiting crunch-run stub")
 				svm.Lock()
 				defer svm.Unlock()
-				if svm.running[uuid] != pid {
+				if svm.running[uuid].pid != pid {
 					bugf := svm.sis.driver.Bugf
 					if bugf == nil {
 						bugf = logger.Warnf
 					}
-					bugf("[test] StubDriver bug or caller bug: pid %d exiting, running[%s]==%d", pid, uuid, svm.running[uuid])
-				} else {
-					delete(svm.running, uuid)
+					bugf("[test] StubDriver bug or caller bug: pid %d exiting, running[%s].pid==%d", pid, uuid, svm.running[uuid].pid)
+					return
 				}
 				if !completed {
 					logger.WithField("State", ctr.State).Print("[test] crashing crunch-run stub")
@@ -288,6 +298,15 @@ func (svm *StubVM) Exec(env map[string]string, command string, stdin io.Reader,
 						svm.CrashRunningContainer(ctr)
 					}
 				}
+				sproc := svm.running[uuid]
+				sproc.exited = true
+				svm.running[uuid] = sproc
+				svm.Unlock()
+				time.Sleep(svm.ArvMountMaxExitLag * time.Duration(math_rand.Float64()))
+				svm.Lock()
+				if math_rand.Float64() >= svm.ArvMountDeadlockRate {
+					delete(svm.running, uuid)
+				}
 			}()
 
 			crashluck := math_rand.Float64()
@@ -333,26 +352,31 @@ func (svm *StubVM) Exec(env map[string]string, command string, stdin io.Reader,
 	if command == "crunch-run --list" {
 		svm.Lock()
 		defer svm.Unlock()
-		for uuid := range svm.running {
-			fmt.Fprintf(stdout, "%s\n", uuid)
+		for uuid, sproc := range svm.running {
+			if sproc.exited {
+				fmt.Fprintf(stdout, "%s stale\n", uuid)
+			} else {
+				fmt.Fprintf(stdout, "%s\n", uuid)
+			}
 		}
 		if !svm.ReportBroken.IsZero() && svm.ReportBroken.Before(time.Now()) {
 			fmt.Fprintln(stdout, "broken")
 		}
+		fmt.Fprintln(stdout, svm.deadlocked)
 		return 0
 	}
 	if strings.HasPrefix(command, "crunch-run --kill ") {
 		svm.Lock()
-		_, running := svm.running[uuid]
-		if running {
+		sproc, running := svm.running[uuid]
+		if running && !sproc.exited {
 			svm.killing[uuid] = true
 			svm.Unlock()
 			time.Sleep(time.Duration(math_rand.Float64()*2) * time.Millisecond)
 			svm.Lock()
-			_, running = svm.running[uuid]
+			sproc, running = svm.running[uuid]
 		}
 		svm.Unlock()
-		if running {
+		if running && !sproc.exited {
 			fmt.Fprintf(stderr, "%s: container is running\n", uuid)
 			return 1
 		}
diff --git a/lib/dispatchcloud/worker/pool.go b/lib/dispatchcloud/worker/pool.go
index 086887cb4..953195c59 100644
--- a/lib/dispatchcloud/worker/pool.go
+++ b/lib/dispatchcloud/worker/pool.go
@@ -64,15 +64,16 @@ type Executor interface {
 }
 
 const (
-	defaultSyncInterval       = time.Minute
-	defaultProbeInterval      = time.Second * 10
-	defaultMaxProbesPerSecond = 10
-	defaultTimeoutIdle        = time.Minute
-	defaultTimeoutBooting     = time.Minute * 10
-	defaultTimeoutProbe       = time.Minute * 10
-	defaultTimeoutShutdown    = time.Second * 10
-	defaultTimeoutTERM        = time.Minute * 2
-	defaultTimeoutSignal      = time.Second * 5
+	defaultSyncInterval        = time.Minute
+	defaultProbeInterval       = time.Second * 10
+	defaultMaxProbesPerSecond  = 10
+	defaultTimeoutIdle         = time.Minute
+	defaultTimeoutBooting      = time.Minute * 10
+	defaultTimeoutProbe        = time.Minute * 10
+	defaultTimeoutShutdown     = time.Second * 10
+	defaultTimeoutTERM         = time.Minute * 2
+	defaultTimeoutSignal       = time.Second * 5
+	defaultTimeoutStaleRunLock = time.Second * 5
 
 	// Time after a quota error to try again anyway, even if no
 	// instances have been shutdown.
@@ -115,6 +116,7 @@ func NewPool(logger logrus.FieldLogger, arvClient *arvados.Client, reg *promethe
 		timeoutShutdown:                duration(cluster.Containers.CloudVMs.TimeoutShutdown, defaultTimeoutShutdown),
 		timeoutTERM:                    duration(cluster.Containers.CloudVMs.TimeoutTERM, defaultTimeoutTERM),
 		timeoutSignal:                  duration(cluster.Containers.CloudVMs.TimeoutSignal, defaultTimeoutSignal),
+		timeoutStaleRunLock:            duration(cluster.Containers.CloudVMs.TimeoutStaleRunLock, defaultTimeoutStaleRunLock),
 		installPublicKey:               installPublicKey,
 		tagKeyPrefix:                   cluster.Containers.CloudVMs.TagKeyPrefix,
 		stop:                           make(chan bool),
@@ -152,6 +154,7 @@ type Pool struct {
 	timeoutShutdown                time.Duration
 	timeoutTERM                    time.Duration
 	timeoutSignal                  time.Duration
+	timeoutStaleRunLock            time.Duration
 	installPublicKey               ssh.PublicKey
 	tagKeyPrefix                   string
 
diff --git a/lib/dispatchcloud/worker/worker.go b/lib/dispatchcloud/worker/worker.go
index 9199d4baf..f903c0a5a 100644
--- a/lib/dispatchcloud/worker/worker.go
+++ b/lib/dispatchcloud/worker/worker.go
@@ -110,6 +110,7 @@ type worker struct {
 	probing             chan struct{}
 	bootOutcomeReported bool
 	timeToReadyReported bool
+	staleRunLockSince   time.Time
 }
 
 func (wkr *worker) onUnkillable(uuid string) {
@@ -382,13 +383,29 @@ func (wkr *worker) probeRunning() (running []string, reportsBroken, ok bool) {
 		return
 	}
 	ok = true
+
+	staleRunLock := false
 	for _, s := range strings.Split(string(stdout), "\n") {
 		if s == "broken" {
 			reportsBroken = true
-		} else if s != "" {
+		} else if s == "" {
+		} else if toks := strings.Split(s, " "); len(toks) == 1 {
 			running = append(running, s)
+		} else if toks[1] == "stale" {
+			wkr.logger.WithField("ContainerUUID", toks[0]).Info("probe reported stale run lock")
+			staleRunLock = true
 		}
 	}
+	wkr.mtx.Lock()
+	defer wkr.mtx.Unlock()
+	if !staleRunLock {
+		wkr.staleRunLockSince = time.Time{}
+	} else if wkr.staleRunLockSince.IsZero() {
+		wkr.staleRunLockSince = time.Now()
+	} else if dur := time.Now().Sub(wkr.staleRunLockSince); dur > wkr.wp.timeoutStaleRunLock {
+		wkr.logger.WithField("Duration", dur).Warn("reporting broken after reporting stale run lock for too long")
+		reportsBroken = true
+	}
 	return
 }
 
diff --git a/sdk/go/arvados/config.go b/sdk/go/arvados/config.go
index 363d09daf..394e30a73 100644
--- a/sdk/go/arvados/config.go
+++ b/sdk/go/arvados/config.go
@@ -462,6 +462,7 @@ type CloudVMsConfig struct {
 	TimeoutProbe                   Duration
 	TimeoutShutdown                Duration
 	TimeoutSignal                  Duration
+	TimeoutStaleRunLock            Duration
 	TimeoutTERM                    Duration
 	ResourceTags                   map[string]string
 	TagKeyPrefix                   string

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list