[ARVADOS] created: 1.3.0-2950-g49e86c93f

Git user git at public.arvados.org
Fri Aug 21 15:28:11 UTC 2020


        at  49e86c93f50583975d0265241c375c2ba8ec7ed1 (commit)


commit 49e86c93f50583975d0265241c375c2ba8ec7ed1
Author: Tom Clegg <tom at tomclegg.ca>
Date:   Fri Aug 21 11:25:27 2020 -0400

    16723: Don't lock a requeued container while old crunch-runs exist.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at tomclegg.ca>

diff --git a/lib/dispatchcloud/scheduler/run_queue.go b/lib/dispatchcloud/scheduler/run_queue.go
index 4447f084a..9396dda07 100644
--- a/lib/dispatchcloud/scheduler/run_queue.go
+++ b/lib/dispatchcloud/scheduler/run_queue.go
@@ -88,6 +88,9 @@ tryrun:
 				// a higher-priority container on the
 				// same instance type. Don't let this
 				// one sneak in ahead of it.
+			} else if sch.pool.KillContainer(ctr.UUID, "about to lock") {
+				logger.Debug("crunch-run process still exists from previous attempt")
+				dontstart[it] = true
 			} else if sch.pool.StartContainer(it, ctr) {
 				// Success.
 			} else {

commit 1de49feb938cc265ec90786c22a1456237796aef
Author: Tom Clegg <tom at tomclegg.ca>
Date:   Fri Aug 21 11:24:50 2020 -0400

    16723: Fail test if a crunch-run stub is killed before finishing.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at tomclegg.ca>

diff --git a/lib/dispatchcloud/dispatcher_test.go b/lib/dispatchcloud/dispatcher_test.go
index 3b45e75fd..f8940f720 100644
--- a/lib/dispatchcloud/dispatcher_test.go
+++ b/lib/dispatchcloud/dispatcher_test.go
@@ -171,6 +171,7 @@ func (s *DispatcherSuite) TestDispatchToStubDriver(c *check.C) {
 			stubvm.CrunchRunCrashRate = 0.1
 		}
 	}
+	s.stubDriver.Bugf = c.Errorf
 
 	start := time.Now()
 	go s.disp.run()
diff --git a/lib/dispatchcloud/test/stub_driver.go b/lib/dispatchcloud/test/stub_driver.go
index f45d37352..f6e06d3f7 100644
--- a/lib/dispatchcloud/test/stub_driver.go
+++ b/lib/dispatchcloud/test/stub_driver.go
@@ -34,6 +34,11 @@ type StubDriver struct {
 	// VM's error rate and other behaviors.
 	SetupVM func(*StubVM)
 
+	// Bugf, if set, is called if a bug is detected in the caller
+	// or stub. Typically set to (*check.C)Errorf. If unset,
+	// logger.Warnf is called instead.
+	Bugf func(string, ...interface{})
+
 	// StubVM's fake crunch-run uses this Queue to read and update
 	// container state.
 	Queue *Queue
@@ -273,7 +278,11 @@ func (svm *StubVM) Exec(env map[string]string, command string, stdin io.Reader,
 				defer svm.Unlock()
 				if svm.running[uuid] != pid {
 					if !completed {
-						logger.Errorf("[test] StubDriver bug or caller bug: pid %d exiting, running[%s]==%d", pid, uuid, svm.running[uuid])
+						bugf := svm.sis.driver.Bugf
+						if bugf == nil {
+							bugf = logger.Warnf
+						}
+						bugf("[test] StubDriver bug or caller bug: pid %d exiting, running[%s]==%d", pid, uuid, svm.running[uuid])
 					}
 				} else {
 					delete(svm.running, uuid)

commit e49cf0412503926eccfe829f2135c5a072b6acc9
Author: Tom Clegg <tom at tomclegg.ca>
Date:   Fri Aug 21 10:42:49 2020 -0400

    16723: Hold uuidLock in kill() to avoid racing with unlock+start.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at tomclegg.ca>

diff --git a/lib/dispatchcloud/scheduler/sync.go b/lib/dispatchcloud/scheduler/sync.go
index c047f9b87..fc683505f 100644
--- a/lib/dispatchcloud/scheduler/sync.go
+++ b/lib/dispatchcloud/scheduler/sync.go
@@ -109,6 +109,10 @@ func (sch *Scheduler) cancel(uuid string, reason string) {
 }
 
 func (sch *Scheduler) kill(uuid string, reason string) {
+	if !sch.uuidLock(uuid, "kill") {
+		return
+	}
+	defer sch.uuidUnlock(uuid)
 	sch.pool.KillContainer(uuid, reason)
 	sch.pool.ForgetContainer(uuid)
 }

commit d3fbb6b33a5b4a55dda969c7e7360cc31e1de0f6
Author: Tom Clegg <tom at tomclegg.ca>
Date:   Fri Aug 21 10:38:25 2020 -0400

    16723: Fix double-container-running bug in test stub.
    
    Two parts:
    
    1. Update state to Complete before crunch-run exits, not after.
    
    2. Don't call the "crashed while running" hook from the test stub
    unless changing state to Running actually succeeded.
    
    Also, call the "crashed while running" hook from the test stub if
    changing state from Running to Complete fails.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at tomclegg.ca>

diff --git a/lib/dispatchcloud/test/stub_driver.go b/lib/dispatchcloud/test/stub_driver.go
index 3e2f8efe0..f45d37352 100644
--- a/lib/dispatchcloud/test/stub_driver.go
+++ b/lib/dispatchcloud/test/stub_driver.go
@@ -265,49 +265,64 @@ func (svm *StubVM) Exec(env map[string]string, command string, stdin io.Reader,
 		})
 		logger.Printf("[test] starting crunch-run stub")
 		go func() {
+			var ctr arvados.Container
+			var started, completed bool
+			defer func() {
+				logger.Print("[test] exiting crunch-run stub")
+				svm.Lock()
+				defer svm.Unlock()
+				if svm.running[uuid] != pid {
+					if !completed {
+						logger.Errorf("[test] StubDriver bug or caller bug: pid %d exiting, running[%s]==%d", pid, uuid, svm.running[uuid])
+					}
+				} else {
+					delete(svm.running, uuid)
+				}
+				if !completed {
+					logger.WithField("State", ctr.State).Print("[test] crashing crunch-run stub")
+					if started && svm.CrashRunningContainer != nil {
+						svm.CrashRunningContainer(ctr)
+					}
+				}
+			}()
+
 			crashluck := math_rand.Float64()
+			wantCrash := crashluck < svm.CrunchRunCrashRate
+			wantCrashEarly := crashluck < svm.CrunchRunCrashRate/2
+
 			ctr, ok := queue.Get(uuid)
 			if !ok {
 				logger.Print("[test] container not in queue")
 				return
 			}
 
-			defer func() {
-				if ctr.State == arvados.ContainerStateRunning && svm.CrashRunningContainer != nil {
-					svm.CrashRunningContainer(ctr)
-				}
-			}()
-
-			if crashluck > svm.CrunchRunCrashRate/2 {
-				time.Sleep(time.Duration(math_rand.Float64()*20) * time.Millisecond)
-				ctr.State = arvados.ContainerStateRunning
-				if !queue.Notify(ctr) {
-					ctr, _ = queue.Get(uuid)
-					logger.Print("[test] erroring out because state=Running update was rejected")
-					return
-				}
-			}
-
 			time.Sleep(time.Duration(math_rand.Float64()*20) * time.Millisecond)
 
 			svm.Lock()
-			defer svm.Unlock()
-			if svm.running[uuid] != pid {
-				logger.Print("[test] container was killed")
+			killed := svm.running[uuid] != pid
+			svm.Unlock()
+			if killed || wantCrashEarly {
 				return
 			}
-			delete(svm.running, uuid)
 
-			if crashluck < svm.CrunchRunCrashRate {
+			ctr.State = arvados.ContainerStateRunning
+			started = queue.Notify(ctr)
+			if !started {
+				ctr, _ = queue.Get(uuid)
+				logger.Print("[test] erroring out because state=Running update was rejected")
+				return
+			}
+
+			if wantCrash {
 				logger.WithField("State", ctr.State).Print("[test] crashing crunch-run stub")
-			} else {
-				if svm.ExecuteContainer != nil {
-					ctr.ExitCode = svm.ExecuteContainer(ctr)
-				}
-				logger.WithField("ExitCode", ctr.ExitCode).Print("[test] exiting crunch-run stub")
-				ctr.State = arvados.ContainerStateComplete
-				go queue.Notify(ctr)
+				return
+			}
+			if svm.ExecuteContainer != nil {
+				ctr.ExitCode = svm.ExecuteContainer(ctr)
 			}
+			logger.WithField("ExitCode", ctr.ExitCode).Print("[test] completing container")
+			ctr.State = arvados.ContainerStateComplete
+			completed = queue.Notify(ctr)
 		}()
 		return 0
 	}

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list