[arvados] created: 2.7.0-5522-gb015c9e45f

Wed Dec 6 19:20:49 UTC 2023

at  b015c9e45f2a81b7069e5ecde3e0e9e0c5c619fa (commit)


commit b015c9e45f2a81b7069e5ecde3e0e9e0c5c619fa
Author: Tom Clegg <tom at curii.com>
Date:   Wed Dec 6 14:01:00 2023 -0500

    21258: Fix "guaranteed broken" stub instance not broken.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>

diff --git a/lib/dispatchcloud/dispatcher_test.go b/lib/dispatchcloud/dispatcher_test.go
index e2f0db1efb..51c2c3d6a3 100644
--- a/lib/dispatchcloud/dispatcher_test.go
+++ b/lib/dispatchcloud/dispatcher_test.go
@@ -217,6 +217,7 @@ func (s *DispatcherSuite) TestDispatchToStubDriver(c *check.C) {
 			return test.CapacityError{InstanceTypeSpecific: true}
 		}
 		n := atomic.AddInt32(&vmCount, 1)
+		c.Logf("SetupVM: instance %s n=%d", stubvm.Instance(), n)
 		stubvm.Boot = time.Now().Add(time.Duration(rand.Int63n(int64(5 * time.Millisecond))))
 		stubvm.CrunchRunDetachDelay = time.Duration(rand.Int63n(int64(10 * time.Millisecond)))
 		stubvm.ExecuteContainer = executeContainer
@@ -238,7 +239,7 @@ func (s *DispatcherSuite) TestDispatchToStubDriver(c *check.C) {
 			// 1 instance is completely broken, ensuring
 			// the boot_outcomes{outcome="failure"} metric
 			// is not zero
-			stubvm.CrunchRunCrashRate = 1
+			stubvm.Broken = time.Now()
 		default:
 			stubvm.CrunchRunCrashRate = 0.1
 			stubvm.ArvMountDeadlockRate = 0.1

commit 95e0edffd5914e136fe9baac16c99858306aeeba
Author: Tom Clegg <tom at curii.com>
Date:   Wed Dec 6 12:47:53 2023 -0500

    21258: Fix "container completed twice" testing bug.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>

diff --git a/lib/dispatchcloud/test/stub_driver.go b/lib/dispatchcloud/test/stub_driver.go
index 0a74d97606..2265be6e16 100644
--- a/lib/dispatchcloud/test/stub_driver.go
+++ b/lib/dispatchcloud/test/stub_driver.go
@@ -239,6 +239,8 @@ type StubVM struct {
 	killing      map[string]bool
 	lastPID      int64
 	deadlocked   string
+	stubprocs    sync.WaitGroup
+	destroying   bool
 	sync.Mutex
 }
 
@@ -267,6 +269,17 @@ func (svm *StubVM) Instance() stubInstance {
 }
 
 func (svm *StubVM) Exec(env map[string]string, command string, stdin io.Reader, stdout, stderr io.Writer) uint32 {
+	// Ensure we don't start any new stubprocs after Destroy()
+	// has started Wait()ing for stubprocs to end.
+	svm.Lock()
+	if svm.destroying {
+		svm.Unlock()
+		return 1
+	}
+	svm.stubprocs.Add(1)
+	defer svm.stubprocs.Done()
+	svm.Unlock()
+
 	stdinData, err := ioutil.ReadAll(stdin)
 	if err != nil {
 		fmt.Fprintf(stderr, "error reading stdin: %s\n", err)
@@ -304,7 +317,15 @@ func (svm *StubVM) Exec(env map[string]string, command string, stdin io.Reader,
 		pid := svm.lastPID
 		svm.running[uuid] = stubProcess{pid: pid}
 		svm.Unlock()
+
 		time.Sleep(svm.CrunchRunDetachDelay)
+
+		svm.Lock()
+		defer svm.Unlock()
+		if svm.destroying {
+			fmt.Fprint(stderr, "crunch-run: killed by system shutdown\n")
+			return 9
+		}
 		fmt.Fprintf(stderr, "starting %s\n", uuid)
 		logger := svm.sis.logger.WithFields(logrus.Fields{
 			"Instance":      svm.id,
@@ -312,13 +333,18 @@ func (svm *StubVM) Exec(env map[string]string, command string, stdin io.Reader,
 			"PID":           pid,
 		})
 		logger.Printf("[test] starting crunch-run stub")
+		svm.stubprocs.Add(1)
 		go func() {
+			defer svm.stubprocs.Done()
 			var ctr arvados.Container
 			var started, completed bool
 			defer func() {
 				logger.Print("[test] exiting crunch-run stub")
 				svm.Lock()
 				defer svm.Unlock()
+				if svm.destroying {
+					return
+				}
 				if svm.running[uuid].pid != pid {
 					bugf := svm.sis.driver.Bugf
 					if bugf == nil {
@@ -358,8 +384,10 @@ func (svm *StubVM) Exec(env map[string]string, command string, stdin io.Reader,
 
 			svm.Lock()
 			killed := svm.killing[uuid]
+			delete(svm.killing, uuid)
+			destroying := svm.destroying
 			svm.Unlock()
-			if killed || wantCrashEarly {
+			if killed || wantCrashEarly || destroying {
 				return
 			}
 
@@ -451,6 +479,10 @@ func (si stubInstance) Destroy() error {
 	if math_rand.Float64() < si.svm.sis.driver.ErrorRateDestroy {
 		return errors.New("instance could not be destroyed")
 	}
+	si.svm.Lock()
+	si.svm.destroying = true
+	si.svm.Unlock()
+	si.svm.stubprocs.Wait()
 	si.svm.SSHService.Close()
 	sis.mtx.Lock()
 	defer sis.mtx.Unlock()

commit 5f8b831515460a7fda4068cb3b2d8a4fa2aaccce
Author: Tom Clegg <tom at curii.com>
Date:   Wed Dec 6 10:22:24 2023 -0500

    21258: Fix flaky boot_outcomes metric test.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>

diff --git a/lib/dispatchcloud/dispatcher_test.go b/lib/dispatchcloud/dispatcher_test.go
index 33d7f4e9ac..e2f0db1efb 100644
--- a/lib/dispatchcloud/dispatcher_test.go
+++ b/lib/dispatchcloud/dispatcher_test.go
@@ -208,7 +208,7 @@ func (s *DispatcherSuite) TestDispatchToStubDriver(c *check.C) {
 		return int(rand.Uint32() & 0x3)
 	}
 	var countCapacityErrors int64
-	n := 0
+	vmCount := int32(0)
 	s.stubDriver.Queue = queue
 	s.stubDriver.SetupVM = func(stubvm *test.StubVM) error {
 		if pt := stubvm.Instance().ProviderType(); pt == test.InstanceType(6).ProviderType {
@@ -216,7 +216,7 @@ func (s *DispatcherSuite) TestDispatchToStubDriver(c *check.C) {
 			atomic.AddInt64(&countCapacityErrors, 1)
 			return test.CapacityError{InstanceTypeSpecific: true}
 		}
-		n++
+		n := atomic.AddInt32(&vmCount, 1)
 		stubvm.Boot = time.Now().Add(time.Duration(rand.Int63n(int64(5 * time.Millisecond))))
 		stubvm.CrunchRunDetachDelay = time.Duration(rand.Int63n(int64(10 * time.Millisecond)))
 		stubvm.ExecuteContainer = executeContainer

-----------------------------------------------------------------------


hooks/post-receive
--