[ARVADOS] created: 1.3.0-2950-g965b6fc27

Git user git at public.arvados.org
Fri Aug 21 15:36:07 UTC 2020


        at  965b6fc27b3eaf3a812132cea093b03b02f35896 (commit)


commit 965b6fc27b3eaf3a812132cea093b03b02f35896
Author: Tom Clegg <tom at tomclegg.ca>
Date:   Fri Aug 21 11:30:04 2020 -0400

    16723: Don't lock a requeued container while old crunch-runs exist.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at tomclegg.ca>

diff --git a/lib/dispatchcloud/scheduler/run_queue.go b/lib/dispatchcloud/scheduler/run_queue.go
index 4447f084a..dddb974b3 100644
--- a/lib/dispatchcloud/scheduler/run_queue.go
+++ b/lib/dispatchcloud/scheduler/run_queue.go
@@ -88,6 +88,8 @@ tryrun:
 				// a higher-priority container on the
 				// same instance type. Don't let this
 				// one sneak in ahead of it.
+			} else if sch.pool.KillContainer(ctr.UUID, "about to lock") {
+				logger.Info("not restarting yet: crunch-run process from previous attempt has not exited")
 			} else if sch.pool.StartContainer(it, ctr) {
 				// Success.
 			} else {

commit 1de49feb938cc265ec90786c22a1456237796aef
Author: Tom Clegg <tom at tomclegg.ca>
Date:   Fri Aug 21 11:24:50 2020 -0400

    16723: Fail test if a crunch-run stub is killed before finishing.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at tomclegg.ca>

diff --git a/lib/dispatchcloud/dispatcher_test.go b/lib/dispatchcloud/dispatcher_test.go
index 3b45e75fd..f8940f720 100644
--- a/lib/dispatchcloud/dispatcher_test.go
+++ b/lib/dispatchcloud/dispatcher_test.go
@@ -171,6 +171,7 @@ func (s *DispatcherSuite) TestDispatchToStubDriver(c *check.C) {
 			stubvm.CrunchRunCrashRate = 0.1
 		}
 	}
+	s.stubDriver.Bugf = c.Errorf
 
 	start := time.Now()
 	go s.disp.run()
diff --git a/lib/dispatchcloud/test/stub_driver.go b/lib/dispatchcloud/test/stub_driver.go
index f45d37352..f6e06d3f7 100644
--- a/lib/dispatchcloud/test/stub_driver.go
+++ b/lib/dispatchcloud/test/stub_driver.go
@@ -34,6 +34,11 @@ type StubDriver struct {
 	// VM's error rate and other behaviors.
 	SetupVM func(*StubVM)
 
+	// Bugf, if set, is called if a bug is detected in the caller
+	// or stub. Typically set to (*check.C)Errorf. If unset,
+	// logger.Warnf is called instead.
+	Bugf func(string, ...interface{})
+
 	// StubVM's fake crunch-run uses this Queue to read and update
 	// container state.
 	Queue *Queue
@@ -273,7 +278,11 @@ func (svm *StubVM) Exec(env map[string]string, command string, stdin io.Reader,
 				defer svm.Unlock()
 				if svm.running[uuid] != pid {
 					if !completed {
-						logger.Errorf("[test] StubDriver bug or caller bug: pid %d exiting, running[%s]==%d", pid, uuid, svm.running[uuid])
+						bugf := svm.sis.driver.Bugf
+						if bugf == nil {
+							bugf = logger.Warnf
+						}
+						bugf("[test] StubDriver bug or caller bug: pid %d exiting, running[%s]==%d", pid, uuid, svm.running[uuid])
 					}
 				} else {
 					delete(svm.running, uuid)

commit e49cf0412503926eccfe829f2135c5a072b6acc9
Author: Tom Clegg <tom at tomclegg.ca>
Date:   Fri Aug 21 10:42:49 2020 -0400

    16723: Hold uuidLock in kill() to avoid racing with unlock+start.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at tomclegg.ca>

diff --git a/lib/dispatchcloud/scheduler/sync.go b/lib/dispatchcloud/scheduler/sync.go
index c047f9b87..fc683505f 100644
--- a/lib/dispatchcloud/scheduler/sync.go
+++ b/lib/dispatchcloud/scheduler/sync.go
@@ -109,6 +109,10 @@ func (sch *Scheduler) cancel(uuid string, reason string) {
 }
 
 func (sch *Scheduler) kill(uuid string, reason string) {
+	if !sch.uuidLock(uuid, "kill") {
+		return
+	}
+	defer sch.uuidUnlock(uuid)
 	sch.pool.KillContainer(uuid, reason)
 	sch.pool.ForgetContainer(uuid)
 }

commit d3fbb6b33a5b4a55dda969c7e7360cc31e1de0f6
Author: Tom Clegg <tom at tomclegg.ca>
Date:   Fri Aug 21 10:38:25 2020 -0400

    16723: Fix double-container-running bug in test stub.
    
    Two parts:
    
    1. Update state to Complete before crunch-run exits, not after.
    
    2. Don't call the "crashed while running" hook from the test stub
    unless changing state to Running actually succeeded.
    
    Also, call the "crashed while running" hook from the test stub if
    changing state from Running to Complete fails.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at tomclegg.ca>

diff --git a/lib/dispatchcloud/test/stub_driver.go b/lib/dispatchcloud/test/stub_driver.go
index 3e2f8efe0..f45d37352 100644
--- a/lib/dispatchcloud/test/stub_driver.go
+++ b/lib/dispatchcloud/test/stub_driver.go
@@ -265,49 +265,64 @@ func (svm *StubVM) Exec(env map[string]string, command string, stdin io.Reader,
 		})
 		logger.Printf("[test] starting crunch-run stub")
 		go func() {
+			var ctr arvados.Container
+			var started, completed bool
+			defer func() {
+				logger.Print("[test] exiting crunch-run stub")
+				svm.Lock()
+				defer svm.Unlock()
+				if svm.running[uuid] != pid {
+					if !completed {
+						logger.Errorf("[test] StubDriver bug or caller bug: pid %d exiting, running[%s]==%d", pid, uuid, svm.running[uuid])
+					}
+				} else {
+					delete(svm.running, uuid)
+				}
+				if !completed {
+					logger.WithField("State", ctr.State).Print("[test] crashing crunch-run stub")
+					if started && svm.CrashRunningContainer != nil {
+						svm.CrashRunningContainer(ctr)
+					}
+				}
+			}()
+
 			crashluck := math_rand.Float64()
+			wantCrash := crashluck < svm.CrunchRunCrashRate
+			wantCrashEarly := crashluck < svm.CrunchRunCrashRate/2
+
 			ctr, ok := queue.Get(uuid)
 			if !ok {
 				logger.Print("[test] container not in queue")
 				return
 			}
 
-			defer func() {
-				if ctr.State == arvados.ContainerStateRunning && svm.CrashRunningContainer != nil {
-					svm.CrashRunningContainer(ctr)
-				}
-			}()
-
-			if crashluck > svm.CrunchRunCrashRate/2 {
-				time.Sleep(time.Duration(math_rand.Float64()*20) * time.Millisecond)
-				ctr.State = arvados.ContainerStateRunning
-				if !queue.Notify(ctr) {
-					ctr, _ = queue.Get(uuid)
-					logger.Print("[test] erroring out because state=Running update was rejected")
-					return
-				}
-			}
-
 			time.Sleep(time.Duration(math_rand.Float64()*20) * time.Millisecond)
 
 			svm.Lock()
-			defer svm.Unlock()
-			if svm.running[uuid] != pid {
-				logger.Print("[test] container was killed")
+			killed := svm.running[uuid] != pid
+			svm.Unlock()
+			if killed || wantCrashEarly {
 				return
 			}
-			delete(svm.running, uuid)
 
-			if crashluck < svm.CrunchRunCrashRate {
+			ctr.State = arvados.ContainerStateRunning
+			started = queue.Notify(ctr)
+			if !started {
+				ctr, _ = queue.Get(uuid)
+				logger.Print("[test] erroring out because state=Running update was rejected")
+				return
+			}
+
+			if wantCrash {
 				logger.WithField("State", ctr.State).Print("[test] crashing crunch-run stub")
-			} else {
-				if svm.ExecuteContainer != nil {
-					ctr.ExitCode = svm.ExecuteContainer(ctr)
-				}
-				logger.WithField("ExitCode", ctr.ExitCode).Print("[test] exiting crunch-run stub")
-				ctr.State = arvados.ContainerStateComplete
-				go queue.Notify(ctr)
+				return
+			}
+			if svm.ExecuteContainer != nil {
+				ctr.ExitCode = svm.ExecuteContainer(ctr)
 			}
+			logger.WithField("ExitCode", ctr.ExitCode).Print("[test] completing container")
+			ctr.State = arvados.ContainerStateComplete
+			completed = queue.Notify(ctr)
 		}()
 		return 0
 	}

commit f6d0a07763cbffee6a03864d20b8d753b9aefb82
Author: Tom Clegg <tom at tomclegg.ca>
Date:   Fri Aug 21 09:24:02 2020 -0400

    16723: Use more readable instance IDs in tests.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at tomclegg.ca>

diff --git a/lib/dispatchcloud/test/stub_driver.go b/lib/dispatchcloud/test/stub_driver.go
index 7a1f42301..3e2f8efe0 100644
--- a/lib/dispatchcloud/test/stub_driver.go
+++ b/lib/dispatchcloud/test/stub_driver.go
@@ -99,6 +99,7 @@ type StubInstanceSet struct {
 
 	allowCreateCall    time.Time
 	allowInstancesCall time.Time
+	lastInstanceID     int
 }
 
 func (sis *StubInstanceSet) Create(it arvados.InstanceType, image cloud.ImageID, tags cloud.InstanceTags, cmd cloud.InitCommand, authKey ssh.PublicKey) (cloud.Instance, error) {
@@ -120,9 +121,10 @@ func (sis *StubInstanceSet) Create(it arvados.InstanceType, image cloud.ImageID,
 	if authKey != nil {
 		ak = append([]ssh.PublicKey{authKey}, ak...)
 	}
+	sis.lastInstanceID++
 	svm := &StubVM{
 		sis:          sis,
-		id:           cloud.InstanceID(fmt.Sprintf("stub-%s-%x", it.ProviderType, math_rand.Int63())),
+		id:           cloud.InstanceID(fmt.Sprintf("inst%d,%s", sis.lastInstanceID, it.ProviderType)),
 		tags:         copyTags(tags),
 		providerType: it.ProviderType,
 		initCommand:  cmd,

commit 60bbffb69b908f7a1146fc0c1904aae2775cdf1e
Author: Tom Clegg <tom at tomclegg.ca>
Date:   Thu Aug 20 22:21:14 2020 -0400

    16723: Reject invalid updates in API stub.
    
    When dispatcher notices crunch-run has exited and the last known state
    is Locked, it requeues the container. If crunch-run changed the
    container state to Running before exiting (and dispatcher hasn't
    noticed yet), dispatcher relies on RailsAPI/controller to reject the
    requeue attempt.
    
    Without this, the scheduler's state=Queued call was being accepted
    even after losing a race to the stub's state=Running call,
    occasionally causing a container to run twice and fail the randomized
    simulation test.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at tomclegg.ca>

diff --git a/lib/dispatchcloud/dispatcher_test.go b/lib/dispatchcloud/dispatcher_test.go
index aa5f22a50..3b45e75fd 100644
--- a/lib/dispatchcloud/dispatcher_test.go
+++ b/lib/dispatchcloud/dispatcher_test.go
@@ -115,6 +115,7 @@ func (s *DispatcherSuite) TestDispatchToStubDriver(c *check.C) {
 		ChooseType: func(ctr *arvados.Container) (arvados.InstanceType, error) {
 			return ChooseInstanceType(s.cluster, ctr)
 		},
+		Logger: ctxlog.TestLogger(c),
 	}
 	for i := 0; i < 200; i++ {
 		queue.Containers = append(queue.Containers, arvados.Container{
diff --git a/lib/dispatchcloud/test/queue.go b/lib/dispatchcloud/test/queue.go
index 11d410fb1..74b84122f 100644
--- a/lib/dispatchcloud/test/queue.go
+++ b/lib/dispatchcloud/test/queue.go
@@ -11,6 +11,7 @@ import (
 
 	"git.arvados.org/arvados.git/lib/dispatchcloud/container"
 	"git.arvados.org/arvados.git/sdk/go/arvados"
+	"github.com/sirupsen/logrus"
 )
 
 // Queue is a test stub for container.Queue. The caller specifies the
@@ -23,6 +24,8 @@ type Queue struct {
 	// must not be nil.
 	ChooseType func(*arvados.Container) (arvados.InstanceType, error)
 
+	Logger logrus.FieldLogger
+
 	entries     map[string]container.QueueEnt
 	updTime     time.Time
 	subscribers map[<-chan struct{}]chan struct{}
@@ -166,13 +169,36 @@ func (q *Queue) Notify(upd arvados.Container) bool {
 	defer q.mtx.Unlock()
 	for i, ctr := range q.Containers {
 		if ctr.UUID == upd.UUID {
-			if ctr.State != arvados.ContainerStateComplete && ctr.State != arvados.ContainerStateCancelled {
+			if allowContainerUpdate[ctr.State][upd.State] {
 				q.Containers[i] = upd
 				return true
+			} else {
+				if q.Logger != nil {
+					q.Logger.WithField("ContainerUUID", ctr.UUID).Infof("test.Queue rejected update from %s to %s", ctr.State, upd.State)
+				}
+				return false
 			}
-			return false
 		}
 	}
 	q.Containers = append(q.Containers, upd)
 	return true
 }
+
+var allowContainerUpdate = map[arvados.ContainerState]map[arvados.ContainerState]bool{
+	arvados.ContainerStateQueued: map[arvados.ContainerState]bool{
+		arvados.ContainerStateQueued:    true,
+		arvados.ContainerStateLocked:    true,
+		arvados.ContainerStateCancelled: true,
+	},
+	arvados.ContainerStateLocked: map[arvados.ContainerState]bool{
+		arvados.ContainerStateQueued:    true,
+		arvados.ContainerStateLocked:    true,
+		arvados.ContainerStateRunning:   true,
+		arvados.ContainerStateCancelled: true,
+	},
+	arvados.ContainerStateRunning: map[arvados.ContainerState]bool{
+		arvados.ContainerStateRunning:   true,
+		arvados.ContainerStateCancelled: true,
+		arvados.ContainerStateComplete:  true,
+	},
+}

commit 27babdaf446a70fdd7742104846de9afd9b75b22
Author: Tom Clegg <tom at tomclegg.ca>
Date:   Thu Aug 20 21:34:40 2020 -0400

    16723: Fix wrong operation shown in debug log.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at tomclegg.ca>

diff --git a/lib/dispatchcloud/scheduler/sync.go b/lib/dispatchcloud/scheduler/sync.go
index 116ca7643..c047f9b87 100644
--- a/lib/dispatchcloud/scheduler/sync.go
+++ b/lib/dispatchcloud/scheduler/sync.go
@@ -115,7 +115,7 @@ func (sch *Scheduler) kill(uuid string, reason string) {
 
 func (sch *Scheduler) requeue(ent container.QueueEnt, reason string) {
 	uuid := ent.Container.UUID
-	if !sch.uuidLock(uuid, "cancel") {
+	if !sch.uuidLock(uuid, "requeue") {
 		return
 	}
 	defer sch.uuidUnlock(uuid)

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list