[ARVADOS] created: 1.3.0-2947-g490560601

Git user git at public.arvados.org
Fri Aug 21 13:39:30 UTC 2020


        at  490560601acca1e2ea820ea3a0fe50d2ffffb601 (commit)


commit 490560601acca1e2ea820ea3a0fe50d2ffffb601
Author: Tom Clegg <tom at tomclegg.ca>
Date:   Fri Aug 21 09:25:09 2020 -0400

    16723: Fix double-container-running bug in test stub.
    
    Two parts:
    
    1. Update state to Complete before crunch-run exits, not after.
    
    2. Don't call the "crashed while running" hook from the test stub
    unless changing state to Running actually succeeded.
    
    Also, call the "crashed while running" hook from the test stub if
    changing state from Running to Complete fails.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at tomclegg.ca>

diff --git a/lib/dispatchcloud/test/stub_driver.go b/lib/dispatchcloud/test/stub_driver.go
index 3e2f8efe0..1c406e853 100644
--- a/lib/dispatchcloud/test/stub_driver.go
+++ b/lib/dispatchcloud/test/stub_driver.go
@@ -265,7 +265,23 @@ func (svm *StubVM) Exec(env map[string]string, command string, stdin io.Reader,
 		})
 		logger.Printf("[test] starting crunch-run stub")
 		go func() {
+			defer func() {
+				logger.Print("[test] exiting crunch-run stub")
+				svm.Lock()
+				defer svm.Unlock()
+				if svm.running[uuid] != pid {
+					logger.Warnf("test.StubDriver bug or caller bug: pid %d exiting, running[%s]==%d", pid, uuid, svm.running[uuid])
+				} else {
+					delete(svm.running, uuid)
+				}
+			}()
+
+			started := false
+			completed := false
 			crashluck := math_rand.Float64()
+			wantCrash := crashluck < svm.CrunchRunCrashRate
+			wantCrashEarly := crashluck < svm.CrunchRunCrashRate/2
+
 			ctr, ok := queue.Get(uuid)
 			if !ok {
 				logger.Print("[test] container not in queue")
@@ -273,41 +289,42 @@ func (svm *StubVM) Exec(env map[string]string, command string, stdin io.Reader,
 			}
 
 			defer func() {
-				if ctr.State == arvados.ContainerStateRunning && svm.CrashRunningContainer != nil {
+				logger.WithField("State", ctr.State).Print("[test] crashing crunch-run stub")
+				if started && !completed && svm.CrashRunningContainer != nil {
 					svm.CrashRunningContainer(ctr)
 				}
 			}()
 
-			if crashluck > svm.CrunchRunCrashRate/2 {
-				time.Sleep(time.Duration(math_rand.Float64()*20) * time.Millisecond)
-				ctr.State = arvados.ContainerStateRunning
-				if !queue.Notify(ctr) {
-					ctr, _ = queue.Get(uuid)
-					logger.Print("[test] erroring out because state=Running update was rejected")
-					return
-				}
-			}
-
 			time.Sleep(time.Duration(math_rand.Float64()*20) * time.Millisecond)
 
 			svm.Lock()
-			defer svm.Unlock()
-			if svm.running[uuid] != pid {
+			killed := svm.running[uuid] != pid
+			svm.Unlock()
+			if killed {
 				logger.Print("[test] container was killed")
 				return
+			} else if wantCrashEarly {
+				return
+			}
+
+			ctr.State = arvados.ContainerStateRunning
+			started = queue.Notify(ctr)
+			if !started {
+				ctr, _ = queue.Get(uuid)
+				logger.Print("[test] erroring out because state=Running update was rejected")
+				return
 			}
-			delete(svm.running, uuid)
 
-			if crashluck < svm.CrunchRunCrashRate {
+			if wantCrash {
 				logger.WithField("State", ctr.State).Print("[test] crashing crunch-run stub")
-			} else {
-				if svm.ExecuteContainer != nil {
-					ctr.ExitCode = svm.ExecuteContainer(ctr)
-				}
-				logger.WithField("ExitCode", ctr.ExitCode).Print("[test] exiting crunch-run stub")
-				ctr.State = arvados.ContainerStateComplete
-				go queue.Notify(ctr)
+				return
+			}
+			if svm.ExecuteContainer != nil {
+				ctr.ExitCode = svm.ExecuteContainer(ctr)
 			}
+			logger.WithField("ExitCode", ctr.ExitCode).Print("[test] completing container")
+			ctr.State = arvados.ContainerStateComplete
+			completed = queue.Notify(ctr)
 		}()
 		return 0
 	}

commit f6d0a07763cbffee6a03864d20b8d753b9aefb82
Author: Tom Clegg <tom at tomclegg.ca>
Date:   Fri Aug 21 09:24:02 2020 -0400

    16723: Use more readable instance IDs in tests.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at tomclegg.ca>

diff --git a/lib/dispatchcloud/test/stub_driver.go b/lib/dispatchcloud/test/stub_driver.go
index 7a1f42301..3e2f8efe0 100644
--- a/lib/dispatchcloud/test/stub_driver.go
+++ b/lib/dispatchcloud/test/stub_driver.go
@@ -99,6 +99,7 @@ type StubInstanceSet struct {
 
 	allowCreateCall    time.Time
 	allowInstancesCall time.Time
+	lastInstanceID     int
 }
 
 func (sis *StubInstanceSet) Create(it arvados.InstanceType, image cloud.ImageID, tags cloud.InstanceTags, cmd cloud.InitCommand, authKey ssh.PublicKey) (cloud.Instance, error) {
@@ -120,9 +121,10 @@ func (sis *StubInstanceSet) Create(it arvados.InstanceType, image cloud.ImageID,
 	if authKey != nil {
 		ak = append([]ssh.PublicKey{authKey}, ak...)
 	}
+	sis.lastInstanceID++
 	svm := &StubVM{
 		sis:          sis,
-		id:           cloud.InstanceID(fmt.Sprintf("stub-%s-%x", it.ProviderType, math_rand.Int63())),
+		id:           cloud.InstanceID(fmt.Sprintf("inst%d,%s", sis.lastInstanceID, it.ProviderType)),
 		tags:         copyTags(tags),
 		providerType: it.ProviderType,
 		initCommand:  cmd,

commit 60bbffb69b908f7a1146fc0c1904aae2775cdf1e
Author: Tom Clegg <tom at tomclegg.ca>
Date:   Thu Aug 20 22:21:14 2020 -0400

    16723: Reject invalid updates in API stub.
    
    When dispatcher notices crunch-run has exited and the last known state
    is Locked, it requeues the container. If crunch-run changed the
    container state to Running before exiting (and dispatcher hasn't
    noticed yet), dispatcher relies on RailsAPI/controller to reject the
    requeue attempt.
    
    Without this, the scheduler's state=Queued call was being accepted
    even after losing a race to the stub's state=Running call,
    occasionally causing a container to run twice and fail the randomized
    simulation test.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at tomclegg.ca>

diff --git a/lib/dispatchcloud/dispatcher_test.go b/lib/dispatchcloud/dispatcher_test.go
index aa5f22a50..3b45e75fd 100644
--- a/lib/dispatchcloud/dispatcher_test.go
+++ b/lib/dispatchcloud/dispatcher_test.go
@@ -115,6 +115,7 @@ func (s *DispatcherSuite) TestDispatchToStubDriver(c *check.C) {
 		ChooseType: func(ctr *arvados.Container) (arvados.InstanceType, error) {
 			return ChooseInstanceType(s.cluster, ctr)
 		},
+		Logger: ctxlog.TestLogger(c),
 	}
 	for i := 0; i < 200; i++ {
 		queue.Containers = append(queue.Containers, arvados.Container{
diff --git a/lib/dispatchcloud/test/queue.go b/lib/dispatchcloud/test/queue.go
index 11d410fb1..74b84122f 100644
--- a/lib/dispatchcloud/test/queue.go
+++ b/lib/dispatchcloud/test/queue.go
@@ -11,6 +11,7 @@ import (
 
 	"git.arvados.org/arvados.git/lib/dispatchcloud/container"
 	"git.arvados.org/arvados.git/sdk/go/arvados"
+	"github.com/sirupsen/logrus"
 )
 
 // Queue is a test stub for container.Queue. The caller specifies the
@@ -23,6 +24,8 @@ type Queue struct {
 	// must not be nil.
 	ChooseType func(*arvados.Container) (arvados.InstanceType, error)
 
+	Logger logrus.FieldLogger
+
 	entries     map[string]container.QueueEnt
 	updTime     time.Time
 	subscribers map[<-chan struct{}]chan struct{}
@@ -166,13 +169,36 @@ func (q *Queue) Notify(upd arvados.Container) bool {
 	defer q.mtx.Unlock()
 	for i, ctr := range q.Containers {
 		if ctr.UUID == upd.UUID {
-			if ctr.State != arvados.ContainerStateComplete && ctr.State != arvados.ContainerStateCancelled {
+			if allowContainerUpdate[ctr.State][upd.State] {
 				q.Containers[i] = upd
 				return true
+			} else {
+				if q.Logger != nil {
+					q.Logger.WithField("ContainerUUID", ctr.UUID).Infof("test.Queue rejected update from %s to %s", ctr.State, upd.State)
+				}
+				return false
 			}
-			return false
 		}
 	}
 	q.Containers = append(q.Containers, upd)
 	return true
 }
+
+var allowContainerUpdate = map[arvados.ContainerState]map[arvados.ContainerState]bool{
+	arvados.ContainerStateQueued: map[arvados.ContainerState]bool{
+		arvados.ContainerStateQueued:    true,
+		arvados.ContainerStateLocked:    true,
+		arvados.ContainerStateCancelled: true,
+	},
+	arvados.ContainerStateLocked: map[arvados.ContainerState]bool{
+		arvados.ContainerStateQueued:    true,
+		arvados.ContainerStateLocked:    true,
+		arvados.ContainerStateRunning:   true,
+		arvados.ContainerStateCancelled: true,
+	},
+	arvados.ContainerStateRunning: map[arvados.ContainerState]bool{
+		arvados.ContainerStateRunning:   true,
+		arvados.ContainerStateCancelled: true,
+		arvados.ContainerStateComplete:  true,
+	},
+}

commit 27babdaf446a70fdd7742104846de9afd9b75b22
Author: Tom Clegg <tom at tomclegg.ca>
Date:   Thu Aug 20 21:34:40 2020 -0400

    16723: Fix wrong operation shown in debug log.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at tomclegg.ca>

diff --git a/lib/dispatchcloud/scheduler/sync.go b/lib/dispatchcloud/scheduler/sync.go
index 116ca7643..c047f9b87 100644
--- a/lib/dispatchcloud/scheduler/sync.go
+++ b/lib/dispatchcloud/scheduler/sync.go
@@ -115,7 +115,7 @@ func (sch *Scheduler) kill(uuid string, reason string) {
 
 func (sch *Scheduler) requeue(ent container.QueueEnt, reason string) {
 	uuid := ent.Container.UUID
-	if !sch.uuidLock(uuid, "cancel") {
+	if !sch.uuidLock(uuid, "requeue") {
 		return
 	}
 	defer sch.uuidUnlock(uuid)

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list