[ARVADOS] created: 1.3.0-3065-gb30659d51

Git user git at public.arvados.org
Wed Sep 2 13:33:29 UTC 2020


        at  b30659d514ce281209fa7b99863413832fa8d44b (commit)


commit b30659d514ce281209fa7b99863413832fa8d44b
Author: Tom Clegg <tom at tomclegg.ca>
Date:   Wed Sep 2 08:56:38 2020 -0400

    16795: Don't shutdown idle nodes just because Create failed.
    
    Previous behavior would prematurely shutdown booting instances when
    Create calls were being rate-limited.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at tomclegg.ca>

diff --git a/lib/dispatchcloud/scheduler/run_queue.go b/lib/dispatchcloud/scheduler/run_queue.go
index d77dcee94..0e8e1dc2e 100644
--- a/lib/dispatchcloud/scheduler/run_queue.go
+++ b/lib/dispatchcloud/scheduler/run_queue.go
@@ -61,30 +61,25 @@ tryrun:
 			if unalloc[it] > 0 {
 				unalloc[it]--
 			} else if sch.pool.AtQuota() {
-				logger.Debug("not starting: AtQuota and no unalloc workers")
+				// Don't let lower-priority containers
+				// starve this one by using keeping
+				// idle workers alive on different
+				// instance types.
+				logger.Debug("unlocking: AtQuota and no unalloc workers")
+				sch.queue.Unlock(ctr.UUID)
 				overquota = sorted[i:]
 				break tryrun
+			} else if logger.Info("creating new instance"); sch.pool.Create(it) {
+				// Success. (Note pool.Create works
+				// asynchronously and does its own
+				// logging, so we don't need to.)
 			} else {
-				logger.Info("creating new instance")
-				if !sch.pool.Create(it) {
-					// (Note pool.Create works
-					// asynchronously and logs its
-					// own failures, so we don't
-					// need to log this as a
-					// failure.)
-
-					sch.queue.Unlock(ctr.UUID)
-					// Don't let lower-priority
-					// containers starve this one
-					// by using keeping idle
-					// workers alive on different
-					// instance types.  TODO:
-					// avoid getting starved here
-					// if instances of a specific
-					// type always fail.
-					overquota = sorted[i:]
-					break tryrun
-				}
+				// Failed despite not being at quota,
+				// e.g., cloud ops throttled.  TODO:
+				// avoid getting starved here if
+				// instances of a specific type always
+				// fail.
+				continue
 			}
 
 			if dontstart[it] {

commit b35dfa1f2b6c2fe57b7bc8a6e107425ed4e44f2a
Author: Tom Clegg <tom at tomclegg.ca>
Date:   Tue Sep 1 17:10:09 2020 -0400

    16795: Simplify "kill" behavior in stub driver.
    
    Now, "kill" just sets a flag, and the "run" stub exits when it notices
    the flag.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at tomclegg.ca>

diff --git a/lib/dispatchcloud/test/stub_driver.go b/lib/dispatchcloud/test/stub_driver.go
index 2dcd6c128..132bd4d69 100644
--- a/lib/dispatchcloud/test/stub_driver.go
+++ b/lib/dispatchcloud/test/stub_driver.go
@@ -268,19 +268,17 @@ func (svm *StubVM) Exec(env map[string]string, command string, stdin io.Reader,
 		logger.Printf("[test] starting crunch-run stub")
 		go func() {
 			var ctr arvados.Container
-			var started, completed, killed bool
+			var started, completed bool
 			defer func() {
 				logger.Print("[test] exiting crunch-run stub")
 				svm.Lock()
 				defer svm.Unlock()
 				if svm.running[uuid] != pid {
-					if !completed && !killed {
-						bugf := svm.sis.driver.Bugf
-						if bugf == nil {
-							bugf = logger.Warnf
-						}
-						bugf("[test] StubDriver bug or caller bug: pid %d exiting, running[%s]==%d", pid, uuid, svm.running[uuid])
+					bugf := svm.sis.driver.Bugf
+					if bugf == nil {
+						bugf = logger.Warnf
 					}
+					bugf("[test] StubDriver bug or caller bug: pid %d exiting, running[%s]==%d", pid, uuid, svm.running[uuid])
 				} else {
 					delete(svm.running, uuid)
 				}
@@ -305,7 +303,7 @@ func (svm *StubVM) Exec(env map[string]string, command string, stdin io.Reader,
 			time.Sleep(time.Duration(math_rand.Float64()*20) * time.Millisecond)
 
 			svm.Lock()
-			killed = svm.running[uuid] != pid
+			killed := svm.killing[uuid]
 			svm.Unlock()
 			if killed || wantCrashEarly {
 				return
@@ -345,21 +343,9 @@ func (svm *StubVM) Exec(env map[string]string, command string, stdin io.Reader,
 	}
 	if strings.HasPrefix(command, "crunch-run --kill ") {
 		svm.Lock()
-		pid, running := svm.running[uuid]
-		if running && !svm.killing[uuid] {
+		_, running := svm.running[uuid]
+		if running {
 			svm.killing[uuid] = true
-			go func() {
-				time.Sleep(time.Duration(math_rand.Float64()*30) * time.Millisecond)
-				svm.Lock()
-				defer svm.Unlock()
-				if svm.running[uuid] == pid {
-					// Kill only if the running entry
-					// hasn't since been killed and
-					// replaced with a different one.
-					delete(svm.running, uuid)
-				}
-				delete(svm.killing, uuid)
-			}()
 			svm.Unlock()
 			time.Sleep(time.Duration(math_rand.Float64()*2) * time.Millisecond)
 			svm.Lock()

commit 76182f26191190c405077106becdde149c0ad7c5
Author: Tom Clegg <tom at tomclegg.ca>
Date:   Tue Sep 1 16:22:13 2020 -0400

    16795: Fix false-positive bug detection.
    
    The dispatcher simulation test occasionally fails with this error when
    a crunch-run stub is killed, and the same container is rescheduled on
    the same node before the first attempt's startup-phase sleep expires:
    
        bugf("[test] StubDriver bug or caller bug: pid %d exiting, running[%s]==%d", pid, uuid, svm.running[uuid])
    ... Error: [test] StubDriver bug or caller bug: pid 9 exiting, running[zzzzz-dz642-000000000000184]==0
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at tomclegg.ca>

diff --git a/lib/dispatchcloud/test/stub_driver.go b/lib/dispatchcloud/test/stub_driver.go
index 41eb20763..2dcd6c128 100644
--- a/lib/dispatchcloud/test/stub_driver.go
+++ b/lib/dispatchcloud/test/stub_driver.go
@@ -268,13 +268,13 @@ func (svm *StubVM) Exec(env map[string]string, command string, stdin io.Reader,
 		logger.Printf("[test] starting crunch-run stub")
 		go func() {
 			var ctr arvados.Container
-			var started, completed bool
+			var started, completed, killed bool
 			defer func() {
 				logger.Print("[test] exiting crunch-run stub")
 				svm.Lock()
 				defer svm.Unlock()
 				if svm.running[uuid] != pid {
-					if !completed {
+					if !completed && !killed {
 						bugf := svm.sis.driver.Bugf
 						if bugf == nil {
 							bugf = logger.Warnf
@@ -305,7 +305,7 @@ func (svm *StubVM) Exec(env map[string]string, command string, stdin io.Reader,
 			time.Sleep(time.Duration(math_rand.Float64()*20) * time.Millisecond)
 
 			svm.Lock()
-			killed := svm.running[uuid] != pid
+			killed = svm.running[uuid] != pid
 			svm.Unlock()
 			if killed || wantCrashEarly {
 				return

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list