[ARVADOS] created: 1.3.0-3065-gb30659d51
Git user
git at public.arvados.org
Wed Sep 2 13:33:29 UTC 2020
at b30659d514ce281209fa7b99863413832fa8d44b (commit)
commit b30659d514ce281209fa7b99863413832fa8d44b
Author: Tom Clegg <tom at tomclegg.ca>
Date: Wed Sep 2 08:56:38 2020 -0400
16795: Don't shutdown idle nodes just because Create failed.
Previous behavior would prematurely shutdown booting instances when
Create calls were being rate-limited.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at tomclegg.ca>
diff --git a/lib/dispatchcloud/scheduler/run_queue.go b/lib/dispatchcloud/scheduler/run_queue.go
index d77dcee94..0e8e1dc2e 100644
--- a/lib/dispatchcloud/scheduler/run_queue.go
+++ b/lib/dispatchcloud/scheduler/run_queue.go
@@ -61,30 +61,25 @@ tryrun:
if unalloc[it] > 0 {
unalloc[it]--
} else if sch.pool.AtQuota() {
- logger.Debug("not starting: AtQuota and no unalloc workers")
+ // Don't let lower-priority containers
+ // starve this one by using keeping
+ // idle workers alive on different
+ // instance types.
+ logger.Debug("unlocking: AtQuota and no unalloc workers")
+ sch.queue.Unlock(ctr.UUID)
overquota = sorted[i:]
break tryrun
+ } else if logger.Info("creating new instance"); sch.pool.Create(it) {
+ // Success. (Note pool.Create works
+ // asynchronously and does its own
+ // logging, so we don't need to.)
} else {
- logger.Info("creating new instance")
- if !sch.pool.Create(it) {
- // (Note pool.Create works
- // asynchronously and logs its
- // own failures, so we don't
- // need to log this as a
- // failure.)
-
- sch.queue.Unlock(ctr.UUID)
- // Don't let lower-priority
- // containers starve this one
- // by using keeping idle
- // workers alive on different
- // instance types. TODO:
- // avoid getting starved here
- // if instances of a specific
- // type always fail.
- overquota = sorted[i:]
- break tryrun
- }
+ // Failed despite not being at quota,
+ // e.g., cloud ops throttled. TODO:
+ // avoid getting starved here if
+ // instances of a specific type always
+ // fail.
+ continue
}
if dontstart[it] {
commit b35dfa1f2b6c2fe57b7bc8a6e107425ed4e44f2a
Author: Tom Clegg <tom at tomclegg.ca>
Date: Tue Sep 1 17:10:09 2020 -0400
16795: Simplify "kill" behavior in stub driver.
Now, "kill" just sets a flag, and the "run" stub exits when it notices
the flag.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at tomclegg.ca>
diff --git a/lib/dispatchcloud/test/stub_driver.go b/lib/dispatchcloud/test/stub_driver.go
index 2dcd6c128..132bd4d69 100644
--- a/lib/dispatchcloud/test/stub_driver.go
+++ b/lib/dispatchcloud/test/stub_driver.go
@@ -268,19 +268,17 @@ func (svm *StubVM) Exec(env map[string]string, command string, stdin io.Reader,
logger.Printf("[test] starting crunch-run stub")
go func() {
var ctr arvados.Container
- var started, completed, killed bool
+ var started, completed bool
defer func() {
logger.Print("[test] exiting crunch-run stub")
svm.Lock()
defer svm.Unlock()
if svm.running[uuid] != pid {
- if !completed && !killed {
- bugf := svm.sis.driver.Bugf
- if bugf == nil {
- bugf = logger.Warnf
- }
- bugf("[test] StubDriver bug or caller bug: pid %d exiting, running[%s]==%d", pid, uuid, svm.running[uuid])
+ bugf := svm.sis.driver.Bugf
+ if bugf == nil {
+ bugf = logger.Warnf
}
+ bugf("[test] StubDriver bug or caller bug: pid %d exiting, running[%s]==%d", pid, uuid, svm.running[uuid])
} else {
delete(svm.running, uuid)
}
@@ -305,7 +303,7 @@ func (svm *StubVM) Exec(env map[string]string, command string, stdin io.Reader,
time.Sleep(time.Duration(math_rand.Float64()*20) * time.Millisecond)
svm.Lock()
- killed = svm.running[uuid] != pid
+ killed := svm.killing[uuid]
svm.Unlock()
if killed || wantCrashEarly {
return
@@ -345,21 +343,9 @@ func (svm *StubVM) Exec(env map[string]string, command string, stdin io.Reader,
}
if strings.HasPrefix(command, "crunch-run --kill ") {
svm.Lock()
- pid, running := svm.running[uuid]
- if running && !svm.killing[uuid] {
+ _, running := svm.running[uuid]
+ if running {
svm.killing[uuid] = true
- go func() {
- time.Sleep(time.Duration(math_rand.Float64()*30) * time.Millisecond)
- svm.Lock()
- defer svm.Unlock()
- if svm.running[uuid] == pid {
- // Kill only if the running entry
- // hasn't since been killed and
- // replaced with a different one.
- delete(svm.running, uuid)
- }
- delete(svm.killing, uuid)
- }()
svm.Unlock()
time.Sleep(time.Duration(math_rand.Float64()*2) * time.Millisecond)
svm.Lock()
commit 76182f26191190c405077106becdde149c0ad7c5
Author: Tom Clegg <tom at tomclegg.ca>
Date: Tue Sep 1 16:22:13 2020 -0400
16795: Fix false-positive bug detection.
The dispatcher simulation test occasionally fails with this error when
a crunch-run stub is killed, and the same container is rescheduled on
the same node before the first attempt's startup-phase sleep expires:
bugf("[test] StubDriver bug or caller bug: pid %d exiting, running[%s]==%d", pid, uuid, svm.running[uuid])
... Error: [test] StubDriver bug or caller bug: pid 9 exiting, running[zzzzz-dz642-000000000000184]==0
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at tomclegg.ca>
diff --git a/lib/dispatchcloud/test/stub_driver.go b/lib/dispatchcloud/test/stub_driver.go
index 41eb20763..2dcd6c128 100644
--- a/lib/dispatchcloud/test/stub_driver.go
+++ b/lib/dispatchcloud/test/stub_driver.go
@@ -268,13 +268,13 @@ func (svm *StubVM) Exec(env map[string]string, command string, stdin io.Reader,
logger.Printf("[test] starting crunch-run stub")
go func() {
var ctr arvados.Container
- var started, completed bool
+ var started, completed, killed bool
defer func() {
logger.Print("[test] exiting crunch-run stub")
svm.Lock()
defer svm.Unlock()
if svm.running[uuid] != pid {
- if !completed {
+ if !completed && !killed {
bugf := svm.sis.driver.Bugf
if bugf == nil {
bugf = logger.Warnf
@@ -305,7 +305,7 @@ func (svm *StubVM) Exec(env map[string]string, command string, stdin io.Reader,
time.Sleep(time.Duration(math_rand.Float64()*20) * time.Millisecond)
svm.Lock()
- killed := svm.running[uuid] != pid
+ killed = svm.running[uuid] != pid
svm.Unlock()
if killed || wantCrashEarly {
return
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list