[ARVADOS] created: f8675ad473b45387b1286c6b7a41edf36148ebac
Git user
git at public.curoverse.com
Tue May 9 16:33:37 EDT 2017
at f8675ad473b45387b1286c6b7a41edf36148ebac (commit)
commit f8675ad473b45387b1286c6b7a41edf36148ebac
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date: Fri May 5 17:11:39 2017 -0400
11626: Log sbatch errors where the user can see them. Fix tests so there is no lingering goroutines.
diff --git a/services/crunch-dispatch-slurm/crunch-dispatch-slurm.go b/services/crunch-dispatch-slurm/crunch-dispatch-slurm.go
index cca8b3f..296c0a3 100644
--- a/services/crunch-dispatch-slurm/crunch-dispatch-slurm.go
+++ b/services/crunch-dispatch-slurm/crunch-dispatch-slurm.go
@@ -226,13 +226,21 @@ func run(disp *dispatch.Dispatcher, ctr arvados.Container, status <-chan arvados
if ctr.State == dispatch.Locked && !sqCheck.HasUUID(ctr.UUID) {
log.Printf("Submitting container %s to slurm", ctr.UUID)
if err := submit(disp, ctr, theConfig.CrunchRunCommand); err != nil {
- log.Printf("Error submitting container %s to slurm: %s", ctr.UUID, err)
+ text := fmt.Sprintf("Error submitting container %s to slurm: %s", ctr.UUID, err)
+ log.Printf(text)
+
+ lr := arvadosclient.Dict{"log": arvadosclient.Dict{
+ "object_uuid": ctr.UUID,
+ "event_type": "dispatch",
+ "properties": map[string]string{"text": text}}}
+ disp.Arv.Create("logs", lr, nil)
+
disp.Unlock(ctr.UUID)
return
}
}
- log.Printf("Start monitoring container %s", ctr.UUID)
+ log.Printf("Start monitoring container %v in state %q", ctr.UUID, ctr.State)
defer log.Printf("Done monitoring container %s", ctr.UUID)
// If the container disappears from the slurm queue, there is
diff --git a/services/crunch-dispatch-slurm/crunch-dispatch-slurm_test.go b/services/crunch-dispatch-slurm/crunch-dispatch-slurm_test.go
index 1c366a0..d49daad 100644
--- a/services/crunch-dispatch-slurm/crunch-dispatch-slurm_test.go
+++ b/services/crunch-dispatch-slurm/crunch-dispatch-slurm_test.go
@@ -52,6 +52,7 @@ func (s *TestSuite) SetUpTest(c *C) {
func (s *TestSuite) TearDownTest(c *C) {
os.Args = initialArgs
+ arvadostest.ResetEnv()
arvadostest.StopAPI()
}
@@ -69,6 +70,8 @@ func (s *TestSuite) TestIntegrationNormal(c *C) {
return exec.Command("echo", "zzzzz-dz642-queuedcontainer")
}
},
+ nil,
+ nil,
[]string(nil),
func(dispatcher *dispatch.Dispatcher, container arvados.Container) {
dispatcher.UpdateState(container.UUID, dispatch.Running)
@@ -82,19 +85,7 @@ func (s *TestSuite) TestIntegrationNormal(c *C) {
func (s *TestSuite) TestIntegrationCancel(c *C) {
var cmd *exec.Cmd
var scancelCmdLine []string
- defer func(orig func(arvados.Container) *exec.Cmd) {
- scancelCmd = orig
- }(scancelCmd)
attempt := 0
- scancelCmd = func(container arvados.Container) *exec.Cmd {
- if attempt++; attempt == 1 {
- return exec.Command("false")
- } else {
- scancelCmdLine = scancelFunc(container).Args
- cmd = exec.Command("echo")
- return cmd
- }
- }
container := s.integrationTest(c,
func() *exec.Cmd {
@@ -104,6 +95,16 @@ func (s *TestSuite) TestIntegrationCancel(c *C) {
return exec.Command("echo", "zzzzz-dz642-queuedcontainer")
}
},
+ func(container arvados.Container) *exec.Cmd {
+ if attempt++; attempt == 1 {
+ return exec.Command("false")
+ } else {
+ scancelCmdLine = scancelFunc(container).Args
+ cmd = exec.Command("echo")
+ return cmd
+ }
+ },
+ nil,
[]string(nil),
func(dispatcher *dispatch.Dispatcher, container arvados.Container) {
dispatcher.UpdateState(container.UUID, dispatch.Running)
@@ -118,11 +119,15 @@ func (s *TestSuite) TestIntegrationCancel(c *C) {
}
func (s *TestSuite) TestIntegrationMissingFromSqueue(c *C) {
- container := s.integrationTest(c, func() *exec.Cmd { return exec.Command("echo") }, []string{"sbatch",
- fmt.Sprintf("--job-name=%s", "zzzzz-dz642-queuedcontainer"),
- fmt.Sprintf("--mem=%d", 11445),
- fmt.Sprintf("--cpus-per-task=%d", 4),
- fmt.Sprintf("--tmp=%d", 45777)},
+ container := s.integrationTest(c,
+ func() *exec.Cmd { return exec.Command("echo") },
+ nil,
+ nil,
+ []string{"sbatch",
+ fmt.Sprintf("--job-name=%s", "zzzzz-dz642-queuedcontainer"),
+ fmt.Sprintf("--mem=%d", 11445),
+ fmt.Sprintf("--cpus-per-task=%d", 4),
+ fmt.Sprintf("--tmp=%d", 45777)},
func(dispatcher *dispatch.Dispatcher, container arvados.Container) {
dispatcher.UpdateState(container.UUID, dispatch.Running)
time.Sleep(3 * time.Second)
@@ -131,8 +136,35 @@ func (s *TestSuite) TestIntegrationMissingFromSqueue(c *C) {
c.Check(container.State, Equals, arvados.ContainerStateCancelled)
}
+func (s *TestSuite) TestSbatchFail(c *C) {
+ container := s.integrationTest(c,
+ func() *exec.Cmd { return exec.Command("echo") },
+ nil,
+ func(container arvados.Container) *exec.Cmd {
+ return exec.Command("false")
+ },
+ []string(nil),
+ func(dispatcher *dispatch.Dispatcher, container arvados.Container) {
+ dispatcher.UpdateState(container.UUID, dispatch.Running)
+ dispatcher.UpdateState(container.UUID, dispatch.Complete)
+ })
+ c.Check(container.State, Equals, arvados.ContainerStateComplete)
+
+ arv, err := arvadosclient.MakeArvadosClient()
+ c.Assert(err, IsNil)
+
+ var ll arvados.LogList
+ err = arv.List("logs", arvadosclient.Dict{"filters": [][]string{
+ []string{"object_uuid", "=", container.UUID},
+ []string{"event_type", "=", "dispatch"},
+ }}, &ll)
+ c.Assert(len(ll.Items), Equals, 1)
+}
+
func (s *TestSuite) integrationTest(c *C,
newSqueueCmd func() *exec.Cmd,
+ newScancelCmd func(arvados.Container) *exec.Cmd,
+ newSbatchCmd func(arvados.Container) *exec.Cmd,
sbatchCmdComps []string,
runContainer func(*dispatch.Dispatcher, arvados.Container)) arvados.Container {
arvadostest.ResetEnv()
@@ -146,9 +178,14 @@ func (s *TestSuite) integrationTest(c *C,
defer func(orig func(arvados.Container) *exec.Cmd) {
sbatchCmd = orig
}(sbatchCmd)
- sbatchCmd = func(container arvados.Container) *exec.Cmd {
- sbatchCmdLine = sbatchFunc(container).Args
- return exec.Command("sh")
+
+ if newSbatchCmd != nil {
+ sbatchCmd = newSbatchCmd
+ } else {
+ sbatchCmd = func(container arvados.Container) *exec.Cmd {
+ sbatchCmdLine = sbatchFunc(container).Args
+ return exec.Command("sh")
+ }
}
// Override squeueCmd
@@ -157,6 +194,12 @@ func (s *TestSuite) integrationTest(c *C,
}(squeueCmd)
squeueCmd = newSqueueCmd
+ // Override scancel
+ defer func(orig func(arvados.Container) *exec.Cmd) {
+ scancelCmd = orig
+ }(scancelCmd)
+ scancelCmd = newScancelCmd
+
// There should be one queued container
params := arvadosclient.Dict{
"filters": [][]string{{"state", "=", "Queued"}},
@@ -169,11 +212,16 @@ func (s *TestSuite) integrationTest(c *C,
theConfig.CrunchRunCommand = []string{"echo"}
ctx, cancel := context.WithCancel(context.Background())
+ doneRun := make(chan struct{})
+
dispatcher := dispatch.Dispatcher{
Arv: arv,
PollPeriod: time.Duration(1) * time.Second,
RunContainer: func(disp *dispatch.Dispatcher, ctr arvados.Container, status <-chan arvados.Container) {
- go runContainer(disp, ctr)
+ go func() {
+ runContainer(disp, ctr)
+ doneRun <- struct{}{}
+ }()
run(disp, ctr, status)
cancel()
},
@@ -182,6 +230,7 @@ func (s *TestSuite) integrationTest(c *C,
sqCheck = &SqueueChecker{Period: 500 * time.Millisecond}
err = dispatcher.Run(ctx)
+ <-doneRun
c.Assert(err, Equals, context.Canceled)
sqCheck.Stop()
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list