[ARVADOS] updated: 386faadf691e444b71d6c96e7c00792d9a0ba2c7

Git user git at public.curoverse.com
Thu Mar 16 13:25:32 EDT 2017


Summary of changes:
 services/crunch-run/crunchrun.go      | 35 ++++++++++++++++++++++-------------
 services/crunch-run/crunchrun_test.go |  6 ++++--
 2 files changed, 26 insertions(+), 15 deletions(-)

       via  386faadf691e444b71d6c96e7c00792d9a0ba2c7 (commit)
      from  8513e042b0033599146546bd3a2ad903c67c9ff5 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.


commit 386faadf691e444b71d6c96e7c00792d9a0ba2c7
Author: Tom Clegg <tom at curoverse.com>
Date:   Thu Mar 16 13:24:58 2017 -0400

    10218: Wait for container to be started (not just created) before trying to cancel it.

diff --git a/services/crunch-run/crunchrun.go b/services/crunch-run/crunchrun.go
index f71ec4d..9ca4f80 100644
--- a/services/crunch-run/crunchrun.go
+++ b/services/crunch-run/crunchrun.go
@@ -91,8 +91,6 @@ type ContainerRunner struct {
 	CleanupTempDir []string
 	Binds          []string
 	OutputPDH      *string
-	CancelLock     sync.Mutex
-	Cancelled      bool
 	SigChan        chan os.Signal
 	ArvMountExit   chan error
 	finalState     string
@@ -115,6 +113,10 @@ type ContainerRunner struct {
 	// parent to be X" feature even on sites where the "specify
 	// cgroup parent" feature breaks.
 	setCgroupParent string
+
+	cStateLock sync.Mutex
+	cStarted   bool // StartContainer() succeeded
+	cCancelled bool // StopContainer() invoked
 }
 
 // SetupSignals sets up signal handling to gracefully terminate the underlying
@@ -127,6 +129,7 @@ func (runner *ContainerRunner) SetupSignals() {
 
 	go func(sig chan os.Signal) {
 		<-sig
+		log.Print("signal handler calling runner.stop()")
 		runner.stop()
 		signal.Stop(sig)
 	}(runner.SigChan)
@@ -134,13 +137,13 @@ func (runner *ContainerRunner) SetupSignals() {
 
 // stop the underlying Docker container.
 func (runner *ContainerRunner) stop() {
-	runner.CancelLock.Lock()
-	defer runner.CancelLock.Unlock()
-	if runner.Cancelled {
+	runner.cStateLock.Lock()
+	defer runner.cStateLock.Unlock()
+	if runner.cCancelled {
 		return
 	}
-	runner.Cancelled = true
-	if runner.ContainerID != "" {
+	runner.cCancelled = true
+	if runner.cStarted {
 		err := runner.Docker.StopContainer(runner.ContainerID, 10)
 		if err != nil {
 			log.Printf("StopContainer failed: %s", err)
@@ -648,10 +651,16 @@ func (runner *ContainerRunner) CreateContainer() error {
 // StartContainer starts the docker container created by CreateContainer.
 func (runner *ContainerRunner) StartContainer() error {
 	runner.CrunchLog.Printf("Starting Docker container id '%s'", runner.ContainerID)
+	runner.cStateLock.Lock()
+	defer runner.cStateLock.Unlock()
+	if runner.cCancelled {
+		return ErrCancelled
+	}
 	err := runner.Docker.StartContainer(runner.ContainerID, &runner.HostConfig)
 	if err != nil {
 		return fmt.Errorf("could not start container: %v", err)
 	}
+	runner.cStarted = true
 	return nil
 }
 
@@ -896,9 +905,9 @@ func (runner *ContainerRunner) CommitLogs() error {
 
 // UpdateContainerRunning updates the container state to "Running"
 func (runner *ContainerRunner) UpdateContainerRunning() error {
-	runner.CancelLock.Lock()
-	defer runner.CancelLock.Unlock()
-	if runner.Cancelled {
+	runner.cStateLock.Lock()
+	defer runner.cStateLock.Unlock()
+	if runner.cCancelled {
 		return ErrCancelled
 	}
 	return runner.ArvClient.Update("containers", runner.Container.UUID,
@@ -942,9 +951,9 @@ func (runner *ContainerRunner) UpdateContainerFinal() error {
 
 // IsCancelled returns the value of Cancelled, with goroutine safety.
 func (runner *ContainerRunner) IsCancelled() bool {
-	runner.CancelLock.Lock()
-	defer runner.CancelLock.Unlock()
-	return runner.Cancelled
+	runner.cStateLock.Lock()
+	defer runner.cStateLock.Unlock()
+	return runner.cCancelled
 }
 
 // NewArvLogWriter creates an ArvLogWriter
diff --git a/services/crunch-run/crunchrun_test.go b/services/crunch-run/crunchrun_test.go
index 1286659..fcf2456 100644
--- a/services/crunch-run/crunchrun_test.go
+++ b/services/crunch-run/crunchrun_test.go
@@ -11,6 +11,7 @@ import (
 	"os"
 	"os/exec"
 	"path/filepath"
+	"runtime/pprof"
 	"sort"
 	"strings"
 	"sync"
@@ -525,7 +526,7 @@ func (s *TestSuite) TestUpdateContainerCancelled(c *C) {
 	api := &ArvTestClient{}
 	kc := &KeepTestClient{}
 	cr := NewContainerRunner(api, kc, nil, "zzzzz-zzzzz-zzzzzzzzzzzzzzz")
-	cr.Cancelled = true
+	cr.cCancelled = true
 	cr.finalState = "Cancelled"
 
 	err := cr.UpdateContainerFinal()
@@ -748,7 +749,7 @@ func (s *TestSuite) TestFullRunSetCwd(c *C) {
 func (s *TestSuite) TestStopOnSignal(c *C) {
 	s.testStopContainer(c, func(cr *ContainerRunner) {
 		go func() {
-			for cr.ContainerID == "" {
+			for !cr.cStarted {
 				time.Sleep(time.Millisecond)
 			}
 			cr.SigChan <- syscall.SIGINT
@@ -802,6 +803,7 @@ func (s *TestSuite) testStopContainer(c *C, setup func(cr *ContainerRunner)) {
 	}()
 	select {
 	case <-time.After(20 * time.Second):
+		pprof.Lookup("goroutine").WriteTo(os.Stderr, 1)
 		c.Fatal("timed out")
 	case err = <-done:
 		c.Check(err, IsNil)

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list