[ARVADOS] updated: 1.3.0-58-g076d8a0d4

Git user git at public.curoverse.com
Wed Dec 12 15:22:39 EST 2018


Summary of changes:
 services/crunch-run/background.go | 57 ++++++++++++++++++++++++++++++++-------
 1 file changed, 48 insertions(+), 9 deletions(-)

       via  076d8a0d4885f7bfa22962a745a5aacb48fa9be1 (commit)
      from  b205525d0b7c7b9042513fe77d2e8061534208ae (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.


commit 076d8a0d4885f7bfa22962a745a5aacb48fa9be1
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Wed Dec 12 15:21:51 2018 -0500

    14360: Fix race between per-container lock and garbage collection.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/services/crunch-run/background.go b/services/crunch-run/background.go
index be5b2e685..a50853837 100644
--- a/services/crunch-run/background.go
+++ b/services/crunch-run/background.go
@@ -39,15 +39,31 @@ func Detach(uuid string, args []string, stdout, stderr io.Writer) int {
 	return exitcode(stderr, detach(uuid, args, stdout, stderr))
 }
 func detach(uuid string, args []string, stdout, stderr io.Writer) error {
-	lockfile, err := os.OpenFile(filepath.Join(lockdir, lockprefix+uuid+locksuffix), os.O_CREATE|os.O_RDWR, 0700)
+	lockfile, err := func() (*os.File, error) {
+		// We must hold the dir-level lock between
+		// opening/creating the lockfile and acquiring LOCK_EX
+		// on it, to avoid racing with the ListProcess's
+		// alive-checking and garbage collection.
+		dirlock, err := lockall()
+		if err != nil {
+			return nil, err
+		}
+		defer dirlock.Close()
+		lockfile, err := os.OpenFile(filepath.Join(lockdir, lockprefix+uuid+locksuffix), os.O_CREATE|os.O_RDWR, 0700)
+		if err != nil {
+			return nil, err
+		}
+		err = syscall.Flock(int(lockfile.Fd()), syscall.LOCK_EX|syscall.LOCK_NB)
+		if err != nil {
+			lockfile.Close()
+			return nil, err
+		}
+		return lockfile, nil
+	}()
 	if err != nil {
 		return err
 	}
 	defer lockfile.Close()
-	err = syscall.Flock(int(lockfile.Fd()), syscall.LOCK_EX|syscall.LOCK_NB)
-	if err != nil {
-		return err
-	}
 	lockfile.Truncate(0)
 
 	outfile, err := ioutil.TempFile("", "crunch-run-"+uuid+"-stdout-")
@@ -157,19 +173,24 @@ func ListProcesses(stdout, stderr io.Writer) int {
 		}
 		defer f.Close()
 
-		// TODO: Do this check without risk of disrupting lock
-		// acquisition during races, e.g., by connecting to a
-		// unix socket or checking /proc/$pid/fd/$n ->
-		// lockfile.
+		// Ensure other processes don't acquire this lockfile
+		// after we have decided it is abandoned but before we
+		// have deleted it.
+		dirlock, err := lockall()
+		if err != nil {
+			return err
+		}
 		err = syscall.Flock(int(f.Fd()), syscall.LOCK_SH|syscall.LOCK_NB)
 		if err == nil {
 			// lockfile is stale
 			err := os.Remove(path)
+			dirlock.Close()
 			if err != nil {
 				fmt.Fprintln(stderr, err)
 			}
 			return nil
 		}
+		dirlock.Close()
 
 		var pi procinfo
 		err = json.NewDecoder(f).Decode(&pi)
@@ -196,3 +217,21 @@ func exitcode(stderr io.Writer, err error) int {
 	}
 	return 0
 }
+
+// Acquire a dir-level lock. Must be held while creating or deleting
+// container-specific lockfiles, to avoid races during the intervals
+// when those container-specific lockfiles are open but not locked.
+//
+// Caller releases the lock by closing the returned file.
+func lockall() (*os.File, error) {
+	f, err := os.OpenFile(filepath.Join(lockdir, lockprefix+"all"+locksuffix), os.O_CREATE|os.O_RDWR, 0700)
+	if err != nil {
+		return nil, err
+	}
+	err = syscall.Flock(int(f.Fd()), syscall.LOCK_EX)
+	if err != nil {
+		f.Close()
+		return nil, err
+	}
+	return f, nil
+}

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list