[ARVADOS] updated: 2.1.0-1818-g43be77c2f

Mon Jan 10 19:12:04 UTC 2022

Summary of changes:
 lib/crunchrun/cuda.go        | 36 ++++++++++++++++++++++--------------
 lib/crunchrun/docker.go      |  7 +++----
 lib/crunchrun/singularity.go |  9 ++++-----
 3 files changed, 29 insertions(+), 23 deletions(-)

       via  43be77c2f1b4b972113202bcd3d543fe0428778a (commit)
      from  c595d3cd2d9f117bc09cf66762d3698c95aebf86 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.


commit 43be77c2f1b4b972113202bcd3d543fe0428778a
Author: Peter Amstutz <peter.amstutz at curii.com>
Date:   Mon Jan 10 14:11:37 2022 -0500

    12630: Report errors from nvidia-modprobe & use Getenv
    
    Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz at curii.com>

diff --git a/lib/crunchrun/cuda.go b/lib/crunchrun/cuda.go
index 91949c588..8282359ea 100644
--- a/lib/crunchrun/cuda.go
+++ b/lib/crunchrun/cuda.go
@@ -5,6 +5,7 @@
 package crunchrun
 
 import (
+	"fmt"
 	"io"
 	"os/exec"
 )
@@ -41,23 +42,30 @@ func nvidiaModprobe(writer io.Writer) {
 	nvidiaSmi := exec.Command("nvidia-smi", "-L")
 	nvidiaSmi.Stdout = writer
 	nvidiaSmi.Stderr = writer
-	nvidiaSmi.Run()
+	err := nvidiaSmi.Run()
+	if err != nil {
+		writer.Write([]byte(fmt.Sprintf("nvidia-smi error: %v\n", err)))
+	}
 
 	// Load the kernel modules & devices associated with
 	// /dev/nvidia-modeset, /dev/nvidia-nvlink, /dev/nvidia-uvm
-	// and /dev/nvidia-uvm-tools (-m, -l and -u).  Annoyingly, you
-	// don't have multiple devices but you need to supply "-c0"
-	// anyway or it won't make the device file.
-	exec.Command("nvidia-modprobe", "-c0", "-m").Run()
-	exec.Command("nvidia-modprobe", "-c0", "-l").Run()
-	exec.Command("nvidia-modprobe", "-c0", "-u").Run()
+	// and /dev/nvidia-uvm-tools (-m, -l and -u).  Annoyingly,
+	// these don't have multiple devices but you need to supply
+	// "-c0" anyway or it won't make the device file.
 
 	// Nvswitch devices are multi-GPU interconnects for up to 16
-	// GPUs.  Here we'll create /dev/nvidia-nvswitch0.  If someone
-	// runs Arvados on a system with multiple nvswitches
-	// (i.e. more than 16 GPUs) they can either ensure that the
-	// additional /dev/nvidia-nvswitch* devices exist before
-	// crunch-run starts or pay for support (because they clearly
-	// have the budget for it).
-	exec.Command("nvidia-modprobe", "-c0", "-s").Run()
+	// GPUs.  The "-c0 -s" flag will create /dev/nvidia-nvswitch0.
+	// If someone runs Arvados on a system with multiple
+	// nvswitches (i.e. more than 16 GPUs) they'll have to ensure
+	// that all the /dev/nvidia-nvswitch* devices exist before
+	// crunch-run starts.
+	for _, opt := range []string{"-m", "-l", "-u", "-s"} {
+		nvmodprobe := exec.Command("nvidia-modprobe", "-c0", opt)
+		nvmodprobe.Stdout = writer
+		nvmodprobe.Stderr = writer
+		err = nvmodprobe.Run()
+		if err != nil {
+			writer.Write([]byte(fmt.Sprintf("nvidia-modprobe error: %v\n", err)))
+		}
+	}
 }
diff --git a/lib/crunchrun/docker.go b/lib/crunchrun/docker.go
index c20f78bb1..06e5b5b1e 100644
--- a/lib/crunchrun/docker.go
+++ b/lib/crunchrun/docker.go
@@ -108,13 +108,12 @@ func (e *dockerExecutor) config(spec containerSpec) (dockercontainer.Config, doc
 	}
 	if spec.CUDADeviceCount != 0 {
 		var deviceIds []string
-		for _, s := range os.Environ() {
+		if cudaVisibleDevices := os.Getenv("CUDA_VISIBLE_DEVICES"); cudaVisibleDevices != "" {
 			// If a resource manager such as slurm or LSF told
 			// us to select specific devices we need to propagate that.
-			if strings.HasPrefix(s, "CUDA_VISIBLE_DEVICES=") {
-				deviceIds = strings.Split(strings.SplitN(s, "=", 2)[1], ",")
-			}
+			deviceIds = strings.Split(cudaVisibleDevices, ",")
 		}
+
 		deviceCount := spec.CUDADeviceCount
 		if len(deviceIds) > 0 {
 			// Docker won't accept both non-empty
diff --git a/lib/crunchrun/singularity.go b/lib/crunchrun/singularity.go
index 942de4300..64a377325 100644
--- a/lib/crunchrun/singularity.go
+++ b/lib/crunchrun/singularity.go
@@ -10,7 +10,6 @@ import (
 	"os"
 	"os/exec"
 	"sort"
-	"strings"
 	"syscall"
 	"time"
 
@@ -288,10 +287,10 @@ func (e *singularityExecutor) execCmd(path string) *exec.Cmd {
 	// Singularity always makes all nvidia devices visible to the
 	// container.  If a resource manager such as slurm or LSF told
 	// us to select specific devices we need to propagate that.
-	for _, s := range os.Environ() {
-		if strings.HasPrefix(s, "CUDA_VISIBLE_DEVICES=") {
-			env = append(env, "SINGULARITYENV_"+s)
-		}
+	if cudaVisibleDevices := os.Getenv("CUDA_VISIBLE_DEVICES"); cudaVisibleDevices != "" {
+		// If a resource manager such as slurm or LSF told
+		// us to select specific devices we need to propagate that.
+		env = append(env, "SINGULARITYENV_CUDA_VISIBLE_DEVICES="+cudaVisibleDevices)
 	}
 
 	args = append(args, e.imageFilename)

-----------------------------------------------------------------------


hooks/post-receive
--