[ARVADOS] updated: 2.1.0-1818-g43be77c2f
Git user
git at public.arvados.org
Mon Jan 10 19:12:04 UTC 2022
Summary of changes:
lib/crunchrun/cuda.go | 36 ++++++++++++++++++++++--------------
lib/crunchrun/docker.go | 7 +++----
lib/crunchrun/singularity.go | 9 ++++-----
3 files changed, 29 insertions(+), 23 deletions(-)
via 43be77c2f1b4b972113202bcd3d543fe0428778a (commit)
from c595d3cd2d9f117bc09cf66762d3698c95aebf86 (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
commit 43be77c2f1b4b972113202bcd3d543fe0428778a
Author: Peter Amstutz <peter.amstutz at curii.com>
Date: Mon Jan 10 14:11:37 2022 -0500
12630: Report errors from nvidia-modprobe & use Getenv
Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz at curii.com>
diff --git a/lib/crunchrun/cuda.go b/lib/crunchrun/cuda.go
index 91949c588..8282359ea 100644
--- a/lib/crunchrun/cuda.go
+++ b/lib/crunchrun/cuda.go
@@ -5,6 +5,7 @@
package crunchrun
import (
+ "fmt"
"io"
"os/exec"
)
@@ -41,23 +42,30 @@ func nvidiaModprobe(writer io.Writer) {
nvidiaSmi := exec.Command("nvidia-smi", "-L")
nvidiaSmi.Stdout = writer
nvidiaSmi.Stderr = writer
- nvidiaSmi.Run()
+ err := nvidiaSmi.Run()
+ if err != nil {
+ writer.Write([]byte(fmt.Sprintf("nvidia-smi error: %v\n", err)))
+ }
// Load the kernel modules & devices associated with
// /dev/nvidia-modeset, /dev/nvidia-nvlink, /dev/nvidia-uvm
- // and /dev/nvidia-uvm-tools (-m, -l and -u). Annoyingly, you
- // don't have multiple devices but you need to supply "-c0"
- // anyway or it won't make the device file.
- exec.Command("nvidia-modprobe", "-c0", "-m").Run()
- exec.Command("nvidia-modprobe", "-c0", "-l").Run()
- exec.Command("nvidia-modprobe", "-c0", "-u").Run()
+ // and /dev/nvidia-uvm-tools (-m, -l and -u). Annoyingly,
+ // these don't have multiple devices but you need to supply
+ // "-c0" anyway or it won't make the device file.
// Nvswitch devices are multi-GPU interconnects for up to 16
- // GPUs. Here we'll create /dev/nvidia-nvswitch0. If someone
- // runs Arvados on a system with multiple nvswitches
- // (i.e. more than 16 GPUs) they can either ensure that the
- // additional /dev/nvidia-nvswitch* devices exist before
- // crunch-run starts or pay for support (because they clearly
- // have the budget for it).
- exec.Command("nvidia-modprobe", "-c0", "-s").Run()
+ // GPUs. The "-c0 -s" flag will create /dev/nvidia-nvswitch0.
+ // If someone runs Arvados on a system with multiple
+ // nvswitches (i.e. more than 16 GPUs) they'll have to ensure
+ // that all the /dev/nvidia-nvswitch* devices exist before
+ // crunch-run starts.
+ for _, opt := range []string{"-m", "-l", "-u", "-s"} {
+ nvmodprobe := exec.Command("nvidia-modprobe", "-c0", opt)
+ nvmodprobe.Stdout = writer
+ nvmodprobe.Stderr = writer
+ err = nvmodprobe.Run()
+ if err != nil {
+ writer.Write([]byte(fmt.Sprintf("nvidia-modprobe error: %v\n", err)))
+ }
+ }
}
diff --git a/lib/crunchrun/docker.go b/lib/crunchrun/docker.go
index c20f78bb1..06e5b5b1e 100644
--- a/lib/crunchrun/docker.go
+++ b/lib/crunchrun/docker.go
@@ -108,13 +108,12 @@ func (e *dockerExecutor) config(spec containerSpec) (dockercontainer.Config, doc
}
if spec.CUDADeviceCount != 0 {
var deviceIds []string
- for _, s := range os.Environ() {
+ if cudaVisibleDevices := os.Getenv("CUDA_VISIBLE_DEVICES"); cudaVisibleDevices != "" {
// If a resource manager such as slurm or LSF told
// us to select specific devices we need to propagate that.
- if strings.HasPrefix(s, "CUDA_VISIBLE_DEVICES=") {
- deviceIds = strings.Split(strings.SplitN(s, "=", 2)[1], ",")
- }
+ deviceIds = strings.Split(cudaVisibleDevices, ",")
}
+
deviceCount := spec.CUDADeviceCount
if len(deviceIds) > 0 {
// Docker won't accept both non-empty
diff --git a/lib/crunchrun/singularity.go b/lib/crunchrun/singularity.go
index 942de4300..64a377325 100644
--- a/lib/crunchrun/singularity.go
+++ b/lib/crunchrun/singularity.go
@@ -10,7 +10,6 @@ import (
"os"
"os/exec"
"sort"
- "strings"
"syscall"
"time"
@@ -288,10 +287,10 @@ func (e *singularityExecutor) execCmd(path string) *exec.Cmd {
// Singularity always makes all nvidia devices visible to the
// container. If a resource manager such as slurm or LSF told
// us to select specific devices we need to propagate that.
- for _, s := range os.Environ() {
- if strings.HasPrefix(s, "CUDA_VISIBLE_DEVICES=") {
- env = append(env, "SINGULARITYENV_"+s)
- }
+ if cudaVisibleDevices := os.Getenv("CUDA_VISIBLE_DEVICES"); cudaVisibleDevices != "" {
+ // If a resource manager such as slurm or LSF told
+ // us to select specific devices we need to propagate that.
+ env = append(env, "SINGULARITYENV_CUDA_VISIBLE_DEVICES="+cudaVisibleDevices)
}
args = append(args, e.imageFilename)
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list