[arvados] created: 2.5.0-158-g1472cb380

git repository hosting git at public.arvados.org
Mon Feb 13 16:02:01 UTC 2023


        at  1472cb3800e6545ef9c3059e6686768ec79b1386 (commit)


commit 1472cb3800e6545ef9c3059e6686768ec79b1386
Author: Brett Smith <brett.smith at curii.com>
Date:   Mon Feb 13 10:54:09 2023 -0500

    19986: Log resource maxima from crunch-run
    
    This aims to a provide a human-friendly summary of resource usage at the
    end of logs.
    
    Arvados-DCO-1.1-Signed-off-by: Brett Smith <brett.smith at curii.com>

diff --git a/lib/crunchrun/crunchrun.go b/lib/crunchrun/crunchrun.go
index 41a69b406..4fb5d3208 100644
--- a/lib/crunchrun/crunchrun.go
+++ b/lib/crunchrun/crunchrun.go
@@ -728,6 +728,13 @@ func (runner *ContainerRunner) stopHoststat() error {
 		return nil
 	}
 	runner.hoststatReporter.Stop()
+	maxima := runner.hoststatReporter.GetMaxima()
+	for memKey, value := range maxima.Memory {
+		if memKey.ProcessName == "" {
+			continue
+		}
+		runner.reportMemoryMax(memKey.ProcessName, memKey.StatName, value, 0, "bytes")
+	}
 	err := runner.hoststatLogger.Close()
 	if err != nil {
 		return fmt.Errorf("error closing hoststat logs: %v", err)
@@ -1091,6 +1098,30 @@ func (runner *ContainerRunner) StartContainer() error {
 	return nil
 }
 
+func (runner *ContainerRunner) reportMemoryMax(
+	source string,
+	statName string,
+	value int64,
+	limit int64,
+	units string,
+) {
+	if units == "" {
+		units = "bytes"
+	}
+	if limit > 0 {
+		percentage := 100 * value / limit
+		runner.CrunchLog.Printf(
+			"Maximum %s memory %s usage was %d%%, %d/%d %s",
+			source, statName, percentage, value, limit, units,
+		)
+	} else {
+		runner.CrunchLog.Printf(
+			"Maximum %s memory %s usage was %d %s",
+			source, statName, value, units,
+		)
+	}
+}
+
 // WaitFinish waits for the container to terminate, capture the exit code, and
 // close the stdout/stderr logging.
 func (runner *ContainerRunner) WaitFinish() error {
@@ -1112,8 +1143,54 @@ func (runner *ContainerRunner) WaitFinish() error {
 		case <-ctx.Done():
 		}
 	}()
-	exitcode, err := runner.executor.Wait(ctx)
-	if err != nil {
+	// The order of operations here is precise: let the executor finish, THEN
+	// log crunchstat maxima, THEN fail if executor.Wait did.
+	exitcode, waitErr := runner.executor.Wait(ctx)
+	var err error
+	if runner.statReporter != nil {
+		runner.statReporter.Stop()
+
+		memLimits := map[string]int64{
+			"rss": runner.Container.RuntimeConstraints.RAM,
+		}
+		memUnits := map[string]string{
+			"pgmajfault": "faults",
+		}
+		maxima := runner.statReporter.GetMaxima()
+		runner.CrunchLog.Printf(
+			"Total CPU usage was %f user and %f sys on %d CPUs",
+			maxima.CPU.User, maxima.CPU.Sys, maxima.CPU.CPUCount,
+		)
+		for disk, sample := range maxima.DiskIO {
+			runner.CrunchLog.Printf(
+				"Total disk I/O on %s was %d bytes written and %d bytes read",
+				disk, sample.TXBytes, sample.RXBytes,
+			)
+		}
+		if maxima.DiskSpace.Total > 0 {
+			percentage := 100 * maxima.DiskSpace.Used / maxima.DiskSpace.Total
+			runner.CrunchLog.Printf(
+				"Maximum disk usage was %d%%, %d/%d bytes",
+				percentage, maxima.DiskSpace.Used, maxima.DiskSpace.Total,
+			)
+		}
+		for _, statName := range crunchstat.MemoryStats {
+			value, _ := maxima.GetMemstat(statName)
+			runner.reportMemoryMax("container", statName, value, memLimits[statName], memUnits[statName])
+		}
+		for ifname, sample := range maxima.NetIO {
+			runner.CrunchLog.Printf(
+				"Total network I/O on %s was %d bytes written and %d bytes read",
+				ifname, sample.TXBytes, sample.RXBytes,
+			)
+		}
+
+		err = runner.statLogger.Close()
+		if err != nil {
+			runner.CrunchLog.Printf("error closing crunchstat logs: %v", err)
+		}
+	}
+	if waitErr != nil {
 		runner.checkBrokenNode(err)
 		return err
 	}
@@ -1159,14 +1236,6 @@ func (runner *ContainerRunner) WaitFinish() error {
 			returnErr = err
 		}
 	}
-
-	if runner.statReporter != nil {
-		runner.statReporter.Stop()
-		err = runner.statLogger.Close()
-		if err != nil {
-			runner.CrunchLog.Printf("error closing crunchstat logs: %v", err)
-		}
-	}
 	return returnErr
 }
 
diff --git a/lib/crunchrun/crunchrun_test.go b/lib/crunchrun/crunchrun_test.go
index 0c69548e6..e3305a138 100644
--- a/lib/crunchrun/crunchrun_test.go
+++ b/lib/crunchrun/crunchrun_test.go
@@ -932,7 +932,7 @@ func (s *TestSuite) testLogRSSThresholds(c *C, ram int, expected []int, notExpec
 	var threshold int
 	for _, threshold = range expected {
 		err := s.searchLogs(logs, fmt.Sprintf(pattern, threshold, ram))
-		c.Check(err, IsNil, Commentf("%s", err))
+		c.Check(err, IsNil, Commentf("%s `%v`", err, pattern))
 	}
 	if notExpected > threshold {
 		err := s.searchLogs(logs, fmt.Sprintf(pattern, notExpected, ram))
@@ -953,6 +953,35 @@ func (s *TestSuite) TestLogAllRSSThresholds(c *C) {
 	s.testLogRSSThresholds(c, 734003299, []int{90, 95, 99}, 0)
 }
 
+func (s *TestSuite) TestLogMaximaAfterRun(c *C) {
+	s.runner.cgroupRoot = "testdata/fakestat"
+	s.runner.parentTemp = c.MkDir()
+	s.fullRunHelper(c, `{
+		"command": ["true"],
+		"container_image": "`+arvadostest.DockerImage112PDH+`",
+		"cwd": ".",
+		"environment": {},
+		"mounts": {"/tmp": {"kind": "tmp"} },
+		"output_path": "/tmp",
+		"priority": 1,
+		"runtime_constraints": {"ram": 7340032000},
+		"state": "Locked"
+	}`, nil, 0, func() {})
+	logs := s.api.Logs["crunch-run"]
+	for _, pattern := range []string{
+		`Total CPU usage was \d+\.\d+ user and \d+\.\d+ sys on \d+ CPUs\n$`,
+		`Maximum disk usage was \d+%, \d+/\d+ bytes\n$`,
+		`Maximum container memory cache usage was 73400320 bytes\n$`,
+		`Maximum container memory swap usage was 320 bytes\n$`,
+		`Maximum container memory pgmajfault usage was 20 faults\n$`,
+		`Maximum container memory rss usage was 10%, 734003200/7340032000 bytes\n$`,
+		`Maximum crunch-run memory rss usage was \d+ bytes\n$`,
+	} {
+		err := s.searchLogs(logs, `\dZ `+pattern)
+		c.Check(err, IsNil, Commentf("%s `%v`", err, pattern))
+	}
+}
+
 func (s *TestSuite) TestCommitNodeInfoBeforeStart(c *C) {
 	var collection_create, container_update arvadosclient.Dict
 	s.fullRunHelper(c, `{
diff --git a/lib/crunchrun/testdata/fakestat/cgroupid/memory.stat b/lib/crunchrun/testdata/fakestat/cgroupid/memory.stat
index fff3333c4..22f0e13fa 100644
--- a/lib/crunchrun/testdata/fakestat/cgroupid/memory.stat
+++ b/lib/crunchrun/testdata/fakestat/cgroupid/memory.stat
@@ -1 +1,5 @@
 rss 734003200
+pgmajfault 3200
+total_cache 73400320
+total_pgmajfault 20
+total_swap 320

commit 41f3b2cd5cb209c70d405a88ec231d53aa2f2567
Author: Brett Smith <brett.smith at curii.com>
Date:   Mon Feb 13 10:20:24 2023 -0500

    19986: Add StatMaxima.GetMemstat method
    
    This makes it easier to report maxima consistently with the original
    crunchstat report.
    
    Arvados-DCO-1.1-Signed-off-by: Brett Smith <brett.smith at curii.com>

diff --git a/lib/crunchstat/crunchstat.go b/lib/crunchstat/crunchstat.go
index d6b337781..511e2cf4e 100644
--- a/lib/crunchstat/crunchstat.go
+++ b/lib/crunchstat/crunchstat.go
@@ -94,6 +94,18 @@ type StatMaxima struct {
 	NetIO     map[string]IOSample
 }
 
+// Report the best available maximum for a given cgroup memory stat.
+//
+// This checks the total_KEY stat before falling back to plain KEY.
+// This mirrors the logic in reportMemSample.
+func (maxima *StatMaxima) GetMemstat(statName string) (value int64, ok bool) {
+	value, ok = maxima.Memory[MemoryKey{StatName: "total_" + statName}]
+	if !ok {
+		value, ok = maxima.Memory[MemoryKey{StatName: statName}]
+	}
+	return
+}
+
 func newMaxima() StatMaxima {
 	return StatMaxima{
 		// Memory is the only field we need to initialize. Other maps are
diff --git a/lib/crunchstat/crunchstat_test.go b/lib/crunchstat/crunchstat_test.go
index e534b528c..993770b0f 100644
--- a/lib/crunchstat/crunchstat_test.go
+++ b/lib/crunchstat/crunchstat_test.go
@@ -251,3 +251,19 @@ func (s *suite) TestMaxima(c *C) {
 
 	c.Logf("%s", s.logbuf.String())
 }
+
+func (s *suite) TestMaximaGetMemstat(c *C) {
+	maxima := newMaxima()
+	maxima.Memory[MemoryKey{StatName: "total_rss"}] = 987
+	maxima.Memory[MemoryKey{StatName: "rss"}] = 654
+	maxima.Memory[MemoryKey{StatName: "swap"}] = 321
+	value, ok := maxima.GetMemstat("rss")
+	c.Check(value, Equals, int64(987))
+	c.Check(ok, Equals, true)
+	value, ok = maxima.GetMemstat("swap")
+	c.Check(value, Equals, int64(321))
+	c.Check(ok, Equals, true)
+	value, ok = maxima.GetMemstat("cache")
+	c.Check(value, Equals, int64(0))
+	c.Check(ok, Equals, false)
+}

commit 5e623a3a5abf2a527c8a24f5d4c3a0142c4454e8
Author: Brett Smith <brett.smith at curii.com>
Date:   Sun Feb 12 23:26:23 2023 -0500

    19986: Record maximum value crunchstat sees for each statistic
    
    See the comments on GetMaxima for details about how the maximum is
    defined for each statistic.
    
    Arvados-DCO-1.1-Signed-off-by: Brett Smith <brett.smith at curii.com>

diff --git a/lib/crunchstat/crunchstat.go b/lib/crunchstat/crunchstat.go
index 9c869ef2d..d6b337781 100644
--- a/lib/crunchstat/crunchstat.go
+++ b/lib/crunchstat/crunchstat.go
@@ -68,6 +68,7 @@ type Reporter struct {
 	lastDiskSpaceSample DiskSpaceSample
 	lastMemSample       memSample
 	rssThresholds       []ThresholdAlert
+	maxima              StatMaxima
 
 	reportPIDs   map[string]int
 	reportPIDsMu sync.Mutex
@@ -76,6 +77,32 @@ type Reporter struct {
 	flushed chan struct{} // closed when we have made our last report
 }
 
+// MemoryKey is a key into StatMaxima.Memory.
+// Initialize it with just StatName to get the host/cgroup maximum.
+// Initialize it with all fields to get that process' maximum.
+type MemoryKey struct {
+	ProcessID   int
+	ProcessName string
+	StatName    string
+}
+
+type StatMaxima struct {
+	CPU       CPUSample
+	DiskIO    map[string]IOSample
+	DiskSpace DiskSpaceSample
+	Memory    map[MemoryKey]int64
+	NetIO     map[string]IOSample
+}
+
+func newMaxima() StatMaxima {
+	return StatMaxima{
+		// Memory is the only field we need to initialize. Other maps are
+		// set unconditionally by GetMaxima after the Reporter runs, and
+		// zero value structs are fine for the rest.
+		Memory: make(map[MemoryKey]int64),
+	}
+}
+
 type ThresholdAlert struct {
 	// A short identifier for the statistic that passed a threshold
 	StatName string
@@ -133,6 +160,7 @@ func (r *Reporter) SetupRSSThresholds(total int64, percentages []int64) {
 //
 // Callers should not modify public data fields after calling Start.
 func (r *Reporter) Start() {
+	r.maxima = newMaxima()
 	r.ThresholdAlerts = make(chan ThresholdAlert, len(r.rssThresholds))
 	r.done = make(chan struct{})
 	r.flushed = make(chan struct{})
@@ -160,6 +188,25 @@ func (r *Reporter) Stop() {
 	close(r.ThresholdAlerts)
 }
 
+// Get the maximum value of each statistic seen during the reporing period.
+//
+// This function will not return until Stop has been called and the Reporter
+// has finished.
+//
+// For CPU and I/O stats, the maximum is the last reported value.
+//
+// For disk space, the maximum is the highest amount of TempDir used.
+//
+// Memory statistics are all reported individually. See the comments on
+// MemoryKey for details about how to access them.
+func (r *Reporter) GetMaxima() StatMaxima {
+	<-r.flushed
+	r.maxima.CPU = r.lastCPUSample
+	r.maxima.DiskIO = r.lastDiskIOSample
+	r.maxima.NetIO = r.lastNetSample
+	return r.maxima
+}
+
 func (r *Reporter) readAllOrWarn(in io.Reader) ([]byte, error) {
 	content, err := ioutil.ReadAll(in)
 	if err != nil {
@@ -317,6 +364,10 @@ func (r *Reporter) getMemSample() {
 			continue
 		}
 		thisSample.memStat[stat] = val
+		maxKey := MemoryKey{StatName: stat}
+		if val > r.maxima.Memory[maxKey] {
+			r.maxima.Memory[maxKey] = val
+		}
 	}
 
 	var thresholdsPassed uint
@@ -409,7 +460,12 @@ func (r *Reporter) doProcmemStats() {
 		if err != nil {
 			continue
 		}
-		procmem += fmt.Sprintf(" %d %s", rss*r.kernelPageSize, procname)
+		value := rss * r.kernelPageSize
+		procmem += fmt.Sprintf(" %d %s", value, procname)
+		maxKey := MemoryKey{pid, procname, "rss"}
+		if value > r.maxima.Memory[maxKey] {
+			r.maxima.Memory[maxKey] = value
+		}
 	}
 	if procmem != "" {
 		r.Logger.Printf("procmem%s\n", procmem)
@@ -482,6 +538,9 @@ func (r *Reporter) doDiskSpaceStats() {
 		Used:       (s.Blocks - s.Bfree) * bs,
 		Available:  s.Bavail * bs,
 	}
+	if nextSample.Used > r.maxima.DiskSpace.Used {
+		r.maxima.DiskSpace = nextSample
+	}
 
 	var delta string
 	if r.lastDiskSpaceSample.hasData {
diff --git a/lib/crunchstat/crunchstat_test.go b/lib/crunchstat/crunchstat_test.go
index f0bbd5aca..e534b528c 100644
--- a/lib/crunchstat/crunchstat_test.go
+++ b/lib/crunchstat/crunchstat_test.go
@@ -6,7 +6,9 @@ package crunchstat
 
 import (
 	"bytes"
+	"errors"
 	"os"
+	"path"
 	"regexp"
 	"strconv"
 	"testing"
@@ -41,9 +43,10 @@ var _ = Suite(&suite{
 })
 
 type suite struct {
-	logbuf    bytes.Buffer
-	logger    *logrus.Logger
-	startTime time.Time
+	cgroupRoot string
+	logbuf     bytes.Buffer
+	logger     *logrus.Logger
+	startTime  time.Time
 }
 
 func (s *suite) SetUpSuite(c *C) {
@@ -51,10 +54,50 @@ func (s *suite) SetUpSuite(c *C) {
 }
 
 func (s *suite) SetUpTest(c *C) {
+	s.cgroupRoot = ""
 	s.logbuf.Reset()
 	s.startTime = time.Now()
 }
 
+func (s *suite) tempCgroup(c *C, sourceDir string) error {
+	tempDir := c.MkDir()
+	dirents, err := os.ReadDir(sourceDir)
+	if err != nil {
+		return err
+	}
+	for _, dirent := range dirents {
+		srcData, err := os.ReadFile(path.Join(sourceDir, dirent.Name()))
+		if err != nil {
+			return err
+		}
+		destPath := path.Join(tempDir, dirent.Name())
+		err = os.WriteFile(destPath, srcData, 0o600)
+		if err != nil {
+			return err
+		}
+	}
+	s.cgroupRoot = tempDir
+	return nil
+}
+
+func (s *suite) addPidToCgroup(pid int) error {
+	if s.cgroupRoot == "" {
+		return errors.New("cgroup has not been set up for this test")
+	}
+	procsPath := path.Join(s.cgroupRoot, "cgroup.procs")
+	procsFile, err := os.OpenFile(procsPath, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0o600)
+	if err != nil {
+		return err
+	}
+	pidLine := strconv.Itoa(pid) + "\n"
+	_, err = procsFile.Write([]byte(pidLine))
+	if err != nil {
+		procsFile.Close()
+		return err
+	}
+	return procsFile.Close()
+}
+
 func (s *suite) TestReadAllOrWarnFail(c *C) {
 	rep := Reporter{Logger: s.logger}
 
@@ -174,3 +217,37 @@ func (s *suite) TestMultipleRSSThresholdsSomePassed(c *C) {
 func (s *suite) TestMultipleRSSThresholdsAllPassed(c *C) {
 	s.testRSSThresholds(c, []int64{1, 2, 3}, 3)
 }
+
+func (s *suite) TestMaxima(c *C) {
+	err := s.tempCgroup(c, fakeRSS.cgroupRoot)
+	c.Assert(err, IsNil)
+	pid := os.Getpid()
+	err = s.addPidToCgroup(pid)
+	c.Assert(err, IsNil)
+
+	rep := Reporter{
+		CgroupRoot: s.cgroupRoot,
+		Logger:     s.logger,
+		PollPeriod: time.Second * 10,
+		TempDir:    s.cgroupRoot,
+	}
+	rep.ReportPID("MaximaTest", pid)
+	rep.Start()
+	rep.Stop()
+	actual := rep.GetMaxima()
+
+	// FIXME: Add checks for the remaining StatMaxima fields. I'm not doing
+	// that on the first pass because I don't feel confident they'll work
+	// consistently across developer machines.
+
+	c.Check(actual.DiskSpace.Total, Not(Equals), uint64(0))
+	c.Check(actual.DiskSpace.Used, Not(Equals), uint64(0))
+	c.Check(actual.DiskSpace.Available, Not(Equals), uint64(0))
+
+	memKey := MemoryKey{StatName: "rss"}
+	c.Check(actual.Memory[memKey], Equals, fakeRSS.value)
+	memKey = MemoryKey{pid, "MaximaTest", "rss"}
+	c.Check(actual.Memory[memKey], Not(Equals), int64(0))
+
+	c.Logf("%s", s.logbuf.String())
+}

commit 4040234f6cf3e2aa54c0896df4b72581d73f3fa1
Author: Brett Smith <brett.smith at curii.com>
Date:   Sun Feb 12 14:04:25 2023 -0500

    19986: Export some sample structs from crunchstat
    
    This is preparation to make maximum statistics public.
    
    Arvados-DCO-1.1-Signed-off-by: Brett Smith <brett.smith at curii.com>

diff --git a/lib/crunchstat/crunchstat.go b/lib/crunchstat/crunchstat.go
index 61ee72a66..9c869ef2d 100644
--- a/lib/crunchstat/crunchstat.go
+++ b/lib/crunchstat/crunchstat.go
@@ -23,6 +23,9 @@ import (
 	"time"
 )
 
+// crunchstat collects all memory statistics, but only reports these.
+var MemoryStats = [...]string{"cache", "swap", "pgmajfault", "rss"}
+
 // A Reporter gathers statistics for a cgroup and writes them to a
 // log.Logger.
 type Reporter struct {
@@ -59,10 +62,10 @@ type Reporter struct {
 
 	kernelPageSize      int64
 	reportedStatFile    map[string]string
-	lastNetSample       map[string]ioSample
-	lastDiskIOSample    map[string]ioSample
-	lastCPUSample       cpuSample
-	lastDiskSpaceSample diskSpaceSample
+	lastNetSample       map[string]IOSample
+	lastDiskIOSample    map[string]IOSample
+	lastCPUSample       CPUSample
+	lastDiskSpaceSample DiskSpaceSample
 	lastMemSample       memSample
 	rssThresholds       []ThresholdAlert
 
@@ -244,10 +247,10 @@ func (r *Reporter) getContainerNetStats() (io.Reader, error) {
 	return nil, errors.New("Could not read stats for any proc in container")
 }
 
-type ioSample struct {
+type IOSample struct {
 	sampleTime time.Time
-	txBytes    int64
-	rxBytes    int64
+	TXBytes    int64
+	RXBytes    int64
 }
 
 func (r *Reporter) doBlkIOStats() {
@@ -258,38 +261,38 @@ func (r *Reporter) doBlkIOStats() {
 	defer c.Close()
 	b := bufio.NewScanner(c)
 	var sampleTime = time.Now()
-	newSamples := make(map[string]ioSample)
+	newSamples := make(map[string]IOSample)
 	for b.Scan() {
 		var device, op string
 		var val int64
 		if _, err := fmt.Sscanf(string(b.Text()), "%s %s %d", &device, &op, &val); err != nil {
 			continue
 		}
-		var thisSample ioSample
+		var thisSample IOSample
 		var ok bool
 		if thisSample, ok = newSamples[device]; !ok {
-			thisSample = ioSample{sampleTime, -1, -1}
+			thisSample = IOSample{sampleTime, -1, -1}
 		}
 		switch op {
 		case "Read":
-			thisSample.rxBytes = val
+			thisSample.RXBytes = val
 		case "Write":
-			thisSample.txBytes = val
+			thisSample.TXBytes = val
 		}
 		newSamples[device] = thisSample
 	}
 	for dev, sample := range newSamples {
-		if sample.txBytes < 0 || sample.rxBytes < 0 {
+		if sample.TXBytes < 0 || sample.RXBytes < 0 {
 			continue
 		}
 		delta := ""
 		if prev, ok := r.lastDiskIOSample[dev]; ok {
 			delta = fmt.Sprintf(" -- interval %.4f seconds %d write %d read",
 				sample.sampleTime.Sub(prev.sampleTime).Seconds(),
-				sample.txBytes-prev.txBytes,
-				sample.rxBytes-prev.rxBytes)
+				sample.TXBytes-prev.TXBytes,
+				sample.RXBytes-prev.RXBytes)
 		}
-		r.Logger.Printf("blkio:%s %d write %d read%s\n", dev, sample.txBytes, sample.rxBytes, delta)
+		r.Logger.Printf("blkio:%s %d write %d read%s\n", dev, sample.TXBytes, sample.RXBytes, delta)
 		r.lastDiskIOSample[dev] = sample
 	}
 }
@@ -333,8 +336,7 @@ func (r *Reporter) getMemSample() {
 
 func (r *Reporter) reportMemSample() {
 	var outstat bytes.Buffer
-	wantStats := [...]string{"cache", "swap", "pgmajfault", "rss"}
-	for _, key := range wantStats {
+	for _, key := range MemoryStats {
 		// Use "total_X" stats (entire hierarchy) if enabled,
 		// otherwise just the single cgroup -- see
 		// https://www.kernel.org/doc/Documentation/cgroup-v1/memory.txt
@@ -441,29 +443,29 @@ func (r *Reporter) doNetworkStats() {
 		if rx, err = strconv.ParseInt(words[1], 10, 64); err != nil {
 			continue
 		}
-		nextSample := ioSample{}
+		nextSample := IOSample{}
 		nextSample.sampleTime = sampleTime
-		nextSample.txBytes = tx
-		nextSample.rxBytes = rx
+		nextSample.TXBytes = tx
+		nextSample.RXBytes = rx
 		var delta string
 		if prev, ok := r.lastNetSample[ifName]; ok {
 			interval := nextSample.sampleTime.Sub(prev.sampleTime).Seconds()
 			delta = fmt.Sprintf(" -- interval %.4f seconds %d tx %d rx",
 				interval,
-				tx-prev.txBytes,
-				rx-prev.rxBytes)
+				tx-prev.TXBytes,
+				rx-prev.RXBytes)
 		}
 		r.Logger.Printf("net:%s %d tx %d rx%s\n", ifName, tx, rx, delta)
 		r.lastNetSample[ifName] = nextSample
 	}
 }
 
-type diskSpaceSample struct {
+type DiskSpaceSample struct {
 	hasData    bool
 	sampleTime time.Time
-	total      uint64
-	used       uint64
-	available  uint64
+	Total      uint64
+	Used       uint64
+	Available  uint64
 }
 
 func (r *Reporter) doDiskSpaceStats() {
@@ -473,12 +475,12 @@ func (r *Reporter) doDiskSpaceStats() {
 		return
 	}
 	bs := uint64(s.Bsize)
-	nextSample := diskSpaceSample{
+	nextSample := DiskSpaceSample{
 		hasData:    true,
 		sampleTime: time.Now(),
-		total:      s.Blocks * bs,
-		used:       (s.Blocks - s.Bfree) * bs,
-		available:  s.Bavail * bs,
+		Total:      s.Blocks * bs,
+		Used:       (s.Blocks - s.Bfree) * bs,
+		Available:  s.Bavail * bs,
 	}
 
 	var delta string
@@ -487,19 +489,19 @@ func (r *Reporter) doDiskSpaceStats() {
 		interval := nextSample.sampleTime.Sub(prev.sampleTime).Seconds()
 		delta = fmt.Sprintf(" -- interval %.4f seconds %d used",
 			interval,
-			int64(nextSample.used-prev.used))
+			int64(nextSample.Used-prev.Used))
 	}
 	r.Logger.Printf("statfs %d available %d used %d total%s\n",
-		nextSample.available, nextSample.used, nextSample.total, delta)
+		nextSample.Available, nextSample.Used, nextSample.Total, delta)
 	r.lastDiskSpaceSample = nextSample
 }
 
-type cpuSample struct {
+type CPUSample struct {
 	hasData    bool // to distinguish the zero value from real data
 	sampleTime time.Time
-	user       float64
-	sys        float64
-	cpus       int64
+	User       float64
+	Sys        float64
+	CPUCount   int64
 }
 
 // Return the number of CPUs available in the container. Return 0 if
@@ -542,23 +544,23 @@ func (r *Reporter) doCPUStats() {
 	var userTicks, sysTicks int64
 	fmt.Sscanf(string(b), "user %d\nsystem %d", &userTicks, &sysTicks)
 	userHz := float64(100)
-	nextSample := cpuSample{
+	nextSample := CPUSample{
 		hasData:    true,
 		sampleTime: time.Now(),
-		user:       float64(userTicks) / userHz,
-		sys:        float64(sysTicks) / userHz,
-		cpus:       r.getCPUCount(),
+		User:       float64(userTicks) / userHz,
+		Sys:        float64(sysTicks) / userHz,
+		CPUCount:   r.getCPUCount(),
 	}
 
 	delta := ""
 	if r.lastCPUSample.hasData {
 		delta = fmt.Sprintf(" -- interval %.4f seconds %.4f user %.4f sys",
 			nextSample.sampleTime.Sub(r.lastCPUSample.sampleTime).Seconds(),
-			nextSample.user-r.lastCPUSample.user,
-			nextSample.sys-r.lastCPUSample.sys)
+			nextSample.User-r.lastCPUSample.User,
+			nextSample.Sys-r.lastCPUSample.Sys)
 	}
 	r.Logger.Printf("cpu %.4f user %.4f sys %d cpus%s\n",
-		nextSample.user, nextSample.sys, nextSample.cpus, delta)
+		nextSample.User, nextSample.Sys, nextSample.CPUCount, delta)
 	r.lastCPUSample = nextSample
 }
 
@@ -582,8 +584,8 @@ func (r *Reporter) run() {
 		return
 	}
 
-	r.lastNetSample = make(map[string]ioSample)
-	r.lastDiskIOSample = make(map[string]ioSample)
+	r.lastNetSample = make(map[string]IOSample)
+	r.lastDiskIOSample = make(map[string]IOSample)
 
 	if len(r.TempDir) == 0 {
 		// Temporary dir not provided, try to get it from the environment.

commit c550b07df2f1567d1a0449480959d03bbc0404dd
Author: Brett Smith <brett.smith at curii.com>
Date:   Sat Feb 11 15:38:51 2023 -0500

    19986: Log when a container uses nearly max RAM
    
    This is meant to help users diagnose when their container likely failed
    of OOM.
    
    Arvados-DCO-1.1-Signed-off-by: Brett Smith <brett.smith at curii.com>

diff --git a/lib/crunchrun/crunchrun.go b/lib/crunchrun/crunchrun.go
index 3def8851c..41a69b406 100644
--- a/lib/crunchrun/crunchrun.go
+++ b/lib/crunchrun/crunchrun.go
@@ -765,10 +765,30 @@ func (runner *ContainerRunner) startCrunchstat() error {
 		PollPeriod:   runner.statInterval,
 		TempDir:      runner.parentTemp,
 	}
+	runner.statReporter.SetupRSSThresholds(
+		runner.Container.RuntimeConstraints.RAM,
+		[]int64{90, 95, 99},
+	)
 	runner.statReporter.Start()
+	go runner.logCrunchstatAlerts()
 	return nil
 }
 
+func (runner *ContainerRunner) logCrunchstatAlerts() {
+	for {
+		alert := <-runner.statReporter.ThresholdAlerts
+		switch alert.StatName {
+		case "mem rss":
+			runner.CrunchLog.Printf(
+				"Container using over %d%% of memory (rss %d/%d %s)",
+				alert.ThresholdPercentage, alert.Value, alert.Total, alert.Unit,
+			)
+		case "":
+			return
+		}
+	}
+}
+
 type infoCommand struct {
 	label string
 	cmd   []string
diff --git a/lib/crunchrun/crunchrun_test.go b/lib/crunchrun/crunchrun_test.go
index aaba1c420..0c69548e6 100644
--- a/lib/crunchrun/crunchrun_test.go
+++ b/lib/crunchrun/crunchrun_test.go
@@ -17,6 +17,7 @@ import (
 	"os/exec"
 	"regexp"
 	"runtime/pprof"
+	"strconv"
 	"strings"
 	"sync"
 	"syscall"
@@ -898,6 +899,60 @@ func (s *TestSuite) TestLogVersionAndRuntime(c *C) {
 	c.Check(s.api.Logs["crunch-run"].String(), Matches, `(?ms).*Using container runtime: stub.*`)
 }
 
+func (s *TestSuite) searchLogs(logs *bytes.Buffer, pattern string) error {
+	var line string
+	var err error
+	re, err := regexp.Compile(pattern)
+	if err != nil {
+		return err
+	}
+	for ; err == nil; line, err = logs.ReadString('\n') {
+		if re.MatchString(line) {
+			return nil
+		}
+	}
+	return errors.New("logs did not match pattern")
+}
+
+func (s *TestSuite) testLogRSSThresholds(c *C, ram int, expected []int, notExpected int) {
+	s.runner.cgroupRoot = "testdata/fakestat"
+	s.fullRunHelper(c, `{
+		"command": ["true"],
+		"container_image": "`+arvadostest.DockerImage112PDH+`",
+		"cwd": ".",
+		"environment": {},
+		"mounts": {"/tmp": {"kind": "tmp"} },
+		"output_path": "/tmp",
+		"priority": 1,
+		"runtime_constraints": {"ram": `+strconv.Itoa(ram)+`},
+		"state": "Locked"
+	}`, nil, 0, func() {})
+	logs := s.api.Logs["crunch-run"]
+	pattern := `\dZ Container using over %d%% of memory \(rss 734003200/%d bytes\)\n$`
+	var threshold int
+	for _, threshold = range expected {
+		err := s.searchLogs(logs, fmt.Sprintf(pattern, threshold, ram))
+		c.Check(err, IsNil, Commentf("%s", err))
+	}
+	if notExpected > threshold {
+		err := s.searchLogs(logs, fmt.Sprintf(pattern, notExpected, ram))
+		c.Check(err, NotNil, Commentf("%d%% threshold logged", notExpected))
+	}
+}
+
+func (s *TestSuite) TestLogNoRSSThresholds(c *C) {
+	s.testLogRSSThresholds(c, 7340032000, []int{}, 90)
+}
+
+func (s *TestSuite) TestLogSomeRSSThresholds(c *C) {
+	onePercentRSS := 7340032
+	s.testLogRSSThresholds(c, 102*onePercentRSS, []int{90, 95}, 99)
+}
+
+func (s *TestSuite) TestLogAllRSSThresholds(c *C) {
+	s.testLogRSSThresholds(c, 734003299, []int{90, 95, 99}, 0)
+}
+
 func (s *TestSuite) TestCommitNodeInfoBeforeStart(c *C) {
 	var collection_create, container_update arvadosclient.Dict
 	s.fullRunHelper(c, `{
diff --git a/lib/crunchrun/testdata/fakestat/cgroup.procs b/lib/crunchrun/testdata/fakestat/cgroup.procs
new file mode 100644
index 000000000..e69de29bb
diff --git a/lib/crunchrun/testdata/fakestat/cgroupid/cgroup.procs b/lib/crunchrun/testdata/fakestat/cgroupid/cgroup.procs
new file mode 100644
index 000000000..e69de29bb
diff --git a/lib/crunchrun/testdata/fakestat/cgroupid/memory.stat b/lib/crunchrun/testdata/fakestat/cgroupid/memory.stat
new file mode 100644
index 000000000..fff3333c4
--- /dev/null
+++ b/lib/crunchrun/testdata/fakestat/cgroupid/memory.stat
@@ -0,0 +1 @@
+rss 734003200

commit 6b8fde42caacac639ded1c1a7d5aebff61a29e34
Author: Brett Smith <brett.smith at curii.com>
Date:   Sun Feb 12 21:10:44 2023 -0500

    19986: Add RSS threshold alerts to crunchstat
    
    This provides a channel through which users (namely crunch-run) can be
    alerted when monitored statistics pass configured thresholds. Right now
    it only supports this for memory rss, but the design should be
    extensible to other statistics in the future.
    
    Arvados-DCO-1.1-Signed-off-by: Brett Smith <brett.smith at curii.com>

diff --git a/.licenseignore b/.licenseignore
index 6ddb5c009..3d24c4ee3 100644
--- a/.licenseignore
+++ b/.licenseignore
@@ -92,4 +92,5 @@ sdk/cwl/tests/wf/hello.txt
 sdk/cwl/tests/wf/indir1/hello2.txt
 sdk/cwl/tests/chipseq/data/Genomes/*
 CITATION.cff
-SECURITY.md
\ No newline at end of file
+SECURITY.md
+*/testdata/fakestat/*
diff --git a/lib/crunchstat/crunchstat.go b/lib/crunchstat/crunchstat.go
index 5d2059f7e..61ee72a66 100644
--- a/lib/crunchstat/crunchstat.go
+++ b/lib/crunchstat/crunchstat.go
@@ -53,6 +53,10 @@ type Reporter struct {
 		Printf(fmt string, args ...interface{})
 	}
 
+	// When stats cross configured thresholds (set up by the Setup*Thresholds
+	// methods) reports are sent to this channel.
+	ThresholdAlerts chan ThresholdAlert
+
 	kernelPageSize      int64
 	reportedStatFile    map[string]string
 	lastNetSample       map[string]ioSample
@@ -60,6 +64,7 @@ type Reporter struct {
 	lastCPUSample       cpuSample
 	lastDiskSpaceSample diskSpaceSample
 	lastMemSample       memSample
+	rssThresholds       []ThresholdAlert
 
 	reportPIDs   map[string]int
 	reportPIDsMu sync.Mutex
@@ -68,6 +73,51 @@ type Reporter struct {
 	flushed chan struct{} // closed when we have made our last report
 }
 
+type ThresholdAlert struct {
+	// A short identifier for the statistic that passed a threshold
+	StatName string
+
+	// The percentage of available resource that the threshold represents
+	// (e.g., 50 == 50% of available whatever)
+	ThresholdPercentage int64
+
+	// The absolute value that the threshold represents, in the resource's
+	// native unit
+	// (e.g., Total available resource * ThresholdPercentage / 100 == Threshold)
+	Threshold int64
+
+	// The time that the threshold was passed
+	Time time.Time
+
+	// The total available resource, in the resource's native unit
+	Total int64
+
+	// A human-friendly string that identifies the unit of measurement
+	// for Threshold, Total, and Value
+	Unit string
+
+	// The value that the statistic had when it passed the threshold,
+	// in the resource's native unit
+	Value int64
+}
+
+// Set up threshold alerts for memory RSS.
+// total is the amount of RAM available, in bytes.
+// percentages is a slice of percentages, represented as integers
+// (e.g., 25 for 25%). An alert will be set up for each percentage of the
+// total available RAM. Note percentages MUST be sorted in ascending order.
+func (r *Reporter) SetupRSSThresholds(total int64, percentages []int64) {
+	for _, percentage := range percentages {
+		r.rssThresholds = append(r.rssThresholds, ThresholdAlert{
+			StatName:            "mem rss",
+			ThresholdPercentage: percentage,
+			Threshold:           total * percentage / 100,
+			Total:               total,
+			Unit:                "bytes",
+		})
+	}
+}
+
 // Start starts monitoring in a new goroutine, and returns
 // immediately.
 //
@@ -80,6 +130,7 @@ type Reporter struct {
 //
 // Callers should not modify public data fields after calling Start.
 func (r *Reporter) Start() {
+	r.ThresholdAlerts = make(chan ThresholdAlert, len(r.rssThresholds))
 	r.done = make(chan struct{})
 	r.flushed = make(chan struct{})
 	go r.run()
@@ -103,6 +154,7 @@ func (r *Reporter) ReportPID(name string, pid int) {
 func (r *Reporter) Stop() {
 	close(r.done)
 	<-r.flushed
+	close(r.ThresholdAlerts)
 }
 
 func (r *Reporter) readAllOrWarn(in io.Reader) ([]byte, error) {
@@ -263,6 +315,19 @@ func (r *Reporter) getMemSample() {
 		}
 		thisSample.memStat[stat] = val
 	}
+
+	var thresholdsPassed uint
+	rss, ok := thisSample.memStat["rss"]
+	for _, threshold := range r.rssThresholds {
+		if !ok || rss < threshold.Threshold {
+			break
+		}
+		threshold.Time = thisSample.sampleTime
+		threshold.Value = rss
+		r.ThresholdAlerts <- threshold
+		thresholdsPassed++
+	}
+	r.rssThresholds = r.rssThresholds[thresholdsPassed:]
 	r.lastMemSample = thisSample
 }
 
diff --git a/lib/crunchstat/crunchstat_test.go b/lib/crunchstat/crunchstat_test.go
index b4498a135..f0bbd5aca 100644
--- a/lib/crunchstat/crunchstat_test.go
+++ b/lib/crunchstat/crunchstat_test.go
@@ -16,6 +16,22 @@ import (
 	. "gopkg.in/check.v1"
 )
 
+const GiB = int64(1024 * 1024 * 1024)
+
+type fakeStat struct {
+	cgroupRoot string
+	statName   string
+	unit       string
+	value      int64
+}
+
+var fakeRSS = fakeStat{
+	cgroupRoot: "testdata/fakestat",
+	statName:   "mem rss",
+	unit:       "bytes",
+	value:      750 * 1024 * 1024,
+}
+
 func Test(t *testing.T) {
 	TestingT(t)
 }
@@ -25,8 +41,9 @@ var _ = Suite(&suite{
 })
 
 type suite struct {
-	logbuf bytes.Buffer
-	logger *logrus.Logger
+	logbuf    bytes.Buffer
+	logger    *logrus.Logger
+	startTime time.Time
 }
 
 func (s *suite) SetUpSuite(c *C) {
@@ -35,6 +52,7 @@ func (s *suite) SetUpSuite(c *C) {
 
 func (s *suite) SetUpTest(c *C) {
 	s.logbuf.Reset()
+	s.startTime = time.Now()
 }
 
 func (s *suite) TestReadAllOrWarnFail(c *C) {
@@ -90,3 +108,69 @@ func (s *suite) TestReportPIDs(c *C) {
 	}
 	c.Logf("%s", s.logbuf.String())
 }
+
+func (s *suite) testRSSThresholds(c *C, rssPercentages []int64, alertCount int) {
+	c.Assert(alertCount <= len(rssPercentages), Equals, true)
+	rep := Reporter{
+		Logger:     s.logger,
+		CgroupRoot: fakeRSS.cgroupRoot,
+		PollPeriod: time.Second * 10,
+	}
+	rep.SetupRSSThresholds(GiB, rssPercentages)
+	timeout := time.After(time.Second)
+
+	rep.Start()
+	for _, expectPercentage := range rssPercentages[:alertCount] {
+		select {
+		case alert := <-rep.ThresholdAlerts:
+			expected := ThresholdAlert{
+				StatName:            fakeRSS.statName,
+				ThresholdPercentage: expectPercentage,
+				Threshold:           GiB * expectPercentage / 100,
+				// This Time value makes the check a noop but it lets us easily
+				// use DeepEquals. We check it has a good value below.
+				Time:  alert.Time,
+				Total: GiB,
+				Unit:  fakeRSS.unit,
+				Value: fakeRSS.value,
+			}
+			c.Check(alert, DeepEquals, expected)
+			c.Check(alert.Time.After(s.startTime), Equals, true)
+		case <-timeout:
+			c.Errorf("timed out waiting for %d%% alert", expectPercentage)
+		}
+	}
+
+	rep.Stop()
+	select {
+	case alert := <-rep.ThresholdAlerts:
+		c.Check(alert, DeepEquals, ThresholdAlert{})
+	case <-timeout:
+		c.Error("timed out waiting for zero value alert after close")
+	}
+	c.Logf("%s", s.logbuf.String())
+}
+
+func (s *suite) TestZeroRSSThresholds(c *C) {
+	s.testRSSThresholds(c, []int64{}, 0)
+}
+
+func (s *suite) TestOneRSSThresholdPassed(c *C) {
+	s.testRSSThresholds(c, []int64{55}, 1)
+}
+
+func (s *suite) TestOneRSSThresholdNotPassed(c *C) {
+	s.testRSSThresholds(c, []int64{85}, 0)
+}
+
+func (s *suite) TestMultipleRSSThresholdsNonePassed(c *C) {
+	s.testRSSThresholds(c, []int64{95, 97, 99}, 0)
+}
+
+func (s *suite) TestMultipleRSSThresholdsSomePassed(c *C) {
+	s.testRSSThresholds(c, []int64{60, 70, 80, 90}, 2)
+}
+
+func (s *suite) TestMultipleRSSThresholdsAllPassed(c *C) {
+	s.testRSSThresholds(c, []int64{1, 2, 3}, 3)
+}
diff --git a/lib/crunchstat/testdata/fakestat/cgroup.procs b/lib/crunchstat/testdata/fakestat/cgroup.procs
new file mode 100644
index 000000000..e69de29bb
diff --git a/lib/crunchstat/testdata/fakestat/memory.stat b/lib/crunchstat/testdata/fakestat/memory.stat
new file mode 100644
index 000000000..4e6bed18b
--- /dev/null
+++ b/lib/crunchstat/testdata/fakestat/memory.stat
@@ -0,0 +1 @@
+rss 786432000

commit cec07f8e4d5019f69d085023f80f56b3d4c1e032
Author: Brett Smith <brett.smith at curii.com>
Date:   Fri Feb 10 14:47:19 2023 -0500

    19986: DRY up logger setup in crunchstat tests
    
    Arvados-DCO-1.1-Signed-off-by: Brett Smith <brett.smith at curii.com>

diff --git a/lib/crunchstat/crunchstat_test.go b/lib/crunchstat/crunchstat_test.go
index 5e8e93de6..b4498a135 100644
--- a/lib/crunchstat/crunchstat_test.go
+++ b/lib/crunchstat/crunchstat_test.go
@@ -6,7 +6,6 @@ package crunchstat
 
 import (
 	"bytes"
-	"log"
 	"os"
 	"regexp"
 	"strconv"
@@ -21,13 +20,25 @@ func Test(t *testing.T) {
 	TestingT(t)
 }
 
-var _ = Suite(&suite{})
+var _ = Suite(&suite{
+	logger: logrus.New(),
+})
 
-type suite struct{}
+type suite struct {
+	logbuf bytes.Buffer
+	logger *logrus.Logger
+}
+
+func (s *suite) SetUpSuite(c *C) {
+	s.logger.Out = &s.logbuf
+}
+
+func (s *suite) SetUpTest(c *C) {
+	s.logbuf.Reset()
+}
 
 func (s *suite) TestReadAllOrWarnFail(c *C) {
-	var logger bytes.Buffer
-	rep := Reporter{Logger: log.New(&logger, "", 0)}
+	rep := Reporter{Logger: s.logger}
 
 	// The special file /proc/self/mem can be opened for
 	// reading, but reading from byte 0 returns an error.
@@ -36,12 +47,11 @@ func (s *suite) TestReadAllOrWarnFail(c *C) {
 	defer f.Close()
 	_, err = rep.readAllOrWarn(f)
 	c.Check(err, NotNil)
-	c.Check(logger.String(), Matches, "^warning: read /proc/self/mem: .*\n")
+	c.Check(s.logbuf.String(), Matches, ".* msg=\"warning: read /proc/self/mem: .*\n")
 }
 
 func (s *suite) TestReadAllOrWarnSuccess(c *C) {
-	var logbuf bytes.Buffer
-	rep := Reporter{Logger: log.New(&logbuf, "", 0)}
+	rep := Reporter{Logger: s.logger}
 
 	f, err := os.Open("./crunchstat_test.go")
 	c.Assert(err, IsNil)
@@ -49,15 +59,12 @@ func (s *suite) TestReadAllOrWarnSuccess(c *C) {
 	data, err := rep.readAllOrWarn(f)
 	c.Check(err, IsNil)
 	c.Check(string(data), Matches, "(?ms).*\npackage crunchstat\n.*")
-	c.Check(logbuf.String(), Equals, "")
+	c.Check(s.logbuf.String(), Equals, "")
 }
 
 func (s *suite) TestReportPIDs(c *C) {
-	var logbuf bytes.Buffer
-	logger := logrus.New()
-	logger.Out = &logbuf
 	r := Reporter{
-		Logger:     logger,
+		Logger:     s.logger,
 		CgroupRoot: "/sys/fs/cgroup",
 		PollPeriod: time.Second,
 	}
@@ -70,7 +77,7 @@ func (s *suite) TestReportPIDs(c *C) {
 			c.Error("timed out")
 			break
 		}
-		if m := regexp.MustCompile(`(?ms).*procmem \d+ init (\d+) test_process.*`).FindSubmatch(logbuf.Bytes()); len(m) > 0 {
+		if m := regexp.MustCompile(`(?ms).*procmem \d+ init (\d+) test_process.*`).FindSubmatch(s.logbuf.Bytes()); len(m) > 0 {
 			size, err := strconv.ParseInt(string(m[1]), 10, 64)
 			c.Check(err, IsNil)
 			// Expect >1 MiB and <100 MiB -- otherwise we
@@ -81,5 +88,5 @@ func (s *suite) TestReportPIDs(c *C) {
 			break
 		}
 	}
-	c.Logf("%s", logbuf.String())
+	c.Logf("%s", s.logbuf.String())
 }

commit 1c591a89e69d2b52d211b32be26fff4ac225479f
Author: Brett Smith <brett.smith at curii.com>
Date:   Fri Feb 10 10:28:07 2023 -0500

    19986: Separate collection of cgroup memory stats
    
    This is scaffolding to help us report promptly when a container is
    approaching OOM. This commit does not change any public interface or
    reporting.
    
    Arvados-DCO-1.1-Signed-off-by: Brett Smith <brett.smith at curii.com>

diff --git a/lib/crunchstat/crunchstat.go b/lib/crunchstat/crunchstat.go
index 3a473cab8..5d2059f7e 100644
--- a/lib/crunchstat/crunchstat.go
+++ b/lib/crunchstat/crunchstat.go
@@ -59,6 +59,7 @@ type Reporter struct {
 	lastDiskIOSample    map[string]ioSample
 	lastCPUSample       cpuSample
 	lastDiskSpaceSample diskSpaceSample
+	lastMemSample       memSample
 
 	reportPIDs   map[string]int
 	reportPIDsMu sync.Mutex
@@ -246,7 +247,7 @@ type memSample struct {
 	memStat    map[string]int64
 }
 
-func (r *Reporter) doMemoryStats() {
+func (r *Reporter) getMemSample() {
 	c, err := r.openStatFile("memory", "memory.stat", true)
 	if err != nil {
 		return
@@ -254,7 +255,6 @@ func (r *Reporter) doMemoryStats() {
 	defer c.Close()
 	b := bufio.NewScanner(c)
 	thisSample := memSample{time.Now(), make(map[string]int64)}
-	wantStats := [...]string{"cache", "swap", "pgmajfault", "rss"}
 	for b.Scan() {
 		var stat string
 		var val int64
@@ -263,19 +263,26 @@ func (r *Reporter) doMemoryStats() {
 		}
 		thisSample.memStat[stat] = val
 	}
+	r.lastMemSample = thisSample
+}
+
+func (r *Reporter) reportMemSample() {
 	var outstat bytes.Buffer
+	wantStats := [...]string{"cache", "swap", "pgmajfault", "rss"}
 	for _, key := range wantStats {
 		// Use "total_X" stats (entire hierarchy) if enabled,
 		// otherwise just the single cgroup -- see
 		// https://www.kernel.org/doc/Documentation/cgroup-v1/memory.txt
-		if val, ok := thisSample.memStat["total_"+key]; ok {
+		if val, ok := r.lastMemSample.memStat["total_"+key]; ok {
 			fmt.Fprintf(&outstat, " %d %s", val, key)
-		} else if val, ok := thisSample.memStat[key]; ok {
+		} else if val, ok := r.lastMemSample.memStat[key]; ok {
 			fmt.Fprintf(&outstat, " %d %s", val, key)
 		}
 	}
 	r.Logger.Printf("mem%s\n", outstat.String())
+}
 
+func (r *Reporter) doProcmemStats() {
 	if r.kernelPageSize == 0 {
 		// assign "don't try again" value in case we give up
 		// and return without assigning the real value
@@ -490,6 +497,15 @@ func (r *Reporter) doCPUStats() {
 	r.lastCPUSample = nextSample
 }
 
+func (r *Reporter) doAllStats() {
+	r.reportMemSample()
+	r.doProcmemStats()
+	r.doCPUStats()
+	r.doBlkIOStats()
+	r.doNetworkStats()
+	r.doDiskSpaceStats()
+}
+
 // Report stats periodically until we learn (via r.done) that someone
 // called Stop.
 func (r *Reporter) run() {
@@ -512,17 +528,19 @@ func (r *Reporter) run() {
 		r.Logger.Printf("notice: monitoring temp dir %s\n", r.TempDir)
 	}
 
-	ticker := time.NewTicker(r.PollPeriod)
+	r.getMemSample()
+	r.doAllStats()
+
+	memTicker := time.NewTicker(time.Second)
+	mainTicker := time.NewTicker(r.PollPeriod)
 	for {
-		r.doMemoryStats()
-		r.doCPUStats()
-		r.doBlkIOStats()
-		r.doNetworkStats()
-		r.doDiskSpaceStats()
 		select {
 		case <-r.done:
 			return
-		case <-ticker.C:
+		case <-memTicker.C:
+			r.getMemSample()
+		case <-mainTicker.C:
+			r.doAllStats()
 		}
 	}
 }

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list