[arvados] created: 2.5.0-158-g9860f118e
git repository hosting
git at public.arvados.org
Mon Feb 13 16:46:40 UTC 2023
at 9860f118ea1f95a0879519cd1a459186bd5161f8 (commit)
commit 9860f118ea1f95a0879519cd1a459186bd5161f8
Author: Brett Smith <brett.smith at curii.com>
Date: Mon Feb 13 10:54:09 2023 -0500
19986: Log resource maxima from crunch-run
This aims to a provide a human-friendly summary of resource usage at the
end of logs.
Arvados-DCO-1.1-Signed-off-by: Brett Smith <brett.smith at curii.com>
diff --git a/lib/crunchrun/crunchrun.go b/lib/crunchrun/crunchrun.go
index 41a69b406..1d6a347b4 100644
--- a/lib/crunchrun/crunchrun.go
+++ b/lib/crunchrun/crunchrun.go
@@ -728,6 +728,13 @@ func (runner *ContainerRunner) stopHoststat() error {
return nil
}
runner.hoststatReporter.Stop()
+ maxima := runner.hoststatReporter.GetMaxima()
+ for memKey, value := range maxima.Memory {
+ if memKey.ProcessName == "" {
+ continue
+ }
+ runner.reportMemoryMax(memKey.ProcessName, memKey.StatName, value, 0, "bytes")
+ }
err := runner.hoststatLogger.Close()
if err != nil {
return fmt.Errorf("error closing hoststat logs: %v", err)
@@ -1091,6 +1098,30 @@ func (runner *ContainerRunner) StartContainer() error {
return nil
}
+func (runner *ContainerRunner) reportMemoryMax(
+ source string,
+ statName string,
+ value int64,
+ limit int64,
+ units string,
+) {
+ if units == "" {
+ units = "bytes"
+ }
+ if limit > 0 {
+ percentage := 100 * value / limit
+ runner.CrunchLog.Printf(
+ "Maximum %s memory %s usage was %d%%, %d/%d %s",
+ source, statName, percentage, value, limit, units,
+ )
+ } else {
+ runner.CrunchLog.Printf(
+ "Maximum %s memory %s usage was %d %s",
+ source, statName, value, units,
+ )
+ }
+}
+
// WaitFinish waits for the container to terminate, capture the exit code, and
// close the stdout/stderr logging.
func (runner *ContainerRunner) WaitFinish() error {
@@ -1112,9 +1143,55 @@ func (runner *ContainerRunner) WaitFinish() error {
case <-ctx.Done():
}
}()
- exitcode, err := runner.executor.Wait(ctx)
- if err != nil {
- runner.checkBrokenNode(err)
+ // The order of operations here is precise: let the executor finish, THEN
+ // log crunchstat maxima, THEN fail if executor.Wait did.
+ exitcode, waitErr := runner.executor.Wait(ctx)
+ var err error
+ if runner.statReporter != nil {
+ runner.statReporter.Stop()
+
+ memLimits := map[string]int64{
+ "rss": runner.Container.RuntimeConstraints.RAM,
+ }
+ memUnits := map[string]string{
+ "pgmajfault": "faults",
+ }
+ maxima := runner.statReporter.GetMaxima()
+ runner.CrunchLog.Printf(
+ "Total CPU usage was %f user and %f sys on %d CPUs",
+ maxima.CPU.User, maxima.CPU.Sys, maxima.CPU.CPUCount,
+ )
+ for disk, sample := range maxima.DiskIO {
+ runner.CrunchLog.Printf(
+ "Total disk I/O on %s was %d bytes written and %d bytes read",
+ disk, sample.TXBytes, sample.RXBytes,
+ )
+ }
+ if maxima.DiskSpace.Total > 0 {
+ percentage := 100 * maxima.DiskSpace.Used / maxima.DiskSpace.Total
+ runner.CrunchLog.Printf(
+ "Maximum disk usage was %d%%, %d/%d bytes",
+ percentage, maxima.DiskSpace.Used, maxima.DiskSpace.Total,
+ )
+ }
+ for _, statName := range crunchstat.MemoryStats {
+ value, _ := maxima.GetMemstat(statName)
+ runner.reportMemoryMax("container", statName, value, memLimits[statName], memUnits[statName])
+ }
+ for ifname, sample := range maxima.NetIO {
+ runner.CrunchLog.Printf(
+ "Total network I/O on %s was %d bytes written and %d bytes read",
+ ifname, sample.TXBytes, sample.RXBytes,
+ )
+ }
+
+ err = runner.statLogger.Close()
+ if err != nil {
+ runner.CrunchLog.Printf("error closing crunchstat logs: %v", err)
+ }
+ }
+ if waitErr != nil {
+ runner.checkBrokenNode(waitErr)
return err
}
runner.ExitCode = &exitcode
@@ -1159,14 +1236,6 @@ func (runner *ContainerRunner) WaitFinish() error {
returnErr = err
}
}
-
- if runner.statReporter != nil {
- runner.statReporter.Stop()
- err = runner.statLogger.Close()
- if err != nil {
- runner.CrunchLog.Printf("error closing crunchstat logs: %v", err)
- }
- }
return returnErr
}
diff --git a/lib/crunchrun/crunchrun_test.go b/lib/crunchrun/crunchrun_test.go
index 0c69548e6..e3305a138 100644
--- a/lib/crunchrun/crunchrun_test.go
+++ b/lib/crunchrun/crunchrun_test.go
@@ -932,7 +932,7 @@ func (s *TestSuite) testLogRSSThresholds(c *C, ram int, expected []int, notExpec
var threshold int
for _, threshold = range expected {
err := s.searchLogs(logs, fmt.Sprintf(pattern, threshold, ram))
- c.Check(err, IsNil, Commentf("%s", err))
+ c.Check(err, IsNil, Commentf("%s `%v`", err, pattern))
}
if notExpected > threshold {
err := s.searchLogs(logs, fmt.Sprintf(pattern, notExpected, ram))
@@ -953,6 +953,35 @@ func (s *TestSuite) TestLogAllRSSThresholds(c *C) {
s.testLogRSSThresholds(c, 734003299, []int{90, 95, 99}, 0)
}
+func (s *TestSuite) TestLogMaximaAfterRun(c *C) {
+ s.runner.cgroupRoot = "testdata/fakestat"
+ s.runner.parentTemp = c.MkDir()
+ s.fullRunHelper(c, `{
+ "command": ["true"],
+ "container_image": "`+arvadostest.DockerImage112PDH+`",
+ "cwd": ".",
+ "environment": {},
+ "mounts": {"/tmp": {"kind": "tmp"} },
+ "output_path": "/tmp",
+ "priority": 1,
+ "runtime_constraints": {"ram": 7340032000},
+ "state": "Locked"
+ }`, nil, 0, func() {})
+ logs := s.api.Logs["crunch-run"]
+ for _, pattern := range []string{
+ `Total CPU usage was \d+\.\d+ user and \d+\.\d+ sys on \d+ CPUs\n$`,
+ `Maximum disk usage was \d+%, \d+/\d+ bytes\n$`,
+ `Maximum container memory cache usage was 73400320 bytes\n$`,
+ `Maximum container memory swap usage was 320 bytes\n$`,
+ `Maximum container memory pgmajfault usage was 20 faults\n$`,
+ `Maximum container memory rss usage was 10%, 734003200/7340032000 bytes\n$`,
+ `Maximum crunch-run memory rss usage was \d+ bytes\n$`,
+ } {
+ err := s.searchLogs(logs, `\dZ `+pattern)
+ c.Check(err, IsNil, Commentf("%s `%v`", err, pattern))
+ }
+}
+
func (s *TestSuite) TestCommitNodeInfoBeforeStart(c *C) {
var collection_create, container_update arvadosclient.Dict
s.fullRunHelper(c, `{
diff --git a/lib/crunchrun/testdata/fakestat/cgroupid/memory.stat b/lib/crunchrun/testdata/fakestat/cgroupid/memory.stat
index fff3333c4..22f0e13fa 100644
--- a/lib/crunchrun/testdata/fakestat/cgroupid/memory.stat
+++ b/lib/crunchrun/testdata/fakestat/cgroupid/memory.stat
@@ -1 +1,5 @@
rss 734003200
+pgmajfault 3200
+total_cache 73400320
+total_pgmajfault 20
+total_swap 320
commit 107e362180f4afe7dfd21ecc81998503e8b50961
Author: Brett Smith <brett.smith at curii.com>
Date: Mon Feb 13 10:20:24 2023 -0500
19986: Add StatMaxima.GetMemstat method
This makes it easier to report maxima consistently with the original
crunchstat report.
Arvados-DCO-1.1-Signed-off-by: Brett Smith <brett.smith at curii.com>
diff --git a/lib/crunchstat/crunchstat.go b/lib/crunchstat/crunchstat.go
index f30602e8b..d60f533cf 100644
--- a/lib/crunchstat/crunchstat.go
+++ b/lib/crunchstat/crunchstat.go
@@ -94,6 +94,18 @@ type StatMaxima struct {
NetIO map[string]IOSample
}
+// Report the best available maximum for a given cgroup memory stat.
+//
+// This checks the total_KEY stat before falling back to plain KEY.
+// This mirrors the logic in reportMemSample.
+func (maxima *StatMaxima) GetMemstat(statName string) (value int64, ok bool) {
+ value, ok = maxima.Memory[MemoryKey{StatName: "total_" + statName}]
+ if !ok {
+ value, ok = maxima.Memory[MemoryKey{StatName: statName}]
+ }
+ return
+}
+
func newMaxima() StatMaxima {
return StatMaxima{
// Memory is the only field we need to initialize. Other maps are
diff --git a/lib/crunchstat/crunchstat_test.go b/lib/crunchstat/crunchstat_test.go
index f48276ce2..784214663 100644
--- a/lib/crunchstat/crunchstat_test.go
+++ b/lib/crunchstat/crunchstat_test.go
@@ -253,3 +253,19 @@ func (s *suite) TestMaxima(c *C) {
c.Logf("%s", s.logbuf.String())
}
+
+func (s *suite) TestMaximaGetMemstat(c *C) {
+ maxima := newMaxima()
+ maxima.Memory[MemoryKey{StatName: "total_rss"}] = 987
+ maxima.Memory[MemoryKey{StatName: "rss"}] = 654
+ maxima.Memory[MemoryKey{StatName: "swap"}] = 321
+ value, ok := maxima.GetMemstat("rss")
+ c.Check(value, Equals, int64(987))
+ c.Check(ok, Equals, true)
+ value, ok = maxima.GetMemstat("swap")
+ c.Check(value, Equals, int64(321))
+ c.Check(ok, Equals, true)
+ value, ok = maxima.GetMemstat("cache")
+ c.Check(value, Equals, int64(0))
+ c.Check(ok, Equals, false)
+}
commit 507bb11984f210752e47892bf8620e4a11bb4be9
Author: Brett Smith <brett.smith at curii.com>
Date: Sun Feb 12 23:26:23 2023 -0500
19986: Record maximum value crunchstat sees for each statistic
See the comments on GetMaxima for details about how the maximum is
defined for each statistic.
Arvados-DCO-1.1-Signed-off-by: Brett Smith <brett.smith at curii.com>
diff --git a/lib/crunchstat/crunchstat.go b/lib/crunchstat/crunchstat.go
index 49c88b6c4..f30602e8b 100644
--- a/lib/crunchstat/crunchstat.go
+++ b/lib/crunchstat/crunchstat.go
@@ -68,6 +68,7 @@ type Reporter struct {
lastDiskSpaceSample DiskSpaceSample
lastMemSample memSample
rssThresholds []ThresholdAlert
+ maxima StatMaxima
reportPIDs map[string]int
reportPIDsMu sync.Mutex
@@ -76,6 +77,32 @@ type Reporter struct {
flushed chan struct{} // closed when we have made our last report
}
+// MemoryKey is a key into StatMaxima.Memory.
+// Initialize it with just StatName to get the host/cgroup maximum.
+// Initialize it with all fields to get that process' maximum.
+type MemoryKey struct {
+ ProcessID int
+ ProcessName string
+ StatName string
+}
+
+type StatMaxima struct {
+ CPU CPUSample
+ DiskIO map[string]IOSample
+ DiskSpace DiskSpaceSample
+ Memory map[MemoryKey]int64
+ NetIO map[string]IOSample
+}
+
+func newMaxima() StatMaxima {
+ return StatMaxima{
+ // Memory is the only field we need to initialize. Other maps are
+ // set unconditionally by GetMaxima after the Reporter runs, and
+ // zero value structs are fine for the rest.
+ Memory: make(map[MemoryKey]int64),
+ }
+}
+
type ThresholdAlert struct {
// A short identifier for the statistic that passed a threshold
StatName string
@@ -133,6 +160,7 @@ func (r *Reporter) SetupRSSThresholds(total int64, percentages []int64) {
//
// Callers should not modify public data fields after calling Start.
func (r *Reporter) Start() {
+ r.maxima = newMaxima()
r.ThresholdAlerts = make(chan ThresholdAlert, len(r.rssThresholds))
r.done = make(chan struct{})
r.flushed = make(chan struct{})
@@ -160,6 +188,25 @@ func (r *Reporter) Stop() {
close(r.ThresholdAlerts)
}
+// Get the maximum value of each statistic seen during the reporing period.
+//
+// This function will not return until Stop has been called and the Reporter
+// has finished.
+//
+// For CPU and I/O stats, the maximum is the last reported value.
+//
+// For disk space, the maximum is the highest amount of TempDir used.
+//
+// Memory statistics are all reported individually. See the comments on
+// MemoryKey for details about how to access them.
+func (r *Reporter) GetMaxima() StatMaxima {
+ <-r.flushed
+ r.maxima.CPU = r.lastCPUSample
+ r.maxima.DiskIO = r.lastDiskIOSample
+ r.maxima.NetIO = r.lastNetSample
+ return r.maxima
+}
+
func (r *Reporter) readAllOrWarn(in io.Reader) ([]byte, error) {
content, err := ioutil.ReadAll(in)
if err != nil {
@@ -317,6 +364,10 @@ func (r *Reporter) getMemSample() {
continue
}
thisSample.memStat[stat] = val
+ maxKey := MemoryKey{StatName: stat}
+ if val > r.maxima.Memory[maxKey] {
+ r.maxima.Memory[maxKey] = val
+ }
}
var thresholdsPassed uint
@@ -412,7 +463,12 @@ func (r *Reporter) doProcmemStats() {
if err != nil {
continue
}
- procmem += fmt.Sprintf(" %d %s", rss*r.kernelPageSize, procname)
+ value := rss * r.kernelPageSize
+ procmem += fmt.Sprintf(" %d %s", value, procname)
+ maxKey := MemoryKey{pid, procname, "rss"}
+ if value > r.maxima.Memory[maxKey] {
+ r.maxima.Memory[maxKey] = value
+ }
}
if procmem != "" {
r.Logger.Printf("procmem%s\n", procmem)
@@ -485,6 +541,9 @@ func (r *Reporter) doDiskSpaceStats() {
Used: (s.Blocks - s.Bfree) * bs,
Available: s.Bavail * bs,
}
+ if nextSample.Used > r.maxima.DiskSpace.Used {
+ r.maxima.DiskSpace = nextSample
+ }
var delta string
if r.lastDiskSpaceSample.hasData {
diff --git a/lib/crunchstat/crunchstat_test.go b/lib/crunchstat/crunchstat_test.go
index 30e16e4c8..f48276ce2 100644
--- a/lib/crunchstat/crunchstat_test.go
+++ b/lib/crunchstat/crunchstat_test.go
@@ -6,7 +6,9 @@ package crunchstat
import (
"bytes"
+ "errors"
"os"
+ "path"
"regexp"
"strconv"
"testing"
@@ -43,9 +45,10 @@ var _ = Suite(&suite{
})
type suite struct {
- logbuf bytes.Buffer
- logger *logrus.Logger
- startTime time.Time
+ cgroupRoot string
+ logbuf bytes.Buffer
+ logger *logrus.Logger
+ startTime time.Time
}
func (s *suite) SetUpSuite(c *C) {
@@ -53,10 +56,50 @@ func (s *suite) SetUpSuite(c *C) {
}
func (s *suite) SetUpTest(c *C) {
+ s.cgroupRoot = ""
s.logbuf.Reset()
s.startTime = time.Now()
}
+func (s *suite) tempCgroup(c *C, sourceDir string) error {
+ tempDir := c.MkDir()
+ dirents, err := os.ReadDir(sourceDir)
+ if err != nil {
+ return err
+ }
+ for _, dirent := range dirents {
+ srcData, err := os.ReadFile(path.Join(sourceDir, dirent.Name()))
+ if err != nil {
+ return err
+ }
+ destPath := path.Join(tempDir, dirent.Name())
+ err = os.WriteFile(destPath, srcData, 0o600)
+ if err != nil {
+ return err
+ }
+ }
+ s.cgroupRoot = tempDir
+ return nil
+}
+
+func (s *suite) addPidToCgroup(pid int) error {
+ if s.cgroupRoot == "" {
+ return errors.New("cgroup has not been set up for this test")
+ }
+ procsPath := path.Join(s.cgroupRoot, "cgroup.procs")
+ procsFile, err := os.OpenFile(procsPath, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0o600)
+ if err != nil {
+ return err
+ }
+ pidLine := strconv.Itoa(pid) + "\n"
+ _, err = procsFile.Write([]byte(pidLine))
+ if err != nil {
+ procsFile.Close()
+ return err
+ }
+ return procsFile.Close()
+}
+
func (s *suite) TestReadAllOrWarnFail(c *C) {
rep := Reporter{Logger: s.logger}
@@ -176,3 +219,37 @@ func (s *suite) TestMultipleRSSThresholdsSomePassed(c *C) {
func (s *suite) TestMultipleRSSThresholdsAllPassed(c *C) {
s.testRSSThresholds(c, []int64{1, 2, 3}, 3)
}
+
+func (s *suite) TestMaxima(c *C) {
+ err := s.tempCgroup(c, fakeRSS.cgroupRoot)
+ c.Assert(err, IsNil)
+ pid := os.Getpid()
+ err = s.addPidToCgroup(pid)
+ c.Assert(err, IsNil)
+
+ rep := Reporter{
+ CgroupRoot: s.cgroupRoot,
+ Logger: s.logger,
+ PollPeriod: time.Second * 10,
+ TempDir: s.cgroupRoot,
+ }
+ rep.ReportPID("MaximaTest", pid)
+ rep.Start()
+ rep.Stop()
+ actual := rep.GetMaxima()
+
+ // FIXME: Add checks for the remaining StatMaxima fields. I'm not doing
+ // that on the first pass because I don't feel confident they'll work
+ // consistently across developer machines.
+
+ c.Check(actual.DiskSpace.Total, Not(Equals), uint64(0))
+ c.Check(actual.DiskSpace.Used, Not(Equals), uint64(0))
+ c.Check(actual.DiskSpace.Available, Not(Equals), uint64(0))
+
+ memKey := MemoryKey{StatName: "total_rss"}
+ c.Check(actual.Memory[memKey], Equals, fakeRSS.value)
+ memKey = MemoryKey{pid, "MaximaTest", "rss"}
+ c.Check(actual.Memory[memKey], Not(Equals), int64(0))
+
+ c.Logf("%s", s.logbuf.String())
+}
commit 7a8d0f824e7a58f630c0c877cb51322649c388ab
Author: Brett Smith <brett.smith at curii.com>
Date: Sun Feb 12 14:04:25 2023 -0500
19986: Export some sample structs from crunchstat
This is preparation to make maximum statistics public.
Arvados-DCO-1.1-Signed-off-by: Brett Smith <brett.smith at curii.com>
diff --git a/lib/crunchstat/crunchstat.go b/lib/crunchstat/crunchstat.go
index e381429b9..49c88b6c4 100644
--- a/lib/crunchstat/crunchstat.go
+++ b/lib/crunchstat/crunchstat.go
@@ -23,6 +23,9 @@ import (
"time"
)
+// crunchstat collects all memory statistics, but only reports these.
+var MemoryStats = [...]string{"cache", "swap", "pgmajfault", "rss"}
+
// A Reporter gathers statistics for a cgroup and writes them to a
// log.Logger.
type Reporter struct {
@@ -59,10 +62,10 @@ type Reporter struct {
kernelPageSize int64
reportedStatFile map[string]string
- lastNetSample map[string]ioSample
- lastDiskIOSample map[string]ioSample
- lastCPUSample cpuSample
- lastDiskSpaceSample diskSpaceSample
+ lastNetSample map[string]IOSample
+ lastDiskIOSample map[string]IOSample
+ lastCPUSample CPUSample
+ lastDiskSpaceSample DiskSpaceSample
lastMemSample memSample
rssThresholds []ThresholdAlert
@@ -244,10 +247,10 @@ func (r *Reporter) getContainerNetStats() (io.Reader, error) {
return nil, errors.New("Could not read stats for any proc in container")
}
-type ioSample struct {
+type IOSample struct {
sampleTime time.Time
- txBytes int64
- rxBytes int64
+ TXBytes int64
+ RXBytes int64
}
func (r *Reporter) doBlkIOStats() {
@@ -258,38 +261,38 @@ func (r *Reporter) doBlkIOStats() {
defer c.Close()
b := bufio.NewScanner(c)
var sampleTime = time.Now()
- newSamples := make(map[string]ioSample)
+ newSamples := make(map[string]IOSample)
for b.Scan() {
var device, op string
var val int64
if _, err := fmt.Sscanf(string(b.Text()), "%s %s %d", &device, &op, &val); err != nil {
continue
}
- var thisSample ioSample
+ var thisSample IOSample
var ok bool
if thisSample, ok = newSamples[device]; !ok {
- thisSample = ioSample{sampleTime, -1, -1}
+ thisSample = IOSample{sampleTime, -1, -1}
}
switch op {
case "Read":
- thisSample.rxBytes = val
+ thisSample.RXBytes = val
case "Write":
- thisSample.txBytes = val
+ thisSample.TXBytes = val
}
newSamples[device] = thisSample
}
for dev, sample := range newSamples {
- if sample.txBytes < 0 || sample.rxBytes < 0 {
+ if sample.TXBytes < 0 || sample.RXBytes < 0 {
continue
}
delta := ""
if prev, ok := r.lastDiskIOSample[dev]; ok {
delta = fmt.Sprintf(" -- interval %.4f seconds %d write %d read",
sample.sampleTime.Sub(prev.sampleTime).Seconds(),
- sample.txBytes-prev.txBytes,
- sample.rxBytes-prev.rxBytes)
+ sample.TXBytes-prev.TXBytes,
+ sample.RXBytes-prev.RXBytes)
}
- r.Logger.Printf("blkio:%s %d write %d read%s\n", dev, sample.txBytes, sample.rxBytes, delta)
+ r.Logger.Printf("blkio:%s %d write %d read%s\n", dev, sample.TXBytes, sample.RXBytes, delta)
r.lastDiskIOSample[dev] = sample
}
}
@@ -336,8 +339,7 @@ func (r *Reporter) getMemSample() {
func (r *Reporter) reportMemSample() {
var outstat bytes.Buffer
- wantStats := [...]string{"cache", "swap", "pgmajfault", "rss"}
- for _, key := range wantStats {
+ for _, key := range MemoryStats {
// Use "total_X" stats (entire hierarchy) if enabled,
// otherwise just the single cgroup -- see
// https://www.kernel.org/doc/Documentation/cgroup-v1/memory.txt
@@ -444,29 +446,29 @@ func (r *Reporter) doNetworkStats() {
if rx, err = strconv.ParseInt(words[1], 10, 64); err != nil {
continue
}
- nextSample := ioSample{}
+ nextSample := IOSample{}
nextSample.sampleTime = sampleTime
- nextSample.txBytes = tx
- nextSample.rxBytes = rx
+ nextSample.TXBytes = tx
+ nextSample.RXBytes = rx
var delta string
if prev, ok := r.lastNetSample[ifName]; ok {
interval := nextSample.sampleTime.Sub(prev.sampleTime).Seconds()
delta = fmt.Sprintf(" -- interval %.4f seconds %d tx %d rx",
interval,
- tx-prev.txBytes,
- rx-prev.rxBytes)
+ tx-prev.TXBytes,
+ rx-prev.RXBytes)
}
r.Logger.Printf("net:%s %d tx %d rx%s\n", ifName, tx, rx, delta)
r.lastNetSample[ifName] = nextSample
}
}
-type diskSpaceSample struct {
+type DiskSpaceSample struct {
hasData bool
sampleTime time.Time
- total uint64
- used uint64
- available uint64
+ Total uint64
+ Used uint64
+ Available uint64
}
func (r *Reporter) doDiskSpaceStats() {
@@ -476,12 +478,12 @@ func (r *Reporter) doDiskSpaceStats() {
return
}
bs := uint64(s.Bsize)
- nextSample := diskSpaceSample{
+ nextSample := DiskSpaceSample{
hasData: true,
sampleTime: time.Now(),
- total: s.Blocks * bs,
- used: (s.Blocks - s.Bfree) * bs,
- available: s.Bavail * bs,
+ Total: s.Blocks * bs,
+ Used: (s.Blocks - s.Bfree) * bs,
+ Available: s.Bavail * bs,
}
var delta string
@@ -490,19 +492,19 @@ func (r *Reporter) doDiskSpaceStats() {
interval := nextSample.sampleTime.Sub(prev.sampleTime).Seconds()
delta = fmt.Sprintf(" -- interval %.4f seconds %d used",
interval,
- int64(nextSample.used-prev.used))
+ int64(nextSample.Used-prev.Used))
}
r.Logger.Printf("statfs %d available %d used %d total%s\n",
- nextSample.available, nextSample.used, nextSample.total, delta)
+ nextSample.Available, nextSample.Used, nextSample.Total, delta)
r.lastDiskSpaceSample = nextSample
}
-type cpuSample struct {
+type CPUSample struct {
hasData bool // to distinguish the zero value from real data
sampleTime time.Time
- user float64
- sys float64
- cpus int64
+ User float64
+ Sys float64
+ CPUCount int64
}
// Return the number of CPUs available in the container. Return 0 if
@@ -545,23 +547,23 @@ func (r *Reporter) doCPUStats() {
var userTicks, sysTicks int64
fmt.Sscanf(string(b), "user %d\nsystem %d", &userTicks, &sysTicks)
userHz := float64(100)
- nextSample := cpuSample{
+ nextSample := CPUSample{
hasData: true,
sampleTime: time.Now(),
- user: float64(userTicks) / userHz,
- sys: float64(sysTicks) / userHz,
- cpus: r.getCPUCount(),
+ User: float64(userTicks) / userHz,
+ Sys: float64(sysTicks) / userHz,
+ CPUCount: r.getCPUCount(),
}
delta := ""
if r.lastCPUSample.hasData {
delta = fmt.Sprintf(" -- interval %.4f seconds %.4f user %.4f sys",
nextSample.sampleTime.Sub(r.lastCPUSample.sampleTime).Seconds(),
- nextSample.user-r.lastCPUSample.user,
- nextSample.sys-r.lastCPUSample.sys)
+ nextSample.User-r.lastCPUSample.User,
+ nextSample.Sys-r.lastCPUSample.Sys)
}
r.Logger.Printf("cpu %.4f user %.4f sys %d cpus%s\n",
- nextSample.user, nextSample.sys, nextSample.cpus, delta)
+ nextSample.User, nextSample.Sys, nextSample.CPUCount, delta)
r.lastCPUSample = nextSample
}
@@ -585,8 +587,8 @@ func (r *Reporter) run() {
return
}
- r.lastNetSample = make(map[string]ioSample)
- r.lastDiskIOSample = make(map[string]ioSample)
+ r.lastNetSample = make(map[string]IOSample)
+ r.lastDiskIOSample = make(map[string]IOSample)
if len(r.TempDir) == 0 {
// Temporary dir not provided, try to get it from the environment.
commit 4d2d8c7d24ef440a739efd13692f67d83677976a
Author: Brett Smith <brett.smith at curii.com>
Date: Sat Feb 11 15:38:51 2023 -0500
19986: Log when a container uses nearly max RAM
This is meant to help users diagnose when their container likely failed
of OOM.
Arvados-DCO-1.1-Signed-off-by: Brett Smith <brett.smith at curii.com>
diff --git a/lib/crunchrun/crunchrun.go b/lib/crunchrun/crunchrun.go
index 3def8851c..41a69b406 100644
--- a/lib/crunchrun/crunchrun.go
+++ b/lib/crunchrun/crunchrun.go
@@ -765,10 +765,30 @@ func (runner *ContainerRunner) startCrunchstat() error {
PollPeriod: runner.statInterval,
TempDir: runner.parentTemp,
}
+ runner.statReporter.SetupRSSThresholds(
+ runner.Container.RuntimeConstraints.RAM,
+ []int64{90, 95, 99},
+ )
runner.statReporter.Start()
+ go runner.logCrunchstatAlerts()
return nil
}
+func (runner *ContainerRunner) logCrunchstatAlerts() {
+ for {
+ alert := <-runner.statReporter.ThresholdAlerts
+ switch alert.StatName {
+ case "mem rss":
+ runner.CrunchLog.Printf(
+ "Container using over %d%% of memory (rss %d/%d %s)",
+ alert.ThresholdPercentage, alert.Value, alert.Total, alert.Unit,
+ )
+ case "":
+ return
+ }
+ }
+}
+
type infoCommand struct {
label string
cmd []string
diff --git a/lib/crunchrun/crunchrun_test.go b/lib/crunchrun/crunchrun_test.go
index aaba1c420..0c69548e6 100644
--- a/lib/crunchrun/crunchrun_test.go
+++ b/lib/crunchrun/crunchrun_test.go
@@ -17,6 +17,7 @@ import (
"os/exec"
"regexp"
"runtime/pprof"
+ "strconv"
"strings"
"sync"
"syscall"
@@ -898,6 +899,60 @@ func (s *TestSuite) TestLogVersionAndRuntime(c *C) {
c.Check(s.api.Logs["crunch-run"].String(), Matches, `(?ms).*Using container runtime: stub.*`)
}
+func (s *TestSuite) searchLogs(logs *bytes.Buffer, pattern string) error {
+ var line string
+ var err error
+ re, err := regexp.Compile(pattern)
+ if err != nil {
+ return err
+ }
+ for ; err == nil; line, err = logs.ReadString('\n') {
+ if re.MatchString(line) {
+ return nil
+ }
+ }
+ return errors.New("logs did not match pattern")
+}
+
+func (s *TestSuite) testLogRSSThresholds(c *C, ram int, expected []int, notExpected int) {
+ s.runner.cgroupRoot = "testdata/fakestat"
+ s.fullRunHelper(c, `{
+ "command": ["true"],
+ "container_image": "`+arvadostest.DockerImage112PDH+`",
+ "cwd": ".",
+ "environment": {},
+ "mounts": {"/tmp": {"kind": "tmp"} },
+ "output_path": "/tmp",
+ "priority": 1,
+ "runtime_constraints": {"ram": `+strconv.Itoa(ram)+`},
+ "state": "Locked"
+ }`, nil, 0, func() {})
+ logs := s.api.Logs["crunch-run"]
+ pattern := `\dZ Container using over %d%% of memory \(rss 734003200/%d bytes\)\n$`
+ var threshold int
+ for _, threshold = range expected {
+ err := s.searchLogs(logs, fmt.Sprintf(pattern, threshold, ram))
+ c.Check(err, IsNil, Commentf("%s", err))
+ }
+ if notExpected > threshold {
+ err := s.searchLogs(logs, fmt.Sprintf(pattern, notExpected, ram))
+ c.Check(err, NotNil, Commentf("%d%% threshold logged", notExpected))
+ }
+}
+
+func (s *TestSuite) TestLogNoRSSThresholds(c *C) {
+ s.testLogRSSThresholds(c, 7340032000, []int{}, 90)
+}
+
+func (s *TestSuite) TestLogSomeRSSThresholds(c *C) {
+ onePercentRSS := 7340032
+ s.testLogRSSThresholds(c, 102*onePercentRSS, []int{90, 95}, 99)
+}
+
+func (s *TestSuite) TestLogAllRSSThresholds(c *C) {
+ s.testLogRSSThresholds(c, 734003299, []int{90, 95, 99}, 0)
+}
+
func (s *TestSuite) TestCommitNodeInfoBeforeStart(c *C) {
var collection_create, container_update arvadosclient.Dict
s.fullRunHelper(c, `{
diff --git a/lib/crunchrun/testdata/fakestat/cgroup.procs b/lib/crunchrun/testdata/fakestat/cgroup.procs
new file mode 100644
index 000000000..e69de29bb
diff --git a/lib/crunchrun/testdata/fakestat/cgroupid/cgroup.procs b/lib/crunchrun/testdata/fakestat/cgroupid/cgroup.procs
new file mode 100644
index 000000000..e69de29bb
diff --git a/lib/crunchrun/testdata/fakestat/cgroupid/memory.stat b/lib/crunchrun/testdata/fakestat/cgroupid/memory.stat
new file mode 100644
index 000000000..fff3333c4
--- /dev/null
+++ b/lib/crunchrun/testdata/fakestat/cgroupid/memory.stat
@@ -0,0 +1 @@
+rss 734003200
commit 06a76dd5b65f1baa355c4f844bbeb3fd1445cf58
Author: Brett Smith <brett.smith at curii.com>
Date: Sun Feb 12 21:10:44 2023 -0500
19986: Add RSS threshold alerts to crunchstat
This provides a channel through which users (namely crunch-run) can be
alerted when monitored statistics pass configured thresholds. Right now
it only supports this for memory rss, but the design should be
extensible to other statistics in the future.
Arvados-DCO-1.1-Signed-off-by: Brett Smith <brett.smith at curii.com>
diff --git a/.licenseignore b/.licenseignore
index 6ddb5c009..3d24c4ee3 100644
--- a/.licenseignore
+++ b/.licenseignore
@@ -92,4 +92,5 @@ sdk/cwl/tests/wf/hello.txt
sdk/cwl/tests/wf/indir1/hello2.txt
sdk/cwl/tests/chipseq/data/Genomes/*
CITATION.cff
-SECURITY.md
\ No newline at end of file
+SECURITY.md
+*/testdata/fakestat/*
diff --git a/lib/crunchstat/crunchstat.go b/lib/crunchstat/crunchstat.go
index 5d2059f7e..e381429b9 100644
--- a/lib/crunchstat/crunchstat.go
+++ b/lib/crunchstat/crunchstat.go
@@ -53,6 +53,10 @@ type Reporter struct {
Printf(fmt string, args ...interface{})
}
+ // When stats cross configured thresholds (set up by the Setup*Thresholds
+ // methods) reports are sent to this channel.
+ ThresholdAlerts chan ThresholdAlert
+
kernelPageSize int64
reportedStatFile map[string]string
lastNetSample map[string]ioSample
@@ -60,6 +64,7 @@ type Reporter struct {
lastCPUSample cpuSample
lastDiskSpaceSample diskSpaceSample
lastMemSample memSample
+ rssThresholds []ThresholdAlert
reportPIDs map[string]int
reportPIDsMu sync.Mutex
@@ -68,6 +73,51 @@ type Reporter struct {
flushed chan struct{} // closed when we have made our last report
}
+type ThresholdAlert struct {
+ // A short identifier for the statistic that passed a threshold
+ StatName string
+
+ // The percentage of available resource that the threshold represents
+ // (e.g., 50 == 50% of available whatever)
+ ThresholdPercentage int64
+
+ // The absolute value that the threshold represents, in the resource's
+ // native unit
+ // (e.g., Total available resource * ThresholdPercentage / 100 == Threshold)
+ Threshold int64
+
+ // The time that the threshold was passed
+ Time time.Time
+
+ // The total available resource, in the resource's native unit
+ Total int64
+
+ // A human-friendly string that identifies the unit of measurement
+ // for Threshold, Total, and Value
+ Unit string
+
+ // The value that the statistic had when it passed the threshold,
+ // in the resource's native unit
+ Value int64
+}
+
+// Set up threshold alerts for memory RSS.
+// total is the amount of RAM available, in bytes.
+// percentages is a slice of percentages, represented as integers
+// (e.g., 25 for 25%). An alert will be set up for each percentage of the
+// total available RAM. Note percentages MUST be sorted in ascending order.
+func (r *Reporter) SetupRSSThresholds(total int64, percentages []int64) {
+ for _, percentage := range percentages {
+ r.rssThresholds = append(r.rssThresholds, ThresholdAlert{
+ StatName: "mem rss",
+ ThresholdPercentage: percentage,
+ Threshold: total * percentage / 100,
+ Total: total,
+ Unit: "bytes",
+ })
+ }
+}
+
// Start starts monitoring in a new goroutine, and returns
// immediately.
//
@@ -80,6 +130,7 @@ type Reporter struct {
//
// Callers should not modify public data fields after calling Start.
func (r *Reporter) Start() {
+ r.ThresholdAlerts = make(chan ThresholdAlert, len(r.rssThresholds))
r.done = make(chan struct{})
r.flushed = make(chan struct{})
go r.run()
@@ -103,6 +154,7 @@ func (r *Reporter) ReportPID(name string, pid int) {
func (r *Reporter) Stop() {
close(r.done)
<-r.flushed
+ close(r.ThresholdAlerts)
}
func (r *Reporter) readAllOrWarn(in io.Reader) ([]byte, error) {
@@ -263,6 +315,22 @@ func (r *Reporter) getMemSample() {
}
thisSample.memStat[stat] = val
}
+
+ var thresholdsPassed uint
+ rss, ok := thisSample.memStat["total_rss"]
+ if !ok {
+ rss, ok = thisSample.memStat["rss"]
+ }
+ for _, threshold := range r.rssThresholds {
+ if !ok || rss < threshold.Threshold {
+ break
+ }
+ threshold.Time = thisSample.sampleTime
+ threshold.Value = rss
+ r.ThresholdAlerts <- threshold
+ thresholdsPassed++
+ }
+ r.rssThresholds = r.rssThresholds[thresholdsPassed:]
r.lastMemSample = thisSample
}
diff --git a/lib/crunchstat/crunchstat_test.go b/lib/crunchstat/crunchstat_test.go
index b4498a135..30e16e4c8 100644
--- a/lib/crunchstat/crunchstat_test.go
+++ b/lib/crunchstat/crunchstat_test.go
@@ -16,6 +16,24 @@ import (
. "gopkg.in/check.v1"
)
+const GiB = int64(1024 * 1024 * 1024)
+
+type fakeStat struct {
+ cgroupRoot string
+ statName string
+ unit string
+ value int64
+}
+
+var fakeRSS = fakeStat{
+ cgroupRoot: "testdata/fakestat",
+ statName: "mem rss",
+ unit: "bytes",
+ // Note this is the value of total_rss, not rss, because that's what should
+ // always be reported for thresholds and maxima.
+ value: 750 * 1024 * 1024,
+}
+
func Test(t *testing.T) {
TestingT(t)
}
@@ -25,8 +43,9 @@ var _ = Suite(&suite{
})
type suite struct {
- logbuf bytes.Buffer
- logger *logrus.Logger
+ logbuf bytes.Buffer
+ logger *logrus.Logger
+ startTime time.Time
}
func (s *suite) SetUpSuite(c *C) {
@@ -35,6 +54,7 @@ func (s *suite) SetUpSuite(c *C) {
func (s *suite) SetUpTest(c *C) {
s.logbuf.Reset()
+ s.startTime = time.Now()
}
func (s *suite) TestReadAllOrWarnFail(c *C) {
@@ -90,3 +110,69 @@ func (s *suite) TestReportPIDs(c *C) {
}
c.Logf("%s", s.logbuf.String())
}
+
+func (s *suite) testRSSThresholds(c *C, rssPercentages []int64, alertCount int) {
+ c.Assert(alertCount <= len(rssPercentages), Equals, true)
+ rep := Reporter{
+ Logger: s.logger,
+ CgroupRoot: fakeRSS.cgroupRoot,
+ PollPeriod: time.Second * 10,
+ }
+ rep.SetupRSSThresholds(GiB, rssPercentages)
+ timeout := time.After(time.Second)
+
+ rep.Start()
+ for _, expectPercentage := range rssPercentages[:alertCount] {
+ select {
+ case alert := <-rep.ThresholdAlerts:
+ expected := ThresholdAlert{
+ StatName: fakeRSS.statName,
+ ThresholdPercentage: expectPercentage,
+ Threshold: GiB * expectPercentage / 100,
+ // This Time value makes the check a noop but it lets us easily
+ // use DeepEquals. We check it has a good value below.
+ Time: alert.Time,
+ Total: GiB,
+ Unit: fakeRSS.unit,
+ Value: fakeRSS.value,
+ }
+ c.Check(alert, DeepEquals, expected)
+ c.Check(alert.Time.After(s.startTime), Equals, true)
+ case <-timeout:
+ c.Fatalf("timed out waiting for %d%% alert", expectPercentage)
+ }
+ }
+
+ rep.Stop()
+ select {
+ case alert := <-rep.ThresholdAlerts:
+ c.Check(alert, DeepEquals, ThresholdAlert{})
+ case <-timeout:
+ c.Error("timed out waiting for zero value alert after close")
+ }
+ c.Logf("%s", s.logbuf.String())
+}
+
+func (s *suite) TestZeroRSSThresholds(c *C) {
+ s.testRSSThresholds(c, []int64{}, 0)
+}
+
+func (s *suite) TestOneRSSThresholdPassed(c *C) {
+ s.testRSSThresholds(c, []int64{55}, 1)
+}
+
+func (s *suite) TestOneRSSThresholdNotPassed(c *C) {
+ s.testRSSThresholds(c, []int64{85}, 0)
+}
+
+func (s *suite) TestMultipleRSSThresholdsNonePassed(c *C) {
+ s.testRSSThresholds(c, []int64{95, 97, 99}, 0)
+}
+
+func (s *suite) TestMultipleRSSThresholdsSomePassed(c *C) {
+ s.testRSSThresholds(c, []int64{60, 70, 80, 90}, 2)
+}
+
+func (s *suite) TestMultipleRSSThresholdsAllPassed(c *C) {
+ s.testRSSThresholds(c, []int64{1, 2, 3}, 3)
+}
diff --git a/lib/crunchstat/testdata/fakestat/cgroup.procs b/lib/crunchstat/testdata/fakestat/cgroup.procs
new file mode 100644
index 000000000..e69de29bb
diff --git a/lib/crunchstat/testdata/fakestat/memory.stat b/lib/crunchstat/testdata/fakestat/memory.stat
new file mode 100644
index 000000000..0540eea23
--- /dev/null
+++ b/lib/crunchstat/testdata/fakestat/memory.stat
@@ -0,0 +1,2 @@
+rss 990
+total_rss 786432000
commit cec07f8e4d5019f69d085023f80f56b3d4c1e032
Author: Brett Smith <brett.smith at curii.com>
Date: Fri Feb 10 14:47:19 2023 -0500
19986: DRY up logger setup in crunchstat tests
Arvados-DCO-1.1-Signed-off-by: Brett Smith <brett.smith at curii.com>
diff --git a/lib/crunchstat/crunchstat_test.go b/lib/crunchstat/crunchstat_test.go
index 5e8e93de6..b4498a135 100644
--- a/lib/crunchstat/crunchstat_test.go
+++ b/lib/crunchstat/crunchstat_test.go
@@ -6,7 +6,6 @@ package crunchstat
import (
"bytes"
- "log"
"os"
"regexp"
"strconv"
@@ -21,13 +20,25 @@ func Test(t *testing.T) {
TestingT(t)
}
-var _ = Suite(&suite{})
+var _ = Suite(&suite{
+ logger: logrus.New(),
+})
-type suite struct{}
+type suite struct {
+ logbuf bytes.Buffer
+ logger *logrus.Logger
+}
+
+func (s *suite) SetUpSuite(c *C) {
+ s.logger.Out = &s.logbuf
+}
+
+func (s *suite) SetUpTest(c *C) {
+ s.logbuf.Reset()
+}
func (s *suite) TestReadAllOrWarnFail(c *C) {
- var logger bytes.Buffer
- rep := Reporter{Logger: log.New(&logger, "", 0)}
+ rep := Reporter{Logger: s.logger}
// The special file /proc/self/mem can be opened for
// reading, but reading from byte 0 returns an error.
@@ -36,12 +47,11 @@ func (s *suite) TestReadAllOrWarnFail(c *C) {
defer f.Close()
_, err = rep.readAllOrWarn(f)
c.Check(err, NotNil)
- c.Check(logger.String(), Matches, "^warning: read /proc/self/mem: .*\n")
+ c.Check(s.logbuf.String(), Matches, ".* msg=\"warning: read /proc/self/mem: .*\n")
}
func (s *suite) TestReadAllOrWarnSuccess(c *C) {
- var logbuf bytes.Buffer
- rep := Reporter{Logger: log.New(&logbuf, "", 0)}
+ rep := Reporter{Logger: s.logger}
f, err := os.Open("./crunchstat_test.go")
c.Assert(err, IsNil)
@@ -49,15 +59,12 @@ func (s *suite) TestReadAllOrWarnSuccess(c *C) {
data, err := rep.readAllOrWarn(f)
c.Check(err, IsNil)
c.Check(string(data), Matches, "(?ms).*\npackage crunchstat\n.*")
- c.Check(logbuf.String(), Equals, "")
+ c.Check(s.logbuf.String(), Equals, "")
}
func (s *suite) TestReportPIDs(c *C) {
- var logbuf bytes.Buffer
- logger := logrus.New()
- logger.Out = &logbuf
r := Reporter{
- Logger: logger,
+ Logger: s.logger,
CgroupRoot: "/sys/fs/cgroup",
PollPeriod: time.Second,
}
@@ -70,7 +77,7 @@ func (s *suite) TestReportPIDs(c *C) {
c.Error("timed out")
break
}
- if m := regexp.MustCompile(`(?ms).*procmem \d+ init (\d+) test_process.*`).FindSubmatch(logbuf.Bytes()); len(m) > 0 {
+ if m := regexp.MustCompile(`(?ms).*procmem \d+ init (\d+) test_process.*`).FindSubmatch(s.logbuf.Bytes()); len(m) > 0 {
size, err := strconv.ParseInt(string(m[1]), 10, 64)
c.Check(err, IsNil)
// Expect >1 MiB and <100 MiB -- otherwise we
@@ -81,5 +88,5 @@ func (s *suite) TestReportPIDs(c *C) {
break
}
}
- c.Logf("%s", logbuf.String())
+ c.Logf("%s", s.logbuf.String())
}
commit 1c591a89e69d2b52d211b32be26fff4ac225479f
Author: Brett Smith <brett.smith at curii.com>
Date: Fri Feb 10 10:28:07 2023 -0500
19986: Separate collection of cgroup memory stats
This is scaffolding to help us report promptly when a container is
approaching OOM. This commit does not change any public interface or
reporting.
Arvados-DCO-1.1-Signed-off-by: Brett Smith <brett.smith at curii.com>
diff --git a/lib/crunchstat/crunchstat.go b/lib/crunchstat/crunchstat.go
index 3a473cab8..5d2059f7e 100644
--- a/lib/crunchstat/crunchstat.go
+++ b/lib/crunchstat/crunchstat.go
@@ -59,6 +59,7 @@ type Reporter struct {
lastDiskIOSample map[string]ioSample
lastCPUSample cpuSample
lastDiskSpaceSample diskSpaceSample
+ lastMemSample memSample
reportPIDs map[string]int
reportPIDsMu sync.Mutex
@@ -246,7 +247,7 @@ type memSample struct {
memStat map[string]int64
}
-func (r *Reporter) doMemoryStats() {
+func (r *Reporter) getMemSample() {
c, err := r.openStatFile("memory", "memory.stat", true)
if err != nil {
return
@@ -254,7 +255,6 @@ func (r *Reporter) doMemoryStats() {
defer c.Close()
b := bufio.NewScanner(c)
thisSample := memSample{time.Now(), make(map[string]int64)}
- wantStats := [...]string{"cache", "swap", "pgmajfault", "rss"}
for b.Scan() {
var stat string
var val int64
@@ -263,19 +263,26 @@ func (r *Reporter) doMemoryStats() {
}
thisSample.memStat[stat] = val
}
+ r.lastMemSample = thisSample
+}
+
+func (r *Reporter) reportMemSample() {
var outstat bytes.Buffer
+ wantStats := [...]string{"cache", "swap", "pgmajfault", "rss"}
for _, key := range wantStats {
// Use "total_X" stats (entire hierarchy) if enabled,
// otherwise just the single cgroup -- see
// https://www.kernel.org/doc/Documentation/cgroup-v1/memory.txt
- if val, ok := thisSample.memStat["total_"+key]; ok {
+ if val, ok := r.lastMemSample.memStat["total_"+key]; ok {
fmt.Fprintf(&outstat, " %d %s", val, key)
- } else if val, ok := thisSample.memStat[key]; ok {
+ } else if val, ok := r.lastMemSample.memStat[key]; ok {
fmt.Fprintf(&outstat, " %d %s", val, key)
}
}
r.Logger.Printf("mem%s\n", outstat.String())
+}
+func (r *Reporter) doProcmemStats() {
if r.kernelPageSize == 0 {
// assign "don't try again" value in case we give up
// and return without assigning the real value
@@ -490,6 +497,15 @@ func (r *Reporter) doCPUStats() {
r.lastCPUSample = nextSample
}
+func (r *Reporter) doAllStats() {
+ r.reportMemSample()
+ r.doProcmemStats()
+ r.doCPUStats()
+ r.doBlkIOStats()
+ r.doNetworkStats()
+ r.doDiskSpaceStats()
+}
+
// Report stats periodically until we learn (via r.done) that someone
// called Stop.
func (r *Reporter) run() {
@@ -512,17 +528,19 @@ func (r *Reporter) run() {
r.Logger.Printf("notice: monitoring temp dir %s\n", r.TempDir)
}
- ticker := time.NewTicker(r.PollPeriod)
+ r.getMemSample()
+ r.doAllStats()
+
+ memTicker := time.NewTicker(time.Second)
+ mainTicker := time.NewTicker(r.PollPeriod)
for {
- r.doMemoryStats()
- r.doCPUStats()
- r.doBlkIOStats()
- r.doNetworkStats()
- r.doDiskSpaceStats()
select {
case <-r.done:
return
- case <-ticker.C:
+ case <-memTicker.C:
+ r.getMemSample()
+ case <-mainTicker.C:
+ r.doAllStats()
}
}
}
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list