[ARVADOS] created: 1.3.0-566-g743680ab1

Git user git at public.curoverse.com
Wed Mar 20 19:31:52 UTC 2019


        at  743680ab176697218aa41839149e02a160786bdf (commit)


commit 743680ab176697218aa41839149e02a160786bdf
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Wed Mar 20 01:50:10 2019 -0400

    14807: Give up if initial keep services list isn't loaded in 60s.
    
    Otherwise, MakeKeepClient blocks forever when ARVADOS_* env vars are
    bad.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/sdk/go/keepclient/discover.go b/sdk/go/keepclient/discover.go
index 2392fcde7..62936e718 100644
--- a/sdk/go/keepclient/discover.go
+++ b/sdk/go/keepclient/discover.go
@@ -6,6 +6,7 @@ package keepclient
 
 import (
 	"encoding/json"
+	"errors"
 	"fmt"
 	"log"
 	"os"
@@ -150,7 +151,12 @@ func (kc *KeepClient) discoverServices() error {
 	}
 	svcListCacheMtx.Unlock()
 
-	return kc.loadKeepServers(<-cacheEnt.latest)
+	select {
+	case <-time.After(time.Minute):
+		return errors.New("timed out while getting initial list of keep services")
+	case sl := <-cacheEnt.latest:
+		return kc.loadKeepServers(sl)
+	}
 }
 
 func (kc *KeepClient) RefreshServiceDiscovery() {

commit b0efcaab2225121f27aa18f5cb19cc5ce8f5beb7
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Tue Mar 19 17:02:05 2019 -0400

    14807: Report queue metrics.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/lib/dispatchcloud/container/queue.go b/lib/dispatchcloud/container/queue.go
index 4e807a12a..af17aaf39 100644
--- a/lib/dispatchcloud/container/queue.go
+++ b/lib/dispatchcloud/container/queue.go
@@ -53,7 +53,6 @@ func (c *QueueEnt) String() string {
 // cache up to date.
 type Queue struct {
 	logger     logrus.FieldLogger
-	reg        *prometheus.Registry
 	chooseType typeChooser
 	client     APIClient
 
@@ -79,14 +78,17 @@ type Queue struct {
 // Arvados cluster's queue during Update, chooseType will be called to
 // assign an appropriate arvados.InstanceType for the queue entry.
 func NewQueue(logger logrus.FieldLogger, reg *prometheus.Registry, chooseType typeChooser, client APIClient) *Queue {
-	return &Queue{
+	cq := &Queue{
 		logger:      logger,
-		reg:         reg,
 		chooseType:  chooseType,
 		client:      client,
 		current:     map[string]QueueEnt{},
 		subscribers: map[<-chan struct{}]chan struct{}{},
 	}
+	if reg != nil {
+		go cq.runMetrics(reg)
+	}
+	return cq
 }
 
 // Subscribe returns a channel that becomes ready to receive when an
@@ -487,3 +489,34 @@ func (cq *Queue) fetchAll(initialParams arvados.ResourceListParams) ([]arvados.C
 	}
 	return results, nil
 }
+
+func (cq *Queue) runMetrics(reg *prometheus.Registry) {
+	mEntries := prometheus.NewGaugeVec(prometheus.GaugeOpts{
+		Namespace: "arvados",
+		Subsystem: "dispatchcloud",
+		Name:      "queue_entries",
+		Help:      "Number of active container entries in the controller database.",
+	}, []string{"state", "instance_type"})
+	reg.MustRegister(mEntries)
+
+	type entKey struct {
+		state arvados.ContainerState
+		inst  string
+	}
+	count := map[entKey]int{}
+
+	ch := cq.Subscribe()
+	defer cq.Unsubscribe(ch)
+	for range ch {
+		for k := range count {
+			count[k] = 0
+		}
+		ents, _ := cq.Entries()
+		for _, ent := range ents {
+			count[entKey{ent.Container.State, ent.InstanceType.Name}]++
+		}
+		for k, v := range count {
+			mEntries.WithLabelValues(string(k.state), k.inst).Set(float64(v))
+		}
+	}
+}

commit 0e1e46da333f606b8b3293367641818fd3cc9ff4
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Tue Mar 19 13:40:10 2019 -0400

    14807: Add "kill instance" management API.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/lib/dispatchcloud/dispatcher.go b/lib/dispatchcloud/dispatcher.go
index adf1028b3..9245d5de3 100644
--- a/lib/dispatchcloud/dispatcher.go
+++ b/lib/dispatchcloud/dispatcher.go
@@ -39,6 +39,7 @@ type pool interface {
 	scheduler.WorkerPool
 	Instances() []worker.InstanceView
 	SetIdleBehavior(cloud.InstanceID, worker.IdleBehavior) error
+	KillInstance(id cloud.InstanceID, reason string) error
 	Stop()
 }
 
@@ -147,6 +148,7 @@ func (disp *dispatcher) initialize() {
 		mux.HandlerFunc("POST", "/arvados/v1/dispatch/instances/hold", disp.apiInstanceHold)
 		mux.HandlerFunc("POST", "/arvados/v1/dispatch/instances/drain", disp.apiInstanceDrain)
 		mux.HandlerFunc("POST", "/arvados/v1/dispatch/instances/run", disp.apiInstanceRun)
+		mux.HandlerFunc("POST", "/arvados/v1/dispatch/instances/kill", disp.apiInstanceKill)
 		metricsH := promhttp.HandlerFor(disp.reg, promhttp.HandlerOpts{
 			ErrorLog: disp.logger,
 		})
@@ -212,6 +214,20 @@ func (disp *dispatcher) apiInstanceRun(w http.ResponseWriter, r *http.Request) {
 	disp.apiInstanceIdleBehavior(w, r, worker.IdleBehaviorRun)
 }
 
+// Management API: shutdown/destroy specified instance now.
+func (disp *dispatcher) apiInstanceKill(w http.ResponseWriter, r *http.Request) {
+	id := cloud.InstanceID(r.FormValue("instance_id"))
+	if id == "" {
+		httpserver.Error(w, "instance_id parameter not provided", http.StatusBadRequest)
+		return
+	}
+	err := disp.pool.KillInstance(id, "via management API: "+r.FormValue("reason"))
+	if err != nil {
+		httpserver.Error(w, err.Error(), http.StatusNotFound)
+		return
+	}
+}
+
 func (disp *dispatcher) apiInstanceIdleBehavior(w http.ResponseWriter, r *http.Request, want worker.IdleBehavior) {
 	id := cloud.InstanceID(r.FormValue("instance_id"))
 	if id == "" {
diff --git a/lib/dispatchcloud/worker/pool.go b/lib/dispatchcloud/worker/pool.go
index 81a658535..014ab93bf 100644
--- a/lib/dispatchcloud/worker/pool.go
+++ b/lib/dispatchcloud/worker/pool.go
@@ -691,6 +691,18 @@ func (wp *Pool) Instances() []InstanceView {
 	return r
 }
 
+// KillInstance destroys a cloud VM instance. It returns an error if
+// the given instance does not exist.
+func (wp *Pool) KillInstance(id cloud.InstanceID, reason string) error {
+	wkr, ok := wp.workers[id]
+	if !ok {
+		return errors.New("instance not found")
+	}
+	wkr.logger.WithField("Reason", reason).Info("shutting down")
+	wkr.shutdown()
+	return nil
+}
+
 func (wp *Pool) setup() {
 	wp.creating = map[string]createCall{}
 	wp.exited = map[string]time.Time{}

commit a614135cd8e4c2e03593a8f16210a30fdef5d90b
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Mon Mar 18 17:27:40 2019 -0400

    14807: Configurable rate limit for cloud provider API calls.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/lib/dispatchcloud/dispatcher_test.go b/lib/dispatchcloud/dispatcher_test.go
index 44d5a0ae7..d1ee4b135 100644
--- a/lib/dispatchcloud/dispatcher_test.go
+++ b/lib/dispatchcloud/dispatcher_test.go
@@ -49,12 +49,13 @@ func (s *DispatcherSuite) SetUpTest(c *check.C) {
 
 	s.cluster = &arvados.Cluster{
 		CloudVMs: arvados.CloudVMs{
-			Driver:          "test",
-			SyncInterval:    arvados.Duration(10 * time.Millisecond),
-			TimeoutIdle:     arvados.Duration(150 * time.Millisecond),
-			TimeoutBooting:  arvados.Duration(150 * time.Millisecond),
-			TimeoutProbe:    arvados.Duration(15 * time.Millisecond),
-			TimeoutShutdown: arvados.Duration(5 * time.Millisecond),
+			Driver:               "test",
+			SyncInterval:         arvados.Duration(10 * time.Millisecond),
+			TimeoutIdle:          arvados.Duration(150 * time.Millisecond),
+			TimeoutBooting:       arvados.Duration(150 * time.Millisecond),
+			TimeoutProbe:         arvados.Duration(15 * time.Millisecond),
+			TimeoutShutdown:      arvados.Duration(5 * time.Millisecond),
+			MaxCloudOpsPerSecond: 500,
 		},
 		Dispatch: arvados.Dispatch{
 			PrivateKey:         string(dispatchprivraw),
diff --git a/lib/dispatchcloud/driver.go b/lib/dispatchcloud/driver.go
index 0343f85b9..eb1e48737 100644
--- a/lib/dispatchcloud/driver.go
+++ b/lib/dispatchcloud/driver.go
@@ -6,12 +6,14 @@ package dispatchcloud
 
 import (
 	"fmt"
+	"time"
 
 	"git.curoverse.com/arvados.git/lib/cloud"
 	"git.curoverse.com/arvados.git/lib/cloud/azure"
 	"git.curoverse.com/arvados.git/lib/cloud/ec2"
 	"git.curoverse.com/arvados.git/sdk/go/arvados"
 	"github.com/sirupsen/logrus"
+	"golang.org/x/crypto/ssh"
 )
 
 var drivers = map[string]cloud.Driver{
@@ -24,5 +26,33 @@ func newInstanceSet(cluster *arvados.Cluster, setID cloud.InstanceSetID, logger
 	if !ok {
 		return nil, fmt.Errorf("unsupported cloud driver %q", cluster.CloudVMs.Driver)
 	}
-	return driver.InstanceSet(cluster.CloudVMs.DriverParameters, setID, logger)
+	is, err := driver.InstanceSet(cluster.CloudVMs.DriverParameters, setID, logger)
+	if maxops := cluster.CloudVMs.MaxCloudOpsPerSecond; maxops > 0 {
+		is = &rateLimitedInstanceSet{
+			InstanceSet: is,
+			ticker:      time.NewTicker(time.Second / time.Duration(maxops)),
+		}
+	}
+	return is, err
+}
+
+type rateLimitedInstanceSet struct {
+	cloud.InstanceSet
+	ticker *time.Ticker
+}
+
+func (is rateLimitedInstanceSet) Create(it arvados.InstanceType, image cloud.ImageID, tags cloud.InstanceTags, init cloud.InitCommand, pk ssh.PublicKey) (cloud.Instance, error) {
+	<-is.ticker.C
+	inst, err := is.InstanceSet.Create(it, image, tags, init, pk)
+	return &rateLimitedInstance{inst, is.ticker}, err
+}
+
+type rateLimitedInstance struct {
+	cloud.Instance
+	ticker *time.Ticker
+}
+
+func (inst *rateLimitedInstance) Destroy() error {
+	<-inst.ticker.C
+	return inst.Instance.Destroy()
 }
diff --git a/sdk/go/arvados/config.go b/sdk/go/arvados/config.go
index 73addb739..7c87ff029 100644
--- a/sdk/go/arvados/config.go
+++ b/sdk/go/arvados/config.go
@@ -154,6 +154,9 @@ type CloudVMs struct {
 	// Time after shutdown to retry shutdown
 	TimeoutShutdown Duration
 
+	// Maximum create/destroy-instance operations per second
+	MaxCloudOpsPerSecond int
+
 	ImageID string
 
 	Driver           string

commit ef288ca2cdefe06e736972d06d5ffb177b0cac51
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Mon Mar 18 16:32:58 2019 -0400

    14807: Drain instances that crunch-run reports broken.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/lib/dispatchcloud/dispatcher_test.go b/lib/dispatchcloud/dispatcher_test.go
index 7268f106a..44d5a0ae7 100644
--- a/lib/dispatchcloud/dispatcher_test.go
+++ b/lib/dispatchcloud/dispatcher_test.go
@@ -157,6 +157,7 @@ func (s *DispatcherSuite) TestDispatchToStubDriver(c *check.C) {
 			stubvm.CrunchRunMissing = true
 		default:
 			stubvm.CrunchRunCrashRate = 0.1
+			stubvm.ReportBroken = time.Now().Add(time.Duration(rand.Int63n(200)+200) * time.Millisecond)
 		}
 	}
 
diff --git a/lib/dispatchcloud/test/stub_driver.go b/lib/dispatchcloud/test/stub_driver.go
index a4521eab7..02346a970 100644
--- a/lib/dispatchcloud/test/stub_driver.go
+++ b/lib/dispatchcloud/test/stub_driver.go
@@ -181,6 +181,7 @@ func (e RateLimitError) EarliestRetry() time.Time { return e.Retry }
 type StubVM struct {
 	Boot                  time.Time
 	Broken                time.Time
+	ReportBroken          time.Time
 	CrunchRunMissing      bool
 	CrunchRunCrashRate    float64
 	CrunchRunDetachDelay  time.Duration
@@ -314,6 +315,9 @@ func (svm *StubVM) Exec(env map[string]string, command string, stdin io.Reader,
 		for uuid := range svm.running {
 			fmt.Fprintf(stdout, "%s\n", uuid)
 		}
+		if !svm.ReportBroken.IsZero() && svm.ReportBroken.Before(time.Now()) {
+			fmt.Fprintln(stdout, "broken")
+		}
 		return 0
 	}
 	if strings.HasPrefix(command, "crunch-run --kill ") {
diff --git a/lib/dispatchcloud/worker/worker.go b/lib/dispatchcloud/worker/worker.go
index 41117c1d4..49c5057b3 100644
--- a/lib/dispatchcloud/worker/worker.go
+++ b/lib/dispatchcloud/worker/worker.go
@@ -5,7 +5,6 @@
 package worker
 
 import (
-	"bytes"
 	"fmt"
 	"strings"
 	"sync"
@@ -215,11 +214,16 @@ func (wkr *worker) probeAndUpdate() {
 			logger.Info("instance booted; will try probeRunning")
 		}
 	}
+	reportedBroken := false
 	if booted || wkr.state == StateUnknown {
-		ctrUUIDs, ok = wkr.probeRunning()
+		ctrUUIDs, reportedBroken, ok = wkr.probeRunning()
 	}
 	wkr.mtx.Lock()
 	defer wkr.mtx.Unlock()
+	if reportedBroken && wkr.idleBehavior == IdleBehaviorRun {
+		logger.Info("probe reported broken instance")
+		wkr.setIdleBehavior(IdleBehaviorDrain)
+	}
 	if !ok || (!booted && len(ctrUUIDs) == 0 && len(wkr.running) == 0) {
 		if wkr.state == StateShutdown && wkr.updated.After(updated) {
 			// Skip the logging noise if shutdown was
@@ -313,7 +317,7 @@ func (wkr *worker) probeAndUpdate() {
 	go wkr.wp.notify()
 }
 
-func (wkr *worker) probeRunning() (running []string, ok bool) {
+func (wkr *worker) probeRunning() (running []string, reportsBroken, ok bool) {
 	cmd := "crunch-run --list"
 	if u := wkr.instance.RemoteUser(); u != "root" {
 		cmd = "sudo " + cmd
@@ -325,13 +329,17 @@ func (wkr *worker) probeRunning() (running []string, ok bool) {
 			"stdout":  string(stdout),
 			"stderr":  string(stderr),
 		}).WithError(err).Warn("probe failed")
-		return nil, false
+		return
 	}
-	stdout = bytes.TrimRight(stdout, "\n")
-	if len(stdout) == 0 {
-		return nil, true
+	ok = true
+	for _, s := range strings.Split(string(stdout), "\n") {
+		if s == "broken" {
+			reportsBroken = true
+		} else if s != "" {
+			running = append(running, s)
+		}
 	}
-	return strings.Split(string(stdout), "\n"), true
+	return
 }
 
 func (wkr *worker) probeBooted() (ok bool, stderr []byte) {
diff --git a/services/crunch-run/background.go b/services/crunch-run/background.go
index 933692bdc..852ccb6ec 100644
--- a/services/crunch-run/background.go
+++ b/services/crunch-run/background.go
@@ -20,6 +20,7 @@ var (
 	lockdir    = "/var/lock"
 	lockprefix = "crunch-run-"
 	locksuffix = ".lock"
+	brokenfile = "crunch-run-broken"
 )
 
 // procinfo is saved in each process's lockfile.
@@ -146,7 +147,10 @@ func ListProcesses(stdout, stderr io.Writer) int {
 		if info.IsDir() && path != walkdir {
 			return filepath.SkipDir
 		}
-		if name := info.Name(); !strings.HasPrefix(name, lockprefix) || !strings.HasSuffix(name, locksuffix) {
+		if name := info.Name(); name == brokenfile {
+			fmt.Fprintln(stdout, "broken")
+			return nil
+		} else if !strings.HasPrefix(name, lockprefix) || !strings.HasSuffix(name, locksuffix) {
 			return nil
 		}
 		if info.Size() == 0 {
diff --git a/services/crunch-run/crunchrun.go b/services/crunch-run/crunchrun.go
index 0576337aa..3925b0b7b 100644
--- a/services/crunch-run/crunchrun.go
+++ b/services/crunch-run/crunchrun.go
@@ -222,7 +222,14 @@ var brokenNodeHook *string = flag.String("broken-node-hook", "", "Script to run
 
 func (runner *ContainerRunner) runBrokenNodeHook() {
 	if *brokenNodeHook == "" {
-		runner.CrunchLog.Printf("No broken node hook provided, cannot mark node as broken.")
+		path := filepath.Join(lockdir, brokenfile)
+		runner.CrunchLog.Printf("Writing %s to mark node as broken", path)
+		f, err := os.OpenFile(path, os.O_CREATE|os.O_RDWR, 0700)
+		if err != nil {
+			runner.CrunchLog.Printf("Error writing %s: %s", path, err)
+			return
+		}
+		f.Close()
 	} else {
 		runner.CrunchLog.Printf("Running broken node hook %q", *brokenNodeHook)
 		// run killme script
diff --git a/services/crunch-run/crunchrun_test.go b/services/crunch-run/crunchrun_test.go
index 17e5e1458..60729c019 100644
--- a/services/crunch-run/crunchrun_test.go
+++ b/services/crunch-run/crunchrun_test.go
@@ -2049,7 +2049,7 @@ func (s *TestSuite) TestFullBrokenDocker2(c *C) {
 
 	c.Check(api.CalledWith("container.state", "Queued"), NotNil)
 	c.Check(api.Logs["crunch-run"].String(), Matches, "(?ms).*unable to run containers.*")
-	c.Check(api.Logs["crunch-run"].String(), Matches, "(?ms).*No broken node hook.*")
+	c.Check(api.Logs["crunch-run"].String(), Matches, "(?ms).*Writing /var/lock/crunch-run-broken to mark node as broken.*")
 }
 
 func (s *TestSuite) TestFullBrokenDocker3(c *C) {

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list