[ARVADOS] created: 1.3.0-1618-g0f5693fea

Git user git at public.curoverse.com
Tue Sep 24 14:01:44 UTC 2019


        at  0f5693feaf2761a46186c3d43e7af294b433c039 (commit)


commit 0f5693feaf2761a46186c3d43e7af294b433c039
Author: Eric Biagiotti <ebiagiotti at veritasgenetics.com>
Date:   Tue Sep 24 09:58:40 2019 -0400

    14714: keep-balance uses cluster config
    
    - Removes dumpconfig flag.
    - Removes the options to specify a keep service list or type. Will now balance all keep services of type disk reported by the keep_services endpoint.
    - Debug flag removed. Uses SystemLogs.LogLevel from cluster config instead.
    - Reorganizes the Server struct to use lib/service to do generic service things.
    
    Arvados-DCO-1.1-Signed-off-by: Eric Biagiotti <ebiagiotti at veritasgenetics.com>

diff --git a/services/keep-balance/balance.go b/services/keep-balance/balance.go
index 08a6c5881..f29522bb0 100644
--- a/services/keep-balance/balance.go
+++ b/services/keep-balance/balance.go
@@ -66,7 +66,7 @@ type Balancer struct {
 // Typical usage:
 //
 //   runOptions, err = (&Balancer{}).Run(config, runOptions)
-func (bal *Balancer) Run(config Config, runOptions RunOptions) (nextRunOptions RunOptions, err error) {
+func (bal *Balancer) Run(client *arvados.Client, cluster *arvados.Cluster, runOptions RunOptions) (nextRunOptions RunOptions, err error) {
 	nextRunOptions = runOptions
 
 	defer bal.time("sweep", "wall clock time to run one full sweep")()
@@ -95,24 +95,21 @@ func (bal *Balancer) Run(config Config, runOptions RunOptions) (nextRunOptions R
 		bal.lostBlocks = ioutil.Discard
 	}
 
-	if len(config.KeepServiceList.Items) > 0 {
-		err = bal.SetKeepServices(config.KeepServiceList)
-	} else {
-		err = bal.DiscoverKeepServices(&config.Client, config.KeepServiceTypes)
-	}
+	diskService := []string{"disk"}
+	err = bal.DiscoverKeepServices(client, diskService)
 	if err != nil {
 		return
 	}
 
 	for _, srv := range bal.KeepServices {
-		err = srv.discoverMounts(&config.Client)
+		err = srv.discoverMounts(client)
 		if err != nil {
 			return
 		}
 	}
 	bal.cleanupMounts()
 
-	if err = bal.CheckSanityEarly(&config.Client); err != nil {
+	if err = bal.CheckSanityEarly(client); err != nil {
 		return
 	}
 	rs := bal.rendezvousState()
@@ -121,7 +118,7 @@ func (bal *Balancer) Run(config Config, runOptions RunOptions) (nextRunOptions R
 			bal.logf("notice: KeepServices list has changed since last run")
 		}
 		bal.logf("clearing existing trash lists, in case the new rendezvous order differs from previous run")
-		if err = bal.ClearTrashLists(&config.Client); err != nil {
+		if err = bal.ClearTrashLists(client); err != nil {
 			return
 		}
 		// The current rendezvous state becomes "safe" (i.e.,
@@ -130,7 +127,7 @@ func (bal *Balancer) Run(config Config, runOptions RunOptions) (nextRunOptions R
 		// succeed in clearing existing trash lists.
 		nextRunOptions.SafeRendezvousState = rs
 	}
-	if err = bal.GetCurrentState(&config.Client, config.CollectionBatchSize, config.CollectionBuffers); err != nil {
+	if err = bal.GetCurrentState(client, cluster.Collections.BalanceCollectionBatch, cluster.Collections.BalanceCollectionBuffers); err != nil {
 		return
 	}
 	bal.ComputeChangeSets()
@@ -150,14 +147,14 @@ func (bal *Balancer) Run(config Config, runOptions RunOptions) (nextRunOptions R
 		lbFile = nil
 	}
 	if runOptions.CommitPulls {
-		err = bal.CommitPulls(&config.Client)
+		err = bal.CommitPulls(client)
 		if err != nil {
 			// Skip trash if we can't pull. (Too cautious?)
 			return
 		}
 	}
 	if runOptions.CommitTrash {
-		err = bal.CommitTrash(&config.Client)
+		err = bal.CommitTrash(client)
 	}
 	return
 }
diff --git a/services/keep-balance/main.go b/services/keep-balance/main.go
index 84516a821..606fde498 100644
--- a/services/keep-balance/main.go
+++ b/services/keep-balance/main.go
@@ -5,97 +5,90 @@
 package main
 
 import (
-	"encoding/json"
+	"context"
 	"flag"
 	"fmt"
+	"io"
 	"log"
-	"net/http"
 	"os"
-	"time"
 
+	"git.curoverse.com/arvados.git/lib/config"
+	"git.curoverse.com/arvados.git/lib/service"
 	"git.curoverse.com/arvados.git/sdk/go/arvados"
-	"git.curoverse.com/arvados.git/sdk/go/config"
+	"git.curoverse.com/arvados.git/sdk/go/ctxlog"
 	"github.com/sirupsen/logrus"
 )
 
-var debugf = func(string, ...interface{}) {}
+var (
+	version = "dev"
+	debugf  = func(string, ...interface{}) {}
+	command = service.Command(arvados.ServiceNameKeepbalance, newHandler)
+	options RunOptions
+)
 
-func main() {
-	var cfg Config
-	var runOptions RunOptions
-
-	configPath := flag.String("config", defaultConfigPath,
-		"`path` of JSON or YAML configuration file")
-	serviceListPath := flag.String("config.KeepServiceList", "",
-		"`path` of JSON or YAML file with list of keep services to balance, as given by \"arv keep_service list\" "+
-			"(default: config[\"KeepServiceList\"], or if none given, get all available services and filter by config[\"KeepServiceTypes\"])")
-	flag.BoolVar(&runOptions.Once, "once", false,
-		"balance once and then exit")
-	flag.BoolVar(&runOptions.CommitPulls, "commit-pulls", false,
-		"send pull requests (make more replicas of blocks that are underreplicated or are not in optimal rendezvous probe order)")
-	flag.BoolVar(&runOptions.CommitTrash, "commit-trash", false,
-		"send trash requests (delete unreferenced old blocks, and excess replicas of overreplicated blocks)")
-	dumpConfig := flag.Bool("dump-config", false, "write current configuration to stdout and exit")
-	dumpFlag := flag.Bool("dump", false, "dump details for each block to stdout")
-	debugFlag := flag.Bool("debug", false, "enable debug messages")
-	getVersion := flag.Bool("version", false, "Print version information and exit.")
-	flag.Usage = usage
-	flag.Parse()
-
-	// Print version information if requested
-	if *getVersion {
-		fmt.Printf("keep-balance %s\n", version)
-		return
+func newHandler(ctx context.Context, cluster *arvados.Cluster, _ string) service.Handler {
+	if !options.Once && cluster.Collections.BalancePeriod == arvados.Duration(0) {
+		return service.ErrorHandler(ctx, cluster, fmt.Errorf("You must either run keep-balance with the -once flag, or set Collections.BalancePeriod in the config. "+
+			"If using the legacy keep-balance.yml config, RunPeriod is the equivalant of Collections.BalancePeriod."))
 	}
 
-	mustReadConfig(&cfg, *configPath)
-	if *serviceListPath != "" {
-		mustReadConfig(&cfg.KeepServiceList, *serviceListPath)
+	ac, err := arvados.NewClientFromConfig(cluster)
+	if err != nil {
+		return service.ErrorHandler(ctx, cluster, fmt.Errorf("error initializing client from cluster config: %s", err))
 	}
 
-	if *dumpConfig {
-		log.Fatal(config.DumpAndExit(cfg))
+	if cluster.SystemLogs.LogLevel == "debug" {
+		debugf = log.Printf
 	}
 
-	to := time.Duration(cfg.RequestTimeout)
-	if to == 0 {
-		to = 30 * time.Minute
+	srv := &Server{
+		Cluster:    cluster,
+		ArvClient:  ac,
+		RunOptions: options,
+		Metrics:    newMetrics(),
 	}
-	arvados.DefaultSecureClient.Timeout = to
-	arvados.InsecureHTTPClient.Timeout = to
-	http.DefaultClient.Timeout = to
 
-	log.Printf("keep-balance %s started", version)
+	go srv.Start(ctx)
+	return srv
+}
+
+func main() {
+	os.Exit(runCommand(os.Args[0], os.Args[1:], os.Stdin, os.Stdout, os.Stderr))
+}
+
+func runCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
+	logger := ctxlog.FromContext(context.Background())
+
+	flags := flag.NewFlagSet(prog, flag.ExitOnError)
+	flags.BoolVar(&options.Once, "once", false,
+		"balance once and then exit")
+	flags.BoolVar(&options.CommitPulls, "commit-pulls", false,
+		"send pull requests (make more replicas of blocks that are underreplicated or are not in optimal rendezvous probe order)")
+	flags.BoolVar(&options.CommitTrash, "commit-trash", false,
+		"send trash requests (delete unreferenced old blocks, and excess replicas of overreplicated blocks)")
+	flags.Bool("version", false, "Write version information to stdout and exit 0")
+	dumpFlag := flags.Bool("dump", false, "dump details for each block to stdout")
+
+	loader := config.NewLoader(os.Stdin, logger)
+	loader.SetupFlags(flags)
+
+	munged := loader.MungeLegacyConfigArgs(logger, args, "-legacy-keepbalance-config")
+	flags.Parse(munged)
 
-	if *debugFlag {
-		debugf = log.Printf
-		if j, err := json.Marshal(cfg); err != nil {
-			log.Fatal(err)
-		} else {
-			log.Printf("config is %s", j)
-		}
-	}
 	if *dumpFlag {
 		dumper := logrus.New()
 		dumper.Out = os.Stdout
 		dumper.Formatter = &logrus.TextFormatter{}
-		runOptions.Dumper = dumper
-	}
-	srv, err := NewServer(cfg, runOptions)
-	if err != nil {
-		// (don't run)
-	} else if runOptions.Once {
-		_, err = srv.Run()
-	} else {
-		err = srv.RunForever(nil)
-	}
-	if err != nil {
-		log.Fatal(err)
+		options.Dumper = dumper
 	}
-}
 
-func mustReadConfig(dst interface{}, path string) {
-	if err := config.LoadFile(dst, path); err != nil {
-		log.Fatal(err)
-	}
+	// Only pass along the version flag, which gets handled in RunCommand
+	args = nil
+	flags.Visit(func(f *flag.Flag) {
+		if f.Name == "version" {
+			args = append(args, "-"+f.Name, f.Value.String())
+		}
+	})
+
+	return command.RunCommand(prog, args, stdin, stdout, stderr)
 }
diff --git a/services/keep-balance/server.go b/services/keep-balance/server.go
index e2f13a425..0f4bb7176 100644
--- a/services/keep-balance/server.go
+++ b/services/keep-balance/server.go
@@ -6,7 +6,6 @@ package main
 
 import (
 	"context"
-	"fmt"
 	"net/http"
 	"os"
 	"os/signal"
@@ -14,57 +13,10 @@ import (
 	"time"
 
 	"git.curoverse.com/arvados.git/sdk/go/arvados"
-	"git.curoverse.com/arvados.git/sdk/go/auth"
 	"git.curoverse.com/arvados.git/sdk/go/ctxlog"
-	"git.curoverse.com/arvados.git/sdk/go/httpserver"
 	"github.com/sirupsen/logrus"
 )
 
-var version = "dev"
-
-const (
-	defaultConfigPath = "/etc/arvados/keep-balance/keep-balance.yml"
-	rfc3339NanoFixed  = "2006-01-02T15:04:05.000000000Z07:00"
-)
-
-// Config specifies site configuration, like API credentials and the
-// choice of which servers are to be balanced.
-//
-// Config is loaded from a JSON config file (see usage()).
-type Config struct {
-	// Arvados API endpoint and credentials.
-	Client arvados.Client
-
-	// List of service types (e.g., "disk") to balance.
-	KeepServiceTypes []string
-
-	KeepServiceList arvados.KeepServiceList
-
-	// address, address:port, or :port for management interface
-	Listen string
-
-	// token for management APIs
-	ManagementToken string
-
-	// How often to check
-	RunPeriod arvados.Duration
-
-	// Number of collections to request in each API call
-	CollectionBatchSize int
-
-	// Max collections to buffer in memory (bigger values consume
-	// more memory, but can reduce store-and-forward latency when
-	// fetching pages)
-	CollectionBuffers int
-
-	// Timeout for outgoing http request/response cycle.
-	RequestTimeout arvados.Duration
-
-	// Destination filename for the list of lost block hashes, one
-	// per line. Updated atomically during each successful run.
-	LostBlocksFile string
-}
-
 // RunOptions controls runtime behavior. The flags/options that belong
 // here are the ones that are useful for interactive use. For example,
 // "CommitTrash" is a runtime option rather than a config item because
@@ -87,100 +39,74 @@ type RunOptions struct {
 }
 
 type Server struct {
-	config     Config
-	runOptions RunOptions
-	metrics    *metrics
-	listening  string // for tests
+	http.Handler
+	Cluster    *arvados.Cluster
+	ArvClient  *arvados.Client
+	RunOptions RunOptions
+	Metrics    *metrics
 
 	Logger logrus.FieldLogger
 	Dumper logrus.FieldLogger
 }
 
-// NewServer returns a new Server that runs Balancers using the given
-// config and runOptions.
-func NewServer(config Config, runOptions RunOptions) (*Server, error) {
-	if len(config.KeepServiceList.Items) > 0 && config.KeepServiceTypes != nil {
-		return nil, fmt.Errorf("cannot specify both KeepServiceList and KeepServiceTypes in config")
-	}
-	if !runOptions.Once && config.RunPeriod == arvados.Duration(0) {
-		return nil, fmt.Errorf("you must either use the -once flag, or specify RunPeriod in config")
-	}
+// CheckHealth implements service.Handler.
+func (srv *Server) CheckHealth() error {
+	return nil
+}
 
-	if runOptions.Logger == nil {
-		log := logrus.New()
-		log.Formatter = &logrus.JSONFormatter{
-			TimestampFormat: rfc3339NanoFixed,
-		}
-		log.Out = os.Stderr
-		runOptions.Logger = log
+// Start sets up and runs the balancer.
+func (srv *Server) Start(ctx context.Context) {
+	if srv.RunOptions.Logger == nil {
+		srv.RunOptions.Logger = ctxlog.FromContext(ctx)
 	}
 
-	srv := &Server{
-		config:     config,
-		runOptions: runOptions,
-		metrics:    newMetrics(),
-		Logger:     runOptions.Logger,
-		Dumper:     runOptions.Dumper,
-	}
-	return srv, srv.start()
-}
+	srv.Logger = srv.RunOptions.Logger
+	srv.Dumper = srv.RunOptions.Dumper
 
-func (srv *Server) start() error {
-	if srv.config.Listen == "" {
-		return nil
-	}
-	ctx := ctxlog.Context(context.Background(), srv.Logger)
-	server := &httpserver.Server{
-		Server: http.Server{
-			Handler: httpserver.HandlerWithContext(ctx,
-				httpserver.LogRequests(
-					auth.RequireLiteralToken(srv.config.ManagementToken,
-						srv.metrics.Handler(srv.Logger)))),
-		},
-		Addr: srv.config.Listen,
+	var err error
+	if srv.RunOptions.Once {
+		_, err = srv.run()
+	} else {
+		err = srv.runForever(nil)
 	}
-	err := server.Start()
 	if err != nil {
-		return err
+		srv.Logger.Error(err)
 	}
-	srv.Logger.Printf("listening at %s", server.Addr)
-	srv.listening = server.Addr
-	return nil
 }
 
-func (srv *Server) Run() (*Balancer, error) {
+func (srv *Server) run() (*Balancer, error) {
 	bal := &Balancer{
 		Logger:         srv.Logger,
 		Dumper:         srv.Dumper,
-		Metrics:        srv.metrics,
-		LostBlocksFile: srv.config.LostBlocksFile,
+		Metrics:        srv.Metrics,
+		LostBlocksFile: srv.Cluster.Collections.BlobMissingReport,
 	}
 	var err error
-	srv.runOptions, err = bal.Run(srv.config, srv.runOptions)
+	srv.RunOptions, err = bal.Run(srv.ArvClient, srv.Cluster, srv.RunOptions)
 	return bal, err
 }
 
 // RunForever runs forever, or (for testing purposes) until the given
 // stop channel is ready to receive.
-func (srv *Server) RunForever(stop <-chan interface{}) error {
-	logger := srv.runOptions.Logger
+func (srv *Server) runForever(stop <-chan interface{}) error {
+	logger := srv.Logger
 
-	ticker := time.NewTicker(time.Duration(srv.config.RunPeriod))
+	ticker := time.NewTicker(time.Duration(srv.Cluster.Collections.BalancePeriod))
 
 	// The unbuffered channel here means we only hear SIGUSR1 if
 	// it arrives while we're waiting in select{}.
 	sigUSR1 := make(chan os.Signal)
 	signal.Notify(sigUSR1, syscall.SIGUSR1)
 
-	logger.Printf("starting up: will scan every %v and on SIGUSR1", srv.config.RunPeriod)
+	logger.Printf("starting up: will scan every %v and on SIGUSR1", srv.Cluster.Collections.BalancePeriod)
 
 	for {
-		if !srv.runOptions.CommitPulls && !srv.runOptions.CommitTrash {
+		if !srv.RunOptions.CommitPulls && !srv.RunOptions.CommitTrash {
 			logger.Print("WARNING: Will scan periodically, but no changes will be committed.")
 			logger.Print("=======  Consider using -commit-pulls and -commit-trash flags.")
 		}
 
-		_, err := srv.Run()
+		_, err := srv.run()
 		if err != nil {
 			logger.Print("run failed: ", err)
 		} else {
@@ -199,7 +125,7 @@ func (srv *Server) RunForever(stop <-chan interface{}) error {
 			// run too soon after the Nth run is triggered
 			// by SIGUSR1.
 			ticker.Stop()
-			ticker = time.NewTicker(time.Duration(srv.config.RunPeriod))
+			ticker = time.NewTicker(time.Duration(srv.Cluster.Collections.BalancePeriod))
 		}
 		logger.Print("starting next run")
 	}
diff --git a/services/keep-balance/usage.go b/services/keep-balance/usage.go
deleted file mode 100644
index b39e83905..000000000
--- a/services/keep-balance/usage.go
+++ /dev/null
@@ -1,106 +0,0 @@
-// Copyright (C) The Arvados Authors. All rights reserved.
-//
-// SPDX-License-Identifier: AGPL-3.0
-
-package main
-
-import (
-	"flag"
-	"fmt"
-	"os"
-)
-
-var exampleConfigFile = []byte(`
-Client:
-    APIHost: zzzzz.arvadosapi.com:443
-    AuthToken: xyzzy
-    Insecure: false
-KeepServiceTypes:
-    - disk
-Listen: ":9005"
-ManagementToken: xyzzy
-RunPeriod: 600s
-CollectionBatchSize: 100000
-CollectionBuffers: 1000
-RequestTimeout: 30m`)
-
-func usage() {
-	fmt.Fprintf(os.Stderr, `
-
-keep-balance rebalances a set of keepstore servers. It creates new
-copies of underreplicated blocks, deletes excess copies of
-overreplicated and unreferenced blocks, and moves blocks to better
-positions (according to the rendezvous hash algorithm) so clients find
-them faster.
-
-Usage: keep-balance [options]
-
-Options:
-`)
-	flag.PrintDefaults()
-	fmt.Fprintf(os.Stderr, `
-Example config file:
-%s
-
-    Client.AuthToken must be recognized by Arvados as an admin token,
-    and must be recognized by all Keep services as a "data manager
-    key".
-
-    Client.Insecure should be true if your Arvados API endpoint uses
-    an unverifiable SSL/TLS certificate.
-
-Periodic scanning:
-
-    By default, keep-balance operates periodically, i.e.: do a
-    scan/balance operation, sleep, repeat.
-
-    RunPeriod determines the interval between start times of
-    successive scan/balance operations. If a scan/balance operation
-    takes longer than RunPeriod, the next one will follow it
-    immediately.
-
-    If SIGUSR1 is received during an idle period between operations,
-    the next operation will start immediately.
-
-One-time scanning:
-
-    Use the -once flag to do a single operation and then exit. The
-    exit code will be zero if the operation was successful.
-
-Committing:
-
-    By default, keep-service computes and reports changes but does not
-    implement them by sending pull and trash lists to the Keep
-    services.
-
-    Use the -commit-pull and -commit-trash flags to implement the
-    computed changes.
-
-Tuning resource usage:
-
-    CollectionBatchSize limits the number of collections retrieved per
-    API transaction. If this is zero or omitted, page size is
-    determined by the API server's own page size limits (see
-    max_items_per_response and max_index_database_read configs).
-
-    CollectionBuffers sets the size of an internal queue of
-    collections. Higher values use more memory, and improve throughput
-    by allowing keep-balance to fetch the next page of collections
-    while the current page is still being processed. If this is zero
-    or omitted, pages are processed serially.
-
-    RequestTimeout is the maximum time keep-balance will spend on a
-    single HTTP request (getting a page of collections, getting the
-    block index from a keepstore server, or sending a trash or pull
-    list to a keepstore server). Defaults to 30 minutes.
-
-Limitations:
-
-    keep-balance does not attempt to discover whether committed pull
-    and trash requests ever get carried out -- only that they are
-    accepted by the Keep services. If some services are full, new
-    copies of underreplicated blocks might never get made, only
-    repeatedly requested.
-
-`, exampleConfigFile)
-}

commit 78cf1c3d54179226e826ebca5b19dc5d0550771a
Author: Eric Biagiotti <ebiagiotti at veritasgenetics.com>
Date:   Fri Sep 6 14:39:39 2019 -0400

    14714: Adds keep-balance to cluster config loading
    
    Also adds a deprecated config loading test and fixes the service file
    
    Arvados-DCO-1.1-Signed-off-by: Eric Biagiotti <ebiagiotti at veritasgenetics.com>

diff --git a/lib/config/config.default.yml b/lib/config/config.default.yml
index c7a038bec..b2f753e11 100644
--- a/lib/config/config.default.yml
+++ b/lib/config/config.default.yml
@@ -347,6 +347,36 @@ Clusters:
       # The default is 2 weeks.
       BlobSigningTTL: 336h
 
+      # When running keep-balance, this is the destination filename for the
+      # list of lost block hashes if there are any, one per line. Updated atomically during
+      # each successful run.
+      BlobMissingReport: ""
+
+      # keep-balance operates periodically, i.e.: do a
+      # scan/balance operation, sleep, repeat.
+      #
+      # BalancePeriod determines the interval between start times of
+      # successive scan/balance operations. If a scan/balance operation
+      # takes longer than RunPeriod, the next one will follow it
+      # immediately.
+      #
+      # If SIGUSR1 is received during an idle period between operations,
+      # the next operation will start immediately.
+      BalancePeriod: 10m
+
+      # Limits the number of collections retrieved by keep-balance per
+      # API transaction. If this is zero, page size is
+      # determined by the API server's own page size limits (see
+      # API.MaxItemsPerResponse and API.MaxIndexDatabaseRead).
+      BalanceCollectionBatch: 100000
+
+      # The size of keep-balance's internal queue of
+      # collections. Higher values use more memory and improve throughput
+      # by allowing keep-balance to fetch the next page of collections
+      # while the current page is still being processed. If this is zero
+      # or omitted, pages are processed serially.
+      BalanceCollectionBuffers: 1000
+
       # Default lifetime for ephemeral collections: 2 weeks. This must not
       # be less than BlobSigningTTL.
       DefaultTrashLifetime: 336h
diff --git a/lib/config/deprecated.go b/lib/config/deprecated.go
index 9eb8c40c1..c78fb9962 100644
--- a/lib/config/deprecated.go
+++ b/lib/config/deprecated.go
@@ -509,3 +509,71 @@ func (ldr *Loader) loadOldGitHttpdConfig(cfg *arvados.Config) error {
 	cfg.Clusters[cluster.ClusterID] = *cluster
 	return nil
 }
+
+const defaultKeepBalanceConfigPath = "/etc/arvados/keep-balance/keep-balance.yml"
+
+type oldKeepBalanceConfig struct {
+	Client              *arvados.Client
+	Listen              *string
+	KeepServiceTypes    *[]string
+	KeepServiceList     *arvados.KeepServiceList
+	RunPeriod           *arvados.Duration
+	CollectionBatchSize *int
+	CollectionBuffers   *int
+	RequestTimeout      *arvados.Duration
+	LostBlocksFile      *string
+	ManagementToken     *string
+}
+
+func (ldr *Loader) loadOldKeepBalanceConfig(cfg *arvados.Config) error {
+	if ldr.KeepBalancePath == "" {
+		return nil
+	}
+	var oc oldKeepBalanceConfig
+	err := ldr.loadOldConfigHelper("keep-balance", ldr.KeepBalancePath, &oc)
+	if os.IsNotExist(err) && ldr.KeepBalancePath == defaultKeepBalanceConfigPath {
+		return nil
+	} else if err != nil {
+		return err
+	}
+
+	cluster, err := cfg.GetCluster("")
+	if err != nil {
+		return err
+	}
+
+	loadOldClientConfig(cluster, oc.Client)
+
+	if oc.Listen != nil {
+		cluster.Services.Keepbalance.InternalURLs[arvados.URL{Host: *oc.Listen}] = arvados.ServiceInstance{}
+	}
+	if oc.ManagementToken != nil {
+		cluster.ManagementToken = *oc.ManagementToken
+	}
+	if oc.RunPeriod != nil {
+		cluster.Collections.BalancePeriod = *oc.RunPeriod
+	}
+	if oc.LostBlocksFile != nil {
+		cluster.Collections.BlobMissingReport = *oc.LostBlocksFile
+	}
+	if oc.CollectionBatchSize != nil {
+		cluster.Collections.BalanceCollectionBatch = *oc.CollectionBatchSize
+	}
+	if oc.CollectionBuffers != nil {
+		cluster.Collections.BalanceCollectionBuffers = *oc.CollectionBuffers
+	}
+	if oc.RequestTimeout != nil {
+		cluster.API.KeepServiceRequestTimeout = *oc.RequestTimeout
+	}
+
+	msg := "To balance specfic keep services, please update to the cluster config."
+	if oc.KeepServiceTypes != nil && len(*oc.KeepServiceTypes) > 0 {
+		ldr.Logger.Warnf("The KeepServiceType configuration option is not longer supported and is being ignored. %s", msg)
+	}
+	if oc.KeepServiceList != nil {
+		return fmt.Errorf("The KeepServiceList configuration option is no longer supported. Please remove it from your configuration file. %s", msg)
+	}
+
+	cfg.Clusters[cluster.ClusterID] = *cluster
+	return nil
+}
diff --git a/lib/config/deprecated_test.go b/lib/config/deprecated_test.go
index 5dda0ba94..db7e7b19a 100644
--- a/lib/config/deprecated_test.go
+++ b/lib/config/deprecated_test.go
@@ -216,3 +216,48 @@ func (s *LoadSuite) TestLegacyArvGitHttpdConfig(c *check.C) {
 	c.Check(cluster.Git.Repositories, check.Equals, "/test/reporoot")
 	c.Check(cluster.Services.Keepproxy.InternalURLs[arvados.URL{Host: ":9000"}], check.Equals, arvados.ServiceInstance{})
 }
+
+func (s *LoadSuite) TestLegacyKeepBalanceConfig(c *check.C) {
+	f := "-legacy-keepbalance-config"
+	content := []byte(fmtKeepBalanceConfig(""))
+	cluster, err := testLoadLegacyConfig(content, f, c)
+
+	c.Check(err, check.IsNil)
+	c.Check(cluster, check.NotNil)
+	c.Check(cluster.ManagementToken, check.Equals, "xyzzy")
+	c.Check(cluster.Services.Keepbalance.InternalURLs[arvados.URL{Host: ":80"}], check.Equals, arvados.ServiceInstance{})
+	c.Check(cluster.Collections.BalanceCollectionBuffers, check.Equals, 1000)
+	c.Check(cluster.Collections.BalanceCollectionBatch, check.Equals, 100000)
+	c.Check(cluster.Collections.BalancePeriod.String(), check.Equals, "10m")
+	c.Check(cluster.Collections.BlobMissingReport, check.Equals, "testfile")
+	c.Check(cluster.API.KeepServiceRequestTimeout.String(), check.Equals, "30m")
+
+	content = []byte(fmtKeepBalanceConfig(`"KeepServiceTypes":["disk"],`))
+	_, err = testLoadLegacyConfig(content, f, c)
+	c.Check(err, check.IsNil)
+
+	content = []byte(fmtKeepBalanceConfig(`"KeepServiceList":{},`))
+	_, err = testLoadLegacyConfig(content, f, c)
+	c.Check(err, check.NotNil)
+}
+
+func fmtKeepBalanceConfig(param string) string {
+	return fmt.Sprintf(`
+{
+	"Client": {
+		"Scheme": "",
+		"APIHost": "example.com",
+		"AuthToken": "abcdefg",
+		"Insecure": false
+	},
+	"Listen": ":80",
+	%s
+	"RunPeriod": "10m",
+	"CollectionBatchSize": 100000,
+	"CollectionBuffers": 1000,
+	"RequestTimeout": "30m",
+	"ManagementToken": "xyzzy",
+	"LostBlocksFile": "testfile"
+}
+`, param)
+}
diff --git a/lib/config/export.go b/lib/config/export.go
index 69aae2c62..09648512b 100644
--- a/lib/config/export.go
+++ b/lib/config/export.go
@@ -91,6 +91,10 @@ var whitelist = map[string]bool{
 	"Collections.TrashSweepInterval":               false,
 	"Collections.TrustAllContent":                  false,
 	"Collections.WebDAVCache":                      false,
+	"Collections.BalanceCollectionBatch":           false,
+	"Collections.BalancePeriod":                    false,
+	"Collections.BlobMissingReport":                false,
+	"Collections.BalanceCollectionBuffers":         false,
 	"Containers":                                   true,
 	"Containers.CloudVMs":                          false,
 	"Containers.CrunchRunCommand":                  false,
diff --git a/lib/config/generated_config.go b/lib/config/generated_config.go
index f8a0e097d..28ddd1a29 100644
--- a/lib/config/generated_config.go
+++ b/lib/config/generated_config.go
@@ -353,6 +353,36 @@ Clusters:
       # The default is 2 weeks.
       BlobSigningTTL: 336h
 
+      # When running keep-balance, this is the destination filename for the
+      # list of lost block hashes if there are any, one per line. Updated atomically during
+      # each successful run.
+      BlobMissingReport: ""
+
+      # keep-balance operates periodically, i.e.: do a
+      # scan/balance operation, sleep, repeat.
+      #
+      # BalancePeriod determines the interval between start times of
+      # successive scan/balance operations. If a scan/balance operation
+      # takes longer than RunPeriod, the next one will follow it
+      # immediately.
+      #
+      # If SIGUSR1 is received during an idle period between operations,
+      # the next operation will start immediately.
+      BalancePeriod: 10m
+
+      # Limits the number of collections retrieved by keep-balance per
+      # API transaction. If this is zero, page size is
+      # determined by the API server's own page size limits (see
+      # API.MaxItemsPerResponse and API.MaxIndexDatabaseRead).
+      BalanceCollectionBatch: 100000
+
+      # The size of keep-balance's internal queue of
+      # collections. Higher values use more memory and improve throughput
+      # by allowing keep-balance to fetch the next page of collections
+      # while the current page is still being processed. If this is zero
+      # or omitted, pages are processed serially.
+      BalanceCollectionBuffers: 1000
+
       # Default lifetime for ephemeral collections: 2 weeks. This must not
       # be less than BlobSigningTTL.
       DefaultTrashLifetime: 336h
diff --git a/lib/config/load.go b/lib/config/load.go
index 7e4849393..d75734694 100644
--- a/lib/config/load.go
+++ b/lib/config/load.go
@@ -36,6 +36,7 @@ type Loader struct {
 	WebsocketPath           string
 	KeepproxyPath           string
 	GitHttpdPath            string
+	KeepBalancePath         string
 
 	configdata []byte
 }
@@ -68,6 +69,7 @@ func (ldr *Loader) SetupFlags(flagset *flag.FlagSet) {
 	flagset.StringVar(&ldr.WebsocketPath, "legacy-ws-config", defaultWebsocketConfigPath, "Legacy arvados-ws configuration `file`")
 	flagset.StringVar(&ldr.KeepproxyPath, "legacy-keepproxy-config", defaultKeepproxyConfigPath, "Legacy keepproxy configuration `file`")
 	flagset.StringVar(&ldr.GitHttpdPath, "legacy-git-httpd-config", defaultGitHttpdConfigPath, "Legacy arv-git-httpd configuration `file`")
+	flagset.StringVar(&ldr.KeepBalancePath, "legacy-keepbalance-config", defaultKeepBalanceConfigPath, "Legacy keep-balance configuration `file`")
 	flagset.BoolVar(&ldr.SkipLegacy, "skip-legacy", false, "Don't load legacy config files")
 }
 
@@ -148,6 +150,9 @@ func (ldr *Loader) MungeLegacyConfigArgs(lgr logrus.FieldLogger, args []string,
 	if legacyConfigArg != "-legacy-git-httpd-config" {
 		ldr.GitHttpdPath = ""
 	}
+	if legacyConfigArg != "-legacy-keepbalance-config" {
+		ldr.KeepBalancePath = ""
+	}
 
 	return munged
 }
@@ -250,6 +255,7 @@ func (ldr *Loader) Load() (*arvados.Config, error) {
 			ldr.loadOldWebsocketConfig(&cfg),
 			ldr.loadOldKeepproxyConfig(&cfg),
 			ldr.loadOldGitHttpdConfig(&cfg),
+			ldr.loadOldKeepBalanceConfig(&cfg),
 		} {
 			if err != nil {
 				return nil, err
diff --git a/sdk/go/arvados/config.go b/sdk/go/arvados/config.go
index 29dd62ac1..b33bf19e9 100644
--- a/sdk/go/arvados/config.go
+++ b/sdk/go/arvados/config.go
@@ -111,6 +111,11 @@ type Cluster struct {
 		TrashSweepInterval    Duration
 		TrustAllContent       bool
 
+		BlobMissingReport        string
+		BalancePeriod            Duration
+		BalanceCollectionBatch   int
+		BalanceCollectionBuffers int
+
 		WebDAVCache WebDAVCacheConfig
 	}
 	Git struct {
diff --git a/services/keep-balance/keep-balance.service b/services/keep-balance/keep-balance.service
index 563871607..1b71fb4e4 100644
--- a/services/keep-balance/keep-balance.service
+++ b/services/keep-balance/keep-balance.service
@@ -6,7 +6,6 @@
 Description=Arvados Keep Balance
 Documentation=https://doc.arvados.org/
 After=network.target
-AssertPathExists=/etc/arvados/keep-balance/keep-balance.yml
 
 # systemd==229 (ubuntu:xenial) obeys StartLimitInterval in the [Unit] section
 StartLimitInterval=0

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list