[arvados] created: 2.1.0-2996-g08bf53a33
git repository hosting
git at public.arvados.org
Fri Oct 28 15:31:33 UTC 2022
at 08bf53a3396ab74e805d468ccbb9c0cea86a3d5a (commit)
commit 08bf53a3396ab74e805d468ccbb9c0cea86a3d5a
Author: Tom Clegg <tom at curii.com>
Date: Fri Oct 28 11:08:39 2022 -0400
18071: Improve test timeout.
Test can take longer when output is going to a terminal. Keep trying
if some progress is made every 3s, rather than giving up after 10s
total.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>
diff --git a/lib/dispatchcloud/dispatcher_test.go b/lib/dispatchcloud/dispatcher_test.go
index bdbdf1ef9..2d486da5f 100644
--- a/lib/dispatchcloud/dispatcher_test.go
+++ b/lib/dispatchcloud/dispatcher_test.go
@@ -193,12 +193,18 @@ func (s *DispatcherSuite) TestDispatchToStubDriver(c *check.C) {
err := s.disp.CheckHealth()
c.Check(err, check.IsNil)
- select {
- case <-done:
- c.Logf("containers finished (%s), waiting for instances to shutdown and queue to clear", time.Since(start))
- case <-time.After(10 * time.Second):
- c.Fatalf("timed out; still waiting for %d containers: %q", len(waiting), waiting)
+ for len(waiting) > 0 {
+ waswaiting := len(waiting)
+ select {
+ case <-done:
+ // loop will end because len(waiting)==0
+ case <-time.After(3 * time.Second):
+ if len(waiting) >= waswaiting {
+ c.Fatalf("timed out; no progress in 3s while waiting for %d containers: %q", len(waiting), waiting)
+ }
+ }
}
+ c.Logf("containers finished (%s), waiting for instances to shutdown and queue to clear", time.Since(start))
deadline := time.Now().Add(5 * time.Second)
for range time.NewTicker(10 * time.Millisecond).C {
commit 65b12213f740b117fb14822bce0dbb415257c355
Author: Tom Clegg <tom at curii.com>
Date: Fri Oct 28 10:35:27 2022 -0400
18071: Use dblock to prevent multiple dispatchers from competing.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>
diff --git a/lib/controller/dblock/dblock.go b/lib/controller/dblock/dblock.go
index 6f5db1006..ad2733abf 100644
--- a/lib/controller/dblock/dblock.go
+++ b/lib/controller/dblock/dblock.go
@@ -21,6 +21,7 @@ var (
ContainerLogSweep = &DBLocker{key: 10002}
KeepBalanceService = &DBLocker{key: 10003} // keep-balance service in periodic-sweep loop
KeepBalanceActive = &DBLocker{key: 10004} // keep-balance sweep in progress (either -once=true or service loop)
+ Dispatch = &DBLocker{key: 10005} // any dispatcher running
retryDelay = 5 * time.Second
)
diff --git a/lib/controller/federation.go b/lib/controller/federation.go
index e7d6e29b8..93b8315a6 100644
--- a/lib/controller/federation.go
+++ b/lib/controller/federation.go
@@ -142,7 +142,7 @@ type CurrentUser struct {
// non-nil, true, nil -- if the token is valid
func (h *Handler) validateAPItoken(req *http.Request, token string) (*CurrentUser, bool, error) {
user := CurrentUser{Authorization: arvados.APIClientAuthorization{APIToken: token}}
- db, err := h.db(req.Context())
+ db, err := h.dbConnector.GetDB(req.Context())
if err != nil {
ctxlog.FromContext(req.Context()).WithError(err).Debugf("validateAPItoken(%s): database error", token)
return nil, false, err
@@ -179,7 +179,7 @@ func (h *Handler) validateAPItoken(req *http.Request, token string) (*CurrentUse
}
func (h *Handler) createAPItoken(req *http.Request, userUUID string, scopes []string) (*arvados.APIClientAuthorization, error) {
- db, err := h.db(req.Context())
+ db, err := h.dbConnector.GetDB(req.Context())
if err != nil {
return nil, err
}
diff --git a/lib/controller/handler.go b/lib/controller/handler.go
index e1392bef9..4c6fca7f7 100644
--- a/lib/controller/handler.go
+++ b/lib/controller/handler.go
@@ -6,7 +6,6 @@ package controller
import (
"context"
- "errors"
"fmt"
"net/http"
"net/http/httptest"
@@ -21,10 +20,8 @@ import (
"git.arvados.org/arvados.git/lib/controller/router"
"git.arvados.org/arvados.git/lib/ctrlctx"
"git.arvados.org/arvados.git/sdk/go/arvados"
- "git.arvados.org/arvados.git/sdk/go/ctxlog"
"git.arvados.org/arvados.git/sdk/go/health"
"git.arvados.org/arvados.git/sdk/go/httpserver"
- "github.com/jmoiron/sqlx"
// sqlx needs lib/pq to talk to PostgreSQL
_ "github.com/lib/pq"
@@ -40,8 +37,7 @@ type Handler struct {
proxy *proxy
secureClient *http.Client
insecureClient *http.Client
- pgdb *sqlx.DB
- pgdbMtx sync.Mutex
+ dbConnector ctrlctx.DBConnector
}
func (h *Handler) ServeHTTP(w http.ResponseWriter, req *http.Request) {
@@ -65,7 +61,7 @@ func (h *Handler) ServeHTTP(w http.ResponseWriter, req *http.Request) {
func (h *Handler) CheckHealth() error {
h.setupOnce.Do(h.setup)
- _, err := h.db(context.TODO())
+ _, err := h.dbConnector.GetDB(context.TODO())
if err != nil {
return err
}
@@ -97,17 +93,18 @@ func (h *Handler) setup() {
mux := http.NewServeMux()
healthFuncs := make(map[string]health.Func)
- oidcAuthorizer := localdb.OIDCAccessTokenAuthorizer(h.Cluster, h.db)
+ h.dbConnector = ctrlctx.DBConnector{PostgreSQL: h.Cluster.PostgreSQL}
+ oidcAuthorizer := localdb.OIDCAccessTokenAuthorizer(h.Cluster, h.dbConnector.GetDB)
h.federation = federation.New(h.Cluster, &healthFuncs)
rtr := router.New(h.federation, router.Config{
MaxRequestSize: h.Cluster.API.MaxRequestSize,
WrapCalls: api.ComposeWrappers(
- ctrlctx.WrapCallsInTransactions(h.db),
+ ctrlctx.WrapCallsInTransactions(h.dbConnector.GetDB),
oidcAuthorizer.WrapCalls,
ctrlctx.WrapCallsWithAuth(h.Cluster)),
})
- healthRoutes := health.Routes{"ping": func() error { _, err := h.db(context.TODO()); return err }}
+ healthRoutes := health.Routes{"ping": func() error { _, err := h.dbConnector.GetDB(context.TODO()); return err }}
for name, f := range healthFuncs {
healthRoutes[name] = f
}
@@ -158,31 +155,6 @@ func (h *Handler) setup() {
go h.containerLogSweepWorker()
}
-var errDBConnection = errors.New("database connection error")
-
-func (h *Handler) db(ctx context.Context) (*sqlx.DB, error) {
- h.pgdbMtx.Lock()
- defer h.pgdbMtx.Unlock()
- if h.pgdb != nil {
- return h.pgdb, nil
- }
-
- db, err := sqlx.Open("postgres", h.Cluster.PostgreSQL.Connection.String())
- if err != nil {
- ctxlog.FromContext(ctx).WithError(err).Error("postgresql connect failed")
- return nil, errDBConnection
- }
- if p := h.Cluster.PostgreSQL.ConnectionPool; p > 0 {
- db.SetMaxOpenConns(p)
- }
- if err := db.Ping(); err != nil {
- ctxlog.FromContext(ctx).WithError(err).Error("postgresql connect succeeded but ping failed")
- return nil, errDBConnection
- }
- h.pgdb = db
- return db, nil
-}
-
type middlewareFunc func(http.ResponseWriter, *http.Request, http.Handler)
func prepend(next http.Handler, middleware middlewareFunc) http.Handler {
diff --git a/lib/controller/handler_test.go b/lib/controller/handler_test.go
index 0ffe0255f..c9999fa28 100644
--- a/lib/controller/handler_test.go
+++ b/lib/controller/handler_test.go
@@ -477,7 +477,7 @@ func (s *HandlerSuite) TestTrashSweep(c *check.C) {
coll, err := s.handler.federation.CollectionCreate(ctx, arvados.CreateOptions{Attrs: map[string]interface{}{"name": "test trash sweep"}, EnsureUniqueName: true})
c.Assert(err, check.IsNil)
defer s.handler.federation.CollectionDelete(ctx, arvados.DeleteOptions{UUID: coll.UUID})
- db, err := s.handler.db(s.ctx)
+ db, err := s.handler.dbConnector.GetDB(s.ctx)
c.Assert(err, check.IsNil)
_, err = db.ExecContext(s.ctx, `update collections set trash_at = $1, delete_at = $2 where uuid = $3`, time.Now().UTC().Add(time.Second/10), time.Now().UTC().Add(time.Hour), coll.UUID)
c.Assert(err, check.IsNil)
@@ -550,7 +550,7 @@ func (s *HandlerSuite) TestLogActivity(c *check.C) {
c.Assert(err, check.IsNil)
}
}
- db, err := s.handler.db(s.ctx)
+ db, err := s.handler.dbConnector.GetDB(s.ctx)
c.Assert(err, check.IsNil)
for _, userUUID := range []string{arvadostest.ActiveUserUUID, arvadostest.SpectatorUserUUID} {
var rows int
diff --git a/lib/controller/trash.go b/lib/controller/trash.go
index afdf95b78..99e7aec0b 100644
--- a/lib/controller/trash.go
+++ b/lib/controller/trash.go
@@ -20,7 +20,7 @@ func (h *Handler) periodicWorker(workerName string, interval time.Duration, lock
logger.Debugf("interval is %v, not running worker", interval)
return
}
- if !locker.Lock(ctx, h.db) {
+ if !locker.Lock(ctx, h.dbConnector.GetDB) {
// context canceled
return
}
@@ -47,7 +47,7 @@ func (h *Handler) trashSweepWorker() {
func (h *Handler) containerLogSweepWorker() {
h.periodicWorker("container log sweep", h.Cluster.Containers.Logging.SweepInterval.Duration(), dblock.ContainerLogSweep, func(ctx context.Context) error {
- db, err := h.db(ctx)
+ db, err := h.dbConnector.GetDB(ctx)
if err != nil {
return err
}
diff --git a/lib/ctrlctx/db.go b/lib/ctrlctx/db.go
index a76420860..2a05096ce 100644
--- a/lib/ctrlctx/db.go
+++ b/lib/ctrlctx/db.go
@@ -10,6 +10,7 @@ import (
"sync"
"git.arvados.org/arvados.git/lib/controller/api"
+ "git.arvados.org/arvados.git/sdk/go/arvados"
"git.arvados.org/arvados.git/sdk/go/ctxlog"
"github.com/jmoiron/sqlx"
@@ -142,3 +143,33 @@ func CurrentTx(ctx context.Context) (*sqlx.Tx, error) {
})
return txn.tx, txn.err
}
+
+var errDBConnection = errors.New("database connection error")
+
+type DBConnector struct {
+ PostgreSQL arvados.PostgreSQL
+ pgdb *sqlx.DB
+ mtx sync.Mutex
+}
+
+func (dbc *DBConnector) GetDB(ctx context.Context) (*sqlx.DB, error) {
+ dbc.mtx.Lock()
+ defer dbc.mtx.Unlock()
+ if dbc.pgdb != nil {
+ return dbc.pgdb, nil
+ }
+ db, err := sqlx.Open("postgres", dbc.PostgreSQL.Connection.String())
+ if err != nil {
+ ctxlog.FromContext(ctx).WithError(err).Error("postgresql connect failed")
+ return nil, errDBConnection
+ }
+ if p := dbc.PostgreSQL.ConnectionPool; p > 0 {
+ db.SetMaxOpenConns(p)
+ }
+ if err := db.Ping(); err != nil {
+ ctxlog.FromContext(ctx).WithError(err).Error("postgresql connect succeeded but ping failed")
+ return nil, errDBConnection
+ }
+ dbc.pgdb = db
+ return db, nil
+}
diff --git a/lib/dispatchcloud/dispatcher.go b/lib/dispatchcloud/dispatcher.go
index ae91a710e..3403c50c9 100644
--- a/lib/dispatchcloud/dispatcher.go
+++ b/lib/dispatchcloud/dispatcher.go
@@ -15,6 +15,8 @@ import (
"time"
"git.arvados.org/arvados.git/lib/cloud"
+ "git.arvados.org/arvados.git/lib/controller/dblock"
+ "git.arvados.org/arvados.git/lib/ctrlctx"
"git.arvados.org/arvados.git/lib/dispatchcloud/container"
"git.arvados.org/arvados.git/lib/dispatchcloud/scheduler"
"git.arvados.org/arvados.git/lib/dispatchcloud/sshexecutor"
@@ -53,6 +55,7 @@ type dispatcher struct {
Registry *prometheus.Registry
InstanceSetID cloud.InstanceSetID
+ dbConnector ctrlctx.DBConnector
logger logrus.FieldLogger
instanceSet cloud.InstanceSet
pool pool
@@ -118,6 +121,7 @@ func (disp *dispatcher) setup() {
func (disp *dispatcher) initialize() {
disp.logger = ctxlog.FromContext(disp.Context)
+ disp.dbConnector = ctrlctx.DBConnector{PostgreSQL: disp.Cluster.PostgreSQL}
disp.ArvClient.AuthToken = disp.AuthToken
@@ -143,6 +147,7 @@ func (disp *dispatcher) initialize() {
if err != nil {
disp.logger.Fatalf("error initializing driver: %s", err)
}
+ dblock.Dispatch.Lock(disp.Context, disp.dbConnector.GetDB)
disp.instanceSet = instanceSet
disp.pool = worker.NewPool(disp.logger, disp.ArvClient, disp.Registry, disp.InstanceSetID, disp.instanceSet, disp.newExecutor, disp.sshKey.PublicKey(), disp.Cluster)
disp.queue = container.NewQueue(disp.logger, disp.Registry, disp.typeChooser, disp.ArvClient)
@@ -175,6 +180,7 @@ func (disp *dispatcher) initialize() {
}
func (disp *dispatcher) run() {
+ defer dblock.Dispatch.Unlock()
defer close(disp.stopped)
defer disp.instanceSet.Stop()
defer disp.pool.Stop()
diff --git a/lib/dispatchcloud/dispatcher_test.go b/lib/dispatchcloud/dispatcher_test.go
index 829a05363..bdbdf1ef9 100644
--- a/lib/dispatchcloud/dispatcher_test.go
+++ b/lib/dispatchcloud/dispatcher_test.go
@@ -15,6 +15,7 @@ import (
"sync"
"time"
+ "git.arvados.org/arvados.git/lib/config"
"git.arvados.org/arvados.git/lib/dispatchcloud/test"
"git.arvados.org/arvados.git/sdk/go/arvados"
"git.arvados.org/arvados.git/sdk/go/arvadostest"
@@ -49,8 +50,16 @@ func (s *DispatcherSuite) SetUpTest(c *check.C) {
MinTimeBetweenCreateCalls: time.Millisecond,
}
+ // We need the postgresql connection info from the integration
+ // test config.
+ cfg, err := config.NewLoader(nil, ctxlog.FromContext(s.ctx)).Load()
+ c.Assert(err, check.IsNil)
+ testcluster, err := cfg.GetCluster("")
+ c.Assert(err, check.IsNil)
+
s.cluster = &arvados.Cluster{
ManagementToken: "test-management-token",
+ PostgreSQL: testcluster.PostgreSQL,
Containers: arvados.ContainersConfig{
CrunchRunCommand: "crunch-run",
CrunchRunArgumentsList: []string{"--foo", "--extra='args'"},
diff --git a/lib/lsf/dispatch.go b/lib/lsf/dispatch.go
index d362f66d1..d1408d23c 100644
--- a/lib/lsf/dispatch.go
+++ b/lib/lsf/dispatch.go
@@ -18,6 +18,8 @@ import (
"time"
"git.arvados.org/arvados.git/lib/cmd"
+ "git.arvados.org/arvados.git/lib/controller/dblock"
+ "git.arvados.org/arvados.git/lib/ctrlctx"
"git.arvados.org/arvados.git/lib/dispatchcloud"
"git.arvados.org/arvados.git/lib/service"
"git.arvados.org/arvados.git/sdk/go/arvados"
@@ -58,6 +60,7 @@ type dispatcher struct {
Registry *prometheus.Registry
logger logrus.FieldLogger
+ dbConnector ctrlctx.DBConnector
lsfcli lsfcli
lsfqueue lsfqueue
arvDispatcher *dispatch.Dispatcher
@@ -73,7 +76,9 @@ type dispatcher struct {
func (disp *dispatcher) Start() {
disp.initOnce.Do(func() {
disp.init()
+ dblock.Dispatch.Lock(context.Background(), disp.dbConnector.GetDB)
go func() {
+ defer dblock.Dispatch.Unlock()
disp.checkLsfQueueForOrphans()
err := disp.arvDispatcher.Run(disp.Context)
if err != nil {
@@ -125,6 +130,7 @@ func (disp *dispatcher) init() {
lsfcli: &disp.lsfcli,
}
disp.ArvClient.AuthToken = disp.AuthToken
+ disp.dbConnector = ctrlctx.DBConnector{PostgreSQL: disp.Cluster.PostgreSQL}
disp.stop = make(chan struct{}, 1)
disp.stopped = make(chan struct{})
diff --git a/services/crunch-dispatch-slurm/crunch-dispatch-slurm.go b/services/crunch-dispatch-slurm/crunch-dispatch-slurm.go
index ac394e114..1c0f6ad28 100644
--- a/services/crunch-dispatch-slurm/crunch-dispatch-slurm.go
+++ b/services/crunch-dispatch-slurm/crunch-dispatch-slurm.go
@@ -19,6 +19,8 @@ import (
"time"
"git.arvados.org/arvados.git/lib/cmd"
+ "git.arvados.org/arvados.git/lib/controller/dblock"
+ "git.arvados.org/arvados.git/lib/ctrlctx"
"git.arvados.org/arvados.git/lib/dispatchcloud"
"git.arvados.org/arvados.git/lib/service"
"git.arvados.org/arvados.git/sdk/go/arvados"
@@ -55,10 +57,11 @@ const initialNiceValue int64 = 10000
type Dispatcher struct {
*dispatch.Dispatcher
- logger logrus.FieldLogger
- cluster *arvados.Cluster
- sqCheck *SqueueChecker
- slurm Slurm
+ logger logrus.FieldLogger
+ cluster *arvados.Cluster
+ sqCheck *SqueueChecker
+ slurm Slurm
+ dbConnector ctrlctx.DBConnector
done chan struct{}
err error
@@ -90,6 +93,7 @@ func (disp *Dispatcher) configure() error {
disp.Client.APIHost = disp.cluster.Services.Controller.ExternalURL.Host
disp.Client.AuthToken = disp.cluster.SystemRootToken
disp.Client.Insecure = disp.cluster.TLS.Insecure
+ disp.dbConnector = ctrlctx.DBConnector{PostgreSQL: disp.cluster.PostgreSQL}
if disp.Client.APIHost != "" || disp.Client.AuthToken != "" {
// Copy real configs into env vars so [a]
@@ -137,6 +141,8 @@ func (disp *Dispatcher) setup() {
}
func (disp *Dispatcher) run() error {
+ dblock.Dispatch.Lock(context.Background(), disp.dbConnector.GetDB)
+ defer dblock.Dispatch.Unlock()
defer disp.sqCheck.Stop()
if disp.cluster != nil && len(disp.cluster.InstanceTypes) > 0 {
commit e33a15001d7a94a805a5d0d4c77544d959974193
Author: Tom Clegg <tom at curii.com>
Date: Thu Oct 27 11:05:42 2022 -0400
18071: Log host:port of other process(es) holding lock.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>
diff --git a/lib/controller/dblock/dblock.go b/lib/controller/dblock/dblock.go
index 472633747..6f5db1006 100644
--- a/lib/controller/dblock/dblock.go
+++ b/lib/controller/dblock/dblock.go
@@ -7,6 +7,8 @@ package dblock
import (
"context"
"database/sql"
+ "fmt"
+ "net"
"sync"
"time"
@@ -36,7 +38,8 @@ type DBLocker struct {
//
// Returns false if ctx is canceled before the lock is acquired.
func (dbl *DBLocker) Lock(ctx context.Context, getdb func(context.Context) (*sqlx.DB, error)) bool {
- logger := ctxlog.FromContext(ctx)
+ logger := ctxlog.FromContext(ctx).WithField("ID", dbl.key)
+ var lastHeldBy string
for ; ; time.Sleep(retryDelay) {
dbl.mtx.Lock()
if dbl.conn != nil {
@@ -54,7 +57,7 @@ func (dbl *DBLocker) Lock(ctx context.Context, getdb func(context.Context) (*sql
dbl.mtx.Unlock()
return false
} else if err != nil {
- logger.WithError(err).Infof("error getting database pool")
+ logger.WithError(err).Info("error getting database pool")
dbl.mtx.Unlock()
continue
}
@@ -72,17 +75,31 @@ func (dbl *DBLocker) Lock(ctx context.Context, getdb func(context.Context) (*sql
if err == context.Canceled {
return false
} else if err != nil {
- logger.WithError(err).Infof("error getting pg_try_advisory_lock %d", dbl.key)
+ logger.WithError(err).Info("error getting pg_try_advisory_lock")
conn.Close()
dbl.mtx.Unlock()
continue
}
if !locked {
+ var host string
+ var port int
+ err = conn.QueryRowContext(ctx, `SELECT client_addr, client_port FROM pg_stat_activity WHERE pid IN
+ (SELECT pid FROM pg_locks
+ WHERE locktype = $1 AND objid = $2)`, "advisory", dbl.key).Scan(&host, &port)
+ if err != nil {
+ logger.WithError(err).Info("error getting other client info")
+ } else {
+ heldBy := net.JoinHostPort(host, fmt.Sprintf("%d", port))
+ if lastHeldBy != heldBy {
+ logger.WithField("DBClient", heldBy).Info("waiting for other process to release lock")
+ lastHeldBy = heldBy
+ }
+ }
conn.Close()
dbl.mtx.Unlock()
continue
}
- logger.Debugf("acquired pg_advisory_lock %d", dbl.key)
+ logger.Debug("acquired pg_advisory_lock")
dbl.ctx, dbl.getdb, dbl.conn = ctx, getdb, conn
dbl.mtx.Unlock()
return true
@@ -102,7 +119,7 @@ func (dbl *DBLocker) Check() bool {
dbl.mtx.Unlock()
return false
} else if err == nil {
- ctxlog.FromContext(dbl.ctx).Debugf("pg_advisory_lock %d connection still alive", dbl.key)
+ ctxlog.FromContext(dbl.ctx).WithField("ID", dbl.key).Debug("connection still alive")
dbl.mtx.Unlock()
return true
}
@@ -120,9 +137,9 @@ func (dbl *DBLocker) Unlock() {
if dbl.conn != nil {
_, err := dbl.conn.ExecContext(context.Background(), `SELECT pg_advisory_unlock($1)`, dbl.key)
if err != nil {
- ctxlog.FromContext(dbl.ctx).WithError(err).Infof("error releasing pg_advisory_lock %d", dbl.key)
+ ctxlog.FromContext(dbl.ctx).WithError(err).WithField("ID", dbl.key).Info("error releasing pg_advisory_lock")
} else {
- ctxlog.FromContext(dbl.ctx).Debugf("released pg_advisory_lock %d", dbl.key)
+ ctxlog.FromContext(dbl.ctx).WithField("ID", dbl.key).Debug("released pg_advisory_lock")
}
dbl.conn.Close()
dbl.conn = nil
diff --git a/lib/controller/dblock/dblock_test.go b/lib/controller/dblock/dblock_test.go
new file mode 100644
index 000000000..6079df6f8
--- /dev/null
+++ b/lib/controller/dblock/dblock_test.go
@@ -0,0 +1,91 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package dblock
+
+import (
+ "bytes"
+ "context"
+ "sync"
+ "testing"
+ "time"
+
+ "git.arvados.org/arvados.git/lib/config"
+ "git.arvados.org/arvados.git/sdk/go/arvados"
+ "git.arvados.org/arvados.git/sdk/go/arvadostest"
+ "git.arvados.org/arvados.git/sdk/go/ctxlog"
+ "github.com/jmoiron/sqlx"
+ "github.com/sirupsen/logrus"
+ check "gopkg.in/check.v1"
+)
+
+func Test(t *testing.T) {
+ check.TestingT(t)
+}
+
+var _ = check.Suite(&suite{})
+
+type suite struct {
+ cluster *arvados.Cluster
+ db *sqlx.DB
+ getdb func(context.Context) (*sqlx.DB, error)
+}
+
+var testLocker = &DBLocker{key: 999}
+
+func (s *suite) SetUpSuite(c *check.C) {
+ cfg, err := config.NewLoader(nil, ctxlog.TestLogger(c)).Load()
+ c.Assert(err, check.IsNil)
+ s.cluster, err = cfg.GetCluster("")
+ c.Assert(err, check.IsNil)
+ s.db = arvadostest.DB(c, s.cluster)
+ s.getdb = func(context.Context) (*sqlx.DB, error) { return s.db, nil }
+}
+
+func (s *suite) TestLock(c *check.C) {
+ retryDelay = time.Millisecond
+
+ var logbuf bytes.Buffer
+ logger := ctxlog.New(&logbuf, "text", "debug")
+ logger.Level = logrus.DebugLevel
+ ctx := ctxlog.Context(context.Background(), logger)
+ ctx, cancel := context.WithCancel(ctx)
+ defer cancel()
+ testLocker.Lock(ctx, s.getdb)
+ testLocker.Check()
+
+ lock2 := make(chan bool)
+ var wg sync.WaitGroup
+ defer wg.Wait()
+ wg.Add(1)
+ go func() {
+ defer wg.Done()
+ testLocker2 := &DBLocker{key: 999}
+ testLocker2.Lock(ctx, s.getdb)
+ close(lock2)
+ testLocker2.Check()
+ testLocker2.Unlock()
+ }()
+
+ // Second lock should wait for first to Unlock
+ select {
+ case <-time.After(time.Second / 10):
+ c.Check(logbuf.String(), check.Matches, `(?ms).*level=info.*DBClient="[^"]+:\d+".*ID=999.*`)
+ case <-lock2:
+ c.Log("double-lock")
+ c.Fail()
+ }
+
+ testLocker.Check()
+ testLocker.Unlock()
+
+ // Now the second lock should succeed within retryDelay
+ select {
+ case <-time.After(retryDelay * 2):
+ c.Log("timed out")
+ c.Fail()
+ case <-lock2:
+ }
+ c.Logf("%s", logbuf.String())
+}
commit aa9507e1633819259794ea4d6cf391dc88621dac
Author: Tom Clegg <tom at curii.com>
Date: Wed Oct 26 17:00:42 2022 -0400
18071: Use dblock to avoid concurrent keep-balance ops.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>
diff --git a/lib/controller/dblock/dblock.go b/lib/controller/dblock/dblock.go
index a46201bb4..472633747 100644
--- a/lib/controller/dblock/dblock.go
+++ b/lib/controller/dblock/dblock.go
@@ -15,9 +15,11 @@ import (
)
var (
- TrashSweep = &DBLocker{key: 10001}
- ContainerLogSweep = &DBLocker{key: 10002}
- retryDelay = 5 * time.Second
+ TrashSweep = &DBLocker{key: 10001}
+ ContainerLogSweep = &DBLocker{key: 10002}
+ KeepBalanceService = &DBLocker{key: 10003} // keep-balance service in periodic-sweep loop
+ KeepBalanceActive = &DBLocker{key: 10004} // keep-balance sweep in progress (either -once=true or service loop)
+ retryDelay = 5 * time.Second
)
// DBLocker uses pg_advisory_lock to maintain a cluster-wide lock for
@@ -31,7 +33,9 @@ type DBLocker struct {
}
// Lock acquires the advisory lock, waiting/reconnecting if needed.
-func (dbl *DBLocker) Lock(ctx context.Context, getdb func(context.Context) (*sqlx.DB, error)) {
+//
+// Returns false if ctx is canceled before the lock is acquired.
+func (dbl *DBLocker) Lock(ctx context.Context, getdb func(context.Context) (*sqlx.DB, error)) bool {
logger := ctxlog.FromContext(ctx)
for ; ; time.Sleep(retryDelay) {
dbl.mtx.Lock()
@@ -41,21 +45,33 @@ func (dbl *DBLocker) Lock(ctx context.Context, getdb func(context.Context) (*sql
dbl.mtx.Unlock()
continue
}
+ if ctx.Err() != nil {
+ dbl.mtx.Unlock()
+ return false
+ }
db, err := getdb(ctx)
- if err != nil {
+ if err == context.Canceled {
+ dbl.mtx.Unlock()
+ return false
+ } else if err != nil {
logger.WithError(err).Infof("error getting database pool")
dbl.mtx.Unlock()
continue
}
conn, err := db.Conn(ctx)
- if err != nil {
+ if err == context.Canceled {
+ dbl.mtx.Unlock()
+ return false
+ } else if err != nil {
logger.WithError(err).Info("error getting database connection")
dbl.mtx.Unlock()
continue
}
var locked bool
err = conn.QueryRowContext(ctx, `SELECT pg_try_advisory_lock($1)`, dbl.key).Scan(&locked)
- if err != nil {
+ if err == context.Canceled {
+ return false
+ } else if err != nil {
logger.WithError(err).Infof("error getting pg_try_advisory_lock %d", dbl.key)
conn.Close()
dbl.mtx.Unlock()
@@ -69,27 +85,33 @@ func (dbl *DBLocker) Lock(ctx context.Context, getdb func(context.Context) (*sql
logger.Debugf("acquired pg_advisory_lock %d", dbl.key)
dbl.ctx, dbl.getdb, dbl.conn = ctx, getdb, conn
dbl.mtx.Unlock()
- return
+ return true
}
}
// Check confirms that the lock is still active (i.e., the session is
// still alive), and re-acquires if needed. Panics if Lock is not
// acquired first.
-func (dbl *DBLocker) Check() {
+//
+// Returns false if the context passed to Lock() is canceled before
+// the lock is confirmed or reacquired.
+func (dbl *DBLocker) Check() bool {
dbl.mtx.Lock()
err := dbl.conn.PingContext(dbl.ctx)
- if err == nil {
+ if err == context.Canceled {
+ dbl.mtx.Unlock()
+ return false
+ } else if err == nil {
ctxlog.FromContext(dbl.ctx).Debugf("pg_advisory_lock %d connection still alive", dbl.key)
dbl.mtx.Unlock()
- return
+ return true
}
ctxlog.FromContext(dbl.ctx).WithError(err).Info("database connection ping failed")
dbl.conn.Close()
dbl.conn = nil
ctx, getdb := dbl.ctx, dbl.getdb
dbl.mtx.Unlock()
- dbl.Lock(ctx, getdb)
+ return dbl.Lock(ctx, getdb)
}
func (dbl *DBLocker) Unlock() {
diff --git a/lib/controller/trash.go b/lib/controller/trash.go
index 9a7b0814c..afdf95b78 100644
--- a/lib/controller/trash.go
+++ b/lib/controller/trash.go
@@ -20,10 +20,16 @@ func (h *Handler) periodicWorker(workerName string, interval time.Duration, lock
logger.Debugf("interval is %v, not running worker", interval)
return
}
- locker.Lock(ctx, h.db)
+ if !locker.Lock(ctx, h.db) {
+ // context canceled
+ return
+ }
defer locker.Unlock()
for time.Sleep(interval); ctx.Err() == nil; time.Sleep(interval) {
- locker.Check()
+ if !locker.Check() {
+ // context canceled
+ return
+ }
err := run(ctx)
if err != nil {
logger.WithError(err).Infof("%s failed", workerName)
diff --git a/services/keep-balance/balance.go b/services/keep-balance/balance.go
index 1dedb409a..50c4dae18 100644
--- a/services/keep-balance/balance.go
+++ b/services/keep-balance/balance.go
@@ -23,7 +23,9 @@ import (
"syscall"
"time"
+ "git.arvados.org/arvados.git/lib/controller/dblock"
"git.arvados.org/arvados.git/sdk/go/arvados"
+ "git.arvados.org/arvados.git/sdk/go/ctxlog"
"git.arvados.org/arvados.git/sdk/go/keepclient"
"github.com/jmoiron/sqlx"
"github.com/sirupsen/logrus"
@@ -70,13 +72,20 @@ type Balancer struct {
//
// Typical usage:
//
-// runOptions, err = (&Balancer{}).Run(config, runOptions)
-func (bal *Balancer) Run(client *arvados.Client, cluster *arvados.Cluster, runOptions RunOptions) (nextRunOptions RunOptions, err error) {
+// runOptions, err = (&Balancer{}).Run(config, runOptions)
+func (bal *Balancer) Run(ctx context.Context, client *arvados.Client, cluster *arvados.Cluster, runOptions RunOptions) (nextRunOptions RunOptions, err error) {
nextRunOptions = runOptions
+ ctxlog.FromContext(ctx).Info("acquiring active lock")
+ if !dblock.KeepBalanceActive.Lock(ctx, func(context.Context) (*sqlx.DB, error) { return bal.DB, nil }) {
+ // context canceled
+ return
+ }
+ defer dblock.KeepBalanceActive.Unlock()
+
defer bal.time("sweep", "wall clock time to run one full sweep")()
- ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(cluster.Collections.BalanceTimeout.Duration()))
+ ctx, cancel := context.WithDeadline(ctx, time.Now().Add(cluster.Collections.BalanceTimeout.Duration()))
defer cancel()
var lbFile *os.File
diff --git a/services/keep-balance/balance_run_test.go b/services/keep-balance/balance_run_test.go
index 2db7bea17..4772da55a 100644
--- a/services/keep-balance/balance_run_test.go
+++ b/services/keep-balance/balance_run_test.go
@@ -6,6 +6,7 @@ package keepbalance
import (
"bytes"
+ "context"
"encoding/json"
"fmt"
"io"
@@ -372,7 +373,7 @@ func (s *runSuite) TestRefuseZeroCollections(c *check.C) {
trashReqs := s.stub.serveKeepstoreTrash()
pullReqs := s.stub.serveKeepstorePull()
srv := s.newServer(&opts)
- _, err = srv.runOnce()
+ _, err = srv.runOnce(context.Background())
c.Check(err, check.ErrorMatches, "received zero collections")
c.Check(trashReqs.Count(), check.Equals, 4)
c.Check(pullReqs.Count(), check.Equals, 0)
@@ -391,7 +392,7 @@ func (s *runSuite) TestRefuseNonAdmin(c *check.C) {
trashReqs := s.stub.serveKeepstoreTrash()
pullReqs := s.stub.serveKeepstorePull()
srv := s.newServer(&opts)
- _, err := srv.runOnce()
+ _, err := srv.runOnce(context.Background())
c.Check(err, check.ErrorMatches, "current user .* is not .* admin user")
c.Check(trashReqs.Count(), check.Equals, 0)
c.Check(pullReqs.Count(), check.Equals, 0)
@@ -417,7 +418,7 @@ func (s *runSuite) TestRefuseSameDeviceDifferentVolumes(c *check.C) {
trashReqs := s.stub.serveKeepstoreTrash()
pullReqs := s.stub.serveKeepstorePull()
srv := s.newServer(&opts)
- _, err := srv.runOnce()
+ _, err := srv.runOnce(context.Background())
c.Check(err, check.ErrorMatches, "cannot continue with config errors.*")
c.Check(trashReqs.Count(), check.Equals, 0)
c.Check(pullReqs.Count(), check.Equals, 0)
@@ -442,7 +443,7 @@ func (s *runSuite) TestWriteLostBlocks(c *check.C) {
s.stub.serveKeepstorePull()
srv := s.newServer(&opts)
c.Assert(err, check.IsNil)
- _, err = srv.runOnce()
+ _, err = srv.runOnce(context.Background())
c.Check(err, check.IsNil)
lost, err := ioutil.ReadFile(lostf.Name())
c.Assert(err, check.IsNil)
@@ -463,7 +464,7 @@ func (s *runSuite) TestDryRun(c *check.C) {
trashReqs := s.stub.serveKeepstoreTrash()
pullReqs := s.stub.serveKeepstorePull()
srv := s.newServer(&opts)
- bal, err := srv.runOnce()
+ bal, err := srv.runOnce(context.Background())
c.Check(err, check.IsNil)
for _, req := range collReqs.reqs {
c.Check(req.Form.Get("include_trash"), check.Equals, "true")
@@ -493,7 +494,7 @@ func (s *runSuite) TestCommit(c *check.C) {
trashReqs := s.stub.serveKeepstoreTrash()
pullReqs := s.stub.serveKeepstorePull()
srv := s.newServer(&opts)
- bal, err := srv.runOnce()
+ bal, err := srv.runOnce(context.Background())
c.Check(err, check.IsNil)
c.Check(trashReqs.Count(), check.Equals, 8)
c.Check(pullReqs.Count(), check.Equals, 4)
@@ -533,13 +534,14 @@ func (s *runSuite) TestRunForever(c *check.C) {
trashReqs := s.stub.serveKeepstoreTrash()
pullReqs := s.stub.serveKeepstorePull()
- stop := make(chan interface{})
+ ctx, cancel := context.WithCancel(context.Background())
+ defer cancel()
s.config.Collections.BalancePeriod = arvados.Duration(time.Millisecond)
srv := s.newServer(&opts)
done := make(chan bool)
go func() {
- srv.runForever(stop)
+ srv.runForever(ctx)
close(done)
}()
@@ -550,7 +552,7 @@ func (s *runSuite) TestRunForever(c *check.C) {
for t0 := time.Now(); pullReqs.Count() < 16 && time.Since(t0) < 10*time.Second; {
time.Sleep(time.Millisecond)
}
- stop <- true
+ cancel()
<-done
c.Check(pullReqs.Count() >= 16, check.Equals, true)
c.Check(trashReqs.Count(), check.Equals, pullReqs.Count()+4)
diff --git a/services/keep-balance/integration_test.go b/services/keep-balance/integration_test.go
index 3cfb5cded..42463a002 100644
--- a/services/keep-balance/integration_test.go
+++ b/services/keep-balance/integration_test.go
@@ -6,6 +6,7 @@ package keepbalance
import (
"bytes"
+ "context"
"io"
"os"
"strings"
@@ -97,7 +98,7 @@ func (s *integrationSuite) TestBalanceAPIFixtures(c *check.C) {
Logger: logger,
Metrics: newMetrics(prometheus.NewRegistry()),
}
- nextOpts, err := bal.Run(s.client, s.config, opts)
+ nextOpts, err := bal.Run(context.Background(), s.client, s.config, opts)
c.Check(err, check.IsNil)
c.Check(nextOpts.SafeRendezvousState, check.Not(check.Equals), "")
c.Check(nextOpts.CommitPulls, check.Equals, true)
diff --git a/services/keep-balance/main.go b/services/keep-balance/main.go
index f0b0df5bd..b016db22f 100644
--- a/services/keep-balance/main.go
+++ b/services/keep-balance/main.go
@@ -112,7 +112,7 @@ func (command) RunCommand(prog string, args []string, stdin io.Reader, stdout, s
Routes: health.Routes{"ping": srv.CheckHealth},
}
- go srv.run()
+ go srv.run(ctx)
return srv
}).RunCommand(prog, args, stdin, stdout, stderr)
}
diff --git a/services/keep-balance/server.go b/services/keep-balance/server.go
index e485f5b20..fd53497f7 100644
--- a/services/keep-balance/server.go
+++ b/services/keep-balance/server.go
@@ -5,12 +5,14 @@
package keepbalance
import (
+ "context"
"net/http"
"os"
"os/signal"
"syscall"
"time"
+ "git.arvados.org/arvados.git/lib/controller/dblock"
"git.arvados.org/arvados.git/sdk/go/arvados"
"github.com/jmoiron/sqlx"
"github.com/sirupsen/logrus"
@@ -62,12 +64,12 @@ func (srv *Server) Done() <-chan struct{} {
return nil
}
-func (srv *Server) run() {
+func (srv *Server) run(ctx context.Context) {
var err error
if srv.RunOptions.Once {
- _, err = srv.runOnce()
+ _, err = srv.runOnce(ctx)
} else {
- err = srv.runForever(nil)
+ err = srv.runForever(ctx)
}
if err != nil {
srv.Logger.Error(err)
@@ -77,7 +79,7 @@ func (srv *Server) run() {
}
}
-func (srv *Server) runOnce() (*Balancer, error) {
+func (srv *Server) runOnce(ctx context.Context) (*Balancer, error) {
bal := &Balancer{
DB: srv.DB,
Logger: srv.Logger,
@@ -86,13 +88,12 @@ func (srv *Server) runOnce() (*Balancer, error) {
LostBlocksFile: srv.Cluster.Collections.BlobMissingReport,
}
var err error
- srv.RunOptions, err = bal.Run(srv.ArvClient, srv.Cluster, srv.RunOptions)
+ srv.RunOptions, err = bal.Run(ctx, srv.ArvClient, srv.Cluster, srv.RunOptions)
return bal, err
}
-// RunForever runs forever, or (for testing purposes) until the given
-// stop channel is ready to receive.
-func (srv *Server) runForever(stop <-chan interface{}) error {
+// RunForever runs forever, or until ctx is cancelled.
+func (srv *Server) runForever(ctx context.Context) error {
logger := srv.Logger
ticker := time.NewTicker(time.Duration(srv.Cluster.Collections.BalancePeriod))
@@ -102,6 +103,10 @@ func (srv *Server) runForever(stop <-chan interface{}) error {
sigUSR1 := make(chan os.Signal)
signal.Notify(sigUSR1, syscall.SIGUSR1)
+ logger.Info("acquiring service lock")
+ dblock.KeepBalanceService.Lock(ctx, func(context.Context) (*sqlx.DB, error) { return srv.DB, nil })
+ defer dblock.KeepBalanceService.Unlock()
+
logger.Printf("starting up: will scan every %v and on SIGUSR1", srv.Cluster.Collections.BalancePeriod)
for {
@@ -110,7 +115,11 @@ func (srv *Server) runForever(stop <-chan interface{}) error {
logger.Print("======= Consider using -commit-pulls and -commit-trash flags.")
}
- _, err := srv.runOnce()
+ if !dblock.KeepBalanceService.Check() {
+ // context canceled
+ return nil
+ }
+ _, err := srv.runOnce(ctx)
if err != nil {
logger.Print("run failed: ", err)
} else {
@@ -118,7 +127,7 @@ func (srv *Server) runForever(stop <-chan interface{}) error {
}
select {
- case <-stop:
+ case <-ctx.Done():
signal.Stop(sigUSR1)
return nil
case <-ticker.C:
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list