[arvados] updated: 2.7.1-21-g593afd3aef

Fri Feb 2 19:49:01 UTC 2024

Summary of changes:
 doc/_includes/_hpc_max_gateway_tunnels.liquid      |  18 +
 doc/admin/scoped-tokens.html.textile.liquid        |  17 +-
 doc/admin/upgrading.html.textile.liquid            |   8 +
 doc/api/keep-webdav.html.textile.liquid            |   6 +
 doc/api/methods/groups.html.textile.liquid         |   2 +-
 .../install-dispatch.html.textile.liquid           |   2 +
 .../install-dispatch.html.textile.liquid           |   2 +
 go.mod                                             |   2 +-
 go.sum                                             |   4 +-
 lib/config/config.default.yml                      |  11 +
 lib/config/export.go                               |   1 +
 lib/controller/rpc/conn.go                         |   4 +-
 lib/crunchrun/container_gateway.go                 |   2 +-
 lib/service/cmd.go                                 | 121 +++--
 lib/service/cmd_test.go                            | 113 ++++-
 sdk/go/arvados/config.go                           |   1 +
 sdk/go/arvados/fs_base.go                          |  93 +++-
 sdk/go/arvados/fs_collection.go                    |   2 +-
 sdk/go/arvados/fs_lookup.go                        |  14 +-
 sdk/go/arvados/fs_project.go                       |   5 +-
 sdk/go/arvados/fs_project_test.go                  | 111 +++--
 sdk/go/arvados/fs_site.go                          |   8 +
 sdk/go/arvados/fs_site_test.go                     |  10 +
 sdk/go/arvadostest/fixtures.go                     |   5 +-
 sdk/go/httpserver/request_limiter.go               | 167 +++----
 sdk/go/httpserver/request_limiter_test.go          |  49 ++-
 sdk/python/arvados/__init__.py                     |  19 +-
 sdk/python/arvados/commands/keepdocker.py          |  50 ++-
 sdk/python/arvados/events.py                       | 487 +++++++++++++++------
 sdk/python/setup.py                                |  10 +-
 .../data/hello-world-ManifestV2-OCILayout.tar      | Bin 24064 -> 25600 bytes
 .../python/tests/data/hello-world-ManifestV2.tar   | Bin 24064 -> 23040 bytes
 sdk/python/tests/data/hello-world-README.txt       |  25 ++
 sdk/python/tests/test_arv_keepdocker.py            |  36 +-
 sdk/python/tests/test_events.py                    | 214 ++++++---
 services/api/app/models/collection.rb              |  12 +-
 services/api/app/models/user.rb                    |  93 +++-
 .../admin_notifier/new_inactive_user.text.erb      |   9 +-
 .../api/app/views/admin_notifier/new_user.text.erb |  16 +-
 .../views/user_notifier/account_is_setup.text.erb  |   2 +-
 services/api/test/integration/remote_user_test.rb  |  25 +-
 services/api/test/unit/user_test.rb                |  10 +-
 services/keep-balance/balance_run_test.go          |  34 ++
 services/keep-balance/metrics.go                   |  89 +++-
 services/keep-web/handler_test.go                  |  45 +-
 services/keep-web/server_test.go                   |   2 +-
 services/ws/session_v0_test.go                     |  35 ++
 tools/arvbox/bin/arvbox                            |   8 +
 ...nsure-encrypted-partitions-aws-ebs-autoscale.sh |  16 +-
 .../usr-local-bin-ensure-encrypted-partitions.sh   |  16 +-
 tools/user-activity/arvados_user_activity/main.py  |   4 +-
 51 files changed, 1510 insertions(+), 525 deletions(-)
 create mode 100644 doc/_includes/_hpc_max_gateway_tunnels.liquid
 copy lib/diagnostics/hello-world.tar => sdk/python/tests/data/hello-world-ManifestV2-OCILayout.tar (70%)
 copy lib/diagnostics/hello-world.tar => sdk/python/tests/data/hello-world-ManifestV2.tar (76%)
 create mode 100644 sdk/python/tests/data/hello-world-README.txt

       via  593afd3aefeeb77f98d716167927bb9c77d97fcd (commit)
       via  702cab80d779cd9e203ff6bf5ad0c484ef8473dc (commit)
       via  7ed9d9c491dc488ffddf6b469f0688ba44f76462 (commit)
       via  e4d27aaccd7eb3051e72be09fbadeb218191c149 (commit)
       via  03a110d5d8eb3d768de0fba396cbf4b3ab437568 (commit)
       via  36d731ce06f586448af97742e163cfdaab136a8e (commit)
       via  cb04fbce3b80919c7eed21c1341a0c27028c8b1a (commit)
       via  177608b6533c97c4d48f1512fd5b98b6612f49af (commit)
       via  3d58984b6e02230ab896e2b651b46a2d7ee47b1f (commit)
       via  e64b5d6334a3ea01abbbacc5228c3f07482ba0d9 (commit)
       via  46148bb0c54244c5e4d0b2566a5fb5c679398720 (commit)
       via  14bd55aeb477b03463fb0023e2dc6a009973edff (commit)
       via  cd1c579bcbc3569698bf93e5f2f23b5a5859eebc (commit)
       via  989e46c76c9c8949bab0b63f2a51460ddc7f1fc9 (commit)
       via  2da34092382837af4185926b96a1bc692cd731f4 (commit)
      from  bf70e3d7cd39fe1d85096d19122e7693159989db (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.


commit 593afd3aefeeb77f98d716167927bb9c77d97fcd
Author: Brett Smith <brett.smith at curii.com>
Date:   Fri Feb 2 10:30:10 2024 -0500

    Merge branch '21417-keepdocker-oci-layout'
    
    Closes #21417.
    
    Arvados-DCO-1.1-Signed-off-by: Brett Smith <brett.smith at curii.com>

diff --git a/sdk/python/arvados/commands/keepdocker.py b/sdk/python/arvados/commands/keepdocker.py
index 7b7367080f..6823ee1bea 100644
--- a/sdk/python/arvados/commands/keepdocker.py
+++ b/sdk/python/arvados/commands/keepdocker.py
@@ -2,34 +2,29 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-from builtins import next
 import argparse
 import collections
 import datetime
 import errno
+import fcntl
 import json
+import logging
 import os
 import re
+import subprocess
 import sys
 import tarfile
 import tempfile
-import shutil
-import _strptime
-import fcntl
+
+import ciso8601
 from operator import itemgetter
 from stat import *
 
-import subprocess
-
 import arvados
+import arvados.config
 import arvados.util
 import arvados.commands._util as arv_cmd
 import arvados.commands.put as arv_put
-from arvados.collection import CollectionReader
-import ciso8601
-import logging
-import arvados.config
-
 from arvados._version import __version__
 
 logger = logging.getLogger('arvados.keepdocker')
@@ -356,6 +351,25 @@ def _uuid2pdh(api, uuid):
         select=['portable_data_hash'],
     ).execute()['items'][0]['portable_data_hash']
 
+def load_image_metadata(image_file):
+    """Load an image manifest and config from an archive
+
+    Given an image archive as an open binary file object, this function loads
+    the image manifest and configuration, deserializing each from JSON and
+    returning them in a 2-tuple of dicts.
+    """
+    image_file.seek(0)
+    with tarfile.open(fileobj=image_file) as image_tar:
+        with image_tar.extractfile('manifest.json') as manifest_file:
+            image_manifest_list = json.load(manifest_file)
+        # Because arv-keepdocker only saves one image, there should only be
+        # one manifest.  This extracts that from the list and raises
+        # ValueError if there's not exactly one.
+        image_manifest, = image_manifest_list
+        with image_tar.extractfile(image_manifest['Config']) as config_file:
+            image_config = json.load(config_file)
+    return image_manifest, image_config
+
 def main(arguments=None, stdout=sys.stdout, install_sig_handlers=True, api=None):
     args = arg_parser.parse_args(arguments)
     if api is None:
@@ -532,21 +546,9 @@ def main(arguments=None, stdout=sys.stdout, install_sig_handlers=True, api=None)
         # Managed properties could be already set
         coll_properties = api.collections().get(uuid=coll_uuid).execute(num_retries=args.retries).get('properties', {})
         coll_properties.update({"docker-image-repo-tag": image_repo_tag})
-
         api.collections().update(uuid=coll_uuid, body={"properties": coll_properties}).execute(num_retries=args.retries)
 
-        # Read the image metadata and make Arvados links from it.
-        image_file.seek(0)
-        image_tar = tarfile.open(fileobj=image_file)
-        image_hash_type, _, raw_image_hash = image_hash.rpartition(':')
-        if image_hash_type:
-            json_filename = raw_image_hash + '.json'
-        else:
-            json_filename = raw_image_hash + '/json'
-        json_file = image_tar.extractfile(image_tar.getmember(json_filename))
-        image_metadata = json.loads(json_file.read().decode('utf-8'))
-        json_file.close()
-        image_tar.close()
+        _, image_metadata = load_image_metadata(image_file)
         link_base = {'head_uuid': coll_uuid, 'properties': {}}
         if 'created' in image_metadata:
             link_base['properties']['image_timestamp'] = image_metadata['created']
diff --git a/sdk/python/tests/data/hello-world-ManifestV2-OCILayout.tar b/sdk/python/tests/data/hello-world-ManifestV2-OCILayout.tar
new file mode 100644
index 0000000000..a4b3d86390
Binary files /dev/null and b/sdk/python/tests/data/hello-world-ManifestV2-OCILayout.tar differ
diff --git a/sdk/python/tests/data/hello-world-ManifestV2.tar b/sdk/python/tests/data/hello-world-ManifestV2.tar
new file mode 100644
index 0000000000..b98e7c7acd
Binary files /dev/null and b/sdk/python/tests/data/hello-world-ManifestV2.tar differ
diff --git a/sdk/python/tests/data/hello-world-README.txt b/sdk/python/tests/data/hello-world-README.txt
new file mode 100644
index 0000000000..8c6a7de31e
--- /dev/null
+++ b/sdk/python/tests/data/hello-world-README.txt
@@ -0,0 +1,25 @@
+The hello-world-*.tar files are archived from the official Docker
+hello-world:latest image available on 2024-02-01,
+sha256:d2c94e258dcb3c5ac2798d32e1249e42ef01cba4841c2234249495f87264ac5a.
+<https://github.com/docker-library/hello-world/tree/a2269bdb107d086851a5e3d448cf47770b50bff7>
+
+Copyright (c) 2014 Docker, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/sdk/python/tests/test_arv_keepdocker.py b/sdk/python/tests/test_arv_keepdocker.py
index 526fd68727..9aebc03504 100644
--- a/sdk/python/tests/test_arv_keepdocker.py
+++ b/sdk/python/tests/test_arv_keepdocker.py
@@ -2,23 +2,24 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-from __future__ import absolute_import
 import arvados
 import collections
+import collections.abc
 import copy
 import hashlib
+import logging
 import mock
 import os
 import subprocess
 import sys
 import tempfile
 import unittest
-import logging
+from pathlib import Path
+
+import parameterized
 
 import arvados.commands.keepdocker as arv_keepdocker
 from . import arvados_testutil as tutil
-from . import run_test_server
-
 
 class StopTest(Exception):
     pass
@@ -226,3 +227,30 @@ class ArvKeepdockerTestCase(unittest.TestCase, tutil.VersionChecker):
         api().collections().update.assert_called_with(
             uuid=mocked_collection['uuid'],
             body={'properties': updated_properties})
+
+
+ at parameterized.parameterized_class(('filename',), [
+    ('hello-world-ManifestV2.tar',),
+    ('hello-world-ManifestV2-OCILayout.tar',),
+])
+class ImageMetadataTestCase(unittest.TestCase):
+    DATA_PATH = Path(__file__).parent / 'data'
+
+    @classmethod
+    def setUpClass(cls):
+        cls.image_file = (cls.DATA_PATH / cls.filename).open('rb')
+
+    @classmethod
+    def tearDownClass(cls):
+        cls.image_file.close()
+
+    def setUp(self):
+        self.manifest, self.config = arv_keepdocker.load_image_metadata(self.image_file)
+
+    def test_image_manifest(self):
+        self.assertIsInstance(self.manifest, collections.abc.Mapping)
+        self.assertEqual(self.manifest.get('RepoTags'), ['hello-world:latest'])
+
+    def test_image_config(self):
+        self.assertIsInstance(self.config, collections.abc.Mapping)
+        self.assertEqual(self.config.get('created'), '2023-05-02T16:49:27Z')

commit 702cab80d779cd9e203ff6bf5ad0c484ef8473dc
Author: Brett Smith <brett.smith at curii.com>
Date:   Fri Feb 2 02:38:14 2024 -0500

    Merge branch '21429-remember-docker-state'
    
    Closes #21429.
    
    Arvados-DCO-1.1-Signed-off-by: Brett Smith <brett.smith at curii.com>

diff --git a/tools/compute-images/scripts/usr-local-bin-ensure-encrypted-partitions-aws-ebs-autoscale.sh b/tools/compute-images/scripts/usr-local-bin-ensure-encrypted-partitions-aws-ebs-autoscale.sh
index abc63a2e92..6f0970b17f 100644
--- a/tools/compute-images/scripts/usr-local-bin-ensure-encrypted-partitions-aws-ebs-autoscale.sh
+++ b/tools/compute-images/scripts/usr-local-bin-ensure-encrypted-partitions-aws-ebs-autoscale.sh
@@ -22,9 +22,16 @@ ensure_umount() {
 # First make sure docker is not using /tmp, then unmount everything under it.
 if [ -d /etc/sv/docker.io ]
 then
+  # TODO: Actually detect Docker state with runit
+  DOCKER_ACTIVE=true
   sv stop docker.io || service stop docker.io || true
 else
-  systemctl disable --now docker.service docker.socket || true
+  if systemctl --quiet is-active docker.service docker.socket; then
+    systemctl stop docker.service docker.socket || true
+    DOCKER_ACTIVE=true
+  else
+    DOCKER_ACTIVE=false
+  fi
 fi
 
 ensure_umount "$MOUNTPATH/docker/aufs"
@@ -38,13 +45,18 @@ cat <<EOF > /etc/docker/daemon.json
 }
 EOF
 
+if ! $DOCKER_ACTIVE; then
+  # Nothing else to do
+  exit 0
+fi
+
 # restart docker
 if [ -d /etc/sv/docker.io ]
 then
   ## runit
   sv up docker.io
 else
-  systemctl enable --now docker.service docker.socket
+  systemctl start docker.service docker.socket || true
 fi
 
 end=$((SECONDS+60))
diff --git a/tools/compute-images/scripts/usr-local-bin-ensure-encrypted-partitions.sh b/tools/compute-images/scripts/usr-local-bin-ensure-encrypted-partitions.sh
index a76dc12109..726ff0cdcd 100644
--- a/tools/compute-images/scripts/usr-local-bin-ensure-encrypted-partitions.sh
+++ b/tools/compute-images/scripts/usr-local-bin-ensure-encrypted-partitions.sh
@@ -119,9 +119,16 @@ mkfs.xfs -f "$CRYPTPATH"
 # First make sure docker is not using /tmp, then unmount everything under it.
 if [ -d /etc/sv/docker.io ]
 then
+  # TODO: Actually detect Docker state with runit
+  DOCKER_ACTIVE=true
   sv stop docker.io || service stop docker.io || true
 else
-  systemctl disable --now docker.service docker.socket || true
+  if systemctl --quiet is-active docker.service docker.socket; then
+    systemctl stop docker.service docker.socket || true
+    DOCKER_ACTIVE=true
+  else
+    DOCKER_ACTIVE=false
+  fi
 fi
 
 ensure_umount "$MOUNTPATH/docker/aufs"
@@ -137,13 +144,18 @@ cat <<EOF > /etc/docker/daemon.json
 }
 EOF
 
+if ! $DOCKER_ACTIVE; then
+  # Nothing else to do
+  exit 0
+fi
+
 # restart docker
 if [ -d /etc/sv/docker.io ]
 then
   ## runit
   sv up docker.io
 else
-  systemctl enable --now docker.service docker.socket || true
+  systemctl start docker.service docker.socket || true
 fi
 
 end=$((SECONDS+60))

commit 7ed9d9c491dc488ffddf6b469f0688ba44f76462
Author: Tom Clegg <tom at curii.com>
Date:   Tue Jan 16 14:05:50 2024 -0500

    Merge branch '21379-user-activity-remote-collection'
    
    fixes #21379
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>

diff --git a/tools/user-activity/arvados_user_activity/main.py b/tools/user-activity/arvados_user_activity/main.py
index ded96c3121..66d03b2041 100755
--- a/tools/user-activity/arvados_user_activity/main.py
+++ b/tools/user-activity/arvados_user_activity/main.py
@@ -96,6 +96,7 @@ collectionNameCache = {}
 def getCollectionName(arv, uuid, pdh):
     lookupField = uuid
     filters = [["uuid", "=", uuid]]
+    order = None
     cached = uuid in collectionNameCache
     # look up by uuid if it is available, fall back to look up by pdh
     if uuid is None or len(uuid) != 27:
@@ -105,10 +106,11 @@ def getCollectionName(arv, uuid, pdh):
         # name, if the uuid for the request is not known.
         lookupField = pdh
         filters = [["portable_data_hash", "=", pdh]]
+        order = "created_at"
         cached = pdh in collectionNameCache
 
     if not cached:
-        u = arv.collections().list(filters=filters, order="created_at", limit=1).execute().get("items")
+        u = arv.collections().list(filters=filters, order=order, limit=1, count="none").execute().get("items")
         if len(u) < 1:
             return "(deleted)"
         collectionNameCache[lookupField] = u[0]["name"]

commit e4d27aaccd7eb3051e72be09fbadeb218191c149
Author: Peter Amstutz <peter.amstutz at curii.com>
Date:   Tue Jan 9 16:14:14 2024 -0500

    Add arvbox check for fs.inotify.max_user_watches, refs #21349
    
    Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz at curii.com>

diff --git a/tools/arvbox/bin/arvbox b/tools/arvbox/bin/arvbox
index 358e61ca57..9effbe0fb2 100755
--- a/tools/arvbox/bin/arvbox
+++ b/tools/arvbox/bin/arvbox
@@ -431,6 +431,14 @@ check() {
             exit 1
         ;;
     esac
+
+    user_watches=$(/usr/sbin/sysctl fs.inotify.max_user_watches)
+    [[ $user_watches =~ fs.inotify.max_user_watches\ =\ ([0-9]+) ]] && value=${BASH_REMATCH[1]}
+    if [[ "$value" -lt 256000 ]] ; then
+	echo "Not enough file system listeners ($value), to fix this run:"
+	echo "sudo sh -c 'echo fs.inotify.max_user_watches=524288 >> /etc/sysctl.d/local.conf && sysctl --system'"
+	exit 1
+    fi
 }
 
 subcmd="$1"

commit 03a110d5d8eb3d768de0fba396cbf4b3ab437568
Author: Tom Clegg <tom at curii.com>
Date:   Tue Jan 9 11:04:09 2024 -0500

    Merge branch '21036-keep-balance-metrics'
    
    fixes #21036
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>

diff --git a/services/keep-balance/balance_run_test.go b/services/keep-balance/balance_run_test.go
index b7b3fb6123..3b1a10f7aa 100644
--- a/services/keep-balance/balance_run_test.go
+++ b/services/keep-balance/balance_run_test.go
@@ -555,6 +555,10 @@ func (s *runSuite) TestDryRun(c *check.C) {
 	c.Check(bal.stats.trashesDeferred, check.Not(check.Equals), 0)
 	c.Check(bal.stats.underrep.replicas, check.Not(check.Equals), 0)
 	c.Check(bal.stats.overrep.replicas, check.Not(check.Equals), 0)
+
+	metrics := arvadostest.GatherMetricsAsString(srv.Metrics.reg)
+	c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_trash_entries_deferred_count [1-9].*`)
+	c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_pull_entries_deferred_count [1-9].*`)
 }
 
 func (s *runSuite) TestCommit(c *check.C) {
@@ -592,6 +596,36 @@ func (s *runSuite) TestCommit(c *check.C) {
 	c.Check(metrics, check.Matches, `(?ms).*\narvados_keepbalance_changeset_compute_seconds_count 1\n.*`)
 	c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_dedup_byte_ratio [1-9].*`)
 	c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_dedup_block_ratio [1-9].*`)
+
+	for _, cat := range []string{
+		"dedup_byte_ratio", "dedup_block_ratio", "collection_bytes",
+		"referenced_bytes", "referenced_blocks", "reference_count",
+		"pull_entries_sent_count",
+		"trash_entries_sent_count",
+	} {
+		c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_`+cat+` [1-9].*`)
+	}
+
+	for _, cat := range []string{
+		"pull_entries_deferred_count",
+		"trash_entries_deferred_count",
+	} {
+		c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_`+cat+` 0\n.*`)
+	}
+
+	c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_replicated_block_count{replicas="0"} [1-9].*`)
+	c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_replicated_block_count{replicas="1"} [1-9].*`)
+	c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_replicated_block_count{replicas="9"} 0\n.*`)
+
+	for _, sub := range []string{"replicas", "blocks", "bytes"} {
+		for _, cat := range []string{"needed", "unneeded", "unachievable", "pulling"} {
+			c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_usage_`+sub+`{status="`+cat+`",storage_class="default"} [1-9].*`)
+		}
+		for _, cat := range []string{"total", "garbage", "transient", "overreplicated", "underreplicated", "unachievable", "balanced", "desired", "lost"} {
+			c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_`+cat+`_`+sub+` [0-9].*`)
+		}
+	}
+	c.Logf("%s", metrics)
 }
 
 func (s *runSuite) TestChunkPrefix(c *check.C) {
diff --git a/services/keep-balance/metrics.go b/services/keep-balance/metrics.go
index 4683b67b98..02cee3955f 100644
--- a/services/keep-balance/metrics.go
+++ b/services/keep-balance/metrics.go
@@ -7,6 +7,7 @@ package keepbalance
 import (
 	"fmt"
 	"net/http"
+	"strconv"
 	"sync"
 
 	"github.com/prometheus/client_golang/prometheus"
@@ -17,18 +18,20 @@ type observer interface{ Observe(float64) }
 type setter interface{ Set(float64) }
 
 type metrics struct {
-	reg         *prometheus.Registry
-	statsGauges map[string]setter
-	observers   map[string]observer
-	setupOnce   sync.Once
-	mtx         sync.Mutex
+	reg            *prometheus.Registry
+	statsGauges    map[string]setter
+	statsGaugeVecs map[string]*prometheus.GaugeVec
+	observers      map[string]observer
+	setupOnce      sync.Once
+	mtx            sync.Mutex
 }
 
 func newMetrics(registry *prometheus.Registry) *metrics {
 	return &metrics{
-		reg:         registry,
-		statsGauges: map[string]setter{},
-		observers:   map[string]observer{},
+		reg:            registry,
+		statsGauges:    map[string]setter{},
+		statsGaugeVecs: map[string]*prometheus.GaugeVec{},
+		observers:      map[string]observer{},
 	}
 }
 
@@ -63,9 +66,24 @@ func (m *metrics) UpdateStats(s balancerStats) {
 		"transient":         {s.unref, "transient (unreferenced, new)"},
 		"overreplicated":    {s.overrep, "overreplicated"},
 		"underreplicated":   {s.underrep, "underreplicated"},
+		"unachievable":      {s.unachievable, "unachievable"},
+		"balanced":          {s.justright, "optimally balanced"},
+		"desired":           {s.desired, "desired"},
 		"lost":              {s.lost, "lost"},
 		"dedup_byte_ratio":  {s.dedupByteRatio(), "deduplication ratio, bytes referenced / bytes stored"},
 		"dedup_block_ratio": {s.dedupBlockRatio(), "deduplication ratio, blocks referenced / blocks stored"},
+		"collection_bytes":  {s.collectionBytes, "total apparent size of all collections"},
+		"referenced_bytes":  {s.collectionBlockBytes, "total size of unique referenced blocks"},
+		"reference_count":   {s.collectionBlockRefs, "block references in all collections"},
+		"referenced_blocks": {s.collectionBlocks, "blocks referenced by any collection"},
+
+		"pull_entries_sent_count":      {s.pulls, "total entries sent in pull lists"},
+		"pull_entries_deferred_count":  {s.pullsDeferred, "total entries deferred (not sent) in pull lists"},
+		"trash_entries_sent_count":     {s.trashes, "total entries sent in trash lists"},
+		"trash_entries_deferred_count": {s.trashesDeferred, "total entries deferred (not sent) in trash lists"},
+
+		"replicated_block_count": {s.replHistogram, "blocks with indicated number of replicas at last count"},
+		"usage":                  {s.classStats, "stored in indicated storage class"},
 	}
 	m.setupOnce.Do(func() {
 		// Register gauge(s) for each balancerStats field.
@@ -87,6 +105,29 @@ func (m *metrics) UpdateStats(s balancerStats) {
 				}
 			case int, int64, float64:
 				addGauge(name, gauge.Help)
+			case []int:
+				// replHistogram
+				gv := prometheus.NewGaugeVec(prometheus.GaugeOpts{
+					Namespace: "arvados",
+					Name:      name,
+					Subsystem: "keep",
+					Help:      gauge.Help,
+				}, []string{"replicas"})
+				m.reg.MustRegister(gv)
+				m.statsGaugeVecs[name] = gv
+			case map[string]replicationStats:
+				// classStats
+				for _, sub := range []string{"blocks", "bytes", "replicas"} {
+					name := name + "_" + sub
+					gv := prometheus.NewGaugeVec(prometheus.GaugeOpts{
+						Namespace: "arvados",
+						Name:      name,
+						Subsystem: "keep",
+						Help:      gauge.Help,
+					}, []string{"storage_class", "status"})
+					m.reg.MustRegister(gv)
+					m.statsGaugeVecs[name] = gv
+				}
 			default:
 				panic(fmt.Sprintf("bad gauge type %T", gauge.Value))
 			}
@@ -105,6 +146,38 @@ func (m *metrics) UpdateStats(s balancerStats) {
 			m.statsGauges[name].Set(float64(val))
 		case float64:
 			m.statsGauges[name].Set(float64(val))
+		case []int:
+			// replHistogram
+			for r, n := range val {
+				m.statsGaugeVecs[name].WithLabelValues(strconv.Itoa(r)).Set(float64(n))
+			}
+			// Record zero for higher-than-max-replication
+			// metrics, so we don't incorrectly continue
+			// to report stale metrics.
+			//
+			// For example, if we previously reported n=1
+			// for repl=6, but have since restarted
+			// keep-balance and the most replicated block
+			// now has repl=5, then the repl=6 gauge will
+			// still say n=1 until we clear it explicitly
+			// here.
+			for r := len(val); r < len(val)+4 || r < len(val)*2; r++ {
+				m.statsGaugeVecs[name].WithLabelValues(strconv.Itoa(r)).Set(0)
+			}
+		case map[string]replicationStats:
+			// classStats
+			for class, cs := range val {
+				for label, val := range map[string]blocksNBytes{
+					"needed":       cs.needed,
+					"unneeded":     cs.unneeded,
+					"pulling":      cs.pulling,
+					"unachievable": cs.unachievable,
+				} {
+					m.statsGaugeVecs[name+"_blocks"].WithLabelValues(class, label).Set(float64(val.blocks))
+					m.statsGaugeVecs[name+"_bytes"].WithLabelValues(class, label).Set(float64(val.bytes))
+					m.statsGaugeVecs[name+"_replicas"].WithLabelValues(class, label).Set(float64(val.replicas))
+				}
+			}
 		default:
 			panic(fmt.Sprintf("bad gauge type %T", gauge.Value))
 		}

commit 36d731ce06f586448af97742e163cfdaab136a8e
Author: Tom Clegg <tom at curii.com>
Date:   Mon Jan 8 16:39:38 2024 -0500

    Merge branch '21285-max-gw-tunnels'
    
    refs #21285
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>

diff --git a/doc/_includes/_hpc_max_gateway_tunnels.liquid b/doc/_includes/_hpc_max_gateway_tunnels.liquid
new file mode 100644
index 0000000000..ba8769c653
--- /dev/null
+++ b/doc/_includes/_hpc_max_gateway_tunnels.liquid
@@ -0,0 +1,18 @@
+{% comment %}
+Copyright (C) The Arvados Authors. All rights reserved.
+
+SPDX-License-Identifier: CC-BY-SA-3.0
+{% endcomment %}
+
+h3(#MaxGatewayTunnels). API.MaxGatewayTunnels
+
+Each Arvados container that runs on your HPC cluster will bring up a long-lived connection to the Arvados controller and keep it open for the entire duration of the container. This connection is used to access real-time container logs from Workbench, and to enable the "container shell":{{site.baseurl}}/install/container-shell-access.html feature.
+
+Set the @MaxGatewayTunnels@ config entry high enough to accommodate the maximum number of containers you expect to run concurrently on your HPC cluster, plus incoming container shell sessions.
+
+<notextile>
+<pre>    API:
+      MaxGatewayTunnels: 2000</pre>
+</notextile>
+
+Also, configure Nginx (and any other HTTP proxies or load balancers running between the HPC and Arvados controller) to allow the expected number of connections, i.e., @MaxConcurrentRequests + MaxQueuedRequests + MaxGatewayTunnels at .
diff --git a/doc/admin/upgrading.html.textile.liquid b/doc/admin/upgrading.html.textile.liquid
index 63c124b7c9..fe15518253 100644
--- a/doc/admin/upgrading.html.textile.liquid
+++ b/doc/admin/upgrading.html.textile.liquid
@@ -28,6 +28,14 @@ TODO: extract this information based on git commit messages and generate changel
 <div class="releasenotes">
 </notextile>
 
+h2(#2_7_2). v2.7.2 (2024-02-??)
+
+"previous: Upgrading to 2.7.1":#v2_7_1
+
+h3. Check MaxGatewayTunnels config
+
+If you use the LSF or Slurm dispatcher, ensure the new @API.MaxGatewayTunnels@ config entry is high enough to support the size of your cluster. See "LSF docs":{{site.baseurl}}/install/crunch2-lsf/install-dispatch.html#MaxGatewayTunnels or "Slurm docs":{{site.baseurl}}/install/crunch2-slurm/install-dispatch.html#MaxGatewayTunnels for details.
+
 h2(#2_7_1). v2.7.1 (2023-12-12)
 
 "previous: Upgrading to 2.7.0":#v2_7_0
diff --git a/doc/install/crunch2-lsf/install-dispatch.html.textile.liquid b/doc/install/crunch2-lsf/install-dispatch.html.textile.liquid
index d4328d89a3..fc4393d0b6 100644
--- a/doc/install/crunch2-lsf/install-dispatch.html.textile.liquid
+++ b/doc/install/crunch2-lsf/install-dispatch.html.textile.liquid
@@ -40,6 +40,8 @@ Add a DispatchLSF entry to the Services section, using the hostname where @arvad
 
 Review the following configuration parameters and adjust as needed.
 
+{% include 'hpc_max_gateway_tunnels' %}
+
 h3(#BsubSudoUser). Containers.LSF.BsubSudoUser
 
 arvados-dispatch-lsf uses @sudo@ to execute @bsub@, for example @sudo -E -u crunch bsub [...]@. This means the @crunch@ account must exist on the hosts where LSF jobs run ("execution hosts"), as well as on the host where you are installing the Arvados LSF dispatcher (the "submission host"). To use a user account other than @crunch@, configure @BsubSudoUser@:
diff --git a/doc/install/crunch2-slurm/install-dispatch.html.textile.liquid b/doc/install/crunch2-slurm/install-dispatch.html.textile.liquid
index 554f53dd38..16af80d127 100644
--- a/doc/install/crunch2-slurm/install-dispatch.html.textile.liquid
+++ b/doc/install/crunch2-slurm/install-dispatch.html.textile.liquid
@@ -41,6 +41,8 @@ Add a DispatchSLURM entry to the Services section, using the hostname where @cru
 
 The following configuration parameters are optional.
 
+{% include 'hpc_max_gateway_tunnels' %}
+
 h3(#PollPeriod). Containers.PollInterval
 
 crunch-dispatch-slurm polls the API server periodically for new containers to run.  The @PollInterval@ option controls how often this poll happens.  Set this to a string of numbers suffixed with one of the time units @ns@, @us@, @ms@, @s@, @m@, or @h at .  For example:
diff --git a/lib/config/config.default.yml b/lib/config/config.default.yml
index 844e67bfcc..9004b3f64f 100644
--- a/lib/config/config.default.yml
+++ b/lib/config/config.default.yml
@@ -231,6 +231,10 @@ Clusters:
       # also effectively limited by MaxConcurrentRailsRequests (see
       # below) because most controller requests proxy through to the
       # RailsAPI service.
+      #
+      # HTTP proxies and load balancers downstream of arvados services
+      # should be configured to allow at least {MaxConcurrentRequest +
+      # MaxQueuedRequests + MaxGatewayTunnels} concurrent requests.
       MaxConcurrentRequests: 64
 
       # Maximum number of concurrent requests to process concurrently
@@ -250,6 +254,13 @@ Clusters:
       # the incoming request queue before returning 503.
       MaxQueueTimeForLockRequests: 2s
 
+      # Maximum number of active gateway tunnel connections. One slot
+      # is consumed by each "container shell" connection. If using an
+      # HPC dispatcher (LSF or Slurm), one slot is consumed by each
+      # running container.  These do not count toward
+      # MaxConcurrentRequests.
+      MaxGatewayTunnels: 1000
+
       # Fraction of MaxConcurrentRequests that can be "log create"
       # messages at any given time.  This is to prevent logging
       # updates from crowding out more important requests.
diff --git a/lib/config/export.go b/lib/config/export.go
index 674b37473e..ce947893fe 100644
--- a/lib/config/export.go
+++ b/lib/config/export.go
@@ -70,6 +70,7 @@ var whitelist = map[string]bool{
 	"API.LogCreateRequestFraction":             false,
 	"API.MaxConcurrentRailsRequests":           false,
 	"API.MaxConcurrentRequests":                false,
+	"API.MaxGatewayTunnels":                    false,
 	"API.MaxIndexDatabaseRead":                 false,
 	"API.MaxItemsPerResponse":                  true,
 	"API.MaxKeepBlobBuffers":                   false,
diff --git a/lib/controller/rpc/conn.go b/lib/controller/rpc/conn.go
index a8ecc57bba..9f518d9c7a 100644
--- a/lib/controller/rpc/conn.go
+++ b/lib/controller/rpc/conn.go
@@ -482,11 +482,11 @@ func (conn *Conn) socket(ctx context.Context, u *url.URL, upgradeHeader string,
 		} else {
 			message = fmt.Sprintf("%q", body)
 		}
-		return connresp, fmt.Errorf("server did not provide a tunnel: %s: %s", resp.Status, message)
+		return connresp, httpserver.ErrorWithStatus(fmt.Errorf("server did not provide a tunnel: %s: %s", resp.Status, message), resp.StatusCode)
 	}
 	if strings.ToLower(resp.Header.Get("Upgrade")) != upgradeHeader ||
 		strings.ToLower(resp.Header.Get("Connection")) != "upgrade" {
-		return connresp, fmt.Errorf("bad response from server: Upgrade %q Connection %q", resp.Header.Get("Upgrade"), resp.Header.Get("Connection"))
+		return connresp, httpserver.ErrorWithStatus(fmt.Errorf("bad response from server: Upgrade %q Connection %q", resp.Header.Get("Upgrade"), resp.Header.Get("Connection")), http.StatusBadGateway)
 	}
 	connresp.Conn = netconn
 	connresp.Bufrw = &bufio.ReadWriter{Reader: bufr, Writer: bufw}
diff --git a/lib/crunchrun/container_gateway.go b/lib/crunchrun/container_gateway.go
index 30f8957a2d..5b68e2c50e 100644
--- a/lib/crunchrun/container_gateway.go
+++ b/lib/crunchrun/container_gateway.go
@@ -220,7 +220,7 @@ func (gw *Gateway) runTunnel(addr string) error {
 		AuthSecret: gw.AuthSecret,
 	})
 	if err != nil {
-		return fmt.Errorf("error creating gateway tunnel: %s", err)
+		return fmt.Errorf("error creating gateway tunnel: %w", err)
 	}
 	mux, err := yamux.Client(tun.Conn, nil)
 	if err != nil {
diff --git a/lib/service/cmd.go b/lib/service/cmd.go
index 725f86f3bd..82e95fe0b4 100644
--- a/lib/service/cmd.go
+++ b/lib/service/cmd.go
@@ -16,6 +16,7 @@ import (
 	_ "net/http/pprof"
 	"net/url"
 	"os"
+	"regexp"
 	"strings"
 	"time"
 
@@ -148,32 +149,13 @@ func (c *command) RunCommand(prog string, args []string, stdin io.Reader, stdout
 		return 1
 	}
 
-	maxReqs := cluster.API.MaxConcurrentRequests
-	if maxRails := cluster.API.MaxConcurrentRailsRequests; maxRails > 0 &&
-		(maxRails < maxReqs || maxReqs == 0) &&
-		strings.HasSuffix(prog, "controller") {
-		// Ideally, we would accept up to
-		// MaxConcurrentRequests, and apply the
-		// MaxConcurrentRailsRequests limit only for requests
-		// that require calling upstream to RailsAPI. But for
-		// now we make the simplifying assumption that every
-		// controller request causes an upstream RailsAPI
-		// request.
-		maxReqs = maxRails
-	}
 	instrumented := httpserver.Instrument(reg, log,
 		httpserver.HandlerWithDeadline(cluster.API.RequestTimeout.Duration(),
 			httpserver.AddRequestIDs(
 				httpserver.Inspect(reg, cluster.ManagementToken,
 					httpserver.LogRequests(
 						interceptHealthReqs(cluster.ManagementToken, handler.CheckHealth,
-							&httpserver.RequestLimiter{
-								Handler:                    handler,
-								MaxConcurrent:              maxReqs,
-								MaxQueue:                   cluster.API.MaxQueuedRequests,
-								MaxQueueTimeForMinPriority: cluster.API.MaxQueueTimeForLockRequests.Duration(),
-								Priority:                   c.requestPriority,
-								Registry:                   reg}))))))
+							c.requestLimiter(handler, cluster, reg)))))))
 	srv := &httpserver.Server{
 		Server: http.Server{
 			Handler:     ifCollectionInHost(instrumented, instrumented.ServeAPI(cluster.ManagementToken, instrumented)),
@@ -212,7 +194,7 @@ func (c *command) RunCommand(prog string, args []string, stdin io.Reader, stdout
 		<-handler.Done()
 		srv.Close()
 	}()
-	go c.requestQueueDumpCheck(cluster, maxReqs, prog, reg, &srv.Server, logger)
+	go c.requestQueueDumpCheck(cluster, prog, reg, &srv.Server, logger)
 	err = srv.Wait()
 	if err != nil {
 		return 1
@@ -221,12 +203,13 @@ func (c *command) RunCommand(prog string, args []string, stdin io.Reader, stdout
 }
 
 // If SystemLogs.RequestQueueDumpDirectory is set, monitor the
-// server's incoming HTTP request queue size. When it exceeds 90% of
-// API.MaxConcurrentRequests, write the /_inspect/requests data to a
-// JSON file in the specified directory.
-func (c *command) requestQueueDumpCheck(cluster *arvados.Cluster, maxReqs int, prog string, reg *prometheus.Registry, srv *http.Server, logger logrus.FieldLogger) {
+// server's incoming HTTP request limiters. When the number of
+// concurrent requests in any queue ("api" or "tunnel") exceeds 90% of
+// its maximum slots, write the /_inspect/requests data to a JSON file
+// in the specified directory.
+func (c *command) requestQueueDumpCheck(cluster *arvados.Cluster, prog string, reg *prometheus.Registry, srv *http.Server, logger logrus.FieldLogger) {
 	outdir := cluster.SystemLogs.RequestQueueDumpDirectory
-	if outdir == "" || cluster.ManagementToken == "" || maxReqs < 1 {
+	if outdir == "" || cluster.ManagementToken == "" {
 		return
 	}
 	logger = logger.WithField("worker", "RequestQueueDump")
@@ -237,16 +220,29 @@ func (c *command) requestQueueDumpCheck(cluster *arvados.Cluster, maxReqs int, p
 			logger.WithError(err).Warn("error getting metrics")
 			continue
 		}
-		dump := false
+		cur := map[string]int{} // queue label => current
+		max := map[string]int{} // queue label => max
 		for _, mf := range mfs {
-			if mf.Name != nil && *mf.Name == "arvados_concurrent_requests" && len(mf.Metric) == 1 {
-				n := int(mf.Metric[0].GetGauge().GetValue())
-				if n > 0 && n >= maxReqs*9/10 {
-					dump = true
-					break
+			for _, m := range mf.GetMetric() {
+				for _, ml := range m.GetLabel() {
+					if ml.GetName() == "queue" {
+						n := int(m.GetGauge().GetValue())
+						if name := mf.GetName(); name == "arvados_concurrent_requests" {
+							cur[*ml.Value] = n
+						} else if name == "arvados_max_concurrent_requests" {
+							max[*ml.Value] = n
+						}
+					}
 				}
 			}
 		}
+		dump := false
+		for queue, n := range cur {
+			if n > 0 && max[queue] > 0 && n >= max[queue]*9/10 {
+				dump = true
+				break
+			}
+		}
 		if dump {
 			req, err := http.NewRequest("GET", "/_inspect/requests", nil)
 			if err != nil {
@@ -269,6 +265,67 @@ func (c *command) requestQueueDumpCheck(cluster *arvados.Cluster, maxReqs int, p
 	}
 }
 
+// Set up a httpserver.RequestLimiter with separate queues/streams for
+// API requests (obeying MaxConcurrentRequests etc) and gateway tunnel
+// requests (obeying MaxGatewayTunnels).
+func (c *command) requestLimiter(handler http.Handler, cluster *arvados.Cluster, reg *prometheus.Registry) http.Handler {
+	maxReqs := cluster.API.MaxConcurrentRequests
+	if maxRails := cluster.API.MaxConcurrentRailsRequests; maxRails > 0 &&
+		(maxRails < maxReqs || maxReqs == 0) &&
+		c.svcName == arvados.ServiceNameController {
+		// Ideally, we would accept up to
+		// MaxConcurrentRequests, and apply the
+		// MaxConcurrentRailsRequests limit only for requests
+		// that require calling upstream to RailsAPI. But for
+		// now we make the simplifying assumption that every
+		// controller request causes an upstream RailsAPI
+		// request.
+		maxReqs = maxRails
+	}
+	rqAPI := &httpserver.RequestQueue{
+		Label:                      "api",
+		MaxConcurrent:              maxReqs,
+		MaxQueue:                   cluster.API.MaxQueuedRequests,
+		MaxQueueTimeForMinPriority: cluster.API.MaxQueueTimeForLockRequests.Duration(),
+	}
+	rqTunnel := &httpserver.RequestQueue{
+		Label:         "tunnel",
+		MaxConcurrent: cluster.API.MaxGatewayTunnels,
+		MaxQueue:      0,
+	}
+	return &httpserver.RequestLimiter{
+		Handler:  handler,
+		Priority: c.requestPriority,
+		Registry: reg,
+		Queue: func(req *http.Request) *httpserver.RequestQueue {
+			if req.Method == http.MethodPost && reTunnelPath.MatchString(req.URL.Path) {
+				return rqTunnel
+			} else {
+				return rqAPI
+			}
+		},
+	}
+}
+
+// reTunnelPath matches paths of API endpoints that go in the "tunnel"
+// queue.
+var reTunnelPath = regexp.MustCompile(func() string {
+	rePathVar := regexp.MustCompile(`{.*?}`)
+	out := ""
+	for _, endpoint := range []arvados.APIEndpoint{
+		arvados.EndpointContainerGatewayTunnel,
+		arvados.EndpointContainerGatewayTunnelCompat,
+		arvados.EndpointContainerSSH,
+		arvados.EndpointContainerSSHCompat,
+	} {
+		if out != "" {
+			out += "|"
+		}
+		out += `\Q/` + rePathVar.ReplaceAllString(endpoint.Path, `\E[^/]*\Q`) + `\E`
+	}
+	return "^(" + out + ")$"
+}())
+
 func (c *command) requestPriority(req *http.Request, queued time.Time) int64 {
 	switch {
 	case req.Method == http.MethodPost && strings.HasPrefix(req.URL.Path, "/arvados/v1/containers/") && strings.HasSuffix(req.URL.Path, "/lock"):
diff --git a/lib/service/cmd_test.go b/lib/service/cmd_test.go
index 08b3a239dc..9ead90019e 100644
--- a/lib/service/cmd_test.go
+++ b/lib/service/cmd_test.go
@@ -17,6 +17,8 @@ import (
 	"net/url"
 	"os"
 	"strings"
+	"sync"
+	"sync/atomic"
 	"testing"
 	"time"
 
@@ -198,15 +200,24 @@ func (*Suite) TestCommand(c *check.C) {
 	c.Check(stderr.String(), check.Matches, `(?ms).*"msg":"CheckHealth called".*`)
 }
 
-func (s *Suite) TestDumpRequestsKeepweb(c *check.C) {
-	s.testDumpRequests(c, arvados.ServiceNameKeepweb, "MaxConcurrentRequests")
+func (s *Suite) TestTunnelPathRegexp(c *check.C) {
+	c.Check(reTunnelPath.MatchString(`/arvados/v1/connect/zzzzz-dz642-aaaaaaaaaaaaaaa/gateway_tunnel`), check.Equals, true)
+	c.Check(reTunnelPath.MatchString(`/arvados/v1/containers/zzzzz-dz642-aaaaaaaaaaaaaaa/gateway_tunnel`), check.Equals, true)
+	c.Check(reTunnelPath.MatchString(`/arvados/v1/connect/zzzzz-dz642-aaaaaaaaaaaaaaa/ssh`), check.Equals, true)
+	c.Check(reTunnelPath.MatchString(`/arvados/v1/containers/zzzzz-dz642-aaaaaaaaaaaaaaa/ssh`), check.Equals, true)
+	c.Check(reTunnelPath.MatchString(`/blah/arvados/v1/containers/zzzzz-dz642-aaaaaaaaaaaaaaa/ssh`), check.Equals, false)
+	c.Check(reTunnelPath.MatchString(`/arvados/v1/containers/zzzzz-dz642-aaaaaaaaaaaaaaa`), check.Equals, false)
 }
 
-func (s *Suite) TestDumpRequestsController(c *check.C) {
-	s.testDumpRequests(c, arvados.ServiceNameController, "MaxConcurrentRailsRequests")
+func (s *Suite) TestRequestLimitsAndDumpRequests_Keepweb(c *check.C) {
+	s.testRequestLimitAndDumpRequests(c, arvados.ServiceNameKeepweb, "MaxConcurrentRequests")
 }
 
-func (*Suite) testDumpRequests(c *check.C, serviceName arvados.ServiceName, maxReqsConfigKey string) {
+func (s *Suite) TestRequestLimitsAndDumpRequests_Controller(c *check.C) {
+	s.testRequestLimitAndDumpRequests(c, arvados.ServiceNameController, "MaxConcurrentRailsRequests")
+}
+
+func (*Suite) testRequestLimitAndDumpRequests(c *check.C, serviceName arvados.ServiceName, maxReqsConfigKey string) {
 	defer func(orig time.Duration) { requestQueueDumpCheckInterval = orig }(requestQueueDumpCheckInterval)
 	requestQueueDumpCheckInterval = time.Second / 10
 
@@ -218,6 +229,7 @@ func (*Suite) testDumpRequests(c *check.C, serviceName arvados.ServiceName, maxR
 	defer cf.Close()
 
 	max := 24
+	maxTunnels := 30
 	fmt.Fprintf(cf, `
 Clusters:
  zzzzz:
@@ -225,7 +237,8 @@ Clusters:
   ManagementToken: bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
   API:
    `+maxReqsConfigKey+`: %d
-   MaxQueuedRequests: 0
+   MaxQueuedRequests: 1
+   MaxGatewayTunnels: %d
   SystemLogs: {RequestQueueDumpDirectory: %q}
   Services:
    Controller:
@@ -234,14 +247,18 @@ Clusters:
    WebDAV:
     ExternalURL: "http://localhost:`+port+`"
     InternalURLs: {"http://localhost:`+port+`": {}}
-`, max, tmpdir)
+`, max, maxTunnels, tmpdir)
 	cf.Close()
 
 	started := make(chan bool, max+1)
 	hold := make(chan bool)
 	handler := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
-		started <- true
-		<-hold
+		if strings.Contains(r.URL.Path, "/ssh") || strings.Contains(r.URL.Path, "/gateway_tunnel") {
+			<-hold
+		} else {
+			started <- true
+			<-hold
+		}
 	})
 	healthCheck := make(chan bool, 1)
 	ctx, cancel := context.WithCancel(context.Background())
@@ -267,15 +284,59 @@ Clusters:
 	}
 	client := http.Client{}
 	deadline := time.Now().Add(time.Second * 2)
+	var activeReqs sync.WaitGroup
+
+	// Start some API reqs
+	var apiResp200, apiResp503 int64
 	for i := 0; i < max+1; i++ {
+		activeReqs.Add(1)
+		go func() {
+			defer activeReqs.Done()
+			target := "http://localhost:" + port + "/testpath"
+			resp, err := client.Get(target)
+			for err != nil && strings.Contains(err.Error(), "dial tcp") && deadline.After(time.Now()) {
+				time.Sleep(time.Second / 100)
+				resp, err = client.Get(target)
+			}
+			if c.Check(err, check.IsNil) {
+				if resp.StatusCode == http.StatusOK {
+					atomic.AddInt64(&apiResp200, 1)
+				} else if resp.StatusCode == http.StatusServiceUnavailable {
+					atomic.AddInt64(&apiResp503, 1)
+				}
+			}
+		}()
+	}
+
+	// Start some gateway tunnel reqs that don't count toward our
+	// API req limit
+	extraTunnelReqs := 20
+	var tunnelResp200, tunnelResp503 int64
+	var paths = []string{
+		"/" + strings.Replace(arvados.EndpointContainerSSH.Path, "{uuid}", "z1234-dz642-abcdeabcdeabcde", -1),
+		"/" + strings.Replace(arvados.EndpointContainerSSHCompat.Path, "{uuid}", "z1234-dz642-abcdeabcdeabcde", -1),
+		"/" + strings.Replace(arvados.EndpointContainerGatewayTunnel.Path, "{uuid}", "z1234-dz642-abcdeabcdeabcde", -1),
+		"/" + strings.Replace(arvados.EndpointContainerGatewayTunnelCompat.Path, "{uuid}", "z1234-dz642-abcdeabcdeabcde", -1),
+	}
+	for i := 0; i < maxTunnels+extraTunnelReqs; i++ {
+		i := i
+		activeReqs.Add(1)
 		go func() {
-			resp, err := client.Get("http://localhost:" + port + "/testpath")
+			defer activeReqs.Done()
+			target := "http://localhost:" + port + paths[i%len(paths)]
+			resp, err := client.Post(target, "application/octet-stream", nil)
 			for err != nil && strings.Contains(err.Error(), "dial tcp") && deadline.After(time.Now()) {
 				time.Sleep(time.Second / 100)
-				resp, err = client.Get("http://localhost:" + port + "/testpath")
+				resp, err = client.Post(target, "application/octet-stream", nil)
 			}
 			if c.Check(err, check.IsNil) {
-				c.Logf("resp StatusCode %d", resp.StatusCode)
+				if resp.StatusCode == http.StatusOK {
+					atomic.AddInt64(&tunnelResp200, 1)
+				} else if resp.StatusCode == http.StatusServiceUnavailable {
+					atomic.AddInt64(&tunnelResp503, 1)
+				} else {
+					c.Errorf("tunnel response code %d", resp.StatusCode)
+				}
 			}
 		}()
 	}
@@ -284,6 +345,10 @@ Clusters:
 		case <-started:
 		case <-time.After(time.Second):
 			c.Logf("%s", stderr.String())
+			c.Logf("apiResp200 %d", apiResp200)
+			c.Logf("apiResp503 %d", apiResp503)
+			c.Logf("tunnelResp200 %d", tunnelResp200)
+			c.Logf("tunnelResp503 %d", tunnelResp503)
 			c.Fatal("timed out")
 		}
 	}
@@ -300,6 +365,20 @@ Clusters:
 		var loaded []struct{ URL string }
 		err = json.Unmarshal(j, &loaded)
 		c.Check(err, check.IsNil)
+
+		for i := 0; i < len(loaded); i++ {
+			if strings.Contains(loaded[i].URL, "/ssh") || strings.Contains(loaded[i].URL, "/gateway_tunnel") {
+				// Filter out a gateway tunnel req
+				// that doesn't count toward our API
+				// req limit
+				if i < len(loaded)-1 {
+					copy(loaded[i:], loaded[i+1:])
+					i--
+				}
+				loaded = loaded[:len(loaded)-1]
+			}
+		}
+
 		if len(loaded) < max {
 			// Dumped when #requests was >90% but <100% of
 			// limit. If we stop now, we won't be able to
@@ -309,7 +388,7 @@ Clusters:
 			c.Logf("loaded dumped requests, but len %d < max %d -- still waiting", len(loaded), max)
 			continue
 		}
-		c.Check(loaded, check.HasLen, max)
+		c.Check(loaded, check.HasLen, max+1)
 		c.Check(loaded[0].URL, check.Equals, "/testpath")
 		break
 	}
@@ -328,7 +407,8 @@ Clusters:
 		c.Check(err, check.IsNil)
 		switch path {
 		case "/metrics":
-			c.Check(string(buf), check.Matches, `(?ms).*arvados_concurrent_requests `+fmt.Sprintf("%d", max)+`\n.*`)
+			c.Check(string(buf), check.Matches, `(?ms).*arvados_concurrent_requests{queue="api"} `+fmt.Sprintf("%d", max)+`\n.*`)
+			c.Check(string(buf), check.Matches, `(?ms).*arvados_queued_requests{priority="normal",queue="api"} 1\n.*`)
 		case "/_inspect/requests":
 			c.Check(string(buf), check.Matches, `(?ms).*"URL":"/testpath".*`)
 		default:
@@ -336,6 +416,11 @@ Clusters:
 		}
 	}
 	close(hold)
+	activeReqs.Wait()
+	c.Check(int(apiResp200), check.Equals, max+1)
+	c.Check(int(apiResp503), check.Equals, 0)
+	c.Check(int(tunnelResp200), check.Equals, maxTunnels)
+	c.Check(int(tunnelResp503), check.Equals, extraTunnelReqs)
 	cancel()
 }
 
diff --git a/sdk/go/arvados/config.go b/sdk/go/arvados/config.go
index e2ad7b089d..e39a1ff0aa 100644
--- a/sdk/go/arvados/config.go
+++ b/sdk/go/arvados/config.go
@@ -102,6 +102,7 @@ type Cluster struct {
 		MaxConcurrentRailsRequests       int
 		MaxConcurrentRequests            int
 		MaxQueuedRequests                int
+		MaxGatewayTunnels                int
 		MaxQueueTimeForLockRequests      Duration
 		LogCreateRequestFraction         float64
 		MaxKeepBlobBuffers               int
diff --git a/sdk/go/httpserver/request_limiter.go b/sdk/go/httpserver/request_limiter.go
index 9d501ab0eb..1e3316ed48 100644
--- a/sdk/go/httpserver/request_limiter.go
+++ b/sdk/go/httpserver/request_limiter.go
@@ -34,13 +34,8 @@ const metricsUpdateInterval = time.Second
 type RequestLimiter struct {
 	Handler http.Handler
 
-	// Maximum number of requests being handled at once. Beyond
-	// this limit, requests will be queued.
-	MaxConcurrent int
-
-	// Maximum number of requests in the queue. Beyond this limit,
-	// the lowest priority requests will return 503.
-	MaxQueue int
+	// Queue determines which queue a request is assigned to.
+	Queue func(req *http.Request) *RequestQueue
 
 	// Priority determines queue ordering. Requests with higher
 	// priority are handled first. Requests with equal priority
@@ -48,11 +43,6 @@ type RequestLimiter struct {
 	// handled FIFO.
 	Priority func(req *http.Request, queued time.Time) int64
 
-	// Return 503 for any request for which Priority() returns
-	// MinPriority if it spends longer than this in the queue
-	// before starting processing.
-	MaxQueueTimeForMinPriority time.Duration
-
 	// "concurrent_requests", "max_concurrent_requests",
 	// "queued_requests", and "max_queued_requests" metrics are
 	// registered with Registry, if it is not nil.
@@ -63,11 +53,32 @@ type RequestLimiter struct {
 	mQueueTimeout *prometheus.SummaryVec
 	mQueueUsage   *prometheus.GaugeVec
 	mtx           sync.Mutex
-	handling      int
-	queue         queue
+	rqs           map[*RequestQueue]bool // all RequestQueues in use
+}
+
+type RequestQueue struct {
+	// Label for metrics. No two queues should have the same label.
+	Label string
+
+	// Maximum number of requests being handled at once. Beyond
+	// this limit, requests will be queued.
+	MaxConcurrent int
+
+	// Maximum number of requests in the queue. Beyond this limit,
+	// the lowest priority requests will return 503.
+	MaxQueue int
+
+	// Return 503 for any request for which Priority() returns
+	// MinPriority if it spends longer than this in the queue
+	// before starting processing.
+	MaxQueueTimeForMinPriority time.Duration
+
+	queue    queue
+	handling int
 }
 
 type qent struct {
+	rq       *RequestQueue
 	queued   time.Time
 	priority int64
 	heappos  int
@@ -121,101 +132,96 @@ func (h *queue) remove(i int) {
 
 func (rl *RequestLimiter) setup() {
 	if rl.Registry != nil {
-		rl.Registry.MustRegister(prometheus.NewGaugeFunc(
-			prometheus.GaugeOpts{
-				Namespace: "arvados",
-				Name:      "concurrent_requests",
-				Help:      "Number of requests in progress",
-			},
-			func() float64 {
-				rl.mtx.Lock()
-				defer rl.mtx.Unlock()
-				return float64(rl.handling)
-			},
-		))
-		rl.Registry.MustRegister(prometheus.NewGaugeFunc(
-			prometheus.GaugeOpts{
-				Namespace: "arvados",
-				Name:      "max_concurrent_requests",
-				Help:      "Maximum number of concurrent requests",
-			},
-			func() float64 { return float64(rl.MaxConcurrent) },
-		))
+		mCurrentReqs := prometheus.NewGaugeVec(prometheus.GaugeOpts{
+			Namespace: "arvados",
+			Name:      "concurrent_requests",
+			Help:      "Number of requests in progress",
+		}, []string{"queue"})
+		rl.Registry.MustRegister(mCurrentReqs)
+		mMaxReqs := prometheus.NewGaugeVec(prometheus.GaugeOpts{
+			Namespace: "arvados",
+			Name:      "max_concurrent_requests",
+			Help:      "Maximum number of concurrent requests",
+		}, []string{"queue"})
+		rl.Registry.MustRegister(mMaxReqs)
+		mMaxQueue := prometheus.NewGaugeVec(prometheus.GaugeOpts{
+			Namespace: "arvados",
+			Name:      "max_queued_requests",
+			Help:      "Maximum number of queued requests",
+		}, []string{"queue"})
+		rl.Registry.MustRegister(mMaxQueue)
 		rl.mQueueUsage = prometheus.NewGaugeVec(prometheus.GaugeOpts{
 			Namespace: "arvados",
 			Name:      "queued_requests",
 			Help:      "Number of requests in queue",
-		}, []string{"priority"})
+		}, []string{"queue", "priority"})
 		rl.Registry.MustRegister(rl.mQueueUsage)
-		rl.Registry.MustRegister(prometheus.NewGaugeFunc(
-			prometheus.GaugeOpts{
-				Namespace: "arvados",
-				Name:      "max_queued_requests",
-				Help:      "Maximum number of queued requests",
-			},
-			func() float64 { return float64(rl.MaxQueue) },
-		))
 		rl.mQueueDelay = prometheus.NewSummaryVec(prometheus.SummaryOpts{
 			Namespace:  "arvados",
 			Name:       "queue_delay_seconds",
 			Help:       "Time spent in the incoming request queue before start of processing",
 			Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.95: 0.005, 0.99: 0.001},
-		}, []string{"priority"})
+		}, []string{"queue", "priority"})
 		rl.Registry.MustRegister(rl.mQueueDelay)
 		rl.mQueueTimeout = prometheus.NewSummaryVec(prometheus.SummaryOpts{
 			Namespace:  "arvados",
 			Name:       "queue_timeout_seconds",
 			Help:       "Time spent in the incoming request queue before client timed out or disconnected",
 			Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.95: 0.005, 0.99: 0.001},
-		}, []string{"priority"})
+		}, []string{"queue", "priority"})
 		rl.Registry.MustRegister(rl.mQueueTimeout)
 		go func() {
 			for range time.NewTicker(metricsUpdateInterval).C {
-				var low, normal, high int
 				rl.mtx.Lock()
-				for _, ent := range rl.queue {
-					switch {
-					case ent.priority < 0:
-						low++
-					case ent.priority > 0:
-						high++
-					default:
-						normal++
+				for rq := range rl.rqs {
+					var low, normal, high int
+					for _, ent := range rq.queue {
+						switch {
+						case ent.priority < 0:
+							low++
+						case ent.priority > 0:
+							high++
+						default:
+							normal++
+						}
 					}
+					mCurrentReqs.WithLabelValues(rq.Label).Set(float64(rq.handling))
+					mMaxReqs.WithLabelValues(rq.Label).Set(float64(rq.MaxConcurrent))
+					mMaxQueue.WithLabelValues(rq.Label).Set(float64(rq.MaxQueue))
+					rl.mQueueUsage.WithLabelValues(rq.Label, "low").Set(float64(low))
+					rl.mQueueUsage.WithLabelValues(rq.Label, "normal").Set(float64(normal))
+					rl.mQueueUsage.WithLabelValues(rq.Label, "high").Set(float64(high))
 				}
 				rl.mtx.Unlock()
-				rl.mQueueUsage.WithLabelValues("low").Set(float64(low))
-				rl.mQueueUsage.WithLabelValues("normal").Set(float64(normal))
-				rl.mQueueUsage.WithLabelValues("high").Set(float64(high))
 			}
 		}()
 	}
 }
 
 // caller must have lock
-func (rl *RequestLimiter) runqueue() {
+func (rq *RequestQueue) runqueue() {
 	// Handle entries from the queue as capacity permits
-	for len(rl.queue) > 0 && (rl.MaxConcurrent == 0 || rl.handling < rl.MaxConcurrent) {
-		rl.handling++
-		ent := rl.queue.removeMax()
+	for len(rq.queue) > 0 && (rq.MaxConcurrent == 0 || rq.handling < rq.MaxConcurrent) {
+		rq.handling++
+		ent := rq.queue.removeMax()
 		ent.ready <- true
 	}
 }
 
 // If the queue is too full, fail and remove the lowest-priority
 // entry. Caller must have lock. Queue must not be empty.
-func (rl *RequestLimiter) trimqueue() {
-	if len(rl.queue) <= rl.MaxQueue {
+func (rq *RequestQueue) trimqueue() {
+	if len(rq.queue) <= rq.MaxQueue {
 		return
 	}
 	min := 0
-	for i := range rl.queue {
-		if i == 0 || rl.queue.Less(min, i) {
+	for i := range rq.queue {
+		if i == 0 || rq.queue.Less(min, i) {
 			min = i
 		}
 	}
-	rl.queue[min].ready <- false
-	rl.queue.remove(min)
+	rq.queue[min].ready <- false
+	rq.queue.remove(min)
 }
 
 func (rl *RequestLimiter) enqueue(req *http.Request) *qent {
@@ -227,19 +233,24 @@ func (rl *RequestLimiter) enqueue(req *http.Request) *qent {
 		priority = rl.Priority(req, qtime)
 	}
 	ent := &qent{
+		rq:       rl.Queue(req),
 		queued:   qtime,
 		priority: priority,
 		ready:    make(chan bool, 1),
 		heappos:  -1,
 	}
-	if rl.MaxConcurrent == 0 || rl.MaxConcurrent > rl.handling {
+	if rl.rqs == nil {
+		rl.rqs = map[*RequestQueue]bool{}
+	}
+	rl.rqs[ent.rq] = true
+	if ent.rq.MaxConcurrent == 0 || ent.rq.MaxConcurrent > ent.rq.handling {
 		// fast path, skip the queue
-		rl.handling++
+		ent.rq.handling++
 		ent.ready <- true
 		return ent
 	}
-	rl.queue.add(ent)
-	rl.trimqueue()
+	ent.rq.queue.add(ent)
+	ent.rq.trimqueue()
 	return ent
 }
 
@@ -247,7 +258,7 @@ func (rl *RequestLimiter) remove(ent *qent) {
 	rl.mtx.Lock()
 	defer rl.mtx.Unlock()
 	if ent.heappos >= 0 {
-		rl.queue.remove(ent.heappos)
+		ent.rq.queue.remove(ent.heappos)
 		ent.ready <- false
 	}
 }
@@ -255,14 +266,14 @@ func (rl *RequestLimiter) remove(ent *qent) {
 func (rl *RequestLimiter) ServeHTTP(resp http.ResponseWriter, req *http.Request) {
 	rl.setupOnce.Do(rl.setup)
 	ent := rl.enqueue(req)
-	SetResponseLogFields(req.Context(), logrus.Fields{"priority": ent.priority})
+	SetResponseLogFields(req.Context(), logrus.Fields{"priority": ent.priority, "queue": ent.rq.Label})
 	if ent.priority == MinPriority {
 		// Note that MaxQueueTime==0 does not cancel a req
 		// that skips the queue, because in that case
 		// rl.enqueue() has already fired ready<-true and
 		// rl.remove() is a no-op.
 		go func() {
-			time.Sleep(rl.MaxQueueTimeForMinPriority)
+			time.Sleep(ent.rq.MaxQueueTimeForMinPriority)
 			rl.remove(ent)
 		}()
 	}
@@ -273,7 +284,7 @@ func (rl *RequestLimiter) ServeHTTP(resp http.ResponseWriter, req *http.Request)
 		// we still need to wait for ent.ready, because
 		// sometimes runqueue() will have already decided to
 		// send true before our rl.remove() call, and in that
-		// case we'll need to decrement rl.handling below.
+		// case we'll need to decrement ent.rq.handling below.
 		ok = <-ent.ready
 	case ok = <-ent.ready:
 	}
@@ -298,7 +309,7 @@ func (rl *RequestLimiter) ServeHTTP(resp http.ResponseWriter, req *http.Request)
 		default:
 			qlabel = "normal"
 		}
-		series.WithLabelValues(qlabel).Observe(time.Now().Sub(ent.queued).Seconds())
+		series.WithLabelValues(ent.rq.Label, qlabel).Observe(time.Now().Sub(ent.queued).Seconds())
 	}
 
 	if !ok {
@@ -308,9 +319,9 @@ func (rl *RequestLimiter) ServeHTTP(resp http.ResponseWriter, req *http.Request)
 	defer func() {
 		rl.mtx.Lock()
 		defer rl.mtx.Unlock()
-		rl.handling--
+		ent.rq.handling--
 		// unblock the next waiting request
-		rl.runqueue()
+		ent.rq.runqueue()
 	}()
 	rl.Handler.ServeHTTP(resp, req)
 }
diff --git a/sdk/go/httpserver/request_limiter_test.go b/sdk/go/httpserver/request_limiter_test.go
index 55f13b4625..7366e1426b 100644
--- a/sdk/go/httpserver/request_limiter_test.go
+++ b/sdk/go/httpserver/request_limiter_test.go
@@ -34,7 +34,11 @@ func newTestHandler() *testHandler {
 
 func (s *Suite) TestRequestLimiter1(c *check.C) {
 	h := newTestHandler()
-	l := RequestLimiter{MaxConcurrent: 1, Handler: h}
+	rq := &RequestQueue{
+		MaxConcurrent: 1}
+	l := RequestLimiter{
+		Queue:   func(*http.Request) *RequestQueue { return rq },
+		Handler: h}
 	var wg sync.WaitGroup
 	resps := make([]*httptest.ResponseRecorder, 10)
 	for i := 0; i < 10; i++ {
@@ -94,7 +98,11 @@ func (s *Suite) TestRequestLimiter1(c *check.C) {
 
 func (*Suite) TestRequestLimiter10(c *check.C) {
 	h := newTestHandler()
-	l := RequestLimiter{MaxConcurrent: 10, Handler: h}
+	rq := &RequestQueue{
+		MaxConcurrent: 10}
+	l := RequestLimiter{
+		Queue:   func(*http.Request) *RequestQueue { return rq },
+		Handler: h}
 	var wg sync.WaitGroup
 	for i := 0; i < 10; i++ {
 		wg.Add(1)
@@ -114,29 +122,32 @@ func (*Suite) TestRequestLimiter10(c *check.C) {
 
 func (*Suite) TestRequestLimiterQueuePriority(c *check.C) {
 	h := newTestHandler()
-	rl := RequestLimiter{
+	rq := &RequestQueue{
 		MaxConcurrent: 1000,
 		MaxQueue:      200,
-		Handler:       h,
+	}
+	rl := RequestLimiter{
+		Handler: h,
+		Queue:   func(*http.Request) *RequestQueue { return rq },
 		Priority: func(r *http.Request, _ time.Time) int64 {
 			p, _ := strconv.ParseInt(r.Header.Get("Priority"), 10, 64)
 			return p
 		}}
 
 	c.Logf("starting initial requests")
-	for i := 0; i < rl.MaxConcurrent; i++ {
+	for i := 0; i < rq.MaxConcurrent; i++ {
 		go func() {
 			rl.ServeHTTP(httptest.NewRecorder(), &http.Request{Header: http.Header{"No-Priority": {"x"}}})
 		}()
 	}
 	c.Logf("waiting for initial requests to consume all MaxConcurrent slots")
-	for i := 0; i < rl.MaxConcurrent; i++ {
+	for i := 0; i < rq.MaxConcurrent; i++ {
 		<-h.inHandler
 	}
 
-	c.Logf("starting %d priority=MinPriority requests (should respond 503 immediately)", rl.MaxQueue)
+	c.Logf("starting %d priority=MinPriority requests (should respond 503 immediately)", rq.MaxQueue)
 	var wgX sync.WaitGroup
-	for i := 0; i < rl.MaxQueue; i++ {
+	for i := 0; i < rq.MaxQueue; i++ {
 		wgX.Add(1)
 		go func() {
 			defer wgX.Done()
@@ -147,13 +158,13 @@ func (*Suite) TestRequestLimiterQueuePriority(c *check.C) {
 	}
 	wgX.Wait()
 
-	c.Logf("starting %d priority=MinPriority requests (should respond 503 after 100 ms)", rl.MaxQueue)
+	c.Logf("starting %d priority=MinPriority requests (should respond 503 after 100 ms)", rq.MaxQueue)
 	// Usage docs say the caller isn't allowed to change fields
 	// after first use, but we secretly know it's OK to change
 	// this field on the fly as long as no requests are arriving
 	// concurrently.
-	rl.MaxQueueTimeForMinPriority = time.Millisecond * 100
-	for i := 0; i < rl.MaxQueue; i++ {
+	rq.MaxQueueTimeForMinPriority = time.Millisecond * 100
+	for i := 0; i < rq.MaxQueue; i++ {
 		wgX.Add(1)
 		go func() {
 			defer wgX.Done()
@@ -162,17 +173,17 @@ func (*Suite) TestRequestLimiterQueuePriority(c *check.C) {
 			rl.ServeHTTP(resp, &http.Request{Header: http.Header{"Priority": {fmt.Sprintf("%d", MinPriority)}}})
 			c.Check(resp.Code, check.Equals, http.StatusServiceUnavailable)
 			elapsed := time.Since(t0)
-			c.Check(elapsed > rl.MaxQueueTimeForMinPriority, check.Equals, true)
-			c.Check(elapsed < rl.MaxQueueTimeForMinPriority*10, check.Equals, true)
+			c.Check(elapsed > rq.MaxQueueTimeForMinPriority, check.Equals, true)
+			c.Check(elapsed < rq.MaxQueueTimeForMinPriority*10, check.Equals, true)
 		}()
 	}
 	wgX.Wait()
 
-	c.Logf("starting %d priority=1 and %d priority=1 requests", rl.MaxQueue, rl.MaxQueue)
+	c.Logf("starting %d priority=1 and %d priority=1 requests", rq.MaxQueue, rq.MaxQueue)
 	var wg1, wg2 sync.WaitGroup
-	wg1.Add(rl.MaxQueue)
-	wg2.Add(rl.MaxQueue)
-	for i := 0; i < rl.MaxQueue*2; i++ {
+	wg1.Add(rq.MaxQueue)
+	wg2.Add(rq.MaxQueue)
+	for i := 0; i < rq.MaxQueue*2; i++ {
 		i := i
 		go func() {
 			pri := (i & 1) + 1
@@ -192,12 +203,12 @@ func (*Suite) TestRequestLimiterQueuePriority(c *check.C) {
 	wg1.Wait()
 
 	c.Logf("allowing initial requests to proceed")
-	for i := 0; i < rl.MaxConcurrent; i++ {
+	for i := 0; i < rq.MaxConcurrent; i++ {
 		h.okToProceed <- struct{}{}
 	}
 
 	c.Logf("allowing queued priority=2 requests to proceed")
-	for i := 0; i < rl.MaxQueue; i++ {
+	for i := 0; i < rq.MaxQueue; i++ {
 		<-h.inHandler
 		h.okToProceed <- struct{}{}
 	}

commit cb04fbce3b80919c7eed21c1341a0c27028c8b1a
Author: Peter Amstutz <peter.amstutz at curii.com>
Date:   Fri Jan 5 14:59:56 2024 -0500

    Merge branch '21290-sync-past-versions' refs #21290
    
    Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz at curii.com>

diff --git a/services/api/app/models/collection.rb b/services/api/app/models/collection.rb
index b4660dbd35..16e85c0dd9 100644
--- a/services/api/app/models/collection.rb
+++ b/services/api/app/models/collection.rb
@@ -329,17 +329,7 @@ class Collection < ArvadosModel
   end
 
   def sync_past_versions
-    updates = self.syncable_updates
-    Collection.where('current_version_uuid = ? AND uuid != ?', self.uuid_before_last_save, self.uuid_before_last_save).each do |c|
-      c.attributes = updates
-      # Use a different validation context to skip the 'past_versions_cannot_be_updated'
-      # validator, as on this case it is legal to update some fields.
-      leave_modified_by_user_alone do
-        leave_modified_at_alone do
-          c.save(context: :update_old_versions)
-        end
-      end
-    end
+    Collection.where('current_version_uuid = ? AND uuid != ?', self.uuid_before_last_save, self.uuid_before_last_save).update_all self.syncable_updates
   end
 
   def versionable_updates?(attrs)

commit 177608b6533c97c4d48f1512fd5b98b6612f49af
Author: Peter Amstutz <peter.amstutz at curii.com>
Date:   Tue Jan 2 10:02:51 2024 -0500

    Merge branch '21059-signup-email' refs #21059
    
    Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz at curii.com>

diff --git a/services/api/app/models/user.rb b/services/api/app/models/user.rb
index c73f31a99d..7def490618 100644
--- a/services/api/app/models/user.rb
+++ b/services/api/app/models/user.rb
@@ -31,8 +31,8 @@ class User < ArvadosModel
   after_update :setup_on_activate
 
   before_create :check_auto_admin
-  before_create :set_initial_username, :if => Proc.new {
-    username.nil? and email
+  before_validation :set_initial_username, :if => Proc.new {
+    new_record? && email
   }
   before_create :active_is_not_nil
   after_create :after_ownership_change
@@ -264,8 +264,7 @@ SELECT target_uuid, perm_level
   def setup(repo_name: nil, vm_uuid: nil, send_notification_email: nil)
     newly_invited = Link.where(tail_uuid: self.uuid,
                               head_uuid: all_users_group_uuid,
-                              link_class: 'permission',
-                              name: 'can_read').empty?
+                              link_class: 'permission').empty?
 
     # Add can_read link from this user to "all users" which makes this
     # user "invited", and (depending on config) a link in the opposite
@@ -387,6 +386,10 @@ SELECT target_uuid, perm_level
   end
 
   def set_initial_username(requested: false)
+    if new_record? and requested == false and self.username != nil and self.username != ""
+      requested = self.username
+    end
+
     if (!requested.is_a?(String) || requested.empty?) and email
       email_parts = email.partition("@")
       local_parts = email_parts.first.partition("+")
@@ -606,13 +609,64 @@ SELECT target_uuid, perm_level
 
   def self.update_remote_user remote_user
     remote_user = remote_user.symbolize_keys
+    remote_user_prefix = remote_user[:uuid][0..4]
+
+    # interaction between is_invited and is_active
+    #
+    # either can flag can be nil, true or false
+    #
+    # in all cases, we create the user if they don't exist.
+    #
+    # invited nil, active nil: don't call setup or unsetup.
+    #
+    # invited nil, active false: call unsetup
+    #
+    # invited nil, active true: call setup and activate them.
+    #
+    #
+    # invited false, active nil: call unsetup
+    #
+    # invited false, active false: call unsetup
+    #
+    # invited false, active true: call unsetup
+    #
+    #
+    # invited true, active nil: call setup but don't change is_active
+    #
+    # invited true, active false: call setup but don't change is_active
+    #
+    # invited true, active true: call setup and activate them.
+
+    should_setup = (remote_user_prefix == Rails.configuration.Login.LoginCluster or
+                    Rails.configuration.Users.AutoSetupNewUsers or
+                    Rails.configuration.Users.NewUsersAreActive or
+                    Rails.configuration.RemoteClusters[remote_user_prefix].andand["ActivateUsers"])
+
+    should_activate = (remote_user_prefix == Rails.configuration.Login.LoginCluster or
+                       Rails.configuration.Users.NewUsersAreActive or
+                       Rails.configuration.RemoteClusters[remote_user_prefix].andand["ActivateUsers"])
+
+    remote_should_be_unsetup = (remote_user[:is_invited] == nil && remote_user[:is_active] == false) ||
+                               (remote_user[:is_invited] == false)
+
+    remote_should_be_setup = should_setup && (
+      (remote_user[:is_invited] == nil && remote_user[:is_active] == true) ||
+      (remote_user[:is_invited] == false && remote_user[:is_active] == true) ||
+      (remote_user[:is_invited] == true))
+
+    remote_should_be_active = should_activate && remote_user[:is_invited] != false && remote_user[:is_active] == true
+
     begin
-      user = User.find_or_create_by(uuid: remote_user[:uuid])
+      user = User.create_with(email: remote_user[:email],
+                              username: remote_user[:username],
+                              first_name: remote_user[:first_name],
+                              last_name: remote_user[:last_name],
+                              is_active: remote_should_be_active
+      ).find_or_create_by(uuid: remote_user[:uuid])
     rescue ActiveRecord::RecordNotUnique
       retry
     end
 
-    remote_user_prefix = user.uuid[0..4]
     user.with_lock do
       needupdate = {}
       [:email, :username, :first_name, :last_name, :prefs].each do |k|
@@ -658,29 +712,19 @@ SELECT target_uuid, perm_level
         end
       end
 
-      if user.is_invited && remote_user[:is_invited] == false
-        # Remote user is not "invited" state, they should be unsetup, which
-        # also makes them inactive.
+      if remote_should_be_unsetup
+        # Remote user is not "invited" or "active" state on their home
+        # cluster, so they should be unsetup, which also makes them
+        # inactive.
         user.unsetup
       else
-        if !user.is_invited && remote_user[:is_invited] and
-          (remote_user_prefix == Rails.configuration.Login.LoginCluster or
-           Rails.configuration.Users.AutoSetupNewUsers or
-           Rails.configuration.Users.NewUsersAreActive or
-           Rails.configuration.RemoteClusters[remote_user_prefix].andand["ActivateUsers"])
-          # Remote user is 'invited' and should be set up
+        if !user.is_invited && remote_should_be_setup
           user.setup
         end
 
-        if !user.is_active && remote_user[:is_active] && user.is_invited and
-          (remote_user_prefix == Rails.configuration.Login.LoginCluster or
-           Rails.configuration.Users.NewUsersAreActive or
-           Rails.configuration.RemoteClusters[remote_user_prefix].andand["ActivateUsers"])
+        if !user.is_active && remote_should_be_active
           # remote user is active and invited, we need to activate them
           user.update!(is_active: true)
-        elsif user.is_active && remote_user[:is_active] == false
-          # remote user is not active, we need to de-activate them
-          user.update!(is_active: false)
         end
 
         if remote_user_prefix == Rails.configuration.Login.LoginCluster and
@@ -914,8 +958,9 @@ SELECT target_uuid, perm_level
 
   # Send admin notifications
   def send_admin_notifications
-    AdminNotifier.new_user(self).deliver_now
-    if not self.is_active then
+    if self.is_invited then
+      AdminNotifier.new_user(self).deliver_now
+    else
       AdminNotifier.new_inactive_user(self).deliver_now
     end
   end
diff --git a/services/api/app/views/admin_notifier/new_inactive_user.text.erb b/services/api/app/views/admin_notifier/new_inactive_user.text.erb
index afcf34da71..22298b1ce7 100644
--- a/services/api/app/views/admin_notifier/new_inactive_user.text.erb
+++ b/services/api/app/views/admin_notifier/new_inactive_user.text.erb
@@ -2,15 +2,16 @@
 
 SPDX-License-Identifier: AGPL-3.0 %>
 
+A new user has been created, but not set up.
 
-A new user landed on the inactive user page:
+  <%= @user.full_name %> <<%= @user.email %>> (<%= @user.username %>)
 
-  <%= @user.full_name %> <<%= @user.email %>>
+They will not be able to use Arvados unless set up by an admin.
 
 <% if Rails.configuration.Services.Workbench1.ExternalURL -%>
-Please see workbench for more information:
+Please see Workbench for more information:
 
-  <%= Rails.configuration.Services.Workbench1.ExternalURL %>
+  <%= URI::join(Rails.configuration.Services.Workbench1.ExternalURL, "user/#{@user.uuid}") %>
 
 <% end -%>
 Thanks,
diff --git a/services/api/app/views/admin_notifier/new_user.text.erb b/services/api/app/views/admin_notifier/new_user.text.erb
index 670b84b7c1..920906d833 100644
--- a/services/api/app/views/admin_notifier/new_user.text.erb
+++ b/services/api/app/views/admin_notifier/new_user.text.erb
@@ -2,22 +2,16 @@
 
 SPDX-License-Identifier: AGPL-3.0 %>
 
-<%
-  add_to_message = ''
-  if Rails.configuration.Users.AutoSetupNewUsers
-    add_to_message = @user.is_invited ? ' and setup' : ', but not setup'
-  end
-%>
-A new user has been created<%=add_to_message%>:
+A new user has been created and set up.
 
-  <%= @user.full_name %> <<%= @user.email %>>
+  <%= @user.full_name %> <<%= @user.email %>> (<%= @user.username %>)
 
-This user is <%= @user.is_active ? '' : 'NOT ' %>active.
+They are able to use Arvados.
 
 <% if Rails.configuration.Services.Workbench1.ExternalURL -%>
-Please see workbench for more information:
+Please see Workbench for more information:
 
-  <%= Rails.configuration.Services.Workbench1.ExternalURL %>
+  <%= URI::join(Rails.configuration.Services.Workbench1.ExternalURL, "user/#{@user.uuid}") %>
 
 <% end -%>
 Thanks,
diff --git a/services/api/app/views/user_notifier/account_is_setup.text.erb b/services/api/app/views/user_notifier/account_is_setup.text.erb
index 352ee7754e..3f04db8517 100644
--- a/services/api/app/views/user_notifier/account_is_setup.text.erb
+++ b/services/api/app/views/user_notifier/account_is_setup.text.erb
@@ -2,4 +2,4 @@
 
 SPDX-License-Identifier: AGPL-3.0 %>
 
-<%= ERB.new(Rails.configuration.Users.UserSetupMailText, 0, "-").result(binding) %>
+<%= ERB.new(Rails.configuration.Users.UserSetupMailText, trim_mode: "-").result(binding) %>
diff --git a/services/api/test/integration/remote_user_test.rb b/services/api/test/integration/remote_user_test.rb
index b0d1320fdf..f42fda4150 100644
--- a/services/api/test/integration/remote_user_test.rb
+++ b/services/api/test/integration/remote_user_test.rb
@@ -100,12 +100,15 @@ class RemoteUsersTest < ActionDispatch::IntegrationTest
       uuid: 'zbbbb-tpzed-000000000000001',
       email: 'foo at example.com',
       username: 'barney',
+      first_name: "Barney",
+      last_name: "Foo",
       is_admin: true,
       is_active: true,
       is_invited: true,
     }
     @stub_token_status = 200
     @stub_token_scopes = ["all"]
+    ActionMailer::Base.deliveries = []
   end
 
   teardown do
@@ -366,6 +369,12 @@ class RemoteUsersTest < ActionDispatch::IntegrationTest
 
   test 'get user from Login cluster' do
     Rails.configuration.Login.LoginCluster = 'zbbbb'
+    email_dest = ActiveSupport::OrderedOptions.new
+    email_dest[:'arvados-admin at example.com'] = ActiveSupport::OrderedOptions.new
+    Rails.configuration.Users.UserNotifierEmailBcc = email_dest
+    Rails.configuration.Users.NewUserNotificationRecipients = email_dest
+    Rails.configuration.Users.NewInactiveUserNotificationRecipients = email_dest
+
     get '/arvados/v1/users/current',
       params: {format: 'json'},
       headers: auth(remote: 'zbbbb')
@@ -375,14 +384,18 @@ class RemoteUsersTest < ActionDispatch::IntegrationTest
     assert_equal true, json_response['is_active']
     assert_equal 'foo at example.com', json_response['email']
     assert_equal 'barney', json_response['username']
+
+    assert_equal 2, ActionMailer::Base.deliveries.length
+    assert_equal "Welcome to Arvados - account enabled", ActionMailer::Base.deliveries[0].subject
+    assert_equal "[ARVADOS] New user created notification", ActionMailer::Base.deliveries[1].subject
   end
 
   [true, false].each do |trusted|
     [true, false].each do |logincluster|
-      [true, false].each do |admin|
-        [true, false].each do |active|
+      [true, false, nil].each do |admin|
+        [true, false, nil].each do |active|
           [true, false].each do |autosetup|
-            [true, false].each do |invited|
+            [true, false, nil].each do |invited|
               test "get invited=#{invited}, active=#{active}, admin=#{admin} user from #{if logincluster then "Login" else "peer" end} cluster when AutoSetupNewUsers=#{autosetup} ActivateUsers=#{trusted}" do
                 Rails.configuration.Login.LoginCluster = 'zbbbb' if logincluster
                 Rails.configuration.RemoteClusters['zbbbb'].ActivateUsers = trusted
@@ -400,9 +413,9 @@ class RemoteUsersTest < ActionDispatch::IntegrationTest
                     headers: auth(remote: 'zbbbb')
                 assert_response :success
                 assert_equal 'zbbbb-tpzed-000000000000001', json_response['uuid']
-                assert_equal (logincluster && admin && invited && active), json_response['is_admin']
-                assert_equal (invited and (logincluster || trusted || autosetup)), json_response['is_invited']
-                assert_equal (invited and (logincluster || trusted) and active), json_response['is_active']
+                assert_equal (logincluster && !!admin && (invited != false) && !!active), json_response['is_admin']
+                assert_equal ((invited == true || (invited == nil && !!active)) && (logincluster || trusted || autosetup)), json_response['is_invited']
+                assert_equal ((invited != false) && (logincluster || trusted) && !!active), json_response['is_active']
                 assert_equal 'foo at example.com', json_response['email']
                 assert_equal 'barney', json_response['username']
               end
diff --git a/services/api/test/unit/user_test.rb b/services/api/test/unit/user_test.rb
index 8a41fb3976..6f49b2b109 100644
--- a/services/api/test/unit/user_test.rb
+++ b/services/api/test/unit/user_test.rb
@@ -347,10 +347,12 @@ class UserTest < ActiveSupport::TestCase
   test "create new user with notifications" do
     set_user_from_auth :admin
 
+    Rails.configuration.Users.AutoSetupNewUsers = false
+
     create_user_and_verify_setup_and_notifications true, active_notify_list, inactive_notify_list, nil, nil
     create_user_and_verify_setup_and_notifications true, active_notify_list, empty_notify_list, nil, nil
     create_user_and_verify_setup_and_notifications true, empty_notify_list, empty_notify_list, nil, nil
-    create_user_and_verify_setup_and_notifications false, active_notify_list, inactive_notify_list, nil, nil
+    create_user_and_verify_setup_and_notifications false, empty_notify_list, inactive_notify_list, nil, nil
     create_user_and_verify_setup_and_notifications false, empty_notify_list, inactive_notify_list, nil, nil
     create_user_and_verify_setup_and_notifications false, empty_notify_list, empty_notify_list, nil, nil
   end
@@ -379,13 +381,13 @@ class UserTest < ActiveSupport::TestCase
     [false, empty_notify_list, empty_notify_list, "arvados at example.com", false, false, "arvados2"],
     [true, active_notify_list, inactive_notify_list, "arvados at example.com", false, false, "arvados2"],
     [true, active_notify_list, inactive_notify_list, "root at example.com", true, false, "root2"],
-    [false, active_notify_list, inactive_notify_list, "root at example.com", true, false, "root2"],
+    [false, active_notify_list, empty_notify_list, "root at example.com", true, false, "root2"],
     [true, active_notify_list, inactive_notify_list, "roo_t at example.com", false, true, "root2"],
     [false, empty_notify_list, empty_notify_list, "^^incorrect_format at example.com", true, true, "incorrectformat"],
     [true, active_notify_list, inactive_notify_list, "&4a_d9. at example.com", true, true, "ad9"],
     [true, active_notify_list, inactive_notify_list, "&4a_d9. at example.com", false, false, "ad9"],
-    [false, active_notify_list, inactive_notify_list, "&4a_d9. at example.com", true, true, "ad9"],
-    [false, active_notify_list, inactive_notify_list, "&4a_d9. at example.com", false, false, "ad9"],
+    [false, active_notify_list, empty_notify_list, "&4a_d9. at example.com", true, true, "ad9"],
+    [false, active_notify_list, empty_notify_list, "&4a_d9. at example.com", false, false, "ad9"],
   ].each do |active, new_user_recipients, inactive_recipients, email, auto_setup_vm, auto_setup_repo, expect_username|
     test "create new user with auto setup active=#{active} email=#{email} vm=#{auto_setup_vm} repo=#{auto_setup_repo}" do
       set_user_from_auth :admin

commit 3d58984b6e02230ab896e2b651b46a2d7ee47b1f
Author: Tom Clegg <tom at curii.com>
Date:   Tue Jan 2 09:34:21 2024 -0500

    Merge branch '21206-ws-requesting-container-uuid'
    
    closes #21206
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>

diff --git a/services/ws/session_v0_test.go b/services/ws/session_v0_test.go
index 72668950a5..7d15543c05 100644
--- a/services/ws/session_v0_test.go
+++ b/services/ws/session_v0_test.go
@@ -211,6 +211,41 @@ func (s *v0Suite) TestEventTypeDelete(c *check.C) {
 	}
 }
 
+func (s *v0Suite) TestEventPropertiesFields(c *check.C) {
+	ac := arvados.NewClientFromEnv()
+	ac.AuthToken = s.token
+
+	conn, r, w, err := s.testClient()
+	c.Assert(err, check.IsNil)
+	defer conn.Close()
+
+	c.Check(w.Encode(map[string]interface{}{
+		"method":  "subscribe",
+		"filters": [][]string{{"object_uuid", "=", arvadostest.RunningContainerUUID}},
+	}), check.IsNil)
+	s.expectStatus(c, r, 200)
+
+	err = ac.RequestAndDecode(nil, "POST", "arvados/v1/logs", s.jsonBody("log", map[string]interface{}{
+		"object_uuid": arvadostest.RunningContainerUUID,
+		"event_type":  "update",
+		"properties": map[string]interface{}{
+			"new_attributes": map[string]interface{}{
+				"name":                      "namevalue",
+				"requesting_container_uuid": "uuidvalue",
+				"state":                     "statevalue",
+			},
+		},
+	}), nil)
+	c.Assert(err, check.IsNil)
+
+	lg := s.expectLog(c, r)
+	c.Check(lg.ObjectUUID, check.Equals, arvadostest.RunningContainerUUID)
+	c.Check(lg.EventType, check.Equals, "update")
+	c.Check(lg.Properties["new_attributes"].(map[string]interface{})["requesting_container_uuid"], check.Equals, "uuidvalue")
+	c.Check(lg.Properties["new_attributes"].(map[string]interface{})["name"], check.Equals, "namevalue")
+	c.Check(lg.Properties["new_attributes"].(map[string]interface{})["state"], check.Equals, "statevalue")
+}
+
 // Trashing/deleting a collection produces an "update" event with
 // properties["new_attributes"]["is_trashed"] == true.
 func (s *v0Suite) TestTrashedCollection(c *check.C) {

commit e64b5d6334a3ea01abbbacc5228c3f07482ba0d9
Author: Brett Smith <brett.smith at curii.com>
Date:   Thu Dec 28 14:38:04 2023 -0500

    Copyedit scoped token creation example
    
    * Mark up user input
    * Use zzzzz cluster id
    
    No issue #.
    
    Arvados-DCO-1.1-Signed-off-by: Brett Smith <brett.smith at curii.com>

diff --git a/doc/admin/scoped-tokens.html.textile.liquid b/doc/admin/scoped-tokens.html.textile.liquid
index 0147aa168b..415f635dcd 100644
--- a/doc/admin/scoped-tokens.html.textile.liquid
+++ b/doc/admin/scoped-tokens.html.textile.liquid
@@ -42,14 +42,14 @@ h2. Creating a scoped token
 
 A scoped token can be created at the command line:
 
-<pre>
-$ arv api_client_authorization create --api-client-authorization '{"scopes": [["GET", "/arvados/v1/collections"], ["GET", "/arvados/v1/collections/"]]}'
+<notextile>
+<pre><code>$ <span class="userinput">arv api_client_authorization create --api-client-authorization '{"scopes": [["GET", "/arvados/v1/collections"], ["GET", "/arvados/v1/collections/"]]}'</span>
 {
- "href":"/api_client_authorizations/x1u39-gj3su-bizbsw0mx5pju3w",
+ "href":"/api_client_authorizations/zzzzz-gj3su-bizbsw0mx5pju3w",
  "kind":"arvados#apiClientAuthorization",
  "etag":"9yk144t0v6cvyp0342exoh2vq",
- "uuid":"x1u39-gj3su-bizbsw0mx5pju3w",
- "owner_uuid":"x1u39-tpzed-fr97h9t4m5jffxs",
+ "uuid":"zzzzz-gj3su-bizbsw0mx5pju3w",
+ "owner_uuid":"zzzzz-tpzed-fr97h9t4m5jffxs",
  "created_at":"2020-03-12T20:36:12.517375422Z",
  "modified_by_client_uuid":null,
  "modified_by_user_uuid":null,
@@ -73,6 +73,7 @@ $ arv api_client_authorization create --api-client-authorization '{"scopes": [["
   ]
  ]
 }
-</pre>
+</code></pre>
+</notextile>
 
 The response will include @api_token@ field which is the newly issued secret token.  It can be passed directly to the API server that issued it, or can be used to construct a @v2@ token.  A @v2@ format token is required if the token will be used to access other clusters in an Arvados federation.  An Arvados @v2@ format token consists of three fields separate by slashes: the prefix @v2@, followed by the token uuid, followed by the token secret.  For example: @v2/x1u39-gj3su-bizbsw0mx5pju3w/5a74htnoqwkhtfo2upekpfbsg04hv7cy5v4nowf7dtpxer086m at .

commit 46148bb0c54244c5e4d0b2566a5fb5c679398720
Author: Brett Smith <brett.smith at curii.com>
Date:   Thu Dec 28 14:29:30 2023 -0500

    Use full paths for API documentation links
    
    Most of the cross-reference links in our documentation point to a
    specific page. These links are unusual in that they point to a
    directory. Lately the linkchecker has been intermittently complaining
    about them:
    
      URL        `file:///tmp/workspace/developer-run-tests-doc-and-sdk-R/doc/.site/api'
      Name       `API documentation'
      Parent URL file:///tmp/workspace/developer-run-tests-doc-and-sdk-R/doc/.site/admin/scoped-tokens.html, line 306, col 576
      Real URL   file:///tmp/workspace/developer-run-tests-doc-and-sdk-R/doc/.site/api/
      Warning    [file-missing-slash] Added trailing slash to
                 directory.
      Result     Valid: directory
    
    This causes a test failure. Spell out the rest of the link to prevent
    that. No issue #.
    
    Arvados-DCO-1.1-Signed-off-by: Brett Smith <brett.smith at curii.com>

diff --git a/doc/admin/scoped-tokens.html.textile.liquid b/doc/admin/scoped-tokens.html.textile.liquid
index de09b42615..0147aa168b 100644
--- a/doc/admin/scoped-tokens.html.textile.liquid
+++ b/doc/admin/scoped-tokens.html.textile.liquid
@@ -18,9 +18,9 @@ Another example is situations where admin access is required but there is risk o
 
 h2. Defining scopes
 
-A "scope" consists of a HTTP method and API path.  A token can have multiple scopes.  Token scopes act as a whitelist, and the API server checks the HTTP method and the API path of every request against the scopes of the request token.  Scopes are also described on the "API Authorization":{{site.baseurl}}/api/tokens.html#scopes page of the "API documentation":{{site.baseurl}}/api .
+A "scope" consists of a HTTP method and API path.  A token can have multiple scopes.  Token scopes act as a whitelist, and the API server checks the HTTP method and the API path of every request against the scopes of the request token.  Scopes are also described on the "API Authorization":{{site.baseurl}}/api/tokens.html#scopes page of the "API documentation":{{site.baseurl}}/api/index.html.
 
-These examples use @/arvados/v1/collections@, but can be applied to any endpoint.  Consult the "API documentation":{{site.baseurl}}/api to determine the endpoints for specific methods.
+These examples use @/arvados/v1/collections@, but can be applied to any endpoint.  Consult the "API documentation":{{site.baseurl}}/api/index.html to determine the endpoints for specific methods.
 
 The scope @["GET", "/arvados/v1/collections"]@ will allow only GET or HEAD requests for the list of collections.  Any other HTTP method or path (including requests for a specific collection record, eg a request with path @/arvados/v1/collections/zzzzz-4zz18-0123456789abcde@) will return a permission error.
 

commit 14bd55aeb477b03463fb0023e2dc6a009973edff
Author: Lucas Di Pentima <lucas.dipentima at curii.com>
Date:   Tue Dec 19 13:17:13 2023 -0300

    Merge branch '21309-x-crypto-upgrade'. Closes #21309
    
    Arvados-DCO-1.1-Signed-off-by: Lucas Di Pentima <lucas.dipentima at curii.com>

diff --git a/go.mod b/go.mod
index 218e2ddde8..0011d7970f 100644
--- a/go.mod
+++ b/go.mod
@@ -37,7 +37,7 @@ require (
 	github.com/prometheus/client_model v0.3.0
 	github.com/prometheus/common v0.39.0
 	github.com/sirupsen/logrus v1.8.1
-	golang.org/x/crypto v0.16.0
+	golang.org/x/crypto v0.17.0
 	golang.org/x/net v0.19.0
 	golang.org/x/oauth2 v0.11.0
 	golang.org/x/sys v0.15.0
diff --git a/go.sum b/go.sum
index 31ddd88621..fb2fe5e3f0 100644
--- a/go.sum
+++ b/go.sum
@@ -287,8 +287,8 @@ golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPh
 golang.org/x/crypto v0.0.0-20201002170205-7f63de1d35b0/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
 golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
 golang.org/x/crypto v0.0.0-20220314234659-1baeb1ce4c0b/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4=
-golang.org/x/crypto v0.16.0 h1:mMMrFzRSCF0GvB7Ne27XVtVAaXLrPmgPC7/v0tkwHaY=
-golang.org/x/crypto v0.16.0/go.mod h1:gCAAfMLgwOJRpTjQ2zCCt2OcSfYMTeZVSRtQlPC7Nq4=
+golang.org/x/crypto v0.17.0 h1:r8bRNjWL3GshPW3gkd+RpvzWrZAwPS49OmTGZ/uhM4k=
+golang.org/x/crypto v0.17.0/go.mod h1:gCAAfMLgwOJRpTjQ2zCCt2OcSfYMTeZVSRtQlPC7Nq4=
 golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
 golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE=
 golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU=

commit cd1c579bcbc3569698bf93e5f2f23b5a5859eebc
Author: Tom Clegg <tom at curii.com>
Date:   Mon Dec 18 16:19:53 2023 -0500

    Merge branch '21214-dav-virtual-projects'
    
    fixes #21214
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>

diff --git a/doc/api/keep-webdav.html.textile.liquid b/doc/api/keep-webdav.html.textile.liquid
index f068a49c2c..e95d523b9d 100644
--- a/doc/api/keep-webdav.html.textile.liquid
+++ b/doc/api/keep-webdav.html.textile.liquid
@@ -35,6 +35,12 @@ The @users@ folder will return a listing of the users for whom the client has pe
 
 In addition to the @/by_id/@ path prefix, the collection or project can be specified using a path prefix of @/c=<uuid or pdh>/@ or (if the cluster is properly configured) as a virtual host.  This is described on "Keep-web URLs":keep-web-urls.html
 
+It is possible for a project or a "filter group":methods/groups.html#filter to appear as its own descendant in the @by_id@ and @users@ tree (a filter group may match itself, its own ancestor, another filter group that matches its ancestor, etc). When this happens, the descendant appears as an empty read-only directory. For example, if filter group @f@ matches its own parent @p@:
+* @/users/example/p/f@ will show the filter group's contents (matched projects and collections).
+* @/users/example/p/f/p@ will appear as an empty directory.
+* @/by_id/uuid_of_f/p@ will show the parent project's contents, including @f at .
+* @/by_id/uuid_of_f/p/f@ will appear as an empty directory.
+
 h3(#auth). Authentication mechanisms
 
 A token can be provided in an Authorization header as a @Bearer@ token:
diff --git a/doc/api/methods/groups.html.textile.liquid b/doc/api/methods/groups.html.textile.liquid
index 02e8bfbb3f..ef8e177f1c 100644
--- a/doc/api/methods/groups.html.textile.liquid
+++ b/doc/api/methods/groups.html.textile.liquid
@@ -46,7 +46,7 @@ The @frozen_by_uuid@ attribute can be cleared by an admin user. It can also be c
 
 The optional @API.FreezeProjectRequiresDescription@ and @API.FreezeProjectRequiresProperties@ configuration settings can be used to prevent users from freezing projects that have empty @description@ and/or specified @properties@ entries.
 
-h3. Filter groups
+h3(#filter). Filter groups
 
 @filter@ groups are virtual groups; they can not own other objects. Filter groups have a special @properties@ field named @filters@, which must be an array of filter conditions. See "list method filters":{{site.baseurl}}/api/methods.html#filters for details on the syntax of valid filters, but keep in mind that the attributes must include the object type (@collections@, @container_requests@, @groups@, @workflows@), separated with a dot from the field to be filtered on.
 
diff --git a/sdk/go/arvados/fs_base.go b/sdk/go/arvados/fs_base.go
index 274d207022..430a0d4c9b 100644
--- a/sdk/go/arvados/fs_base.go
+++ b/sdk/go/arvados/fs_base.go
@@ -13,6 +13,7 @@ import (
 	"net/http"
 	"os"
 	"path"
+	"path/filepath"
 	"strings"
 	"sync"
 	"time"
@@ -387,17 +388,28 @@ func (n *treenode) Size() int64 {
 }
 
 func (n *treenode) FileInfo() os.FileInfo {
-	n.Lock()
-	defer n.Unlock()
-	n.fileinfo.size = int64(len(n.inodes))
-	return n.fileinfo
+	n.RLock()
+	defer n.RUnlock()
+	fi := n.fileinfo
+	fi.size = int64(len(n.inodes))
+	return fi
 }
 
 func (n *treenode) Readdir() (fi []os.FileInfo, err error) {
+	// We need RLock to safely read n.inodes, but we must release
+	// it before calling FileInfo() on the child nodes. Otherwise,
+	// we risk deadlock when filter groups A and B match each
+	// other, concurrent Readdir() calls try to RLock them in
+	// opposite orders, and one cannot be RLocked a second time
+	// because a third caller is waiting for a write lock.
 	n.RLock()
-	defer n.RUnlock()
-	fi = make([]os.FileInfo, 0, len(n.inodes))
+	inodes := make([]inode, 0, len(n.inodes))
 	for _, inode := range n.inodes {
+		inodes = append(inodes, inode)
+	}
+	n.RUnlock()
+	fi = make([]os.FileInfo, 0, len(inodes))
+	for _, inode := range inodes {
 		fi = append(fi, inode.FileInfo())
 	}
 	return
@@ -468,7 +480,8 @@ func (fs *fileSystem) openFile(name string, flag int, perm os.FileMode) (*fileha
 		return nil, ErrSyncNotSupported
 	}
 	dirname, name := path.Split(name)
-	parent, err := rlookup(fs.root, dirname)
+	ancestors := map[inode]bool{}
+	parent, err := rlookup(fs.root, dirname, ancestors)
 	if err != nil {
 		return nil, err
 	}
@@ -533,6 +546,24 @@ func (fs *fileSystem) openFile(name string, flag int, perm os.FileMode) (*fileha
 			return nil, err
 		}
 	}
+	// If n and one of its parents/ancestors are [hardlinks to]
+	// the same node (e.g., a filter group that matches itself),
+	// open an "empty directory" node instead, so the inner
+	// hardlink appears empty. This is needed to ensure
+	// Open("a/b/c/x/x").Readdir() appears empty, matching the
+	// behavior of rlookup("a/b/c/x/x/z") => ErrNotExist.
+	if hl, ok := n.(*hardlink); (ok && ancestors[hl.inode]) || ancestors[n] {
+		n = &treenode{
+			fs:     n.FS(),
+			parent: parent,
+			inodes: nil,
+			fileinfo: fileinfo{
+				name:    name,
+				modTime: time.Now(),
+				mode:    0555 | os.ModeDir,
+			},
+		}
+	}
 	return &filehandle{
 		inode:    n,
 		append:   flag&os.O_APPEND != 0,
@@ -551,7 +582,7 @@ func (fs *fileSystem) Create(name string) (File, error) {
 
 func (fs *fileSystem) Mkdir(name string, perm os.FileMode) error {
 	dirname, name := path.Split(name)
-	n, err := rlookup(fs.root, dirname)
+	n, err := rlookup(fs.root, dirname, nil)
 	if err != nil {
 		return err
 	}
@@ -575,7 +606,7 @@ func (fs *fileSystem) Mkdir(name string, perm os.FileMode) error {
 }
 
 func (fs *fileSystem) Stat(name string) (os.FileInfo, error) {
-	node, err := rlookup(fs.root, name)
+	node, err := rlookup(fs.root, name, nil)
 	if err != nil {
 		return nil, err
 	}
@@ -704,7 +735,7 @@ func (fs *fileSystem) remove(name string, recursive bool) error {
 	if name == "" || name == "." || name == ".." {
 		return ErrInvalidArgument
 	}
-	dir, err := rlookup(fs.root, dirname)
+	dir, err := rlookup(fs.root, dirname, nil)
 	if err != nil {
 		return err
 	}
@@ -741,9 +772,31 @@ func (fs *fileSystem) MemorySize() int64 {
 // rlookup (recursive lookup) returns the inode for the file/directory
 // with the given name (which may contain "/" separators). If no such
 // file/directory exists, the returned node is nil.
-func rlookup(start inode, path string) (node inode, err error) {
+//
+// The visited map should be either nil or empty. If non-nil, all
+// nodes and hardlink targets visited by the given path will be added
+// to it.
+//
+// If a cycle is detected, the second occurrence of the offending node
+// will be replaced by an empty directory. For example, if "x" is a
+// filter group that matches itself, then rlookup("a/b/c/x") will
+// return the filter group, and rlookup("a/b/c/x/x") will return an
+// empty directory.
+func rlookup(start inode, path string, visited map[inode]bool) (node inode, err error) {
+	if visited == nil {
+		visited = map[inode]bool{}
+	}
 	node = start
+	// Clean up ./ and ../ and double-slashes, but (unlike
+	// filepath.Clean) retain a trailing slash, because looking up
+	// ".../regularfile/" should fail.
+	trailingSlash := strings.HasSuffix(path, "/")
+	path = filepath.Clean(path)
+	if trailingSlash && path != "/" {
+		path += "/"
+	}
 	for _, name := range strings.Split(path, "/") {
+		visited[node] = true
 		if node.IsDir() {
 			if name == "." || name == "" {
 				continue
@@ -761,6 +814,24 @@ func rlookup(start inode, path string) (node inode, err error) {
 		if node == nil || err != nil {
 			break
 		}
+		checknode := node
+		if hardlinked, ok := checknode.(*hardlink); ok {
+			checknode = hardlinked.inode
+		}
+		if visited[checknode] {
+			node = &treenode{
+				fs:     node.FS(),
+				parent: node.Parent(),
+				inodes: nil,
+				fileinfo: fileinfo{
+					name:    name,
+					modTime: time.Now(),
+					mode:    0555 | os.ModeDir,
+				},
+			}
+		} else {
+			visited[checknode] = true
+		}
 	}
 	if node == nil && err == nil {
 		err = os.ErrNotExist
diff --git a/sdk/go/arvados/fs_collection.go b/sdk/go/arvados/fs_collection.go
index 84ff69d6bd..052cc1aa37 100644
--- a/sdk/go/arvados/fs_collection.go
+++ b/sdk/go/arvados/fs_collection.go
@@ -457,7 +457,7 @@ func (fs *collectionFileSystem) Sync() error {
 }
 
 func (fs *collectionFileSystem) Flush(path string, shortBlocks bool) error {
-	node, err := rlookup(fs.fileSystem.root, path)
+	node, err := rlookup(fs.fileSystem.root, path, nil)
 	if err != nil {
 		return err
 	}
diff --git a/sdk/go/arvados/fs_lookup.go b/sdk/go/arvados/fs_lookup.go
index 2bb09995e1..7f22449318 100644
--- a/sdk/go/arvados/fs_lookup.go
+++ b/sdk/go/arvados/fs_lookup.go
@@ -48,7 +48,19 @@ func (ln *lookupnode) Readdir() ([]os.FileInfo, error) {
 			return nil, err
 		}
 		for _, child := range all {
-			_, err = ln.treenode.Child(child.FileInfo().Name(), func(inode) (inode, error) {
+			var name string
+			if hl, ok := child.(*hardlink); ok && hl.inode == ln {
+				// If child is a hardlink to its
+				// parent, FileInfo()->RLock() will
+				// deadlock, because we already have
+				// the write lock. In this situation
+				// we can safely access the hardlink's
+				// name directly.
+				name = hl.name
+			} else {
+				name = child.FileInfo().Name()
+			}
+			_, err = ln.treenode.Child(name, func(inode) (inode, error) {
 				return child, nil
 			})
 			if err != nil {
diff --git a/sdk/go/arvados/fs_project.go b/sdk/go/arvados/fs_project.go
index a68e83945e..df1d06e753 100644
--- a/sdk/go/arvados/fs_project.go
+++ b/sdk/go/arvados/fs_project.go
@@ -35,10 +35,11 @@ func (fs *customFileSystem) projectsLoadOne(parent inode, uuid, name string) (in
 		contents = CollectionList{}
 		err = fs.RequestAndDecode(&contents, "GET", "arvados/v1/groups/"+uuid+"/contents", nil, ResourceListParams{
 			Count: "none",
+			Order: "uuid",
 			Filters: []Filter{
 				{"name", "=", strings.Replace(name, subst, "/", -1)},
 				{"uuid", "is_a", []string{"arvados#collection", "arvados#group"}},
-				{"groups.group_class", "=", "project"},
+				{"groups.group_class", "in", []string{"project", "filter"}},
 			},
 			Select: []string{"uuid", "name", "modified_at", "properties"},
 		})
@@ -104,7 +105,7 @@ func (fs *customFileSystem) projectsLoadAll(parent inode, uuid string) ([]inode,
 			{"uuid", "is_a", class},
 		}
 		if class == "arvados#group" {
-			filters = append(filters, Filter{"group_class", "=", "project"})
+			filters = append(filters, Filter{"groups.group_class", "in", []string{"project", "filter"}})
 		}
 
 		params := ResourceListParams{
diff --git a/sdk/go/arvados/fs_project_test.go b/sdk/go/arvados/fs_project_test.go
index d3dac7a14f..5c2eb33d12 100644
--- a/sdk/go/arvados/fs_project_test.go
+++ b/sdk/go/arvados/fs_project_test.go
@@ -42,61 +42,94 @@ func (sc *spyingClient) RequestAndDecode(dst interface{}, method, path string, b
 func (s *SiteFSSuite) TestFilterGroup(c *check.C) {
 	// Make sure that a collection and group that match the filter are present,
 	// and that a group that does not match the filter is not present.
-	s.fs.MountProject("fg", fixtureThisFilterGroupUUID)
 
-	_, err := s.fs.OpenFile("/fg/baz_file", 0, 0)
-	c.Assert(err, check.IsNil)
+	checkOpen := func(path string, exists bool) {
+		f, err := s.fs.Open(path)
+		if exists {
+			if c.Check(err, check.IsNil) {
+				c.Check(f.Close(), check.IsNil)
+			}
+		} else {
+			c.Check(err, check.Equals, os.ErrNotExist)
+		}
+	}
 
-	_, err = s.fs.OpenFile("/fg/A Subproject", 0, 0)
-	c.Assert(err, check.IsNil)
+	checkDirContains := func(parent, child string, exists bool) {
+		f, err := s.fs.Open(parent)
+		if !c.Check(err, check.IsNil) {
+			return
+		}
+		ents, err := f.Readdir(-1)
+		if !c.Check(err, check.IsNil) {
+			return
+		}
+		for _, ent := range ents {
+			if !exists {
+				c.Check(ent.Name(), check.Not(check.Equals), child)
+				if child == "" {
+					// no children are expected
+					c.Errorf("child %q found in parent %q", child, parent)
+				}
+			} else if ent.Name() == child {
+				return
+			}
+		}
+		if exists {
+			c.Errorf("child %q not found in parent %q", child, parent)
+		}
+	}
 
-	_, err = s.fs.OpenFile("/fg/A Project", 0, 0)
-	c.Assert(err, check.Not(check.IsNil))
+	checkOpen("/users/active/This filter group/baz_file", true)
+	checkOpen("/users/active/This filter group/A Subproject", true)
+	checkOpen("/users/active/This filter group/A Project", false)
+	s.fs.MountProject("fg", fixtureThisFilterGroupUUID)
+	checkOpen("/fg/baz_file", true)
+	checkOpen("/fg/A Subproject", true)
+	checkOpen("/fg/A Project", false)
+	s.fs.MountProject("home", "")
+	checkOpen("/home/A filter group with an is_a collection filter/baz_file", true)
+	checkOpen("/home/A filter group with an is_a collection filter/baz_file/baz", true)
+	checkOpen("/home/A filter group with an is_a collection filter/A Subproject", false)
+	checkOpen("/home/A filter group with an is_a collection filter/A Project", false)
 
 	// An empty filter means everything that is visible should be returned.
+	checkOpen("/users/active/A filter group without filters/baz_file", true)
+	checkOpen("/users/active/A filter group without filters/A Subproject", true)
+	checkOpen("/users/active/A filter group without filters/A Project", true)
 	s.fs.MountProject("fg2", fixtureAFilterGroupTwoUUID)
+	checkOpen("/fg2/baz_file", true)
+	checkOpen("/fg2/A Subproject", true)
+	checkOpen("/fg2/A Project", true)
 
-	_, err = s.fs.OpenFile("/fg2/baz_file", 0, 0)
-	c.Assert(err, check.IsNil)
-
-	_, err = s.fs.OpenFile("/fg2/A Subproject", 0, 0)
-	c.Assert(err, check.IsNil)
-
-	_, err = s.fs.OpenFile("/fg2/A Project", 0, 0)
-	c.Assert(err, check.IsNil)
+	// If a filter group matches itself or one of its ancestors,
+	// the matched item appears as an empty directory.
+	checkDirContains("/users/active/A filter group without filters", "A filter group without filters", true)
+	checkOpen("/users/active/A filter group without filters/A filter group without filters", true)
+	checkOpen("/users/active/A filter group without filters/A filter group without filters/baz_file", false)
+	checkDirContains("/users/active/A filter group without filters/A filter group without filters", "", false)
 
 	// An 'is_a' 'arvados#collection' filter means only collections should be returned.
+	checkOpen("/users/active/A filter group with an is_a collection filter/baz_file", true)
+	checkOpen("/users/active/A filter group with an is_a collection filter/baz_file/baz", true)
+	checkOpen("/users/active/A filter group with an is_a collection filter/A Subproject", false)
+	checkOpen("/users/active/A filter group with an is_a collection filter/A Project", false)
 	s.fs.MountProject("fg3", fixtureAFilterGroupThreeUUID)
-
-	_, err = s.fs.OpenFile("/fg3/baz_file", 0, 0)
-	c.Assert(err, check.IsNil)
-
-	_, err = s.fs.OpenFile("/fg3/A Subproject", 0, 0)
-	c.Assert(err, check.Not(check.IsNil))
+	checkOpen("/fg3/baz_file", true)
+	checkOpen("/fg3/baz_file/baz", true)
+	checkOpen("/fg3/A Subproject", false)
 
 	// An 'exists' 'arvados#collection' filter means only collections with certain properties should be returned.
 	s.fs.MountProject("fg4", fixtureAFilterGroupFourUUID)
-
-	_, err = s.fs.Stat("/fg4/collection with list property with odd values")
-	c.Assert(err, check.IsNil)
-
-	_, err = s.fs.Stat("/fg4/collection with list property with even values")
-	c.Assert(err, check.IsNil)
+	checkOpen("/fg4/collection with list property with odd values", true)
+	checkOpen("/fg4/collection with list property with even values", true)
+	checkOpen("/fg4/baz_file", false)
 
 	// A 'contains' 'arvados#collection' filter means only collections with certain properties should be returned.
 	s.fs.MountProject("fg5", fixtureAFilterGroupFiveUUID)
-
-	_, err = s.fs.Stat("/fg5/collection with list property with odd values")
-	c.Assert(err, check.IsNil)
-
-	_, err = s.fs.Stat("/fg5/collection with list property with string value")
-	c.Assert(err, check.IsNil)
-
-	_, err = s.fs.Stat("/fg5/collection with prop2 5")
-	c.Assert(err, check.Not(check.IsNil))
-
-	_, err = s.fs.Stat("/fg5/collection with list property with even values")
-	c.Assert(err, check.Not(check.IsNil))
+	checkOpen("/fg5/collection with list property with odd values", true)
+	checkOpen("/fg5/collection with list property with string value", true)
+	checkOpen("/fg5/collection with prop2 5", false)
+	checkOpen("/fg5/collection with list property with even values", false)
 }
 
 func (s *SiteFSSuite) TestCurrentUserHome(c *check.C) {
diff --git a/sdk/go/arvados/fs_site.go b/sdk/go/arvados/fs_site.go
index a4a18837e0..d4f0241682 100644
--- a/sdk/go/arvados/fs_site.go
+++ b/sdk/go/arvados/fs_site.go
@@ -123,6 +123,10 @@ func (fs *customFileSystem) ForwardSlashNameSubstitution(repl string) {
 	fs.forwardSlashNameSubstitution = repl
 }
 
+func (fs *customFileSystem) MemorySize() int64 {
+	return fs.fileSystem.MemorySize() + fs.byIDRoot.MemorySize()
+}
+
 // SiteFileSystem returns a FileSystem that maps collections and other
 // Arvados objects onto a filesystem layout.
 //
@@ -386,3 +390,7 @@ func (hl *hardlink) FileInfo() os.FileInfo {
 	}
 	return fi
 }
+
+func (hl *hardlink) MemorySize() int64 {
+	return 64 + int64(len(hl.name))
+}
diff --git a/sdk/go/arvados/fs_site_test.go b/sdk/go/arvados/fs_site_test.go
index c7d6b2a464..2c86536b2f 100644
--- a/sdk/go/arvados/fs_site_test.go
+++ b/sdk/go/arvados/fs_site_test.go
@@ -185,6 +185,16 @@ func (s *SiteFSSuite) TestByUUIDAndPDH(c *check.C) {
 		names = append(names, fi.Name())
 	}
 	c.Check(names, check.DeepEquals, []string{"baz"})
+	f, err = s.fs.Open("/by_id/" + fixtureAProjectUUID + "/A Subproject/baz_file/baz")
+	c.Assert(err, check.IsNil)
+	err = f.Close()
+	c.Assert(err, check.IsNil)
+	_, err = s.fs.Open("/by_id/" + fixtureAProjectUUID + "/A Subproject/baz_file/baz/")
+	c.Assert(err, check.Equals, ErrNotADirectory)
+	_, err = s.fs.Open("/by_id/" + fixtureAProjectUUID + "/A Subproject/baz_file/baz/z")
+	c.Assert(err, check.Equals, ErrNotADirectory)
+	_, err = s.fs.Open("/by_id/" + fixtureAProjectUUID + "/A Subproject/baz_file/baz/..")
+	c.Assert(err, check.Equals, ErrNotADirectory)
 
 	_, err = s.fs.OpenFile("/by_id/"+fixtureNonexistentCollection, os.O_RDWR|os.O_CREATE, 0755)
 	c.Check(err, ErrorIs, ErrInvalidOperation)
diff --git a/sdk/go/arvadostest/fixtures.go b/sdk/go/arvadostest/fixtures.go
index ac12f7ae13..3b8a618fea 100644
--- a/sdk/go/arvadostest/fixtures.go
+++ b/sdk/go/arvadostest/fixtures.go
@@ -37,8 +37,9 @@ const (
 	StorageClassesDesiredArchiveConfirmedDefault = "zzzzz-4zz18-3t236wr12769qqa"
 	EmptyCollectionUUID                          = "zzzzz-4zz18-gs9ooj1h9sd5mde"
 
-	AProjectUUID    = "zzzzz-j7d0g-v955i6s2oi1cbso"
-	ASubprojectUUID = "zzzzz-j7d0g-axqo7eu9pwvna1x"
+	AProjectUUID     = "zzzzz-j7d0g-v955i6s2oi1cbso"
+	ASubprojectUUID  = "zzzzz-j7d0g-axqo7eu9pwvna1x"
+	AFilterGroupUUID = "zzzzz-j7d0g-thisfiltergroup"
 
 	FooAndBarFilesInDirUUID = "zzzzz-4zz18-foonbarfilesdir"
 	FooAndBarFilesInDirPDH  = "870369fc72738603c2fad16664e50e2d+58"
diff --git a/services/keep-web/handler_test.go b/services/keep-web/handler_test.go
index c14789889d..5a12e26e9d 100644
--- a/services/keep-web/handler_test.go
+++ b/services/keep-web/handler_test.go
@@ -1105,6 +1105,17 @@ func (s *IntegrationSuite) TestDirectoryListingWithNoAnonymousToken(c *check.C)
 }
 
 func (s *IntegrationSuite) testDirectoryListing(c *check.C) {
+	// The "ownership cycle" test fixtures are reachable from the
+	// "filter group without filters" group, causing webdav's
+	// walkfs to recurse indefinitely. Avoid that by deleting one
+	// of the bogus fixtures.
+	arv := arvados.NewClientFromEnv()
+	err := arv.RequestAndDecode(nil, "DELETE", "arvados/v1/groups/zzzzz-j7d0g-cx2al9cqkmsf1hs", nil, nil)
+	if err != nil {
+		c.Assert(err, check.FitsTypeOf, &arvados.TransactionError{})
+		c.Check(err.(*arvados.TransactionError).StatusCode, check.Equals, 404)
+	}
+
 	s.handler.Cluster.Services.WebDAVDownload.ExternalURL.Host = "download.example.com"
 	authHeader := http.Header{
 		"Authorization": {"OAuth2 " + arvadostest.ActiveToken},
@@ -1241,8 +1252,32 @@ func (s *IntegrationSuite) testDirectoryListing(c *check.C) {
 			expect:  []string{"waz"},
 			cutDirs: 2,
 		},
+		{
+			uri:     "download.example.com/users/active/This filter group/",
+			header:  authHeader,
+			expect:  []string{"A Subproject/"},
+			cutDirs: 3,
+		},
+		{
+			uri:     "download.example.com/users/active/This filter group/A Subproject",
+			header:  authHeader,
+			expect:  []string{"baz_file/"},
+			cutDirs: 4,
+		},
+		{
+			uri:     "download.example.com/by_id/" + arvadostest.AFilterGroupUUID,
+			header:  authHeader,
+			expect:  []string{"A Subproject/"},
+			cutDirs: 2,
+		},
+		{
+			uri:     "download.example.com/by_id/" + arvadostest.AFilterGroupUUID + "/A Subproject",
+			header:  authHeader,
+			expect:  []string{"baz_file/"},
+			cutDirs: 3,
+		},
 	} {
-		comment := check.Commentf("HTML: %q => %q", trial.uri, trial.expect)
+		comment := check.Commentf("HTML: %q redir %q => %q", trial.uri, trial.redirect, trial.expect)
 		resp := httptest.NewRecorder()
 		u := mustParseURL("//" + trial.uri)
 		req := &http.Request{
@@ -1278,6 +1313,7 @@ func (s *IntegrationSuite) testDirectoryListing(c *check.C) {
 		} else {
 			c.Check(resp.Code, check.Equals, http.StatusOK, comment)
 			for _, e := range trial.expect {
+				e = strings.Replace(e, " ", "%20", -1)
 				c.Check(resp.Body.String(), check.Matches, `(?ms).*href="./`+e+`".*`, comment)
 			}
 			c.Check(resp.Body.String(), check.Matches, `(?ms).*--cut-dirs=`+fmt.Sprintf("%d", trial.cutDirs)+` .*`, comment)
@@ -1310,6 +1346,12 @@ func (s *IntegrationSuite) testDirectoryListing(c *check.C) {
 		}
 		resp = httptest.NewRecorder()
 		s.handler.ServeHTTP(resp, req)
+		// This check avoids logging a big XML document in the
+		// event webdav throws a 500 error after sending
+		// headers for a 207.
+		if !c.Check(strings.HasSuffix(resp.Body.String(), "Internal Server Error"), check.Equals, false) {
+			continue
+		}
 		if trial.expect == nil {
 			c.Check(resp.Code, check.Equals, http.StatusUnauthorized, comment)
 		} else {
@@ -1320,6 +1362,7 @@ func (s *IntegrationSuite) testDirectoryListing(c *check.C) {
 				} else {
 					e = filepath.Join(u.Path, e)
 				}
+				e = strings.Replace(e, " ", "%20", -1)
 				c.Check(resp.Body.String(), check.Matches, `(?ms).*<D:href>`+e+`</D:href>.*`, comment)
 			}
 		}
diff --git a/services/keep-web/server_test.go b/services/keep-web/server_test.go
index b3d0b9b418..dd29c40082 100644
--- a/services/keep-web/server_test.go
+++ b/services/keep-web/server_test.go
@@ -476,7 +476,7 @@ func (s *IntegrationSuite) TestMetrics(c *check.C) {
 	c.Check(summaries["request_duration_seconds/get/200"].SampleCount, check.Equals, "3")
 	c.Check(summaries["request_duration_seconds/get/404"].SampleCount, check.Equals, "1")
 	c.Check(summaries["time_to_status_seconds/get/404"].SampleCount, check.Equals, "1")
-	c.Check(gauges["arvados_keepweb_sessions_cached_session_bytes//"].Value, check.Equals, float64(469))
+	c.Check(gauges["arvados_keepweb_sessions_cached_session_bytes//"].Value, check.Equals, float64(624))
 
 	// If the Host header indicates a collection, /metrics.json
 	// refers to a file in the collection -- the metrics handler

commit 989e46c76c9c8949bab0b63f2a51460ddc7f1fc9
Author: Brett Smith <brett.smith at curii.com>
Date:   Mon Dec 18 13:48:50 2023 -0500

    Merge branch '21283-callable-api-module'
    
    Closes #21283.
    
    Arvados-DCO-1.1-Signed-off-by: Brett Smith <brett.smith at curii.com>

diff --git a/sdk/python/arvados/__init__.py b/sdk/python/arvados/__init__.py
index e90f381298..83f658201c 100644
--- a/sdk/python/arvados/__init__.py
+++ b/sdk/python/arvados/__init__.py
@@ -6,8 +6,8 @@
 This module provides the entire Python SDK for Arvados. The most useful modules
 include:
 
-* arvados.api - After you `import arvados`, you can call `arvados.api.api` as
-  `arvados.api` to construct a client object.
+* arvados.api - After you `import arvados`, you can call `arvados.api` as a
+  shortcut to the client constructor function `arvados.api.api`.
 
 * arvados.collection - The `arvados.collection.Collection` class provides a
   high-level interface to read and write collections. It coordinates sending
@@ -26,15 +26,24 @@ import types
 
 from collections import UserDict
 
-from .api import api, api_from_config, http_cache
+from . import api, errors, util
+from .api import api_from_config, http_cache
 from .collection import CollectionReader, CollectionWriter, ResumableCollectionWriter
 from arvados.keep import *
 from arvados.stream import *
 from .arvfile import StreamFileReader
 from .logging import log_format, log_date_format, log_handler
 from .retry import RetryLoop
-import arvados.errors as errors
-import arvados.util as util
+
+# Previous versions of the PySDK used to say `from .api import api`.  This
+# made it convenient to call the API client constructor, but difficult to
+# access the rest of the `arvados.api` module. The magic below fixes that
+# bug while retaining backwards compatibility: `arvados.api` is now the
+# module and you can import it normally, but we make that module callable so
+# all the existing code that says `arvados.api('v1', ...)` still works.
+class _CallableAPIModule(api.__class__):
+    __call__ = staticmethod(api.api)
+api.__class__ = _CallableAPIModule
 
 # Override logging module pulled in via `from ... import *`
 # so users can `import arvados.logging`.

commit 2da34092382837af4185926b96a1bc692cd731f4
Author: Brett Smith <brett.smith at curii.com>
Date:   Sat Dec 16 15:05:50 2023 -0500

    Merge branch '21146-pysdk-new-websockets'
    
    Closes #21146, #19825.
    
    Arvados-DCO-1.1-Signed-off-by: Brett Smith <brett.smith at curii.com>

diff --git a/sdk/python/arvados/events.py b/sdk/python/arvados/events.py
index e53e4980a8..917c876706 100644
--- a/sdk/python/arvados/events.py
+++ b/sdk/python/arvados/events.py
@@ -1,155 +1,322 @@
 # Copyright (C) The Arvados Authors. All rights reserved.
 #
 # SPDX-License-Identifier: Apache-2.0
+"""Follow events on an Arvados cluster
 
-from __future__ import absolute_import
-from future import standard_library
-standard_library.install_aliases()
-from builtins import str
-from builtins import object
-import arvados
-from . import config
-from . import errors
-from .retry import RetryLoop
+This module provides different ways to get notified about events that happen
+on an Arvados cluster. You indicate which events you want updates about, and
+provide a function that is called any time one of those events is received
+from the server.
 
-import logging
+`subscribe` is the main entry point. It helps you construct one of the two
+API-compatible client classes: `EventClient` (which uses WebSockets) or
+`PollClient` (which periodically queries the logs list methods).
+"""
+
+import enum
 import json
-import _thread
-import threading
-import time
+import logging
 import os
 import re
 import ssl
-from ws4py.client.threadedclient import WebSocketClient
+import sys
+import _thread
+import threading
+import time
+
+import websockets.exceptions as ws_exc
+import websockets.sync.client as ws_client
+
+from . import config
+from . import errors
+from . import util
+from .retry import RetryLoop
+from ._version import __version__
+
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Iterable,
+    List,
+    Optional,
+    Union,
+)
+
+EventCallback = Callable[[Dict[str, Any]], object]
+"""Type signature for an event handler callback"""
+FilterCondition = List[Union[None, str, 'Filter']]
+"""Type signature for a single filter condition"""
+Filter = List[FilterCondition]
+"""Type signature for an entire filter"""
 
 _logger = logging.getLogger('arvados.events')
 
+class WSMethod(enum.Enum):
+    """Arvados WebSocket methods
 
-class _EventClient(WebSocketClient):
-    def __init__(self, url, filters, on_event, last_log_id, on_closed):
-        ssl_options = {'ca_certs': arvados.util.ca_certs_path()}
-        if config.flag_is_true('ARVADOS_API_HOST_INSECURE'):
-            ssl_options['cert_reqs'] = ssl.CERT_NONE
-        else:
-            ssl_options['cert_reqs'] = ssl.CERT_REQUIRED
+    This enum represents valid values for the `method` field in messages
+    sent to an Arvados WebSocket server.
+    """
+    SUBSCRIBE = 'subscribe'
+    SUB = SUBSCRIBE
+    UNSUBSCRIBE = 'unsubscribe'
+    UNSUB = UNSUBSCRIBE
 
-        # Warning: If the host part of url resolves to both IPv6 and
-        # IPv4 addresses (common with "localhost"), only one of them
-        # will be attempted -- and it might not be the right one. See
-        # ws4py's WebSocketBaseClient.__init__.
-        super(_EventClient, self).__init__(url, ssl_options=ssl_options)
 
-        self.filters = filters
-        self.on_event = on_event
+class EventClient(threading.Thread):
+    """Follow Arvados events via WebSocket
+
+    EventClient follows events on Arvados cluster published by the WebSocket
+    server. Users can select the events they want to follow and run their own
+    callback function on each.
+    """
+    _USER_AGENT = 'Python/{}.{}.{} arvados.events/{}'.format(
+        *sys.version_info[:3],
+        __version__,
+    )
+
+    def __init__(
+            self,
+            url: str,
+            filters: Optional[Filter],
+            on_event_cb: EventCallback,
+            last_log_id: Optional[int]=None,
+            *,
+            insecure: Optional[bool]=None,
+    ) -> None:
+        """Initialize a WebSocket client
+
+        Constructor arguments:
+
+        * url: str --- The `wss` URL for an Arvados WebSocket server.
+
+        * filters: arvados.events.Filter | None --- One event filter to
+          subscribe to after connecting to the WebSocket server. If not
+          specified, the client will subscribe to all events.
+
+        * on_event_cb: arvados.events.EventCallback --- When the client
+          receives an event from the WebSocket server, it calls this
+          function with the event object.
+
+        * last_log_id: int | None --- If specified, this will be used as the
+          value for the `last_log_id` field in subscribe messages sent by
+          the client.
+
+        Constructor keyword arguments:
+
+        * insecure: bool | None --- If `True`, the client will not check the
+          validity of the server's TLS certificate. If not specified, uses
+          the value from the user's `ARVADOS_API_HOST_INSECURE` setting.
+        """
+        self.url = url
+        self.filters = [filters or []]
+        self.on_event_cb = on_event_cb
         self.last_log_id = last_log_id
-        self._closing_lock = threading.RLock()
-        self._closing = False
-        self._closed = threading.Event()
-        self.on_closed = on_closed
+        self.is_closed = threading.Event()
+        self._ssl_ctx = ssl.create_default_context(
+            purpose=ssl.Purpose.SERVER_AUTH,
+            cafile=util.ca_certs_path(),
+        )
+        if insecure is None:
+            insecure = config.flag_is_true('ARVADOS_API_HOST_INSECURE')
+        if insecure:
+            self._ssl_ctx.check_hostname = False
+            self._ssl_ctx.verify_mode = ssl.CERT_NONE
+        self._subscribe_lock = threading.Lock()
+        self._connect()
+        super().__init__(daemon=True)
+        self.start()
+
+    def _connect(self) -> None:
+        # There are no locks protecting this method. After the thread starts,
+        # it should only be called from inside.
+        self._client = ws_client.connect(
+            self.url,
+            logger=_logger,
+            ssl_context=self._ssl_ctx,
+            user_agent_header=self._USER_AGENT,
+        )
+        self._client_ok = True
+
+    def _subscribe(self, f: Filter, last_log_id: Optional[int]) -> None:
+        extra = {}
+        if last_log_id is not None:
+            extra['last_log_id'] = last_log_id
+        return self._update_sub(WSMethod.SUBSCRIBE, f, **extra)
 
-    def opened(self):
-        for f in self.filters:
-            self.subscribe(f, self.last_log_id)
+    def _update_sub(self, method: WSMethod, f: Filter, **extra: Any) -> None:
+        msg = json.dumps({
+            'method': method.value,
+            'filters': f,
+            **extra,
+        })
+        self._client.send(msg)
 
-    def closed(self, code, reason=None):
-        self._closed.set()
-        self.on_closed()
+    def close(self, code: int=1000, reason: str='', timeout: float=0) -> None:
+        """Close the WebSocket connection and stop processing events
 
-    def received_message(self, m):
-        with self._closing_lock:
-            if not self._closing:
-                self.on_event(json.loads(str(m)))
+        Arguments:
 
-    def close(self, code=1000, reason='', timeout=0):
-        """Close event client and optionally wait for it to finish.
+        * code: int --- The WebSocket close code sent to the server when
+          disconnecting. Default 1000.
 
-        :timeout: is the number of seconds to wait for ws4py to
-        indicate that the connection has closed.
+        * reason: str --- The WebSocket close reason sent to the server when
+          disconnecting. Default is the empty string.
+
+        * timeout: float --- How long to wait for the WebSocket server to
+          acknowledge the disconnection, in seconds. Default 0, which means
+          no timeout.
         """
-        super(_EventClient, self).close(code, reason)
-        with self._closing_lock:
-            # make sure we don't process any more messages.
-            self._closing = True
-        # wait for ws4py to tell us the connection is closed.
-        self._closed.wait(timeout=timeout)
+        self.is_closed.set()
+        self._client.close_timeout = timeout or None
+        self._client.close(code, reason)
 
-    def subscribe(self, f, last_log_id=None):
-        m = {"method": "subscribe", "filters": f}
-        if last_log_id is not None:
-            m["last_log_id"] = last_log_id
-        self.send(json.dumps(m))
+    def run_forever(self) -> None:
+        """Run the WebSocket client indefinitely
 
-    def unsubscribe(self, f):
-        self.send(json.dumps({"method": "unsubscribe", "filters": f}))
+        This method blocks until the `close` method is called (e.g., from
+        another thread) or the client permanently loses its connection.
+        """
+        # Have to poll here to let KeyboardInterrupt get raised.
+        while not self.is_closed.wait(1):
+            pass
 
+    def subscribe(self, f: Filter, last_log_id: Optional[int]=None) -> None:
+        """Subscribe to another set of events from the server
 
-class EventClient(object):
-    def __init__(self, url, filters, on_event_cb, last_log_id):
-        self.url = url
-        if filters:
-            self.filters = [filters]
-        else:
-            self.filters = [[]]
-        self.on_event_cb = on_event_cb
-        self.last_log_id = last_log_id
-        self.is_closed = threading.Event()
-        self._setup_event_client()
+        Arguments:
 
-    def _setup_event_client(self):
-        self.ec = _EventClient(self.url, self.filters, self.on_event,
-                               self.last_log_id, self.on_closed)
-        self.ec.daemon = True
-        try:
-            self.ec.connect()
-        except Exception:
-            self.ec.close_connection()
-            raise
+        * f: arvados.events.Filter | None --- One filter to subscribe to
+          events for.
 
-    def subscribe(self, f, last_log_id=None):
-        self.filters.append(f)
-        self.ec.subscribe(f, last_log_id)
+        * last_log_id: int | None --- If specified, request events starting
+          from this id. If not specified, the server will only send events
+          that occur after processing the subscription.
+        """
+        with self._subscribe_lock:
+            self._subscribe(f, last_log_id)
+            self.filters.append(f)
 
-    def unsubscribe(self, f):
-        del self.filters[self.filters.index(f)]
-        self.ec.unsubscribe(f)
+    def unsubscribe(self, f: Filter) -> None:
+        """Unsubscribe from an event stream
 
-    def close(self, code=1000, reason='', timeout=0):
-        self.is_closed.set()
-        self.ec.close(code, reason, timeout)
+        Arguments:
+
+        * f: arvados.events.Filter | None --- One event filter to stop
+        receiving events for.
+        """
+        with self._subscribe_lock:
+            try:
+                index = self.filters.index(f)
+            except ValueError:
+                raise ValueError(f"filter not subscribed: {f!r}") from None
+            self._update_sub(WSMethod.UNSUBSCRIBE, f)
+            del self.filters[index]
+
+    def on_closed(self) -> None:
+        """Handle disconnection from the WebSocket server
+
+        This method is called when the client loses its connection from
+        receiving events. This implementation tries to establish a new
+        connection if it was not closed client-side.
+        """
+        if self.is_closed.is_set():
+            return
+        _logger.warning("Unexpected close. Reconnecting.")
+        for _ in RetryLoop(num_retries=25, backoff_start=.1, max_wait=15):
+            try:
+                self._connect()
+            except Exception as e:
+                _logger.warning("Error '%s' during websocket reconnect.", e)
+            else:
+                _logger.warning("Reconnect successful.")
+                break
+        else:
+            _logger.error("EventClient thread could not contact websocket server.")
+            self.is_closed.set()
+            _thread.interrupt_main()
+
+    def on_event(self, m: Dict[str, Any]) -> None:
+        """Handle an event from the WebSocket server
 
-    def on_event(self, m):
-        if m.get('id') != None:
-            self.last_log_id = m.get('id')
+        This method is called whenever the client receives an event from the
+        server. This implementation records the `id` field internally, then
+        calls the callback function provided at initialization time.
+
+        Arguments:
+
+        * m: Dict[str, Any] --- The event object, deserialized from JSON.
+        """
+        try:
+            self.last_log_id = m['id']
+        except KeyError:
+            pass
         try:
             self.on_event_cb(m)
-        except Exception as e:
+        except Exception:
             _logger.exception("Unexpected exception from event callback.")
             _thread.interrupt_main()
 
-    def on_closed(self):
-        if not self.is_closed.is_set():
-            _logger.warning("Unexpected close. Reconnecting.")
-            for tries_left in RetryLoop(num_retries=25, backoff_start=.1, max_wait=15):
-                try:
-                    self._setup_event_client()
-                    _logger.warning("Reconnect successful.")
-                    break
-                except Exception as e:
-                    _logger.warning("Error '%s' during websocket reconnect.", e)
-            if tries_left == 0:
-                _logger.exception("EventClient thread could not contact websocket server.")
-                self.is_closed.set()
-                _thread.interrupt_main()
-                return
+    def run(self) -> None:
+        """Run the client loop
 
-    def run_forever(self):
-        # Have to poll here to let KeyboardInterrupt get raised.
-        while not self.is_closed.wait(1):
-            pass
+        This method runs in a separate thread to receive and process events
+        from the server.
+        """
+        self.setName(f'ArvadosWebsockets-{self.ident}')
+        while self._client_ok and not self.is_closed.is_set():
+            try:
+                with self._subscribe_lock:
+                    for f in self.filters:
+                        self._subscribe(f, self.last_log_id)
+                for msg_s in self._client:
+                    if not self.is_closed.is_set():
+                        msg = json.loads(msg_s)
+                        self.on_event(msg)
+            except ws_exc.ConnectionClosed:
+                self._client_ok = False
+                self.on_closed()
 
 
 class PollClient(threading.Thread):
-    def __init__(self, api, filters, on_event, poll_time, last_log_id):
+    """Follow Arvados events via polling logs
+
+    PollClient follows events on Arvados cluster by periodically running
+    logs list API calls. Users can select the events they want to follow and
+    run their own callback function on each.
+    """
+    def __init__(
+            self,
+            api: 'arvados.api_resources.ArvadosAPIClient',
+            filters: Optional[Filter],
+            on_event: EventCallback,
+            poll_time: float=15,
+            last_log_id: Optional[int]=None,
+    ) -> None:
+        """Initialize a polling client
+
+        Constructor arguments:
+
+        * api: arvados.api_resources.ArvadosAPIClient --- The Arvados API
+          client used to query logs. It will be used in a separate thread,
+          so if it is not an instance of `arvados.safeapi.ThreadSafeApiCache`
+          it should not be reused after the thread is started.
+
+        * filters: arvados.events.Filter | None --- One event filter to
+          subscribe to after connecting to the WebSocket server. If not
+          specified, the client will subscribe to all events.
+
+        * on_event: arvados.events.EventCallback --- When the client
+          receives an event from the WebSocket server, it calls this
+          function with the event object.
+
+        * poll_time: float --- The number of seconds to wait between querying
+          logs. Default 15.
+
+        * last_log_id: int | None --- If specified, queries will include a
+          filter for logs with an `id` at least this value.
+        """
         super(PollClient, self).__init__()
         self.api = api
         if filters:
@@ -174,6 +341,11 @@ class PollClient(threading.Thread):
             self._skip_old_events = False
 
     def run(self):
+        """Run the client loop
+
+        This method runs in a separate thread to poll and process events
+        from the server.
+        """
         self.on_event({'status': 200})
 
         while not self._closing.is_set():
@@ -262,23 +434,29 @@ class PollClient(threading.Thread):
                 self._closing.wait(self.poll_time)
 
     def run_forever(self):
+        """Run the polling client indefinitely
+
+        This method blocks until the `close` method is called (e.g., from
+        another thread) or the client permanently loses its connection.
+        """
         # Have to poll here, otherwise KeyboardInterrupt will never get processed.
         while not self._closing.is_set():
             self._closing.wait(1)
 
-    def close(self, code=None, reason=None, timeout=0):
-        """Close poll client and optionally wait for it to finish.
+    def close(self, code: Optional[int]=None, reason: Optional[str]=None, timeout: float=0) -> None:
+        """Stop polling and processing events
 
-        If an :on_event: handler is running in a different thread,
-        first wait (indefinitely) for it to return.
+        Arguments:
 
-        After closing, wait up to :timeout: seconds for the thread to
-        finish the poll request in progress (if any).
+        * code: Optional[int] --- Ignored; this argument exists for API
+          compatibility with `EventClient.close`.
 
-        :code: and :reason: are ignored. They are present for
-        interface compatibility with EventClient.
-        """
+        * reason: Optional[str] --- Ignored; this argument exists for API
+          compatibility with `EventClient.close`.
 
+        * timeout: float --- How long to wait for the client thread to finish
+          processing events. Default 0, which means no timeout.
+        """
         with self._closing_lock:
             self._closing.set()
         try:
@@ -290,11 +468,27 @@ class PollClient(threading.Thread):
             # to do so raises the same exception."
             pass
 
-    def subscribe(self, f):
+    def subscribe(self, f: Filter, last_log_id: Optional[int]=None) -> None:
+        """Subscribe to another set of events from the server
+
+        Arguments:
+
+        * f: arvados.events.Filter | None --- One filter to subscribe to.
+
+        * last_log_id: Optional[int] --- Ignored; this argument exists for
+          API compatibility with `EventClient.subscribe`.
+        """
         self.on_event({'status': 200})
         self.filters.append(f)
 
     def unsubscribe(self, f):
+        """Unsubscribe from an event stream
+
+        Arguments:
+
+        * f: arvados.events.Filter | None --- One event filter to stop
+        receiving events for.
+        """
         del self.filters[self.filters.index(f)]
 
 
@@ -312,21 +506,42 @@ def _subscribe_websocket(api, filters, on_event, last_log_id=None):
     else:
         return client
 
-
-def subscribe(api, filters, on_event, poll_fallback=15, last_log_id=None):
+def subscribe(
+        api: 'arvados.api_resources.ArvadosAPIClient',
+        filters: Optional[Filter],
+        on_event: EventCallback,
+        poll_fallback: float=15,
+        last_log_id: Optional[int]=None,
+) -> Union[EventClient, PollClient]:
+    """Start a thread to monitor events
+
+    This method tries to construct an `EventClient` to process Arvados
+    events via WebSockets. If that fails, or the
+    `ARVADOS_DISABLE_WEBSOCKETS` flag is set in user configuration, it falls
+    back to constructing a `PollClient` to process the events via API
+    polling.
+
+    Arguments:
+
+    * api: arvados.api_resources.ArvadosAPIClient --- The Arvados API
+      client used to query logs. It may be used in a separate thread,
+      so if it is not an instance of `arvados.safeapi.ThreadSafeApiCache`
+      it should not be reused after this method returns.
+
+    * filters: arvados.events.Filter | None --- One event filter to
+      subscribe to after initializing the client. If not specified, the
+      client will subscribe to all events.
+
+    * on_event: arvados.events.EventCallback --- When the client receives an
+      event, it calls this function with the event object.
+
+    * poll_time: float --- The number of seconds to wait between querying
+      logs. If 0, this function will refuse to construct a `PollClient`.
+      Default 15.
+
+    * last_log_id: int | None --- If specified, start processing events with
+      at least this `id` value.
     """
-    :api:
-      a client object retrieved from arvados.api(). The caller should not use this client object for anything else after calling subscribe().
-    :filters:
-      Initial subscription filters.
-    :on_event:
-      The callback when a message is received.
-    :poll_fallback:
-      If websockets are not available, fall back to polling every N seconds.  If poll_fallback=False, this will return None if websockets are not available.
-    :last_log_id:
-      Log rows that are newer than the log id
-    """
-
     if not poll_fallback:
         return _subscribe_websocket(api, filters, on_event, last_log_id)
 
diff --git a/sdk/python/setup.py b/sdk/python/setup.py
index 9ba9629bca..284a460f1a 100644
--- a/sdk/python/setup.py
+++ b/sdk/python/setup.py
@@ -116,21 +116,21 @@ setup(name='arvados-python-client',
       ],
       install_requires=[
           'ciso8601 >=2.0.0',
+          'dataclasses; python_version<"3.7"',
           'future',
           'google-api-core <2.11.0', # 2.11.0rc1 is incompatible with google-auth<2
           'google-api-python-client >=2.1.0',
           'google-auth <2',
           'httplib2 >=0.9.2, <0.20.2',
+          'protobuf <4.0.0dev',
           'pycurl >=7.19.5.1, <7.45.0',
+          'pyparsing <3',
           'ruamel.yaml >=0.15.54, <0.17.22',
           'setuptools >=40.3.0',
           # As of 4.8.0rc1, typing_extensions does not parse in Python 3.7
           'typing_extensions >=3.7.4, <4.8; python_version<"3.8"',
-          'ws4py >=0.4.2',
-          'protobuf <4.0.0dev',
-          'pyparsing <3',
-          'setuptools >=40.3.0',
-          'dataclasses; python_version<"3.7"',
+          'websockets >=11.0',
+          'websockets ~=11.0; python_version<"3.8"',
       ],
       classifiers=[
           'Programming Language :: Python :: 3',
diff --git a/sdk/python/tests/test_events.py b/sdk/python/tests/test_events.py
index f5192160f3..b4e6a0b1cd 100644
--- a/sdk/python/tests/test_events.py
+++ b/sdk/python/tests/test_events.py
@@ -2,13 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-from __future__ import print_function
-from __future__ import absolute_import
-from __future__ import division
-from future import standard_library
-standard_library.install_aliases()
-from builtins import range
-from builtins import object
+import json
 import logging
 import mock
 import queue
@@ -17,10 +11,62 @@ import threading
 import time
 import unittest
 
+import websockets.exceptions as ws_exc
+
 import arvados
 from . import arvados_testutil as tutil
 from . import run_test_server
 
+class FakeWebsocketClient:
+    """Fake self-contained version of websockets.sync.client.ClientConnection
+
+    This provides enough of the API to test EventClient. It loosely mimics
+    the Arvados WebSocket API by acknowledging subscribe messages. You can use
+    `mock_wrapper` to test calls. You can set `_check_lock` to test that the
+    given lock is acquired before `send` is called.
+    """
+
+    def __init__(self):
+        self._check_lock = None
+        self._closed = threading.Event()
+        self._messages = queue.Queue()
+
+    def mock_wrapper(self):
+        wrapper = mock.Mock(wraps=self)
+        wrapper.__iter__ = lambda _: self.__iter__()
+        return wrapper
+
+    def __iter__(self):
+        while True:
+            msg = self._messages.get()
+            self._messages.task_done()
+            if isinstance(msg, Exception):
+                raise msg
+            else:
+                yield msg
+
+    def close(self, code=1000, reason=''):
+        if not self._closed.is_set():
+            self._closed.set()
+            self.force_disconnect()
+
+    def force_disconnect(self):
+        self._messages.put(ws_exc.ConnectionClosed(None, None))
+
+    def send(self, msg):
+        if self._check_lock is not None and self._check_lock.acquire(blocking=False):
+            self._check_lock.release()
+            raise AssertionError(f"called ws_client.send() without lock")
+        elif self._closed.is_set():
+            raise ws_exc.ConnectionClosed(None, None)
+        try:
+            msg = json.loads(msg)
+        except ValueError:
+            status = 400
+        else:
+            status = 200
+        self._messages.put(json.dumps({'status': status}))
+
 
 class WebsocketTest(run_test_server.TestCaseWithServers):
     MAIN_SERVER = {}
@@ -201,7 +247,7 @@ class WebsocketTest(run_test_server.TestCaseWithServers):
 
         # close (im)properly
         if close_unexpected:
-            self.ws.ec.close_connection()
+            self.ws._client.close()
         else:
             self.ws.close()
 
@@ -240,69 +286,115 @@ class WebsocketTest(run_test_server.TestCaseWithServers):
         self._test_websocket_reconnect(False)
 
     # Test websocket reconnection retry
-    @mock.patch('arvados.events._EventClient.connect')
-    def test_websocket_reconnect_retry(self, event_client_connect):
-        event_client_connect.side_effect = [None, Exception('EventClient.connect error'), None]
-
+    @mock.patch('arvados.events.ws_client.connect')
+    def test_websocket_reconnect_retry(self, ws_conn):
         logstream = tutil.StringIO()
         rootLogger = logging.getLogger()
         streamHandler = logging.StreamHandler(logstream)
         rootLogger.addHandler(streamHandler)
-
-        run_test_server.authorize_with('active')
-        events = queue.Queue(100)
-
-        filters = [['object_uuid', 'is_a', 'arvados#human']]
-        self.ws = arvados.events.subscribe(
-            arvados.api('v1'), filters,
-            events.put_nowait,
-            poll_fallback=False,
-            last_log_id=None)
-        self.assertIsInstance(self.ws, arvados.events.EventClient)
-
-        # simulate improper close
-        self.ws.on_closed()
-
-        # verify log messages to ensure retry happened
-        log_messages = logstream.getvalue()
-        found = log_messages.find("Error 'EventClient.connect error' during websocket reconnect.")
-        self.assertNotEqual(found, -1)
-        rootLogger.removeHandler(streamHandler)
-
-    @mock.patch('arvados.events._EventClient')
-    def test_subscribe_method(self, websocket_client):
-        filters = [['object_uuid', 'is_a', 'arvados#human']]
-        client = arvados.events.EventClient(
-            self.MOCK_WS_URL, [], lambda event: None, None)
-        client.subscribe(filters[:], 99)
-        websocket_client().subscribe.assert_called_with(filters, 99)
-
-    @mock.patch('arvados.events._EventClient')
-    def test_unsubscribe(self, websocket_client):
-        filters = [['object_uuid', 'is_a', 'arvados#human']]
-        client = arvados.events.EventClient(
-            self.MOCK_WS_URL, filters[:], lambda event: None, None)
-        client.unsubscribe(filters[:])
-        websocket_client().unsubscribe.assert_called_with(filters)
-
-    @mock.patch('arvados.events._EventClient')
+        try:
+            msg_event, wss_client, self.ws = self.fake_client(ws_conn)
+            self.assertTrue(msg_event.wait(timeout=1), "timed out waiting for setup callback")
+            msg_event.clear()
+            ws_conn.side_effect = [Exception('EventClient.connect error'), wss_client]
+            wss_client.force_disconnect()
+            self.assertTrue(msg_event.wait(timeout=1), "timed out waiting for reconnect callback")
+            # verify log messages to ensure retry happened
+            self.assertIn("Error 'EventClient.connect error' during websocket reconnect.", logstream.getvalue())
+            self.assertEqual(ws_conn.call_count, 3)
+        finally:
+            rootLogger.removeHandler(streamHandler)
+
+    @mock.patch('arvados.events.ws_client.connect')
     def test_run_forever_survives_reconnects(self, websocket_client):
-        connected = threading.Event()
-        websocket_client().connect.side_effect = connected.set
         client = arvados.events.EventClient(
             self.MOCK_WS_URL, [], lambda event: None, None)
         forever_thread = threading.Thread(target=client.run_forever)
         forever_thread.start()
         # Simulate an unexpected disconnect, and wait for reconnect.
-        close_thread = threading.Thread(target=client.on_closed)
-        close_thread.start()
-        self.assertTrue(connected.wait(timeout=self.TEST_TIMEOUT))
-        close_thread.join()
-        run_forever_alive = forever_thread.is_alive()
-        client.close()
-        forever_thread.join()
-        self.assertTrue(run_forever_alive)
-        self.assertEqual(2, websocket_client().connect.call_count)
+        try:
+            client.on_closed()
+            self.assertTrue(forever_thread.is_alive())
+            self.assertEqual(2, websocket_client.call_count)
+        finally:
+            client.close()
+            forever_thread.join()
+
+    @staticmethod
+    def fake_client(conn_patch, filters=None, url=MOCK_WS_URL):
+        """Set up EventClient test infrastructure
+
+        Given a patch of `arvados.events.ws_client.connect`,
+        this returns a 3-tuple:
+
+        * `msg_event` is a `threading.Event` that is set as the test client
+          event callback. You can wait for this event to confirm that a
+          sent message has been acknowledged and processed.
+
+        * `mock_client` is a `mock.Mock` wrapper around `FakeWebsocketClient`.
+          Use this to assert `EventClient` calls the right methods. It tests
+          that `EventClient` acquires a lock before calling `send`.
+
+        * `client` is the `EventClient` that uses `mock_client` under the hood
+          that you exercise methods of.
+
+        Other arguments are passed to initialize `EventClient`.
+        """
+        msg_event = threading.Event()
+        fake_client = FakeWebsocketClient()
+        mock_client = fake_client.mock_wrapper()
+        conn_patch.return_value = mock_client
+        client = arvados.events.EventClient(url, filters, lambda _: msg_event.set())
+        fake_client._check_lock = client._subscribe_lock
+        return msg_event, mock_client, client
+
+    @mock.patch('arvados.events.ws_client.connect')
+    def test_subscribe_locking(self, ws_conn):
+        f = [['created_at', '>=', '2023-12-01T00:00:00.000Z']]
+        msg_event, wss_client, self.ws = self.fake_client(ws_conn)
+        self.assertTrue(msg_event.wait(timeout=1), "timed out waiting for setup callback")
+        msg_event.clear()
+        wss_client.send.reset_mock()
+        self.ws.subscribe(f)
+        self.assertTrue(msg_event.wait(timeout=1), "timed out waiting for subscribe callback")
+        wss_client.send.assert_called()
+        (msg,), _ = wss_client.send.call_args
+        self.assertEqual(
+            json.loads(msg),
+            {'method': 'subscribe', 'filters': f},
+        )
+
+    @mock.patch('arvados.events.ws_client.connect')
+    def test_unsubscribe_locking(self, ws_conn):
+        f = [['created_at', '>=', '2023-12-01T01:00:00.000Z']]
+        msg_event, wss_client, self.ws = self.fake_client(ws_conn, f)
+        self.assertTrue(msg_event.wait(timeout=1), "timed out waiting for setup callback")
+        msg_event.clear()
+        wss_client.send.reset_mock()
+        self.ws.unsubscribe(f)
+        self.assertTrue(msg_event.wait(timeout=1), "timed out waiting for unsubscribe callback")
+        wss_client.send.assert_called()
+        (msg,), _ = wss_client.send.call_args
+        self.assertEqual(
+            json.loads(msg),
+            {'method': 'unsubscribe', 'filters': f},
+        )
+
+    @mock.patch('arvados.events.ws_client.connect')
+    def test_resubscribe_locking(self, ws_conn):
+        f = [['created_at', '>=', '2023-12-01T02:00:00.000Z']]
+        msg_event, wss_client, self.ws = self.fake_client(ws_conn, f)
+        self.assertTrue(msg_event.wait(timeout=1), "timed out waiting for setup callback")
+        msg_event.clear()
+        wss_client.send.reset_mock()
+        wss_client.force_disconnect()
+        self.assertTrue(msg_event.wait(timeout=1), "timed out waiting for resubscribe callback")
+        wss_client.send.assert_called()
+        (msg,), _ = wss_client.send.call_args
+        self.assertEqual(
+            json.loads(msg),
+            {'method': 'subscribe', 'filters': f},
+        )
 
 
 class PollClientTestCase(unittest.TestCase):

-----------------------------------------------------------------------


hooks/post-receive
--