[arvados] created: 2.5.0-56-gb4b146dd2
git repository hosting
git at public.arvados.org
Wed Feb 1 20:00:57 UTC 2023
at b4b146dd2d1e8c816def4216951654f60fd8349f (commit)
commit b4b146dd2d1e8c816def4216951654f60fd8349f
Author: Tom Clegg <tom at curii.com>
Date: Wed Feb 1 13:51:35 2023 -0500
18075: Note obsolete/deprecated config.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>
diff --git a/doc/admin/upgrading.html.textile.liquid b/doc/admin/upgrading.html.textile.liquid
index 324658d67..13c91803b 100644
--- a/doc/admin/upgrading.html.textile.liquid
+++ b/doc/admin/upgrading.html.textile.liquid
@@ -36,6 +36,8 @@ h3. Default limit for cloud VM instances
There is a new configuration entry @CloudVMs.MaxInstances@ (default 64) that limits the number of VMs the cloud dispatcher will run at a time. This may need to be adjusted to suit your anticipated workload.
+Using the obsolete configuration entry @MaxCloudVMs@, which was previously accepted in config files but not obeyed, will now result in a deprecation warning.
+
h3. Slow migration on upgrade
This upgrade includes a database schema update (changing an integer column in each table from 32-bit to 64-bit) that may be slow on a large installation. Expect the arvados-api-server package upgrade to take longer than usual.
commit 1ebe303758179153f464ce1d5909d7949b738169
Author: Tom Clegg <tom at curii.com>
Date: Wed Feb 1 13:46:40 2023 -0500
18075: Update AtQuota comment, check quota in Create, log at limit.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>
diff --git a/lib/dispatchcloud/worker/pool.go b/lib/dispatchcloud/worker/pool.go
index e3a7b553c..3abcba6c7 100644
--- a/lib/dispatchcloud/worker/pool.go
+++ b/lib/dispatchcloud/worker/pool.go
@@ -304,10 +304,10 @@ func (wp *Pool) Unallocated() map[arvados.InstanceType]int {
// pool. The worker is added immediately; instance creation runs in
// the background.
//
-// Create returns false if a pre-existing error state prevents it from
-// even attempting to create a new instance. Those errors are logged
-// by the Pool, so the caller does not need to log anything in such
-// cases.
+// Create returns false if a pre-existing error or a configuration
+// setting prevents it from even attempting to create a new
+// instance. Those errors are logged by the Pool, so the caller does
+// not need to log anything in such cases.
func (wp *Pool) Create(it arvados.InstanceType) bool {
logger := wp.logger.WithField("InstanceType", it.Name)
wp.setupOnce.Do(wp.setup)
@@ -317,7 +317,9 @@ func (wp *Pool) Create(it arvados.InstanceType) bool {
}
wp.mtx.Lock()
defer wp.mtx.Unlock()
- if time.Now().Before(wp.atQuotaUntil) || wp.instanceSet.throttleCreate.Error() != nil {
+ if time.Now().Before(wp.atQuotaUntil) ||
+ wp.instanceSet.throttleCreate.Error() != nil ||
+ (wp.maxInstances > 0 && wp.maxInstances <= len(wp.workers)+len(wp.creating)) {
return false
}
// The maxConcurrentInstanceCreateOps knob throttles the number of node create
@@ -363,11 +365,15 @@ func (wp *Pool) Create(it arvados.InstanceType) bool {
}
wp.updateWorker(inst, it)
}()
+ if len(wp.creating)+len(wp.workers) == wp.maxInstances {
+ logger.Infof("now at MaxInstances limit of %d instances", wp.maxInstances)
+ }
return true
}
// AtQuota returns true if Create is not expected to work at the
-// moment.
+// moment (e.g., cloud provider has reported quota errors, or we are
+// already at our own configured quota).
func (wp *Pool) AtQuota() bool {
wp.mtx.Lock()
defer wp.mtx.Unlock()
commit c11aeb6b640c78feec50ebcacc52718faf5643a7
Author: Tom Clegg <tom at curii.com>
Date: Mon Jan 30 10:23:21 2023 -0500
18075: Remove tests for crunch1 Node.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>
diff --git a/services/api/test/unit/node_test.rb b/services/api/test/unit/node_test.rb
deleted file mode 100644
index c3d070886..000000000
--- a/services/api/test/unit/node_test.rb
+++ /dev/null
@@ -1,214 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: AGPL-3.0
-
-require 'test_helper'
-require 'tmpdir'
-require 'tempfile'
-
-class NodeTest < ActiveSupport::TestCase
- def ping_node(node_name, ping_data)
- set_user_from_auth :admin
- node = nodes(node_name)
- node.ping({ping_secret: node.info['ping_secret'],
- ip: node.ip_address}.merge(ping_data))
- node
- end
-
- test "pinging a node can add and update stats" do
- node = ping_node(:idle, {total_cpu_cores: '12', total_ram_mb: '512'})
- assert_equal(12, node.properties['total_cpu_cores'])
- assert_equal(512, node.properties['total_ram_mb'])
- end
-
- test "stats disappear if not in a ping" do
- node = ping_node(:idle, {total_ram_mb: '256'})
- refute_includes(node.properties, 'total_cpu_cores')
- assert_equal(256, node.properties['total_ram_mb'])
- end
-
- test "worker state is down for node with no slot" do
- node = nodes(:was_idle_now_down)
- assert_nil node.slot_number, "fixture is not what I expected"
- assert_equal 'down', node.crunch_worker_state, "wrong worker state"
- end
-
- test "dns_server_conf_template" do
- Rails.configuration.Containers.SLURM.Managed.DNSServerConfDir = Rails.root.join 'tmp'
- Rails.configuration.Containers.SLURM.Managed.DNSServerConfTemplate = Rails.root.join 'config', 'unbound.template'
- conffile = Rails.root.join 'tmp', 'compute65535.conf'
- File.unlink conffile rescue nil
- assert Node.dns_server_update 'compute65535', '127.0.0.1'
- assert_match(/\"1\.0\.0\.127\.in-addr\.arpa\. IN PTR compute65535\.zzzzz\.arvadosapi\.com\"/, IO.read(conffile))
- File.unlink conffile
- end
-
- test "dns_server_restart_command" do
- Rails.configuration.Containers.SLURM.Managed.DNSServerConfDir = Rails.root.join 'tmp'
- Rails.configuration.Containers.SLURM.Managed.DNSServerReloadCommand = 'foobar'
- restartfile = Rails.root.join 'tmp', 'restart.txt'
- File.unlink restartfile rescue nil
- assert Node.dns_server_update 'compute65535', '127.0.0.127'
- assert_equal "foobar\n", IO.read(restartfile)
- File.unlink restartfile
- end
-
- test "dns_server_restart_command fail" do
- Rails.configuration.Containers.SLURM.Managed.DNSServerConfDir = Rails.root.join 'tmp', 'bogusdir'
- Rails.configuration.Containers.SLURM.Managed.DNSServerReloadCommand = 'foobar'
- refute Node.dns_server_update 'compute65535', '127.0.0.127'
- end
-
- test "dns_server_update_command with valid command" do
- testfile = Rails.root.join('tmp', 'node_test_dns_server_update_command.txt')
- Rails.configuration.Containers.SLURM.Managed.DNSServerUpdateCommand =
- ('echo -n "%{hostname} == %{ip_address}" >' +
- testfile.to_s.shellescape)
- assert Node.dns_server_update 'compute65535', '127.0.0.1'
- assert_equal 'compute65535 == 127.0.0.1', IO.read(testfile)
- File.unlink testfile
- end
-
- test "dns_server_update_command with failing command" do
- Rails.configuration.Containers.SLURM.Managed.DNSServerUpdateCommand = 'false %{hostname}'
- refute Node.dns_server_update 'compute65535', '127.0.0.1'
- end
-
- test "dns update with no commands/dirs configured" do
- Rails.configuration.Containers.SLURM.Managed.DNSServerUpdateCommand = ""
- Rails.configuration.Containers.SLURM.Managed.DNSServerConfDir = ""
- Rails.configuration.Containers.SLURM.Managed.DNSServerConfTemplate = 'ignored!'
- Rails.configuration.Containers.SLURM.Managed.DNSServerReloadCommand = 'ignored!'
- assert Node.dns_server_update 'compute65535', '127.0.0.127'
- end
-
- test "don't leave temp files behind if there's an error writing them" do
- Rails.configuration.Containers.SLURM.Managed.DNSServerConfTemplate = Rails.root.join 'config', 'unbound.template'
- Tempfile.any_instance.stubs(:puts).raises(IOError)
- Dir.mktmpdir do |tmpdir|
- Rails.configuration.Containers.SLURM.Managed.DNSServerConfDir = tmpdir
- refute Node.dns_server_update 'compute65535', '127.0.0.127'
- assert_empty Dir.entries(tmpdir).select{|f| File.file? f}
- end
- end
-
- test "ping new node with no hostname and default config" do
- node = ping_node(:new_with_no_hostname, {})
- slot_number = node.slot_number
- refute_nil slot_number
- assert_equal("compute#{slot_number}", node.hostname)
- end
-
- test "ping new node with no hostname and no config" do
- Rails.configuration.Containers.SLURM.Managed.AssignNodeHostname = false
- node = ping_node(:new_with_no_hostname, {})
- refute_nil node.slot_number
- assert_nil node.hostname
- end
-
- test "ping new node with zero padding config" do
- Rails.configuration.Containers.SLURM.Managed.AssignNodeHostname = 'compute%<slot_number>04d'
- node = ping_node(:new_with_no_hostname, {})
- slot_number = node.slot_number
- refute_nil slot_number
- assert_equal("compute000#{slot_number}", node.hostname)
- end
-
- test "ping node with hostname and config and expect hostname unchanged" do
- node = ping_node(:new_with_custom_hostname, {})
- assert_equal(23, node.slot_number)
- assert_equal("custom1", node.hostname)
- end
-
- test "ping node with hostname and no config and expect hostname unchanged" do
- Rails.configuration.Containers.SLURM.Managed.AssignNodeHostname = false
- node = ping_node(:new_with_custom_hostname, {})
- assert_equal(23, node.slot_number)
- assert_equal("custom1", node.hostname)
- end
-
- # Ping two nodes: one without a hostname and the other with a hostname.
- # Verify that the first one gets a hostname and second one is unchanged.
- test "ping two nodes one with no hostname and one with hostname and check hostnames" do
- # ping node with no hostname and expect it set with config format
- node = ping_node(:new_with_no_hostname, {})
- refute_nil node.slot_number
- assert_equal "compute#{node.slot_number}", node.hostname
-
- # ping node with a hostname and expect it to be unchanged
- node2 = ping_node(:new_with_custom_hostname, {})
- refute_nil node2.slot_number
- assert_equal "custom1", node2.hostname
- end
-
- test "update dns when hostname and ip_address are cleared" do
- act_as_system_user do
- node = ping_node(:new_with_custom_hostname, {})
- Node.expects(:dns_server_update).with(node.hostname, Node::UNUSED_NODE_IP)
- node.update_attributes(hostname: nil, ip_address: nil)
- end
- end
-
- test "update dns when hostname changes" do
- act_as_system_user do
- node = ping_node(:new_with_custom_hostname, {})
-
- Node.expects(:dns_server_update).with(node.hostname, Node::UNUSED_NODE_IP)
- Node.expects(:dns_server_update).with('foo0', node.ip_address)
- node.update_attributes!(hostname: 'foo0')
-
- Node.expects(:dns_server_update).with('foo0', Node::UNUSED_NODE_IP)
- node.update_attributes!(hostname: nil, ip_address: nil)
-
- Node.expects(:dns_server_update).with('foo0', '10.11.12.13')
- node.update_attributes!(hostname: 'foo0', ip_address: '10.11.12.13')
-
- Node.expects(:dns_server_update).with('foo0', '10.11.12.14')
- node.update_attributes!(hostname: 'foo0', ip_address: '10.11.12.14')
- end
- end
-
- test 'newest ping wins IP address conflict' do
- act_as_system_user do
- n1, n2 = Node.create!, Node.create!
-
- n1.ping(ip: '10.5.5.5', ping_secret: n1.info['ping_secret'])
- n1.reload
-
- Node.expects(:dns_server_update).with(n1.hostname, Node::UNUSED_NODE_IP)
- Node.expects(:dns_server_update).with(Not(equals(n1.hostname)), '10.5.5.5')
- n2.ping(ip: '10.5.5.5', ping_secret: n2.info['ping_secret'])
-
- n1.reload
- n2.reload
- assert_nil n1.ip_address
- assert_equal '10.5.5.5', n2.ip_address
-
- Node.expects(:dns_server_update).with(n2.hostname, Node::UNUSED_NODE_IP)
- Node.expects(:dns_server_update).with(n1.hostname, '10.5.5.5')
- n1.ping(ip: '10.5.5.5', ping_secret: n1.info['ping_secret'])
-
- n1.reload
- n2.reload
- assert_nil n2.ip_address
- assert_equal '10.5.5.5', n1.ip_address
- end
- end
-
- test 'run out of slots' do
- act_as_system_user do
- Node.destroy_all
- (1..4).each do |i|
- n = Node.create!
- args = { ip: "10.0.0.#{i}", ping_secret: n.info['ping_secret'] }
- if i <= 3 # MAX_VMS
- n.ping(args)
- else
- assert_raises do
- n.ping(args)
- end
- end
- end
- end
- end
-end
commit 508a25a9f3677c913b081e79350bc76d25c76698
Author: Tom Clegg <tom at curii.com>
Date: Fri Jan 27 16:33:30 2023 -0500
18075: Kill off MaxComputeVMs properly.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>
diff --git a/services/api/app/models/node.rb b/services/api/app/models/node.rb
index c8b463696..c8a606e2b 100644
--- a/services/api/app/models/node.rb
+++ b/services/api/app/models/node.rb
@@ -24,6 +24,7 @@ class Node < ArvadosModel
attr_accessor :job_readable
UNUSED_NODE_IP = '127.40.4.0'
+ MAX_VMS = 3
api_accessible :user, :extend => :common do |t|
t.add :hostname
@@ -159,7 +160,7 @@ class Node < ArvadosModel
# query label:
'Node.available_slot_number',
# [col_id, val] for $1 vars:
- [[nil, Rails.configuration.Containers.MaxComputeVMs]],
+ [[nil, MAX_VMS]],
).rows.first.andand.first
end
@@ -267,7 +268,7 @@ class Node < ArvadosModel
!Rails.configuration.Containers.SLURM.Managed.DNSServerConfTemplate.to_s.empty? and
!Rails.configuration.Containers.SLURM.Managed.AssignNodeHostname.empty?)
- (0..Rails.configuration.Containers.MaxComputeVMs-1).each do |slot_number|
+ (0..MAX_VMS-1).each do |slot_number|
hostname = hostname_for_slot(slot_number)
hostfile = File.join Rails.configuration.Containers.SLURM.Managed.DNSServerConfDir, "#{hostname}.conf"
if !File.exist? hostfile
diff --git a/services/api/config/arvados_config.rb b/services/api/config/arvados_config.rb
index c47eeb551..d928d592c 100644
--- a/services/api/config/arvados_config.rb
+++ b/services/api/config/arvados_config.rb
@@ -132,7 +132,6 @@ arvcfg.declare_config "Containers.DefaultKeepCacheRAM", Integer, :container_defa
arvcfg.declare_config "Containers.MaxDispatchAttempts", Integer, :max_container_dispatch_attempts
arvcfg.declare_config "Containers.MaxRetryAttempts", Integer, :container_count_max
arvcfg.declare_config "Containers.AlwaysUsePreemptibleInstances", Boolean, :preemptible_instances
-arvcfg.declare_config "Containers.MaxComputeVMs", Integer, :max_compute_nodes
arvcfg.declare_config "Containers.Logging.LogBytesPerEvent", Integer, :crunch_log_bytes_per_event
arvcfg.declare_config "Containers.Logging.LogSecondsBetweenEvents", ActiveSupport::Duration, :crunch_log_seconds_between_events
arvcfg.declare_config "Containers.Logging.LogThrottlePeriod", ActiveSupport::Duration, :crunch_log_throttle_period
diff --git a/services/api/test/unit/node_test.rb b/services/api/test/unit/node_test.rb
index 9fa3febe1..c3d070886 100644
--- a/services/api/test/unit/node_test.rb
+++ b/services/api/test/unit/node_test.rb
@@ -196,13 +196,12 @@ class NodeTest < ActiveSupport::TestCase
end
test 'run out of slots' do
- Rails.configuration.Containers.MaxComputeVMs = 3
act_as_system_user do
Node.destroy_all
(1..4).each do |i|
n = Node.create!
args = { ip: "10.0.0.#{i}", ping_secret: n.info['ping_secret'] }
- if i <= Rails.configuration.Containers.MaxComputeVMs
+ if i <= 3 # MAX_VMS
n.ping(args)
else
assert_raises do
commit be5771dd0fe41d5108d8fd72e62223b37948cf2d
Author: Tom Clegg <tom at curii.com>
Date: Fri Jan 27 14:54:47 2023 -0500
18075: Add CloudVMs.MaxInstances config, retire MaxComputeVMs.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>
diff --git a/doc/admin/upgrading.html.textile.liquid b/doc/admin/upgrading.html.textile.liquid
index 4da168293..324658d67 100644
--- a/doc/admin/upgrading.html.textile.liquid
+++ b/doc/admin/upgrading.html.textile.liquid
@@ -28,10 +28,14 @@ TODO: extract this information based on git commit messages and generate changel
<div class="releasenotes">
</notextile>
-h2(#main). development main (as of 2023-01-16)
+h2(#main). development main (as of 2023-01-27)
"previous: Upgrading to 2.5.0":#v2_5_0
+h3. Default limit for cloud VM instances
+
+There is a new configuration entry @CloudVMs.MaxInstances@ (default 64) that limits the number of VMs the cloud dispatcher will run at a time. This may need to be adjusted to suit your anticipated workload.
+
h3. Slow migration on upgrade
This upgrade includes a database schema update (changing an integer column in each table from 32-bit to 64-bit) that may be slow on a large installation. Expect the arvados-api-server package upgrade to take longer than usual.
diff --git a/lib/config/config.default.yml b/lib/config/config.default.yml
index 29a4a640b..26ada44d6 100644
--- a/lib/config/config.default.yml
+++ b/lib/config/config.default.yml
@@ -1005,13 +1005,6 @@ Clusters:
# with the cancelled container.
MaxRetryAttempts: 3
- # The maximum number of compute nodes that can be in use simultaneously
- # If this limit is reduced, any existing nodes with slot number >= new limit
- # will not be counted against the new limit. In other words, the new limit
- # won't be strictly enforced until those nodes with higher slot numbers
- # go down.
- MaxComputeVMs: 64
-
# Schedule all child containers on preemptible instances (e.g. AWS
# Spot Instances) even if not requested by the submitter.
#
@@ -1327,6 +1320,15 @@ Clusters:
# providers too, if desired.
MaxConcurrentInstanceCreateOps: 1
+ # The maximum number of instances to run at a time, or 0 for
+ # unlimited.
+ #
+ # If more instances than this are already running and busy
+ # when the dispatcher starts up, the running containers will
+ # be allowed to finish before the excess instances are shut
+ # down.
+ MaxInstances: 64
+
# Interval between cloud provider syncs/updates ("list all
# instances").
SyncInterval: 1m
diff --git a/lib/config/export.go b/lib/config/export.go
index bc7864486..f9699c6ed 100644
--- a/lib/config/export.go
+++ b/lib/config/export.go
@@ -131,7 +131,6 @@ var whitelist = map[string]bool{
"Containers.Logging": false,
"Containers.LogReuseDecisions": false,
"Containers.LSF": false,
- "Containers.MaxComputeVMs": false,
"Containers.MaxDispatchAttempts": false,
"Containers.MaxRetryAttempts": true,
"Containers.MinRetryPeriod": true,
diff --git a/lib/dispatchcloud/worker/pool.go b/lib/dispatchcloud/worker/pool.go
index 66e0bfee9..e3a7b553c 100644
--- a/lib/dispatchcloud/worker/pool.go
+++ b/lib/dispatchcloud/worker/pool.go
@@ -111,6 +111,7 @@ func NewPool(logger logrus.FieldLogger, arvClient *arvados.Client, reg *promethe
instanceTypes: cluster.InstanceTypes,
maxProbesPerSecond: cluster.Containers.CloudVMs.MaxProbesPerSecond,
maxConcurrentInstanceCreateOps: cluster.Containers.CloudVMs.MaxConcurrentInstanceCreateOps,
+ maxInstances: cluster.Containers.CloudVMs.MaxInstances,
probeInterval: duration(cluster.Containers.CloudVMs.ProbeInterval, defaultProbeInterval),
syncInterval: duration(cluster.Containers.CloudVMs.SyncInterval, defaultSyncInterval),
timeoutIdle: duration(cluster.Containers.CloudVMs.TimeoutIdle, defaultTimeoutIdle),
@@ -155,6 +156,7 @@ type Pool struct {
probeInterval time.Duration
maxProbesPerSecond int
maxConcurrentInstanceCreateOps int
+ maxInstances int
timeoutIdle time.Duration
timeoutBooting time.Duration
timeoutProbe time.Duration
@@ -369,7 +371,7 @@ func (wp *Pool) Create(it arvados.InstanceType) bool {
func (wp *Pool) AtQuota() bool {
wp.mtx.Lock()
defer wp.mtx.Unlock()
- return time.Now().Before(wp.atQuotaUntil)
+ return time.Now().Before(wp.atQuotaUntil) || (wp.maxInstances > 0 && wp.maxInstances <= len(wp.workers)+len(wp.creating))
}
// SetIdleBehavior determines how the indicated instance will behave
diff --git a/sdk/go/arvados/config.go b/sdk/go/arvados/config.go
index 76ed7cefb..677706c08 100644
--- a/sdk/go/arvados/config.go
+++ b/sdk/go/arvados/config.go
@@ -498,7 +498,6 @@ type ContainersConfig struct {
DefaultKeepCacheRAM ByteSize
DispatchPrivateKey string
LogReuseDecisions bool
- MaxComputeVMs int
MaxDispatchAttempts int
MaxRetryAttempts int
MinRetryPeriod Duration
@@ -562,6 +561,7 @@ type CloudVMsConfig struct {
MaxCloudOpsPerSecond int
MaxProbesPerSecond int
MaxConcurrentInstanceCreateOps int
+ MaxInstances int
PollInterval Duration
ProbeInterval Duration
SSHPort string
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list