[arvados] created: 2.5.0-56-gb4b146dd2

git repository hosting git at public.arvados.org
Wed Feb 1 20:00:57 UTC 2023


        at  b4b146dd2d1e8c816def4216951654f60fd8349f (commit)


commit b4b146dd2d1e8c816def4216951654f60fd8349f
Author: Tom Clegg <tom at curii.com>
Date:   Wed Feb 1 13:51:35 2023 -0500

    18075: Note obsolete/deprecated config.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>

diff --git a/doc/admin/upgrading.html.textile.liquid b/doc/admin/upgrading.html.textile.liquid
index 324658d67..13c91803b 100644
--- a/doc/admin/upgrading.html.textile.liquid
+++ b/doc/admin/upgrading.html.textile.liquid
@@ -36,6 +36,8 @@ h3. Default limit for cloud VM instances
 
 There is a new configuration entry @CloudVMs.MaxInstances@ (default 64) that limits the number of VMs the cloud dispatcher will run at a time. This may need to be adjusted to suit your anticipated workload.
 
+Using the obsolete configuration entry @MaxCloudVMs@, which was previously accepted in config files but not obeyed, will now result in a deprecation warning.
+
 h3. Slow migration on upgrade
 
 This upgrade includes a database schema update (changing an integer column in each table from 32-bit to 64-bit) that may be slow on a large installation. Expect the arvados-api-server package upgrade to take longer than usual.

commit 1ebe303758179153f464ce1d5909d7949b738169
Author: Tom Clegg <tom at curii.com>
Date:   Wed Feb 1 13:46:40 2023 -0500

    18075: Update AtQuota comment, check quota in Create, log at limit.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>

diff --git a/lib/dispatchcloud/worker/pool.go b/lib/dispatchcloud/worker/pool.go
index e3a7b553c..3abcba6c7 100644
--- a/lib/dispatchcloud/worker/pool.go
+++ b/lib/dispatchcloud/worker/pool.go
@@ -304,10 +304,10 @@ func (wp *Pool) Unallocated() map[arvados.InstanceType]int {
 // pool. The worker is added immediately; instance creation runs in
 // the background.
 //
-// Create returns false if a pre-existing error state prevents it from
-// even attempting to create a new instance. Those errors are logged
-// by the Pool, so the caller does not need to log anything in such
-// cases.
+// Create returns false if a pre-existing error or a configuration
+// setting prevents it from even attempting to create a new
+// instance. Those errors are logged by the Pool, so the caller does
+// not need to log anything in such cases.
 func (wp *Pool) Create(it arvados.InstanceType) bool {
 	logger := wp.logger.WithField("InstanceType", it.Name)
 	wp.setupOnce.Do(wp.setup)
@@ -317,7 +317,9 @@ func (wp *Pool) Create(it arvados.InstanceType) bool {
 	}
 	wp.mtx.Lock()
 	defer wp.mtx.Unlock()
-	if time.Now().Before(wp.atQuotaUntil) || wp.instanceSet.throttleCreate.Error() != nil {
+	if time.Now().Before(wp.atQuotaUntil) ||
+		wp.instanceSet.throttleCreate.Error() != nil ||
+		(wp.maxInstances > 0 && wp.maxInstances <= len(wp.workers)+len(wp.creating)) {
 		return false
 	}
 	// The maxConcurrentInstanceCreateOps knob throttles the number of node create
@@ -363,11 +365,15 @@ func (wp *Pool) Create(it arvados.InstanceType) bool {
 		}
 		wp.updateWorker(inst, it)
 	}()
+	if len(wp.creating)+len(wp.workers) == wp.maxInstances {
+		logger.Infof("now at MaxInstances limit of %d instances", wp.maxInstances)
+	}
 	return true
 }
 
 // AtQuota returns true if Create is not expected to work at the
-// moment.
+// moment (e.g., cloud provider has reported quota errors, or we are
+// already at our own configured quota).
 func (wp *Pool) AtQuota() bool {
 	wp.mtx.Lock()
 	defer wp.mtx.Unlock()

commit c11aeb6b640c78feec50ebcacc52718faf5643a7
Author: Tom Clegg <tom at curii.com>
Date:   Mon Jan 30 10:23:21 2023 -0500

    18075: Remove tests for crunch1 Node.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>

diff --git a/services/api/test/unit/node_test.rb b/services/api/test/unit/node_test.rb
deleted file mode 100644
index c3d070886..000000000
--- a/services/api/test/unit/node_test.rb
+++ /dev/null
@@ -1,214 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: AGPL-3.0
-
-require 'test_helper'
-require 'tmpdir'
-require 'tempfile'
-
-class NodeTest < ActiveSupport::TestCase
-  def ping_node(node_name, ping_data)
-    set_user_from_auth :admin
-    node = nodes(node_name)
-    node.ping({ping_secret: node.info['ping_secret'],
-                ip: node.ip_address}.merge(ping_data))
-    node
-  end
-
-  test "pinging a node can add and update stats" do
-    node = ping_node(:idle, {total_cpu_cores: '12', total_ram_mb: '512'})
-    assert_equal(12, node.properties['total_cpu_cores'])
-    assert_equal(512, node.properties['total_ram_mb'])
-  end
-
-  test "stats disappear if not in a ping" do
-    node = ping_node(:idle, {total_ram_mb: '256'})
-    refute_includes(node.properties, 'total_cpu_cores')
-    assert_equal(256, node.properties['total_ram_mb'])
-  end
-
-  test "worker state is down for node with no slot" do
-    node = nodes(:was_idle_now_down)
-    assert_nil node.slot_number, "fixture is not what I expected"
-    assert_equal 'down', node.crunch_worker_state, "wrong worker state"
-  end
-
-  test "dns_server_conf_template" do
-    Rails.configuration.Containers.SLURM.Managed.DNSServerConfDir = Rails.root.join 'tmp'
-    Rails.configuration.Containers.SLURM.Managed.DNSServerConfTemplate = Rails.root.join 'config', 'unbound.template'
-    conffile = Rails.root.join 'tmp', 'compute65535.conf'
-    File.unlink conffile rescue nil
-    assert Node.dns_server_update 'compute65535', '127.0.0.1'
-    assert_match(/\"1\.0\.0\.127\.in-addr\.arpa\. IN PTR compute65535\.zzzzz\.arvadosapi\.com\"/, IO.read(conffile))
-    File.unlink conffile
-  end
-
-  test "dns_server_restart_command" do
-    Rails.configuration.Containers.SLURM.Managed.DNSServerConfDir = Rails.root.join 'tmp'
-    Rails.configuration.Containers.SLURM.Managed.DNSServerReloadCommand = 'foobar'
-    restartfile = Rails.root.join 'tmp', 'restart.txt'
-    File.unlink restartfile rescue nil
-    assert Node.dns_server_update 'compute65535', '127.0.0.127'
-    assert_equal "foobar\n", IO.read(restartfile)
-    File.unlink restartfile
-  end
-
-  test "dns_server_restart_command fail" do
-    Rails.configuration.Containers.SLURM.Managed.DNSServerConfDir = Rails.root.join 'tmp', 'bogusdir'
-    Rails.configuration.Containers.SLURM.Managed.DNSServerReloadCommand = 'foobar'
-    refute Node.dns_server_update 'compute65535', '127.0.0.127'
-  end
-
-  test "dns_server_update_command with valid command" do
-    testfile = Rails.root.join('tmp', 'node_test_dns_server_update_command.txt')
-    Rails.configuration.Containers.SLURM.Managed.DNSServerUpdateCommand =
-      ('echo -n "%{hostname} == %{ip_address}" >' +
-       testfile.to_s.shellescape)
-    assert Node.dns_server_update 'compute65535', '127.0.0.1'
-    assert_equal 'compute65535 == 127.0.0.1', IO.read(testfile)
-    File.unlink testfile
-  end
-
-  test "dns_server_update_command with failing command" do
-    Rails.configuration.Containers.SLURM.Managed.DNSServerUpdateCommand = 'false %{hostname}'
-    refute Node.dns_server_update 'compute65535', '127.0.0.1'
-  end
-
-  test "dns update with no commands/dirs configured" do
-    Rails.configuration.Containers.SLURM.Managed.DNSServerUpdateCommand = ""
-    Rails.configuration.Containers.SLURM.Managed.DNSServerConfDir = ""
-    Rails.configuration.Containers.SLURM.Managed.DNSServerConfTemplate = 'ignored!'
-    Rails.configuration.Containers.SLURM.Managed.DNSServerReloadCommand = 'ignored!'
-    assert Node.dns_server_update 'compute65535', '127.0.0.127'
-  end
-
-  test "don't leave temp files behind if there's an error writing them" do
-    Rails.configuration.Containers.SLURM.Managed.DNSServerConfTemplate = Rails.root.join 'config', 'unbound.template'
-    Tempfile.any_instance.stubs(:puts).raises(IOError)
-    Dir.mktmpdir do |tmpdir|
-      Rails.configuration.Containers.SLURM.Managed.DNSServerConfDir = tmpdir
-      refute Node.dns_server_update 'compute65535', '127.0.0.127'
-      assert_empty Dir.entries(tmpdir).select{|f| File.file? f}
-    end
-  end
-
-  test "ping new node with no hostname and default config" do
-    node = ping_node(:new_with_no_hostname, {})
-    slot_number = node.slot_number
-    refute_nil slot_number
-    assert_equal("compute#{slot_number}", node.hostname)
-  end
-
-  test "ping new node with no hostname and no config" do
-    Rails.configuration.Containers.SLURM.Managed.AssignNodeHostname = false
-    node = ping_node(:new_with_no_hostname, {})
-    refute_nil node.slot_number
-    assert_nil node.hostname
-  end
-
-  test "ping new node with zero padding config" do
-    Rails.configuration.Containers.SLURM.Managed.AssignNodeHostname = 'compute%<slot_number>04d'
-    node = ping_node(:new_with_no_hostname, {})
-    slot_number = node.slot_number
-    refute_nil slot_number
-    assert_equal("compute000#{slot_number}", node.hostname)
-  end
-
-  test "ping node with hostname and config and expect hostname unchanged" do
-    node = ping_node(:new_with_custom_hostname, {})
-    assert_equal(23, node.slot_number)
-    assert_equal("custom1", node.hostname)
-  end
-
-  test "ping node with hostname and no config and expect hostname unchanged" do
-    Rails.configuration.Containers.SLURM.Managed.AssignNodeHostname = false
-    node = ping_node(:new_with_custom_hostname, {})
-    assert_equal(23, node.slot_number)
-    assert_equal("custom1", node.hostname)
-  end
-
-  # Ping two nodes: one without a hostname and the other with a hostname.
-  # Verify that the first one gets a hostname and second one is unchanged.
-  test "ping two nodes one with no hostname and one with hostname and check hostnames" do
-    # ping node with no hostname and expect it set with config format
-    node = ping_node(:new_with_no_hostname, {})
-    refute_nil node.slot_number
-    assert_equal "compute#{node.slot_number}", node.hostname
-
-    # ping node with a hostname and expect it to be unchanged
-    node2 = ping_node(:new_with_custom_hostname, {})
-    refute_nil node2.slot_number
-    assert_equal "custom1", node2.hostname
-  end
-
-  test "update dns when hostname and ip_address are cleared" do
-    act_as_system_user do
-      node = ping_node(:new_with_custom_hostname, {})
-      Node.expects(:dns_server_update).with(node.hostname, Node::UNUSED_NODE_IP)
-      node.update_attributes(hostname: nil, ip_address: nil)
-    end
-  end
-
-  test "update dns when hostname changes" do
-    act_as_system_user do
-      node = ping_node(:new_with_custom_hostname, {})
-
-      Node.expects(:dns_server_update).with(node.hostname, Node::UNUSED_NODE_IP)
-      Node.expects(:dns_server_update).with('foo0', node.ip_address)
-      node.update_attributes!(hostname: 'foo0')
-
-      Node.expects(:dns_server_update).with('foo0', Node::UNUSED_NODE_IP)
-      node.update_attributes!(hostname: nil, ip_address: nil)
-
-      Node.expects(:dns_server_update).with('foo0', '10.11.12.13')
-      node.update_attributes!(hostname: 'foo0', ip_address: '10.11.12.13')
-
-      Node.expects(:dns_server_update).with('foo0', '10.11.12.14')
-      node.update_attributes!(hostname: 'foo0', ip_address: '10.11.12.14')
-    end
-  end
-
-  test 'newest ping wins IP address conflict' do
-    act_as_system_user do
-      n1, n2 = Node.create!, Node.create!
-
-      n1.ping(ip: '10.5.5.5', ping_secret: n1.info['ping_secret'])
-      n1.reload
-
-      Node.expects(:dns_server_update).with(n1.hostname, Node::UNUSED_NODE_IP)
-      Node.expects(:dns_server_update).with(Not(equals(n1.hostname)), '10.5.5.5')
-      n2.ping(ip: '10.5.5.5', ping_secret: n2.info['ping_secret'])
-
-      n1.reload
-      n2.reload
-      assert_nil n1.ip_address
-      assert_equal '10.5.5.5', n2.ip_address
-
-      Node.expects(:dns_server_update).with(n2.hostname, Node::UNUSED_NODE_IP)
-      Node.expects(:dns_server_update).with(n1.hostname, '10.5.5.5')
-      n1.ping(ip: '10.5.5.5', ping_secret: n1.info['ping_secret'])
-
-      n1.reload
-      n2.reload
-      assert_nil n2.ip_address
-      assert_equal '10.5.5.5', n1.ip_address
-    end
-  end
-
-  test 'run out of slots' do
-    act_as_system_user do
-      Node.destroy_all
-      (1..4).each do |i|
-        n = Node.create!
-        args = { ip: "10.0.0.#{i}", ping_secret: n.info['ping_secret'] }
-        if i <= 3 # MAX_VMS
-          n.ping(args)
-        else
-          assert_raises do
-            n.ping(args)
-          end
-        end
-      end
-    end
-  end
-end

commit 508a25a9f3677c913b081e79350bc76d25c76698
Author: Tom Clegg <tom at curii.com>
Date:   Fri Jan 27 16:33:30 2023 -0500

    18075: Kill off MaxComputeVMs properly.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>

diff --git a/services/api/app/models/node.rb b/services/api/app/models/node.rb
index c8b463696..c8a606e2b 100644
--- a/services/api/app/models/node.rb
+++ b/services/api/app/models/node.rb
@@ -24,6 +24,7 @@ class Node < ArvadosModel
   attr_accessor :job_readable
 
   UNUSED_NODE_IP = '127.40.4.0'
+  MAX_VMS = 3
 
   api_accessible :user, :extend => :common do |t|
     t.add :hostname
@@ -159,7 +160,7 @@ class Node < ArvadosModel
                           # query label:
                           'Node.available_slot_number',
                           # [col_id, val] for $1 vars:
-                          [[nil, Rails.configuration.Containers.MaxComputeVMs]],
+                          [[nil, MAX_VMS]],
                          ).rows.first.andand.first
   end
 
@@ -267,7 +268,7 @@ class Node < ArvadosModel
       !Rails.configuration.Containers.SLURM.Managed.DNSServerConfTemplate.to_s.empty? and
       !Rails.configuration.Containers.SLURM.Managed.AssignNodeHostname.empty?)
 
-    (0..Rails.configuration.Containers.MaxComputeVMs-1).each do |slot_number|
+    (0..MAX_VMS-1).each do |slot_number|
       hostname = hostname_for_slot(slot_number)
       hostfile = File.join Rails.configuration.Containers.SLURM.Managed.DNSServerConfDir, "#{hostname}.conf"
       if !File.exist? hostfile
diff --git a/services/api/config/arvados_config.rb b/services/api/config/arvados_config.rb
index c47eeb551..d928d592c 100644
--- a/services/api/config/arvados_config.rb
+++ b/services/api/config/arvados_config.rb
@@ -132,7 +132,6 @@ arvcfg.declare_config "Containers.DefaultKeepCacheRAM", Integer, :container_defa
 arvcfg.declare_config "Containers.MaxDispatchAttempts", Integer, :max_container_dispatch_attempts
 arvcfg.declare_config "Containers.MaxRetryAttempts", Integer, :container_count_max
 arvcfg.declare_config "Containers.AlwaysUsePreemptibleInstances", Boolean, :preemptible_instances
-arvcfg.declare_config "Containers.MaxComputeVMs", Integer, :max_compute_nodes
 arvcfg.declare_config "Containers.Logging.LogBytesPerEvent", Integer, :crunch_log_bytes_per_event
 arvcfg.declare_config "Containers.Logging.LogSecondsBetweenEvents", ActiveSupport::Duration, :crunch_log_seconds_between_events
 arvcfg.declare_config "Containers.Logging.LogThrottlePeriod", ActiveSupport::Duration, :crunch_log_throttle_period
diff --git a/services/api/test/unit/node_test.rb b/services/api/test/unit/node_test.rb
index 9fa3febe1..c3d070886 100644
--- a/services/api/test/unit/node_test.rb
+++ b/services/api/test/unit/node_test.rb
@@ -196,13 +196,12 @@ class NodeTest < ActiveSupport::TestCase
   end
 
   test 'run out of slots' do
-    Rails.configuration.Containers.MaxComputeVMs = 3
     act_as_system_user do
       Node.destroy_all
       (1..4).each do |i|
         n = Node.create!
         args = { ip: "10.0.0.#{i}", ping_secret: n.info['ping_secret'] }
-        if i <= Rails.configuration.Containers.MaxComputeVMs
+        if i <= 3 # MAX_VMS
           n.ping(args)
         else
           assert_raises do

commit be5771dd0fe41d5108d8fd72e62223b37948cf2d
Author: Tom Clegg <tom at curii.com>
Date:   Fri Jan 27 14:54:47 2023 -0500

    18075: Add CloudVMs.MaxInstances config, retire MaxComputeVMs.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom at curii.com>

diff --git a/doc/admin/upgrading.html.textile.liquid b/doc/admin/upgrading.html.textile.liquid
index 4da168293..324658d67 100644
--- a/doc/admin/upgrading.html.textile.liquid
+++ b/doc/admin/upgrading.html.textile.liquid
@@ -28,10 +28,14 @@ TODO: extract this information based on git commit messages and generate changel
 <div class="releasenotes">
 </notextile>
 
-h2(#main). development main (as of 2023-01-16)
+h2(#main). development main (as of 2023-01-27)
 
 "previous: Upgrading to 2.5.0":#v2_5_0
 
+h3. Default limit for cloud VM instances
+
+There is a new configuration entry @CloudVMs.MaxInstances@ (default 64) that limits the number of VMs the cloud dispatcher will run at a time. This may need to be adjusted to suit your anticipated workload.
+
 h3. Slow migration on upgrade
 
 This upgrade includes a database schema update (changing an integer column in each table from 32-bit to 64-bit) that may be slow on a large installation. Expect the arvados-api-server package upgrade to take longer than usual.
diff --git a/lib/config/config.default.yml b/lib/config/config.default.yml
index 29a4a640b..26ada44d6 100644
--- a/lib/config/config.default.yml
+++ b/lib/config/config.default.yml
@@ -1005,13 +1005,6 @@ Clusters:
       # with the cancelled container.
       MaxRetryAttempts: 3
 
-      # The maximum number of compute nodes that can be in use simultaneously
-      # If this limit is reduced, any existing nodes with slot number >= new limit
-      # will not be counted against the new limit. In other words, the new limit
-      # won't be strictly enforced until those nodes with higher slot numbers
-      # go down.
-      MaxComputeVMs: 64
-
       # Schedule all child containers on preemptible instances (e.g. AWS
       # Spot Instances) even if not requested by the submitter.
       #
@@ -1327,6 +1320,15 @@ Clusters:
         # providers too, if desired.
         MaxConcurrentInstanceCreateOps: 1
 
+        # The maximum number of instances to run at a time, or 0 for
+        # unlimited.
+        #
+        # If more instances than this are already running and busy
+        # when the dispatcher starts up, the running containers will
+        # be allowed to finish before the excess instances are shut
+        # down.
+        MaxInstances: 64
+
         # Interval between cloud provider syncs/updates ("list all
         # instances").
         SyncInterval: 1m
diff --git a/lib/config/export.go b/lib/config/export.go
index bc7864486..f9699c6ed 100644
--- a/lib/config/export.go
+++ b/lib/config/export.go
@@ -131,7 +131,6 @@ var whitelist = map[string]bool{
 	"Containers.Logging":                       false,
 	"Containers.LogReuseDecisions":             false,
 	"Containers.LSF":                           false,
-	"Containers.MaxComputeVMs":                 false,
 	"Containers.MaxDispatchAttempts":           false,
 	"Containers.MaxRetryAttempts":              true,
 	"Containers.MinRetryPeriod":                true,
diff --git a/lib/dispatchcloud/worker/pool.go b/lib/dispatchcloud/worker/pool.go
index 66e0bfee9..e3a7b553c 100644
--- a/lib/dispatchcloud/worker/pool.go
+++ b/lib/dispatchcloud/worker/pool.go
@@ -111,6 +111,7 @@ func NewPool(logger logrus.FieldLogger, arvClient *arvados.Client, reg *promethe
 		instanceTypes:                  cluster.InstanceTypes,
 		maxProbesPerSecond:             cluster.Containers.CloudVMs.MaxProbesPerSecond,
 		maxConcurrentInstanceCreateOps: cluster.Containers.CloudVMs.MaxConcurrentInstanceCreateOps,
+		maxInstances:                   cluster.Containers.CloudVMs.MaxInstances,
 		probeInterval:                  duration(cluster.Containers.CloudVMs.ProbeInterval, defaultProbeInterval),
 		syncInterval:                   duration(cluster.Containers.CloudVMs.SyncInterval, defaultSyncInterval),
 		timeoutIdle:                    duration(cluster.Containers.CloudVMs.TimeoutIdle, defaultTimeoutIdle),
@@ -155,6 +156,7 @@ type Pool struct {
 	probeInterval                  time.Duration
 	maxProbesPerSecond             int
 	maxConcurrentInstanceCreateOps int
+	maxInstances                   int
 	timeoutIdle                    time.Duration
 	timeoutBooting                 time.Duration
 	timeoutProbe                   time.Duration
@@ -369,7 +371,7 @@ func (wp *Pool) Create(it arvados.InstanceType) bool {
 func (wp *Pool) AtQuota() bool {
 	wp.mtx.Lock()
 	defer wp.mtx.Unlock()
-	return time.Now().Before(wp.atQuotaUntil)
+	return time.Now().Before(wp.atQuotaUntil) || (wp.maxInstances > 0 && wp.maxInstances <= len(wp.workers)+len(wp.creating))
 }
 
 // SetIdleBehavior determines how the indicated instance will behave
diff --git a/sdk/go/arvados/config.go b/sdk/go/arvados/config.go
index 76ed7cefb..677706c08 100644
--- a/sdk/go/arvados/config.go
+++ b/sdk/go/arvados/config.go
@@ -498,7 +498,6 @@ type ContainersConfig struct {
 	DefaultKeepCacheRAM           ByteSize
 	DispatchPrivateKey            string
 	LogReuseDecisions             bool
-	MaxComputeVMs                 int
 	MaxDispatchAttempts           int
 	MaxRetryAttempts              int
 	MinRetryPeriod                Duration
@@ -562,6 +561,7 @@ type CloudVMsConfig struct {
 	MaxCloudOpsPerSecond           int
 	MaxProbesPerSecond             int
 	MaxConcurrentInstanceCreateOps int
+	MaxInstances                   int
 	PollInterval                   Duration
 	ProbeInterval                  Duration
 	SSHPort                        string

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list