[arvados] updated: 2.5.0-212-g4eb591b3d

git repository hosting git at public.arvados.org
Fri Mar 3 15:48:56 UTC 2023


Summary of changes:
 lib/config/config.default.yml                      | 25 +++++++++++++---------
 lib/config/export.go                               |  1 -
 lib/dispatchcloud/dispatcher.go                    |  6 +++++-
 lib/dispatchcloud/scheduler/run_queue.go           |  6 +-----
 lib/dispatchcloud/scheduler/run_queue_test.go      | 11 ++++++----
 sdk/go/arvados/config.go                           |  2 +-
 services/api/app/models/container.rb               |  5 +++++
 services/api/app/models/container_request.rb       |  4 ++++
 .../v1/container_requests_controller_test.rb       | 19 ++++++++++++++--
 9 files changed, 55 insertions(+), 24 deletions(-)

       via  4eb591b3d541cc5ac035fb0eed7c39dc8e81dd02 (commit)
      from  53a0cb93d935f02248f4e347ea0689fccf5e818e (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.


commit 4eb591b3d541cc5ac035fb0eed7c39dc8e81dd02
Author: Peter Amstutz <peter.amstutz at curii.com>
Date:   Fri Mar 3 10:44:30 2023 -0500

    20182: API server sets "supervisor" flag now
    
    Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz at curii.com>

diff --git a/lib/config/config.default.yml b/lib/config/config.default.yml
index 882ee62c3..527439571 100644
--- a/lib/config/config.default.yml
+++ b/lib/config/config.default.yml
@@ -1055,16 +1055,6 @@ Clusters:
       # Container runtime: "docker" (default) or "singularity"
       RuntimeEngine: docker
 
-      # Number of "supervisor" containers eligible to run at any given
-      # time expressed as a fraction of CloudVMs.MaxInstances. A
-      # supervisor is a container who's purpose is to submit other
-      # containers, such as a container running arvados-cwl-runner.
-      # If there is a hard limit on the amount of concurrent
-      # containers that the cluster can run, it is important to avoid
-      # crowding out the containers doing useful work with containers
-      # who just create more work.
-      SupervisorFraction: 0.3
-
       # When running a container, run a dedicated keepstore process,
       # using the specified number of 64 MiB memory buffers per
       # allocated CPU core (VCPUs in the container's runtime
@@ -1339,6 +1329,21 @@ Clusters:
         # down.
         MaxInstances: 64
 
+        # Maximum fraction of CloudVMs.MaxInstances allowed to run
+        # "supervisor" containers at any given time. A supervisor is a
+        # container whose purpose is mainly to submit and manage other
+        # containers, such as arvados-cwl-runner workflow runner.
+        #
+        # If there is a hard limit on the amount of concurrent
+        # containers that the cluster can run, it is important to
+        # avoid crowding out the containers doing useful work with
+        # containers who just create more work.
+        #
+        # For example, with the default MaxInstances of 64, it will
+        # schedule at most floor(64*0.30) = 19 concurrent workflows,
+        # ensuring 45 slots are available for work.
+        SupervisorFraction: 0.30
+
         # Interval between cloud provider syncs/updates ("list all
         # instances").
         SyncInterval: 1m
diff --git a/lib/config/export.go b/lib/config/export.go
index fc688d68d..a8a535eeb 100644
--- a/lib/config/export.go
+++ b/lib/config/export.go
@@ -142,7 +142,6 @@ var whitelist = map[string]bool{
 	"Containers.ShellAccess.User":              true,
 	"Containers.SLURM":                         false,
 	"Containers.StaleLockTimeout":              false,
-	"Containers.SupervisorFraction":            false,
 	"Containers.SupportedDockerImageFormats":   true,
 	"Containers.SupportedDockerImageFormats.*": true,
 	"Git":                                  false,
diff --git a/lib/dispatchcloud/dispatcher.go b/lib/dispatchcloud/dispatcher.go
index 1f8272afd..06a558d5f 100644
--- a/lib/dispatchcloud/dispatcher.go
+++ b/lib/dispatchcloud/dispatcher.go
@@ -193,7 +193,11 @@ func (disp *dispatcher) run() {
 	if pollInterval <= 0 {
 		pollInterval = defaultPollInterval
 	}
-	sched := scheduler.New(disp.Context, disp.ArvClient, disp.queue, disp.pool, disp.Registry, staleLockTimeout, pollInterval, int(float64(disp.Cluster.Containers.CloudVMs.MaxInstances)*disp.Cluster.Containers.SupervisorFraction))
+	maxSupervisors := int(float64(disp.Cluster.Containers.CloudVMs.MaxInstances) * disp.Cluster.Containers.CloudVMs.SupervisorFraction)
+	if maxSupervisors == 0 && disp.Cluster.Containers.CloudVMs.SupervisorFraction > 0 {
+		maxSupervisors = 1
+	}
+	sched := scheduler.New(disp.Context, disp.ArvClient, disp.queue, disp.pool, disp.Registry, staleLockTimeout, pollInterval, maxSupervisors)
 	sched.Start()
 	defer sched.Stop()
 
diff --git a/lib/dispatchcloud/scheduler/run_queue.go b/lib/dispatchcloud/scheduler/run_queue.go
index 65ca904be..b8158579a 100644
--- a/lib/dispatchcloud/scheduler/run_queue.go
+++ b/lib/dispatchcloud/scheduler/run_queue.go
@@ -15,10 +15,6 @@ import (
 
 var quietAfter503 = time.Minute
 
-func isSupervisor(ctr arvados.Container) bool {
-	return (len(ctr.Command) > 0 && ctr.Command[0] == "arvados-cwl-runner") || ctr.SchedulingParameters.Supervisor
-}
-
 func (sch *Scheduler) runQueue() {
 	unsorted, _ := sch.queue.Entries()
 	sorted := make([]container.QueueEnt, 0, len(unsorted))
@@ -97,7 +93,7 @@ tryrun:
 			"ContainerUUID": ctr.UUID,
 			"InstanceType":  it.Name,
 		})
-		if isSupervisor(ctr) {
+		if ctr.SchedulingParameters.Supervisor {
 			supervisors += 1
 			if sch.maxSupervisors > 0 && supervisors > sch.maxSupervisors {
 				continue
diff --git a/lib/dispatchcloud/scheduler/run_queue_test.go b/lib/dispatchcloud/scheduler/run_queue_test.go
index db161b7d7..3278c7de6 100644
--- a/lib/dispatchcloud/scheduler/run_queue_test.go
+++ b/lib/dispatchcloud/scheduler/run_queue_test.go
@@ -549,8 +549,7 @@ func (*SchedulerSuite) TestContainersMetrics(c *check.C) {
 	c.Check(int(testutil.ToFloat64(sch.mLongestWaitTimeSinceQueue)), check.Equals, 0)
 }
 
-// Assign priority=4 container to idle node. Create new instances for
-// the priority=3 and 1 containers.  Ignore the supervisor at priority 2.
+// Assign priority=4, 3 and 1 containers to idle nodes. Ignore the supervisor at priority 2.
 func (*SchedulerSuite) TestSkipSupervisors(c *check.C) {
 	ctx := ctxlog.Context(context.Background(), ctxlog.TestLogger(c))
 	queue := test.Queue{
@@ -573,7 +572,9 @@ func (*SchedulerSuite) TestSkipSupervisors(c *check.C) {
 					VCPUs: 1,
 					RAM:   1 << 30,
 				},
-				Command: []string{"arvados-cwl-runner"},
+				SchedulingParameters: arvados.SchedulingParameters{
+					Supervisor: true,
+				},
 			},
 			{
 				UUID:     test.ContainerUUID(3),
@@ -595,7 +596,9 @@ func (*SchedulerSuite) TestSkipSupervisors(c *check.C) {
 					VCPUs: 1,
 					RAM:   1 << 30,
 				},
-				Command: []string{"arvados-cwl-runner"},
+				SchedulingParameters: arvados.SchedulingParameters{
+					Supervisor: true,
+				},
 			},
 		},
 	}
diff --git a/sdk/go/arvados/config.go b/sdk/go/arvados/config.go
index d167addd3..bf70a7603 100644
--- a/sdk/go/arvados/config.go
+++ b/sdk/go/arvados/config.go
@@ -506,7 +506,6 @@ type ContainersConfig struct {
 	SupportedDockerImageFormats   StringSet
 	AlwaysUsePreemptibleInstances bool
 	PreemptiblePriceFactor        float64
-	SupervisorFraction            float64
 	RuntimeEngine                 string
 	LocalKeepBlobBuffersPerVCPU   int
 	LocalKeepLogsToContainerLog   string
@@ -563,6 +562,7 @@ type CloudVMsConfig struct {
 	MaxProbesPerSecond             int
 	MaxConcurrentInstanceCreateOps int
 	MaxInstances                   int
+	SupervisorFraction             float64
 	PollInterval                   Duration
 	ProbeInterval                  Duration
 	SSHPort                        string
diff --git a/services/api/app/models/container.rb b/services/api/app/models/container.rb
index 42d0ed49b..fa6b6f296 100644
--- a/services/api/app/models/container.rb
+++ b/services/api/app/models/container.rb
@@ -729,6 +729,11 @@ class Container < ArvadosModel
                                .map { |req| req.scheduling_parameters["preemptible"] }
                                .all?,
 
+              # supervisor: true if all any true, else false
+              "supervisor": retryable_requests
+                               .map { |req| req.scheduling_parameters["supervisor"] }
+                               .any?,
+
               # max_run_time: 0 if any are 0 (unlimited), else the maximum
               "max_run_time": retryable_requests
                                 .map { |req| req.scheduling_parameters["max_run_time"] || 0 }
diff --git a/services/api/app/models/container_request.rb b/services/api/app/models/container_request.rb
index 47e2769a8..e4da8399f 100644
--- a/services/api/app/models/container_request.rb
+++ b/services/api/app/models/container_request.rb
@@ -320,6 +320,10 @@ class ContainerRequest < ArvadosModel
       return false
     end
     if state_changed? and state == Committed and container_uuid.nil?
+      if self.command.length > 0 and self.command[0] == "arvados-cwl-runner"
+        # Special case, arvados-cwl-runner processes are always considered "supervisors"
+        self.scheduling_parameters['supervisor'] = true
+      end
       while true
         c = Container.resolve(self)
         c.lock!
diff --git a/services/api/test/functional/arvados/v1/container_requests_controller_test.rb b/services/api/test/functional/arvados/v1/container_requests_controller_test.rb
index e99af39c9..f287a11fa 100644
--- a/services/api/test/functional/arvados/v1/container_requests_controller_test.rb
+++ b/services/api/test/functional/arvados/v1/container_requests_controller_test.rb
@@ -8,8 +8,9 @@ class Arvados::V1::ContainerRequestsControllerTest < ActionController::TestCase
   def minimal_cr
     {
       command: ['echo', 'hello'],
-      container_image: 'test',
+      container_image: 'arvados/apitestfixture:latest',
       output_path: 'test',
+      runtime_constraints: {vcpus: 1, ram: 1}
     }
   end
 
@@ -18,7 +19,7 @@ class Arvados::V1::ContainerRequestsControllerTest < ActionController::TestCase
 
     sp = {'partitions' => ['test1', 'test2']}
     post :create, params: {
-           container_request: minimal_cr.merge(scheduling_parameters: sp.dup)
+           container_request: minimal_cr.merge(scheduling_parameters: sp.dup, state: "Committed")
          }
     assert_response :success
 
@@ -26,6 +27,20 @@ class Arvados::V1::ContainerRequestsControllerTest < ActionController::TestCase
     assert_not_nil cr, 'Expected container request'
     assert_equal sp['partitions'], cr['scheduling_parameters']['partitions']
     assert_equal false, cr['scheduling_parameters']['preemptible']
+    assert_equal false, cr['scheduling_parameters']['supervisor']
+  end
+
+  test 'create a-c-r should be supervisor' do
+    authorize_with :active
+
+    post :create, params: {
+           container_request: minimal_cr.merge(command: ["arvados-cwl-runner", "my-workflow.cwl"], state: "Committed")
+         }
+    assert_response :success
+
+    cr = JSON.parse(@response.body)
+    assert_not_nil cr, 'Expected container request'
+    assert_equal true, cr['scheduling_parameters']['supervisor']
   end
 
   test "secret_mounts not in #create responses" do

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list