[ARVADOS] updated: 1352b00b037db2ce5de291127b4f1e845bc6b73b

git at public.curoverse.com git at public.curoverse.com
Fri Aug 28 11:37:32 EDT 2015


Summary of changes:
 .../arvnodeman/computenode/driver/__init__.py      |  73 ++++++++++++---
 .../arvnodeman/computenode/driver/azure.py         |  71 ++++++++++++++
 .../arvnodeman/computenode/driver/ec2.py           |   5 -
 .../doc/{ec2.example.cfg => azure.example.cfg}     | 102 ++++++++++++---------
 services/nodemanager/setup.py                      |   5 +-
 .../tests/test_computenode_driver_azure.py         |  89 ++++++++++++++++++
 6 files changed, 284 insertions(+), 61 deletions(-)
 create mode 100644 services/nodemanager/arvnodeman/computenode/driver/azure.py
 copy services/nodemanager/doc/{ec2.example.cfg => azure.example.cfg} (55%)
 create mode 100644 services/nodemanager/tests/test_computenode_driver_azure.py

       via  1352b00b037db2ce5de291127b4f1e845bc6b73b (commit)
       via  29c5c69ef3c9ceadbea3085f2268bb5ed8496c04 (commit)
       via  10b25cacfc521b3dc74c2204fc2b29aca8ad2631 (commit)
       via  9b914107504ece419ee2f7d72be7d6262037ff52 (commit)
       via  03980b49e2fb3cac357e417acea64cd342d1065e (commit)
       via  19a2e9a97939126293ce33d72f576f6f54da574f (commit)
       via  b95518608653185f96f378ea3df4cf1ad7b05817 (commit)
       via  44494089c502572ee231bb421da70889b68fae4f (commit)
       via  5467329d6822455de4644a277f741068cf5f1ec9 (commit)
       via  0600e45775658866f624b87efeef6a1067db5c39 (commit)
       via  8552d32092e45a1f6ee1424e92882ec84b51cb8a (commit)
       via  0fde046a6c68909ae25af809557fcd64eb7264d7 (commit)
       via  45a172b5f59ea7464b7241212464bf9113a18f36 (commit)
       via  c46ab7c622127315b4e90c98e859588a3403267a (commit)
       via  d5341150545efd0960acf34186ca18b98a1b1860 (commit)
       via  ca9cbfd8d91e1ba968832ba98c656e32f59fc393 (commit)
      from  59b414ff7a625ad3d6e92659b20bdabc6d89e7d4 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.


commit 1352b00b037db2ce5de291127b4f1e845bc6b73b
Merge: 59b414f 29c5c69
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Fri Aug 28 11:38:37 2015 -0400

    Merge branch '6507-node-manager-azure' closes #6507

diff --cc services/nodemanager/doc/azure.example.cfg
index 0000000,e92431f..ceeff5a
mode 000000,100644..100644
--- a/services/nodemanager/doc/azure.example.cfg
+++ b/services/nodemanager/doc/azure.example.cfg
@@@ -1,0 -1,151 +1,152 @@@
+ # Azure configuration for Arvados Node Manager.
+ # All times are in seconds unless specified otherwise.
+ 
+ [Daemon]
+ # The dispatcher can customize the start and stop procedure for
+ # cloud nodes.  For example, the SLURM dispatcher drains nodes
+ # through SLURM before shutting them down.
+ #dispatcher = slurm
+ 
+ # Node Manager will ensure that there are at least this many nodes
+ # running at all times.
+ min_nodes = 0
+ 
+ # Node Manager will not start any compute nodes when at least this
+ # many are running.
+ max_nodes = 8
+ 
+ # Poll Azure nodes and Arvados for new information every N seconds.
+ poll_time = 60
+ 
+ # Polls have exponential backoff when services fail to respond.
+ # This is the longest time to wait between polls.
+ max_poll_time = 300
+ 
+ # If Node Manager can't succesfully poll a service for this long,
+ # it will never start or stop compute nodes, on the assumption that its
+ # information is too outdated.
+ poll_stale_after = 600
+ 
+ # If Node Manager boots a cloud node, and it does not pair with an Arvados
+ # node before this long, assume that there was a cloud bootstrap failure and
+ # shut it down.  Note that normal shutdown windows apply (see the Cloud
+ # section), so this should be shorter than the first shutdown window value.
+ boot_fail_after = 1800
+ 
+ # "Node stale time" affects two related behaviors.
+ # 1. If a compute node has been running for at least this long, but it
+ # isn't paired with an Arvados node, do not shut it down, but leave it alone.
+ # This prevents the node manager from shutting down a node that might
+ # actually be doing work, but is having temporary trouble contacting the
+ # API server.
+ # 2. When the Node Manager starts a new compute node, it will try to reuse
+ # an Arvados node that hasn't been updated for this long.
+ node_stale_after = 14400
+ 
+ # File path for Certificate Authorities
+ certs_file = /etc/ssl/certs/ca-certificates.crt
+ 
+ [Logging]
+ # Log file path
+ file = /var/log/arvados/node-manager.log
+ 
+ # Log level for most Node Manager messages.
+ # Choose one of DEBUG, INFO, WARNING, ERROR, or CRITICAL.
+ # WARNING lets you know when polling a service fails.
+ # INFO additionally lets you know when a compute node is started or stopped.
+ level = INFO
+ 
+ # You can also set different log levels for specific libraries.
+ # Pykka is the Node Manager's actor library.
+ # Setting this to DEBUG will display tracebacks for uncaught
+ # exceptions in the actors, but it's also very chatty.
+ pykka = WARNING
+ 
+ # Setting apiclient to INFO will log the URL of every Arvados API request.
+ apiclient = WARNING
+ 
+ [Arvados]
+ host = zyxwv.arvadosapi.com
+ token = ARVADOS_TOKEN
+ timeout = 15
+ 
+ # Accept an untrusted SSL certificate from the API server?
+ insecure = no
+ 
+ [Cloud]
+ provider = azure
+ 
+ # Shutdown windows define periods of time when a node may and may not be shut
+ # down.  These are windows in full minutes, separated by commas.  Counting from
+ # the time the node is booted, the node WILL NOT shut down for N1 minutes; then
+ # it MAY shut down for N2 minutes; then it WILL NOT shut down for N3 minutes;
+ # and so on.  For example, "20, 999999" means the node may shut down between
+ # the 20th and 999999th minutes of uptime.
+ # Azure bills by the minute, so it makes sense to agressively shut down idle
+ # nodes.  Specify at least two windows.  You can add as many as you need beyond
+ # that.
+ shutdown_windows = 20, 999999
+ 
+ [Cloud Credentials]
+ # Use "azure account list" with the azure CLI to get these values.
+ tenant_id = 00000000-0000-0000-0000-000000000000
+ subscription_id = 00000000-0000-0000-0000-000000000000
+ 
+ # The following directions are based on
+ # https://azure.microsoft.com/en-us/documentation/articles/resource-group-authenticate-service-principal/
+ #
+ # azure ad app create --name "<Your Application Display Name>" --home-page "<https://YourApplicationHomePage>" --identifier-uris "<https://YouApplicationUri>" --password <Your_Password>
+ # azure ad sp create "<Application_Id>"
+ # azure role assignment create --objectId "<Object_Id>" -o Owner -c /subscriptions/{subscriptionId}/
+ #
+ # Use <Application_Id> for "key" and the <Your_Password> for "secret"
+ #
+ key = 00000000-0000-0000-0000-000000000000
+ secret = PASSWORD
+ timeout = 60
+ region = East US
+ 
+ [Cloud List]
+ # The resource group in which the compute node virtual machines will be created
+ # and listed.
+ ex_resource_group = ArvadosResourceGroup
+ 
+ [Cloud Create]
+ # The image id, in the form "Publisher:Offer:SKU:Version"
+ image = Canonical:UbuntuServer:14.04.3-LTS:14.04.201508050
+ 
+ # Path to a local ssh key file that will be used to provision new nodes.
+ ssh_key = /home/arvadosuser/.ssh/id_rsa.pub
+ 
+ # The account name for the admin user that will be provisioned on new nodes.
+ ex_user_name = arvadosuser
+ 
+ # The Azure storage account that will be used to store the node OS disk images.
+ ex_storage_account = arvadosstorage
+ 
+ # The virtual network the VMs will be associated with.
+ ex_network = ArvadosNetwork
+ 
+ # Optional subnet of the virtual network.
+ #ex_subnet = default
+ 
+ # Node tags
+ tag_arvados-class = dynamic-compute
+ tag_cluster = zyxwv
+ 
+ # the API server to ping
+ ping_host = hostname:port
+ 
 -[Size Standard_A2]
++[Size Standard_D3]
+ # You can define any number of Size sections to list Azure sizes you're
+ # willing to use.  The Node Manager should boot the cheapest size(s) that
+ # can run jobs in the queue (N.B.: defining more than one size has not been
+ # tested yet).
+ # Each size section MUST define the number of cores are available in this
+ # size class (since libcloud does not provide any consistent API for exposing
+ # this setting).
+ # You may also want to define the amount of scratch space (expressed
+ # in GB) for Crunch jobs.  You can also override Microsoft's provided
+ # data fields by setting the same names here.
+ cores = 4
++scratch = 200

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list