[ARVADOS] created: 1.3.0-1462-g30e065c34
Git user
git at public.curoverse.com
Thu Aug 8 14:05:07 UTC 2019
at 30e065c34db0ab9a0e824a77b1ac0a46412598e0 (commit)
commit 30e065c34db0ab9a0e824a77b1ac0a46412598e0
Author: Peter Amstutz <pamstutz at veritasgenetics.com>
Date: Thu Aug 8 10:04:43 2019 -0400
15133: Delete crunch-job & arv-run-pipeline-instance
Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <pamstutz at veritasgenetics.com>
diff --git a/sdk/cli/bin/arv-crunch-job b/sdk/cli/bin/arv-crunch-job
deleted file mode 100755
index 6e4b5e0b1..000000000
--- a/sdk/cli/bin/arv-crunch-job
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/usr/bin/env ruby
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-exec File.join(File.dirname(File.realpath(__FILE__)), 'crunch-job'), *ARGV
diff --git a/sdk/cli/bin/arv-run-pipeline-instance b/sdk/cli/bin/arv-run-pipeline-instance
deleted file mode 100755
index 336b1a2c7..000000000
--- a/sdk/cli/bin/arv-run-pipeline-instance
+++ /dev/null
@@ -1,781 +0,0 @@
-#!/usr/bin/env ruby
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-class WhRunPipelineInstance
-end
-
-if RUBY_VERSION < '1.9.3' then
- abort <<-EOS
-#{$0.gsub(/^\.\//,'')} requires Ruby version 1.9.3 or higher.
- EOS
-end
-
-begin
- require 'arvados'
- require 'rubygems'
- require 'json'
- require 'pp'
- require 'optimist'
- require 'google/api_client'
-rescue LoadError => l
- $stderr.puts $:
- abort <<-EOS
-#{$0}: fatal: #{l.message}
-Some runtime dependencies may be missing.
-Try: gem install arvados pp google-api-client json optimist
- EOS
-end
-
-def debuglog(message, verbosity=1)
- $stderr.puts "#{File.split($0).last} #{$$}: #{message}" if $debuglevel >= verbosity
-end
-
-# Parse command line options (the kind that control the behavior of
-# this program, that is, not the pipeline component parameters).
-
-p = Optimist::Parser.new do
- version __FILE__
- banner(<<EOF)
-
-Usage:
- arv-run-pipeline-instance --template TEMPLATE_UUID [options] [--] [parameters]
- arv-run-pipeline-instance --instance INSTANCE_UUID [options] [--] [parameters]
-
-Parameters:
- param_name=param_value
- param_name param_value
- Set (or override) the default value for every
- pipeline component parameter with the given
- name.
-
- component_name::param_name=param_value
- component_name::param_name param_value
- --component_name::param_name=param_value
- --component_name::param_name param_value
- Set the value of a parameter for a single
- pipeline component.
-
-Options:
-EOF
- opt(:dry_run,
- "Do not start any new jobs or wait for existing jobs to finish. Just find out whether jobs are finished, queued, or running for each component.",
- :type => :boolean,
- :short => :n)
- opt(:status_text,
- "Store plain text status in given file.",
- :short => :none,
- :type => :string,
- :default => '/dev/stdout')
- opt(:status_json,
- "Store json-formatted pipeline in given file.",
- :short => :none,
- :type => :string,
- :default => '/dev/null')
- opt(:no_wait,
- "Do not wait for jobs to finish. Just look up status, submit new jobs if needed, and exit.",
- :short => :none,
- :type => :boolean)
- opt(:no_reuse,
- "Do not reuse existing jobs to satisfy pipeline components. Submit a new job for every component.",
- :short => :none,
- :type => :boolean)
- opt(:debug,
- "Print extra debugging information on stderr.",
- :type => :boolean)
- opt(:debug_level,
- "Set debug verbosity level.",
- :short => :none,
- :type => :integer)
- opt(:template,
- "UUID of pipeline template, or path to local pipeline template file.",
- :short => :none,
- :type => :string)
- opt(:instance,
- "UUID of pipeline instance.",
- :short => :none,
- :type => :string)
- opt(:submit,
- "Submit the pipeline instance to the server, and exit. Let the Crunch dispatch service satisfy the components by finding/running jobs.",
- :short => :none,
- :type => :boolean)
- opt(:run_pipeline_here,
- "Manage the pipeline instance in-process. Submit jobs to Crunch as needed. Do not exit until the pipeline finishes (or fails).",
- :short => :none,
- :type => :boolean)
- opt(:run_jobs_here,
- "Run jobs in the local terminal session instead of submitting them to Crunch. Implies --run-pipeline-here. Note: this results in a significantly different job execution environment, and some Crunch features are not supported. It can be necessary to modify a pipeline in order to make it run this way.",
- :short => :none,
- :type => :boolean)
- opt(:run_here,
- "Synonym for --run-jobs-here.",
- :short => :none,
- :type => :boolean)
- opt(:description,
- "Description for the pipeline instance.",
- :short => :none,
- :type => :string)
- opt(:project_uuid,
- "UUID of the project for the pipeline instance.",
- short: :none,
- type: :string)
- stop_on [:'--']
-end
-$options = Optimist::with_standard_exception_handling p do
- p.parse ARGV
-end
-$debuglevel = $options[:debug_level] || ($options[:debug] && 1) || 0
-
-$options[:run_jobs_here] ||= $options[:run_here] # old flag name
-$options[:run_pipeline_here] ||= $options[:run_jobs_here] # B requires A
-
-if $options[:instance]
- if $options[:template] or $options[:submit]
- abort "#{$0}: syntax error: --instance cannot be combined with --template or --submit."
- end
-elsif not $options[:template]
- $stderr.puts "error: you must supply a --template or --instance."
- p.educate
- abort
-end
-
-if $options[:run_pipeline_here] == $options[:submit]
- abort "#{$0}: error: you must supply --run-pipeline-here, --run-jobs-here, or --submit."
-end
-
-# Set up the API client.
-
-$arv = Arvados.new api_version: 'v1'
-$client = $arv.client
-$arvados = $arv.arvados_api
-
-class PipelineInstance
- def self.find(uuid)
- result = $client.execute(:api_method => $arvados.pipeline_instances.get,
- :parameters => {
- :uuid => uuid
- },
- :authenticated => false,
- :headers => {
- authorization: 'OAuth2 '+$arv.config['ARVADOS_API_TOKEN']
- })
- j = JSON.parse result.body, :symbolize_names => true
- unless j.is_a? Hash and j[:uuid]
- debuglog "Failed to get pipeline_instance: #{j[:errors] rescue nil}", 0
- nil
- else
- debuglog "Retrieved pipeline_instance #{j[:uuid]}"
- self.new(j)
- end
- end
- def self.create(attributes)
- result = $client.execute(:api_method => $arvados.pipeline_instances.create,
- :body_object => {
- :pipeline_instance => attributes
- },
- :authenticated => false,
- :headers => {
- authorization: 'OAuth2 '+$arv.config['ARVADOS_API_TOKEN']
- })
- j = JSON.parse result.body, :symbolize_names => true
- unless j.is_a? Hash and j[:uuid]
- abort "\n#{Time.now} -- pipeline_template #{@template[:uuid]}\nFailed to create pipeline_instance: #{j[:errors] rescue nil} #{j.inspect}"
- end
- debuglog "Created pipeline instance: #{j[:uuid]}"
- self.new(j)
- end
- def save
- result = $client.execute(:api_method => $arvados.pipeline_instances.update,
- :parameters => {
- :uuid => @pi[:uuid]
- },
- :body_object => {
- :pipeline_instance => @attributes_to_update
- },
- :authenticated => false,
- :headers => {
- authorization: 'OAuth2 '+$arv.config['ARVADOS_API_TOKEN']
- })
- j = JSON.parse result.body, :symbolize_names => true
- unless j.is_a? Hash and j[:uuid]
- debuglog "Failed to save pipeline_instance: #{j[:errors] rescue nil}", 0
- nil
- else
- @attributes_to_update = {}
- @pi = j
- end
- end
- def []=(x,y)
- @attributes_to_update[x] = y
- @pi[x] = y
- end
- def [](x)
- @pi[x]
- end
-
- def log_stderr(msg)
- $arv.log.create log: {
- event_type: 'stderr',
- object_uuid: self[:uuid],
- owner_uuid: self[:owner_uuid],
- properties: {"text" => msg},
- }
- end
-
- protected
- def initialize(j)
- @attributes_to_update = {}
- @pi = j
- end
-end
-
-class JobCache
- def self.get(uuid)
- @cache ||= {}
- result = $client.execute(:api_method => $arvados.jobs.get,
- :parameters => {
- :uuid => uuid
- },
- :authenticated => false,
- :headers => {
- authorization: 'OAuth2 '+$arv.config['ARVADOS_API_TOKEN']
- })
- @cache[uuid] = JSON.parse result.body, :symbolize_names => true
- end
- def self.where(conditions)
- result = $client.execute(:api_method => $arvados.jobs.list,
- :parameters => {
- :limit => 10000,
- :where => conditions.to_json
- },
- :authenticated => false,
- :headers => {
- authorization: 'OAuth2 '+$arv.config['ARVADOS_API_TOKEN']
- })
- list = JSON.parse result.body, :symbolize_names => true
- if list and list[:items].is_a? Array
- list[:items]
- else
- []
- end
- end
-
- # create() returns [job, exception]. If both job and exception are
- # nil, there was a non-retryable error and the call should not be
- # attempted again.
- def self.create(pipeline, component, job, create_params)
- @cache ||= {}
-
- body = {job: no_nil_values(job)}.merge(no_nil_values(create_params))
-
- result = nil
- begin
- result = $client.execute(
- :api_method => $arvados.jobs.create,
- :body_object => body,
- :authenticated => false,
- :headers => {
- authorization: 'OAuth2 '+$arv.config['ARVADOS_API_TOKEN']
- })
- if result.status == 429 || result.status >= 500
- raise Exception.new("HTTP status #{result.status}")
- end
- rescue Exception => e
- return nil, e
- end
- j = JSON.parse(result.body, :symbolize_names => true) rescue nil
- if result.status == 200 && j.is_a?(Hash) && j[:uuid]
- @cache[j[:uuid]] = j
- return j, nil
- else
- errors = j[:errors] rescue []
- debuglog "create job: [#{result.status}] #{errors.inspect} with attributes #{body}", 0
-
- msg = ""
- errors.each do |err|
- msg += "Error creating job for component #{component}: #{err}\n"
- end
- msg += "Job submission was: #{body.to_json}"
-
- pipeline.log_stderr(msg)
- return nil, nil
- end
- end
-
- protected
-
- def self.no_nil_values(hash)
- hash.reject { |key, value| value.nil? }
- end
-end
-
-class WhRunPipelineInstance
- attr_reader :instance
-
- def initialize(_options)
- @options = _options
- end
-
- def fetch_template(template)
- if template.match /[^-0-9a-z]/
- # Doesn't look like a uuid -- use it as a filename.
- @template = JSON.parse File.read(template), :symbolize_names => true
- else
- result = $client.execute(:api_method => $arvados.pipeline_templates.get,
- :parameters => {
- :uuid => template
- },
- :authenticated => false,
- :headers => {
- authorization: 'OAuth2 '+$arv.config['ARVADOS_API_TOKEN']
- })
- @template = JSON.parse result.body, :symbolize_names => true
- if !@template[:uuid]
- abort "#{$0}: fatal: failed to retrieve pipeline template #{template} #{@template[:errors].inspect rescue nil}"
- end
- end
- self
- end
-
- def fetch_instance(instance_uuid)
- @instance = PipelineInstance.find(instance_uuid)
- @template = @instance
- self
- end
-
- def apply_parameters(params_args)
- params_args.shift if params_args[0] == '--'
- params = {}
- while !params_args.empty?
- if (re = params_args[0].match /^(--)?([^-].*?)=(.+)/)
- params[re[2]] = re[3]
- params_args.shift
- elsif params_args.size > 1
- param = params_args.shift.sub /^--/, ''
- params[param] = params_args.shift
- else
- abort "\n#{Time.now} -- pipeline_template #{@template[:uuid]}\nSyntax error: I do not know what to do with arg \"#{params_args[0]}\""
- end
- end
-
- if not @template[:components].is_a?(Hash)
- abort "\n#{Time.now} -- pipeline_template #{@template[:uuid]}\nSyntax error: Template missing \"components\" hash"
- end
- @components = @template[:components].dup
-
- bad_components = @components.each_pair.select do |cname, cspec|
- not cspec.is_a?(Hash)
- end
- if bad_components.any?
- abort "\n#{Time.now} -- pipeline_template #{@template[:uuid]}\nSyntax error: Components not specified with hashes: #{bad_components.map(&:first).join(', ')}"
- end
-
- bad_components = @components.each_pair.select do |cname, cspec|
- not cspec[:script_parameters].is_a?(Hash)
- end
- if bad_components.any?
- abort "\n#{Time.now} -- pipeline_template #{@template[:uuid]}\nSyntax error: Components missing \"script_parameters\" hashes: #{bad_components.map(&:first).join(', ')}"
- end
-
- errors = []
- @components.each do |componentname, component|
- component[:script_parameters].each do |parametername, parameter|
- parameter = { :value => parameter } unless parameter.is_a? Hash
- if params.has_key?("#{componentname}::#{parametername}")
- value = params["#{componentname}::#{parametername}"]
- elsif parameter.has_key?(:value)
- value = parameter[:value]
- elsif parameter.has_key?(:output_of)
- if !@components[parameter[:output_of].intern]
- errors << [componentname, parametername, "output_of refers to nonexistent component '#{parameter[:output_of]}'"]
- else
- # value will be filled in later when the upstream
- # component's output becomes known
- end
- next
- elsif params.has_key?(parametername.to_s)
- value = params[parametername.to_s]
- elsif parameter.has_key?(:default)
- value = parameter[:default]
- elsif [false, 'false', 0, '0'].index(parameter[:required])
- value = nil
- else
- errors << [componentname, parametername, "required parameter is missing"]
- next
- end
- debuglog "parameter #{componentname}::#{parametername} == #{value}"
-
- component[:script_parameters][parametername] =
- parameter.dup.merge(value: value)
- end
- end
- if !errors.empty?
- all_errors = errors.collect do |c,p,e|
- "#{c}::#{p} - #{e}\n"
- end.join("")
- abort "\n#{Time.now} -- pipeline_template #{@template[:uuid]}\nErrors:\n#{all_errors}"
- end
- debuglog "options=" + @options.pretty_inspect
- self
- end
-
- def setup_instance
- if @instance
- @instance[:properties][:run_options] ||= {}
- if @options[:no_reuse]
- # override properties of existing instance
- @instance[:properties][:run_options][:enable_job_reuse] = false
- else
- # Default to "enable reuse" if not specified. (This code path
- # can go away when old clients go away.)
- if @instance[:properties][:run_options][:enable_job_reuse].nil?
- @instance[:properties][:run_options][:enable_job_reuse] = true
- end
- end
- else
- description = $options[:description] ||
- ("Created at #{Time.now.localtime}" + (@template[:name].andand.size.andand>0 ? " using the pipeline template *#{@template[:name]}*" : ""))
- instance_body = {
- components: @components,
- properties: {
- run_options: {
- enable_job_reuse: !@options[:no_reuse]
- }
- },
- pipeline_template_uuid: @template[:uuid],
- description: description,
- state: ($options[:submit] ? 'RunningOnServer' : 'RunningOnClient')
- }
- if @options[:project_uuid]
- instance_body[:owner_uuid] = @options[:project_uuid]
- end
- @instance = PipelineInstance.create(instance_body)
- end
- self
- end
-
- def run
- moretodo = true
- interrupted = false
-
- if @instance[:started_at].nil?
- @instance[:started_at] = Time.now
- end
-
- job_creation_failed = 0
- while moretodo
- moretodo = false
- @components.each do |cname, c|
- job = nil
- owner_uuid = @instance[:owner_uuid]
- # Is the job satisfying this component already known to be
- # finished? (Already meaning "before we query API server about
- # the job's current state")
- c_already_finished = (c[:job] &&
- c[:job][:uuid] &&
- ["Complete", "Failed", "Cancelled"].include?(c[:job][:state]))
- if !c[:job] and
- c[:script_parameters].select { |pname, p| p.is_a? Hash and p[:output_of]}.empty?
- # No job yet associated with this component and is component inputs
- # are fully specified (any output_of script_parameters are resolved
- # to real value)
- my_submit_id = "instance #{@instance[:uuid]} rand #{rand(2**64).to_s(36)}"
- job, err = JobCache.create(@instance, cname, {
- :script => c[:script],
- :script_parameters => Hash[c[:script_parameters].map do |key, spec|
- [key, spec[:value]]
- end],
- :script_version => c[:script_version],
- :repository => c[:repository],
- :nondeterministic => c[:nondeterministic],
- :runtime_constraints => c[:runtime_constraints],
- :owner_uuid => owner_uuid,
- :is_locked_by_uuid => (@options[:run_jobs_here] ? owner_uuid : nil),
- :submit_id => my_submit_id,
- :state => (if @options[:run_jobs_here] then "Running" else "Queued" end)
- }, {
- # This is the right place to put these attributes when
- # dealing with new API servers.
- :minimum_script_version => c[:minimum_script_version],
- :exclude_script_versions => c[:exclude_minimum_script_versions],
- :find_or_create => (@instance[:properties][:run_options].andand[:enable_job_reuse] &&
- !c[:nondeterministic]),
- :filters => c[:filters]
- })
- if job
- debuglog "component #{cname} new job #{job[:uuid]}"
- c[:job] = job
- c[:run_in_process] = (@options[:run_jobs_here] and
- job[:submit_id] == my_submit_id)
- elsif err.nil?
- debuglog "component #{cname} new job failed", 0
- job_creation_failed += 1
- else
- debuglog "component #{cname} new job failed, err=#{err}", 0
- end
- end
-
- if c[:job] and c[:run_in_process] and not ["Complete", "Failed", "Cancelled"].include? c[:job][:state]
- report_status
- begin
- require 'open3'
- Open3.popen3("arv-crunch-job", "--force-unlock",
- "--job", c[:job][:uuid]) do |stdin, stdout, stderr, wait_thr|
- debuglog "arv-crunch-job pid #{wait_thr.pid} started", 0
- stdin.close
- while true
- rready, wready, = IO.select([stdout, stderr], [])
- break if !rready[0]
- begin
- buf = rready[0].read_nonblock(2**20)
- rescue EOFError
- break
- end
- (rready[0] == stdout ? $stdout : $stderr).write(buf)
- end
- stdout.close
- stderr.close
- debuglog "arv-crunch-job pid #{wait_thr.pid} exit #{wait_thr.value.to_i}", 0
- end
- if not $arv.job.get(uuid: c[:job][:uuid])[:finished_at]
- raise Exception.new("arv-crunch-job did not set finished_at.")
- end
- rescue Exception => e
- debuglog "Interrupted (#{e}). Failing job.", 0
- $arv.job.update(uuid: c[:job][:uuid],
- job: {
- state: "Failed"
- })
- end
- end
-
- if c[:job] and c[:job][:uuid]
- if ["Running", "Queued"].include?(c[:job][:state])
- # Job is running (or may be soon) so update copy of job record
- c[:job] = JobCache.get(c[:job][:uuid])
- end
-
- if c[:job][:state] == "Complete"
- # Populate script_parameters of other components waiting for
- # this job
- @components.each do |c2name, c2|
- c2[:script_parameters].each do |pname, p|
- if p.is_a? Hash and p[:output_of] == cname.to_s
- debuglog "parameter #{c2name}::#{pname} == #{c[:job][:output]}"
- c2[:script_parameters][pname] = {value: c[:job][:output]}
- moretodo = true
- end
- end
- end
- unless c_already_finished
- # This is my first time discovering that the job
- # succeeded. (At the top of this loop, I was still
- # waiting for it to finish.)
-
- if @instance[:name].andand.length.andand > 0
- pipeline_name = @instance[:name]
- elsif @template.andand[:name].andand.length.andand > 0
- pipeline_name = @template[:name]
- else
- pipeline_name = @instance[:uuid]
- end
- if c[:output_name] != false
- # Create a collection located in the same project as the pipeline with the contents of the output.
- portable_data_hash = c[:job][:output]
- collections = $arv.collection.list(limit: 1,
- filters: [['portable_data_hash', '=', portable_data_hash]],
- select: ["portable_data_hash", "manifest_text"]
- )[:items]
- if collections.any?
- name = c[:output_name] || "Output #{portable_data_hash[0..7]} of #{cname} of #{pipeline_name}"
-
- # check if there is a name collision.
- name_collisions = $arv.collection.list(filters: [["owner_uuid", "=", owner_uuid],
- ["name", "=", name]])[:items]
-
- newcollection_actual = nil
- if name_collisions.any? and name_collisions.first[:portable_data_hash] == portable_data_hash
- # There is already a collection with the same name and the
- # same contents, so just point to that.
- newcollection_actual = name_collisions.first
- end
-
- if newcollection_actual.nil?
- # Did not find a collection with the same name (or the
- # collection has a different portable data hash) so create
- # a new collection with ensure_unique_name: true.
- newcollection = {
- owner_uuid: owner_uuid,
- name: name,
- portable_data_hash: collections.first[:portable_data_hash],
- manifest_text: collections.first[:manifest_text]
- }
- debuglog "Creating collection #{newcollection}", 0
- newcollection_actual = $arv.collection.create collection: newcollection, ensure_unique_name: true
- end
-
- c[:output_uuid] = newcollection_actual[:uuid]
- else
- debuglog "Could not find a collection with portable data hash #{portable_data_hash}", 0
- end
- end
- end
- elsif ["Queued", "Running"].include? c[:job][:state]
- # Job is running or queued to run, so indicate that pipeline
- # should continue to run
- moretodo = true
- elsif c[:job][:state] == "Cancelled"
- debuglog "component #{cname} job #{c[:job][:uuid]} cancelled."
- moretodo = false
- elsif c[:job][:state] == "Failed"
- moretodo = false
- end
- end
- end
- @instance[:components] = @components
- report_status
-
- if @options[:no_wait]
- moretodo = false
- end
-
- # If job creation fails, just give up on this pipeline instance.
- if job_creation_failed > 0
- moretodo = false
- end
-
- if moretodo
- begin
- sleep 10
- rescue Interrupt
- debuglog "interrupt", 0
- interrupted = true
- break
- end
- end
- end
-
- c_in_state = @components.values.group_by { |c|
- c[:job] and c[:job][:state]
- }
- succeeded = c_in_state["Complete"].andand.count || 0
- failed = (c_in_state["Failed"].andand.count || 0) + (c_in_state["Cancelled"].andand.count || 0)
- ended = succeeded + failed
-
- success = (succeeded == @components.length)
-
- # A job create call failed. Just give up.
- if job_creation_failed > 0
- debuglog "job creation failed - giving up on this pipeline instance", 0
- success = false
- failed += 1
- end
-
- if interrupted
- if success
- @instance[:state] = 'Complete'
- else
- @instance[:state] = 'Paused'
- end
- else
- if ended == @components.length or failed > 0
- @instance[:state] = success ? 'Complete' : 'Failed'
- end
- end
-
- if @instance[:finished_at].nil? and ['Complete', 'Failed'].include? @instance[:state]
- @instance[:finished_at] = Time.now
- end
-
- debuglog "pipeline instance state is #{@instance[:state]}"
-
- # set components_summary
- components_summary = {"todo" => @components.length - ended, "done" => succeeded, "failed" => failed}
- @instance[:components_summary] = components_summary
-
- @instance.save
- end
-
- def cleanup
- if @instance and @instance[:state] == 'RunningOnClient'
- @instance[:state] = 'Paused'
- @instance.save
- end
- end
-
- def uuid
- @instance[:uuid]
- end
-
- protected
-
- def report_status
- @instance.save
-
- if @options[:status_json] != '/dev/null'
- File.open(@options[:status_json], 'w') do |f|
- f.puts @components.pretty_inspect
- end
- end
-
- if @options[:status_text] != '/dev/null'
- File.open(@options[:status_text], 'w') do |f|
- f.puts ""
- f.puts "#{Time.now} -- pipeline_instance #{@instance[:uuid]}"
- namewidth = @components.collect { |cname, c| cname.size }.max
- @components.each do |cname, c|
- jstatus = if !c[:job]
- "-"
- else case c[:job][:state]
- when "Running"
- "#{c[:job][:tasks_summary].inspect}"
- when "Complete"
- c[:job][:output]
- when "Cancelled"
- "cancelled #{c[:job][:cancelled_at]}"
- when "Failed"
- "failed #{c[:job][:finished_at]}"
- when "Queued"
- "queued #{c[:job][:created_at]}"
- end
- end
- f.puts "#{cname.to_s.ljust namewidth} #{c[:job] ? c[:job][:uuid] : '-'.ljust(27)} #{jstatus}"
- end
- end
- end
- end
-
- def abort(msg)
- if @instance
- if ["New", "Ready", "RunningOnClient",
- "RunningOnServer"].include?(@instance[:state])
- @instance[:state] = "Failed"
- @instance[:finished_at] = Time.now
- @instance.save
- end
- @instance.log_stderr(msg)
- end
- Kernel::abort(msg)
- end
-end
-
-runner = WhRunPipelineInstance.new($options)
-begin
- if $options[:template]
- runner.fetch_template($options[:template])
- else
- runner.fetch_instance($options[:instance])
- end
- runner.apply_parameters(p.leftovers)
- runner.setup_instance
- if $options[:submit]
- runner.instance.save
- puts runner.instance[:uuid]
- else
- runner.run
- end
-rescue Exception => e
- runner.cleanup
- raise e
-end
diff --git a/sdk/cli/bin/crunch-job b/sdk/cli/bin/crunch-job
deleted file mode 100755
index 242dff708..000000000
--- a/sdk/cli/bin/crunch-job
+++ /dev/null
@@ -1,2577 +0,0 @@
-#!/usr/bin/env perl
-# -*- mode: perl; perl-indent-level: 2; indent-tabs-mode: nil; -*-
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: AGPL-3.0
-
-=head1 NAME
-
-crunch-job: Execute job steps, save snapshots as requested, collate output.
-
-=head1 SYNOPSIS
-
-Obtain job details from Arvados, run tasks on compute nodes (typically
-invoked by scheduler on controller):
-
- crunch-job --job x-y-z --git-dir /path/to/repo/.git
-
-Obtain job details from command line, run tasks on local machine
-(typically invoked by application or developer on VM):
-
- crunch-job --job '{"script_version":"/path/to/working/tree","script":"scriptname",...}'
-
- crunch-job --job '{"repository":"https://github.com/curoverse/arvados.git","script_version":"master","script":"scriptname",...}'
-
-=head1 OPTIONS
-
-=over
-
-=item --force-unlock
-
-If the job is already locked, steal the lock and run it anyway.
-
-=item --git-dir
-
-Path to a .git directory (or a git URL) where the commit given in the
-job's C<script_version> attribute is to be found. If this is I<not>
-given, the job's C<repository> attribute will be used.
-
-=item --job-api-token
-
-Arvados API authorization token to use during the course of the job.
-
-=item --no-clear-tmp
-
-Do not clear per-job/task temporary directories during initial job
-setup. This can speed up development and debugging when running jobs
-locally.
-
-=item --job
-
-UUID of the job to run, or a JSON-encoded job resource without a
-UUID. If the latter is given, a new job object will be created.
-
-=back
-
-=head1 RUNNING JOBS LOCALLY
-
-crunch-job's log messages appear on stderr along with the job tasks'
-stderr streams. The log is saved in Keep at each checkpoint and when
-the job finishes.
-
-If the job succeeds, the job's output locator is printed on stdout.
-
-While the job is running, the following signals are accepted:
-
-=over
-
-=item control-C, SIGINT, SIGQUIT
-
-Save a checkpoint, terminate any job tasks that are running, and stop.
-
-=item SIGALRM
-
-Save a checkpoint and continue.
-
-=item SIGHUP
-
-Refresh node allocation (i.e., check whether any nodes have been added
-or unallocated) and attributes of the Job record that should affect
-behavior (e.g., cancel job if cancelled_at becomes non-nil).
-
-=back
-
-=cut
-
-
-use strict;
-use POSIX ':sys_wait_h';
-use POSIX qw(strftime);
-use Fcntl qw(F_GETFL F_SETFL O_NONBLOCK);
-use Arvados;
-use Cwd qw(realpath);
-use Data::Dumper;
-use Digest::MD5 qw(md5_hex);
-use Getopt::Long;
-use IPC::Open2;
-use IO::Select;
-use File::Temp;
-use Fcntl ':flock';
-use File::Path qw( make_path remove_tree );
-
-use constant TASK_TEMPFAIL => 111;
-use constant EX_TEMPFAIL => 75;
-use constant EX_RETRY_UNLOCKED => 93;
-
-$ENV{"TMPDIR"} ||= "/tmp";
-unless (defined $ENV{"CRUNCH_TMP"}) {
- $ENV{"CRUNCH_TMP"} = $ENV{"TMPDIR"} . "/crunch-job";
- if ($ENV{"USER"} ne "crunch" && $< != 0) {
- # use a tmp dir unique for my uid
- $ENV{"CRUNCH_TMP"} .= "-$<";
- }
-}
-
-# Create the tmp directory if it does not exist
-if ( ! -d $ENV{"CRUNCH_TMP"} ) {
- make_path $ENV{"CRUNCH_TMP"} or die "Failed to create temporary working directory: " . $ENV{"CRUNCH_TMP"};
-}
-
-$ENV{"JOB_WORK"} = $ENV{"CRUNCH_TMP"} . "/work";
-$ENV{"CRUNCH_INSTALL"} = "$ENV{CRUNCH_TMP}/opt";
-$ENV{"CRUNCH_WORK"} = $ENV{"JOB_WORK"}; # deprecated
-mkdir ($ENV{"JOB_WORK"});
-
-my %proc;
-my $force_unlock;
-my $git_dir;
-my $jobspec;
-my $job_api_token;
-my $no_clear_tmp;
-my $resume_stash;
-my $cgroup_root = "/sys/fs/cgroup";
-my $docker_bin = "docker.io";
-my $docker_run_args = "";
-my $srun_sync_timeout = 15*60;
-GetOptions('force-unlock' => \$force_unlock,
- 'git-dir=s' => \$git_dir,
- 'job=s' => \$jobspec,
- 'job-api-token=s' => \$job_api_token,
- 'no-clear-tmp' => \$no_clear_tmp,
- 'resume-stash=s' => \$resume_stash,
- 'cgroup-root=s' => \$cgroup_root,
- 'docker-bin=s' => \$docker_bin,
- 'docker-run-args=s' => \$docker_run_args,
- 'srun-sync-timeout=i' => \$srun_sync_timeout,
- );
-
-if (defined $job_api_token) {
- $ENV{ARVADOS_API_TOKEN} = $job_api_token;
-}
-
-my $have_slurm = exists $ENV{SLURM_JOB_ID} && exists $ENV{SLURM_NODELIST};
-
-
-$SIG{'USR1'} = sub
-{
- $main::ENV{CRUNCH_DEBUG} = 1;
-};
-$SIG{'USR2'} = sub
-{
- $main::ENV{CRUNCH_DEBUG} = 0;
-};
-
-my $arv = Arvados->new('apiVersion' => 'v1');
-
-my $Job;
-my $job_id;
-my $dbh;
-my $sth;
-my @jobstep;
-
-my $local_job;
-if ($jobspec =~ /^[-a-z\d]+$/)
-{
- # $jobspec is an Arvados UUID, not a JSON job specification
- $Job = api_call("jobs/get", uuid => $jobspec);
- $local_job = 0;
-}
-else
-{
- $local_job = JSON::decode_json($jobspec);
-}
-
-
-# Make sure our workers (our slurm nodes, localhost, or whatever) are
-# at least able to run basic commands: they aren't down or severely
-# misconfigured.
-my $cmd = ['true'];
-if (($Job || $local_job)->{docker_image_locator}) {
- $cmd = [$docker_bin, 'ps', '-q'];
-}
-Log(undef, "Sanity check is `@$cmd`");
-my ($exited, $stdout, $stderr, $tempfail) = srun_sync(
- ["srun", "--nodes=\Q$ENV{SLURM_NNODES}\E", "--ntasks-per-node=1"],
- $cmd,
- {label => "sanity check"});
-if ($exited != 0) {
- Log(undef, "Sanity check failed: ".exit_status_s($exited));
- exit EX_TEMPFAIL;
-}
-Log(undef, "Sanity check OK");
-
-
-my $User = api_call("users/current");
-
-if (!$local_job) {
- if (!$force_unlock) {
- # Claim this job, and make sure nobody else does
- eval { api_call("jobs/lock", uuid => $Job->{uuid}); };
- if ($@) {
- Log(undef, "Error while locking job, exiting ".EX_TEMPFAIL);
- exit EX_TEMPFAIL;
- };
- }
-}
-else
-{
- if (!$resume_stash)
- {
- map { croak ("No $_ specified") unless $local_job->{$_} }
- qw(script script_version script_parameters);
- }
-
- $local_job->{'is_locked_by_uuid'} = $User->{'uuid'};
- $local_job->{'started_at'} = gmtime;
- $local_job->{'state'} = 'Running';
-
- $Job = api_call("jobs/create", job => $local_job);
-}
-$job_id = $Job->{'uuid'};
-
-my $keep_logfile = $job_id . '.log.txt';
-log_writer_start($keep_logfile);
-
-$Job->{'runtime_constraints'} ||= {};
-$Job->{'runtime_constraints'}->{'max_tasks_per_node'} ||= 0;
-my $max_ncpus = $Job->{'runtime_constraints'}->{'max_tasks_per_node'};
-
-my $gem_versions = `gem list --quiet arvados-cli 2>/dev/null`;
-if ($? == 0) {
- $gem_versions =~ s/^arvados-cli \(/ with arvados-cli Gem version(s) /;
- chomp($gem_versions);
- chop($gem_versions); # Closing parentheses
-} else {
- $gem_versions = "";
-}
-Log(undef,
- "running from " . ((-e $0) ? realpath($0) : "stdin") . $gem_versions);
-
-Log (undef, "check slurm allocation");
-my @slot;
-my @node;
-# Should use $ENV{SLURM_TASKS_PER_NODE} instead of sinfo? (eg. "4(x3),2,4(x2)")
-my @sinfo;
-if (!$have_slurm)
-{
- my $localcpus = 0 + `grep -cw ^processor /proc/cpuinfo` || 1;
- push @sinfo, "$localcpus localhost";
-}
-if (exists $ENV{SLURM_NODELIST})
-{
- push @sinfo, `sinfo -h --format='%c %N' --nodes=\Q$ENV{SLURM_NODELIST}\E`;
-}
-foreach (@sinfo)
-{
- my ($ncpus, $slurm_nodelist) = split;
- $ncpus = $max_ncpus if $max_ncpus && $ncpus > $max_ncpus;
-
- my @nodelist;
- while ($slurm_nodelist =~ s/^([^\[,]+?(\[.*?\])?)(,|$)//)
- {
- my $nodelist = $1;
- if ($nodelist =~ /\[((\d+)(-(\d+))?(,(\d+)(-(\d+))?)*)\]/)
- {
- my $ranges = $1;
- foreach (split (",", $ranges))
- {
- my ($a, $b);
- if (/(\d+)-(\d+)/)
- {
- $a = $1;
- $b = $2;
- }
- else
- {
- $a = $_;
- $b = $_;
- }
- push @nodelist, map {
- my $n = $nodelist;
- $n =~ s/\[[-,\d]+\]/$_/;
- $n;
- } ($a..$b);
- }
- }
- else
- {
- push @nodelist, $nodelist;
- }
- }
- foreach my $nodename (@nodelist)
- {
- Log (undef, "node $nodename - $ncpus slots");
- my $node = { name => $nodename,
- ncpus => $ncpus,
- # The number of consecutive times a task has been dispatched
- # to this node and failed.
- losing_streak => 0,
- # The number of consecutive times that SLURM has reported
- # a node failure since the last successful task.
- fail_count => 0,
- # Don't dispatch work to this node until this time
- # (in seconds since the epoch) has passed.
- hold_until => 0 };
- foreach my $cpu (1..$ncpus)
- {
- push @slot, { node => $node,
- cpu => $cpu };
- }
- }
- push @node, @nodelist;
-}
-
-
-
-# Ensure that we get one jobstep running on each allocated node before
-# we start overloading nodes with concurrent steps
-
- at slot = sort { $a->{cpu} <=> $b->{cpu} } @slot;
-
-
-$Job->update_attributes(
- 'tasks_summary' => { 'failed' => 0,
- 'todo' => 1,
- 'running' => 0,
- 'done' => 0 });
-
-Log (undef, "start");
-$SIG{'INT'} = sub { $main::please_freeze = 1; };
-$SIG{'QUIT'} = sub { $main::please_freeze = 1; };
-$SIG{'TERM'} = \&croak;
-$SIG{'TSTP'} = sub { $main::please_freeze = 1; };
-$SIG{'ALRM'} = sub { $main::please_info = 1; };
-$SIG{'CONT'} = sub { $main::please_continue = 1; };
-$SIG{'HUP'} = sub { $main::please_refresh = 1; };
-
-$main::please_freeze = 0;
-$main::please_info = 0;
-$main::please_continue = 0;
-$main::please_refresh = 0;
-my $jobsteps_must_output_keys = 0; # becomes 1 when any task outputs a key
-
-grep { $ENV{$1} = $2 if /^(NOCACHE.*?)=(.*)/ } split ("\n", $$Job{knobs});
-$ENV{"CRUNCH_JOB_UUID"} = $job_id;
-$ENV{"JOB_UUID"} = $job_id;
-
-
-my @jobstep_todo = ();
-my @jobstep_done = ();
-my @jobstep_tomerge = ();
-my $jobstep_tomerge_level = 0;
-my $squeue_checked = 0;
-my $sinfo_checked = 0;
-my $latest_refresh = scalar time;
-
-
-
-if (defined $Job->{thawedfromkey})
-{
- thaw ($Job->{thawedfromkey});
-}
-else
-{
- my $first_task = api_call("job_tasks/create", job_task => {
- 'job_uuid' => $Job->{'uuid'},
- 'sequence' => 0,
- 'qsequence' => 0,
- 'parameters' => {},
- });
- push @jobstep, { 'level' => 0,
- 'failures' => 0,
- 'arvados_task' => $first_task,
- };
- push @jobstep_todo, 0;
-}
-
-
-if (!$have_slurm)
-{
- must_lock_now("$ENV{CRUNCH_TMP}/.lock", "a job is already running here.");
-}
-
-my $build_script = handle_readall(\*DATA);
-my $nodelist = join(",", @node);
-my $git_tar_count = 0;
-
-if (!defined $no_clear_tmp) {
- # Find FUSE mounts under $CRUNCH_TMP and unmount them. Then clean
- # up work directories crunch_tmp/work, crunch_tmp/opt,
- # crunch_tmp/src*.
- my ($exited, $stdout, $stderr, $tempfail) = srun_sync(
- ["srun", "--nodelist=$nodelist", "-D", $ENV{'TMPDIR'}],
- ['bash', '-ec', q{
-arv-mount --unmount-timeout 10 --unmount-all ${CRUNCH_TMP}
-rm -rf ${JOB_WORK} ${CRUNCH_INSTALL} ${CRUNCH_TMP}/task ${CRUNCH_TMP}/src* ${CRUNCH_TMP}/*.cid
- }],
- {label => "clean work dirs"});
- if ($exited != 0) {
- exit_retry_unlocked();
- }
-}
-
-# If this job requires a Docker image, install that.
-my ($docker_locator, $docker_stream, $docker_hash, $docker_limitmem, $dockeruserarg);
-if ($docker_locator = $Job->{docker_image_locator}) {
- Log (undef, "Install docker image $docker_locator");
- ($docker_stream, $docker_hash) = find_docker_image($docker_locator);
- if (!$docker_hash)
- {
- croak("No Docker image hash found from locator $docker_locator");
- }
- Log (undef, "docker image hash is $docker_hash");
- $docker_stream =~ s/^\.//;
- my $docker_install_script = qq{
-loaded() {
- id=\$($docker_bin inspect --format="{{.ID}}" \Q$docker_hash\E) || return 1
- echo "image ID is \$id"
- [[ \${id} = \Q$docker_hash\E ]]
-}
-if loaded >&2 2>/dev/null; then
- echo >&2 "image is already present"
- exit 0
-fi
-echo >&2 "docker image is not present; loading"
-arv-get \Q$docker_locator$docker_stream/$docker_hash.tar\E | $docker_bin load
-if ! loaded >&2; then
- echo >&2 "`docker load` exited 0, but image is not found (!)"
- exit 1
-fi
-echo >&2 "image loaded successfully"
-};
-
- my ($exited, $stdout, $stderr, $tempfail) = srun_sync(
- ["srun", "--nodelist=" . join(',', @node)],
- ["/bin/bash", "-o", "pipefail", "-ec", $docker_install_script],
- {label => "load docker image"});
- if ($exited != 0)
- {
- exit_retry_unlocked();
- }
-
- # Determine whether this version of Docker supports memory+swap limits.
- ($exited, $stdout, $stderr, $tempfail) = srun_sync(
- ["srun", "--nodes=1"],
- [$docker_bin, 'run', '--help'],
- {label => "check --memory-swap feature"});
- if ($tempfail) {
- exit_retry_unlocked();
- }
- $docker_limitmem = ($stdout =~ /--memory-swap/);
-
- # Find a non-root Docker user to use.
- # Tries the default user for the container, then 'crunch', then 'nobody',
- # testing for whether the actual user id is non-zero. This defends against
- # mistakes but not malice, but we intend to harden the security in the future
- # so we don't want anyone getting used to their jobs running as root in their
- # Docker containers.
- my @tryusers = ("", "crunch", "nobody");
- foreach my $try_user (@tryusers) {
- my $label;
- my $try_user_arg;
- if ($try_user eq "") {
- $label = "check whether default user is UID 0";
- $try_user_arg = "";
- } else {
- $label = "check whether user '$try_user' is UID 0";
- $try_user_arg = "--user=$try_user";
- }
- my ($exited, $stdout, $stderr, $tempfail) = srun_sync(
- ["srun", "--nodes=1"],
- ["/bin/sh", "-ec",
- "$docker_bin run $docker_run_args $try_user_arg $docker_hash id --user"],
- {label => $label});
- chomp($stdout);
- if ($exited == 0 && $stdout =~ /^\d+$/ && $stdout > 0) {
- $dockeruserarg = $try_user_arg;
- if ($try_user eq "") {
- Log(undef, "Container will run with default user");
- } else {
- Log(undef, "Container will run with $dockeruserarg");
- }
- last;
- } elsif ($tempfail) {
- exit_retry_unlocked();
- }
- }
-
- if (!defined $dockeruserarg) {
- croak("Could not find a user in container that is not UID 0 (tried default user, @tryusers) or there was a problem running 'id' in the container.");
- }
-
- if ($Job->{arvados_sdk_version}) {
- # The job also specifies an Arvados SDK version. Add the SDKs to the
- # tar file for the build script to install.
- Log(undef, sprintf("Packing Arvados SDK version %s for installation",
- $Job->{arvados_sdk_version}));
- add_git_archive("git", "--git-dir=$git_dir", "archive",
- "--prefix=.arvados.sdk/",
- $Job->{arvados_sdk_version}, "sdk");
- }
-}
-
-if (!defined $git_dir && $Job->{'script_version'} =~ m{^/}) {
- # If script_version looks like an absolute path, *and* the --git-dir
- # argument was not given -- which implies we were not invoked by
- # crunch-dispatch -- we will use the given path as a working
- # directory instead of resolving script_version to a git commit (or
- # doing anything else with git).
- $ENV{"CRUNCH_SRC_COMMIT"} = $Job->{'script_version'};
- $ENV{"CRUNCH_SRC"} = $Job->{'script_version'};
-}
-else {
- # Resolve the given script_version to a git commit sha1. Also, if
- # the repository is remote, clone it into our local filesystem: this
- # ensures "git archive" will work, and is necessary to reliably
- # resolve a symbolic script_version like "master^".
- $ENV{"CRUNCH_SRC"} = "$ENV{CRUNCH_TMP}/src";
-
- Log (undef, "Looking for version ".$Job->{script_version}." from repository ".$Job->{repository});
-
- $ENV{"CRUNCH_SRC_COMMIT"} = $Job->{script_version};
-
- # If we're running under crunch-dispatch, it will have already
- # pulled the appropriate source tree into its own repository, and
- # given us that repo's path as $git_dir.
- #
- # If we're running a "local" job, we might have to fetch content
- # from a remote repository.
- #
- # (Currently crunch-dispatch gives a local path with --git-dir, but
- # we might as well accept URLs there too in case it changes its
- # mind.)
- my $repo = $git_dir || $Job->{'repository'};
-
- # Repository can be remote or local. If remote, we'll need to fetch it
- # to a local dir before doing `git log` et al.
- my $repo_location;
-
- if ($repo =~ m{://|^[^/]*:}) {
- # $repo is a git url we can clone, like git:// or https:// or
- # file:/// or [user@]host:repo.git. Note "user/name at host:foo" is
- # not recognized here because distinguishing that from a local
- # path is too fragile. If you really need something strange here,
- # use the ssh:// form.
- $repo_location = 'remote';
- } elsif ($repo =~ m{^\.*/}) {
- # $repo is a local path to a git index. We'll also resolve ../foo
- # to ../foo/.git if the latter is a directory. To help
- # disambiguate local paths from named hosted repositories, this
- # form must be given as ./ or ../ if it's a relative path.
- if (-d "$repo/.git") {
- $repo = "$repo/.git";
- }
- $repo_location = 'local';
- } else {
- # $repo is none of the above. It must be the name of a hosted
- # repository.
- my $arv_repo_list = api_call("repositories/list",
- 'filters' => [['name','=',$repo]]);
- my @repos_found = @{$arv_repo_list->{'items'}};
- my $n_found = $arv_repo_list->{'serverResponse'}->{'items_available'};
- if ($n_found > 0) {
- Log(undef, "Repository '$repo' -> "
- . join(", ", map { $_->{'uuid'} } @repos_found));
- }
- if ($n_found != 1) {
- croak("Error: Found $n_found repositories with name '$repo'.");
- }
- $repo = $repos_found[0]->{'fetch_url'};
- $repo_location = 'remote';
- }
- Log(undef, "Using $repo_location repository '$repo'");
- $ENV{"CRUNCH_SRC_URL"} = $repo;
-
- # Resolve given script_version (we'll call that $treeish here) to a
- # commit sha1 ($commit).
- my $treeish = $Job->{'script_version'};
- my $commit;
- if ($repo_location eq 'remote') {
- # We minimize excess object-fetching by re-using the same bare
- # repository in CRUNCH_TMP/.git for multiple crunch-jobs -- we
- # just keep adding remotes to it as needed.
- my $local_repo = $ENV{'CRUNCH_TMP'}."/.git";
- my $gitcmd = "git --git-dir=\Q$local_repo\E";
-
- # Set up our local repo for caching remote objects, making
- # archives, etc.
- if (!-d $local_repo) {
- make_path($local_repo) or croak("Error: could not create $local_repo");
- }
- # This works (exits 0 and doesn't delete fetched objects) even
- # if $local_repo is already initialized:
- `$gitcmd init --bare`;
- if ($?) {
- croak("Error: $gitcmd init --bare exited ".exit_status_s($?));
- }
-
- # If $treeish looks like a hash (or abbrev hash) we look it up in
- # our local cache first, since that's cheaper. (We don't want to
- # do that with tags/branches though -- those change over time, so
- # they should always be resolved by the remote repo.)
- if ($treeish =~ /^[0-9a-f]{7,40}$/s) {
- # Hide stderr because it's normal for this to fail:
- my $sha1 = `$gitcmd rev-list -n1 ''\Q$treeish\E 2>/dev/null`;
- if ($? == 0 &&
- # Careful not to resolve a branch named abcdeff to commit 1234567:
- $sha1 =~ /^$treeish/ &&
- $sha1 =~ /^([0-9a-f]{40})$/s) {
- $commit = $1;
- Log(undef, "Commit $commit already present in $local_repo");
- }
- }
-
- if (!defined $commit) {
- # If $treeish isn't just a hash or abbrev hash, or isn't here
- # yet, we need to fetch the remote to resolve it correctly.
-
- # First, remove all local heads. This prevents a name that does
- # not exist on the remote from resolving to (or colliding with)
- # a previously fetched branch or tag (possibly from a different
- # remote).
- remove_tree("$local_repo/refs/heads", {keep_root => 1});
-
- Log(undef, "Fetching objects from $repo to $local_repo");
- `$gitcmd fetch --no-progress --tags ''\Q$repo\E \Q+refs/heads/*:refs/heads/*\E`;
- if ($?) {
- croak("Error: `$gitcmd fetch` exited ".exit_status_s($?));
- }
- }
-
- # Now that the data is all here, we will use our local repo for
- # the rest of our git activities.
- $repo = $local_repo;
- }
-
- my $gitcmd = "git --git-dir=\Q$repo\E";
- my $sha1 = `$gitcmd rev-list -n1 ''\Q$treeish\E`;
- unless ($? == 0 && $sha1 =~ /^([0-9a-f]{40})$/) {
- croak("`$gitcmd rev-list` exited "
- .exit_status_s($?)
- .", '$treeish' not found, giving up");
- }
- $commit = $1;
- Log(undef, "Version $treeish is commit $commit");
-
- if ($commit ne $Job->{'script_version'}) {
- # Record the real commit id in the database, frozentokey, logs,
- # etc. -- instead of an abbreviation or a branch name which can
- # become ambiguous or point to a different commit in the future.
- if (!$Job->update_attributes('script_version' => $commit)) {
- croak("Error: failed to update job's script_version attribute");
- }
- }
-
- $ENV{"CRUNCH_SRC_COMMIT"} = $commit;
- add_git_archive("$gitcmd archive ''\Q$commit\E");
-}
-
-my $git_archive = combined_git_archive();
-if (!defined $git_archive) {
- Log(undef, "Skip install phase (no git archive)");
- if ($have_slurm) {
- Log(undef, "Warning: This probably means workers have no source tree!");
- }
-}
-else {
- my $exited;
- my $install_script_tries_left = 3;
- for (my $attempts = 0; $attempts < 3; $attempts++) {
- my @srunargs = ("srun",
- "--nodelist=$nodelist",
- "-D", $ENV{'TMPDIR'}, "--job-name=$job_id");
- my @execargs = ("sh", "-c",
- "mkdir -p $ENV{CRUNCH_INSTALL} && cd $ENV{CRUNCH_TMP} && perl -");
-
- $ENV{"CRUNCH_GIT_ARCHIVE_HASH"} = md5_hex($git_archive);
- my ($stdout, $stderr, $tempfail);
- ($exited, $stdout, $stderr, $tempfail) = srun_sync(
- \@srunargs, \@execargs,
- {label => "run install script on all workers"},
- $build_script . $git_archive);
- if ($tempfail) {
- exit_retry_unlocked();
- }
-
- my $stderr_anything_from_script = 0;
- for my $line (split(/\n/, $stderr)) {
- if ($line !~ /^(srun: error: |starting: \[)/) {
- $stderr_anything_from_script = 1;
- }
- }
-
- last if $exited == 0 || $main::please_freeze;
-
- # If the install script fails but doesn't print an error message,
- # the next thing anyone is likely to do is just run it again in
- # case it was a transient problem like "slurm communication fails
- # because the network isn't reliable enough". So we'll just do
- # that ourselves (up to 3 attempts in total). OTOH, if there is an
- # error message, the problem is more likely to have a real fix and
- # we should fail the job so the fixing process can start, instead
- # of doing 2 more attempts.
- last if $stderr_anything_from_script;
- }
-
- foreach my $tar_filename (map { tar_filename_n($_); } (1..$git_tar_count)) {
- unlink($tar_filename);
- }
-
- if ($exited != 0) {
- croak("Giving up");
- }
-}
-
-foreach (qw (script script_version script_parameters runtime_constraints))
-{
- Log (undef,
- "$_ " .
- (ref($Job->{$_}) ? JSON::encode_json($Job->{$_}) : $Job->{$_}));
-}
-foreach (split (/\n/, $Job->{knobs}))
-{
- Log (undef, "knob " . $_);
-}
-my $resp = api_call(
- 'nodes/list',
- 'filters' => [['hostname', 'in', \@node]],
- 'order' => 'hostname',
- 'limit' => scalar(@node),
- );
-for my $n (@{$resp->{items}}) {
- Log(undef, "$n->{hostname} $n->{uuid} ".JSON::encode_json($n->{properties}));
-}
-
-
-
-$main::success = undef;
-
-
-
-ONELEVEL:
-
-my $thisround_succeeded = 0;
-my $thisround_failed = 0;
-my $thisround_failed_multiple = 0;
-my $working_slot_count = scalar(@slot);
-
- at jobstep_todo = sort { $jobstep[$a]->{level} <=> $jobstep[$b]->{level}
- or $a <=> $b } @jobstep_todo;
-my $level = $jobstep[$jobstep_todo[0]]->{level};
-
-my $initial_tasks_this_level = 0;
-foreach my $id (@jobstep_todo) {
- $initial_tasks_this_level++ if ($jobstep[$id]->{level} == $level);
-}
-
-# If the number of tasks scheduled at this level #T is smaller than the number
-# of slots available #S, only use the first #T slots, or the first slot on
-# each node, whichever number is greater.
-#
-# When we dispatch tasks later, we'll allocate whole-node resources like RAM
-# based on these numbers. Using fewer slots makes more resources available
-# to each individual task, which should normally be a better strategy when
-# there are fewer of them running with less parallelism.
-#
-# Note that this calculation is not redone if the initial tasks at
-# this level queue more tasks at the same level. This may harm
-# overall task throughput for that level.
-my @freeslot;
-if ($initial_tasks_this_level < @node) {
- @freeslot = (0..$#node);
-} elsif ($initial_tasks_this_level < @slot) {
- @freeslot = (0..$initial_tasks_this_level - 1);
-} else {
- @freeslot = (0..$#slot);
-}
-my $round_num_freeslots = scalar(@freeslot);
-print STDERR "crunch-job have ${round_num_freeslots} free slots for ${initial_tasks_this_level} initial tasks at this level, ".scalar(@node)." nodes, and ".scalar(@slot)." slots\n";
-
-my %round_max_slots = ();
-for (my $ii = $#freeslot; $ii >= 0; $ii--) {
- my $this_slot = $slot[$freeslot[$ii]];
- my $node_name = $this_slot->{node}->{name};
- $round_max_slots{$node_name} ||= $this_slot->{cpu};
- last if (scalar(keys(%round_max_slots)) >= @node);
-}
-
-Log(undef, "start level $level with $round_num_freeslots slots");
-my @holdslot;
-my %reader;
-my $progress_is_dirty = 1;
-my $progress_stats_updated = 0;
-
-update_progress_stats();
-
-
-THISROUND:
-for (my $todo_ptr = 0; $todo_ptr <= $#jobstep_todo; $todo_ptr ++)
-{
- # Don't create new tasks if we already know the job's final result.
- last if defined($main::success);
-
- my $id = $jobstep_todo[$todo_ptr];
- my $Jobstep = $jobstep[$id];
- if ($Jobstep->{level} != $level)
- {
- next;
- }
-
- pipe $reader{$id}, "writer" or croak("pipe() failed: $!");
- set_nonblocking($reader{$id});
-
- my $childslot = $freeslot[0];
- my $childnode = $slot[$childslot]->{node};
- my $childslotname = join (".",
- $slot[$childslot]->{node}->{name},
- $slot[$childslot]->{cpu});
-
- my $childpid = fork();
- if ($childpid == 0)
- {
- $SIG{'INT'} = 'DEFAULT';
- $SIG{'QUIT'} = 'DEFAULT';
- $SIG{'TERM'} = 'DEFAULT';
-
- foreach (values (%reader))
- {
- close($_);
- }
- fcntl ("writer", F_SETFL, 0) or croak ($!); # no close-on-exec
- open(STDOUT,">&writer") or croak ($!);
- open(STDERR,">&writer") or croak ($!);
-
- undef $dbh;
- undef $sth;
-
- delete $ENV{"GNUPGHOME"};
- $ENV{"TASK_UUID"} = $Jobstep->{'arvados_task'}->{'uuid'};
- $ENV{"TASK_QSEQUENCE"} = $id;
- $ENV{"TASK_SEQUENCE"} = $level;
- $ENV{"JOB_SCRIPT"} = $Job->{script};
- while (my ($param, $value) = each %{$Job->{script_parameters}}) {
- $param =~ tr/a-z/A-Z/;
- $ENV{"JOB_PARAMETER_$param"} = $value;
- }
- $ENV{"TASK_SLOT_NODE"} = $slot[$childslot]->{node}->{name};
- $ENV{"TASK_SLOT_NUMBER"} = $slot[$childslot]->{cpu};
- $ENV{"TASK_WORK"} = $ENV{"CRUNCH_TMP"}."/task/$childslotname";
- $ENV{"HOME"} = $ENV{"TASK_WORK"};
- $ENV{"TASK_TMPDIR"} = $ENV{"TASK_WORK"}; # deprecated
- $ENV{"CRUNCH_NODE_SLOTS"} = $round_max_slots{$ENV{TASK_SLOT_NODE}};
- $ENV{"PATH"} = $ENV{"CRUNCH_INSTALL"} . "/bin:" . $ENV{"PATH"};
-
- my $keep_mnt = $ENV{"TASK_WORK"}.".keep";
-
- $ENV{"GZIP"} = "-n";
-
- my @srunargs = (
- "srun",
- "--nodelist=".$childnode->{name},
- qw(-n1 -c1 -N1 -D), $ENV{'TMPDIR'},
- "--job-name=$job_id.$id.$$",
- );
-
- my $stdbuf = " stdbuf --output=0 --error=0 ";
-
- my $arv_file_cache = "";
- if (defined($Job->{'runtime_constraints'}->{'keep_cache_mb_per_task'})) {
- $arv_file_cache = "--file-cache=" . ($Job->{'runtime_constraints'}->{'keep_cache_mb_per_task'} * 1024 * 1024);
- }
-
- my $command =
- "if [ -e \Q$ENV{TASK_WORK}\E ]; then rm -rf \Q$ENV{TASK_WORK}\E; fi; "
- ."mkdir -p \Q$ENV{CRUNCH_TMP}\E \Q$ENV{JOB_WORK}\E \Q$ENV{TASK_WORK}\E \Q$keep_mnt\E "
- ."&& cd \Q$ENV{CRUNCH_TMP}\E "
- # These environment variables get used explicitly later in
- # $command. No tool is expected to read these values directly.
- .q{&& MEM=$(awk '($1 == "MemTotal:"){print $2}' </proc/meminfo) }
- .q{&& SWAP=$(awk '($1 == "SwapTotal:"){print $2}' </proc/meminfo) }
- ."&& MEMLIMIT=\$(( (\$MEM * 95) / ($ENV{CRUNCH_NODE_SLOTS} * 100) )) "
- ."&& let SWAPLIMIT=\$MEMLIMIT+\$SWAP "
- .q{&& declare -a VOLUMES=() }
- .q{&& if which crunchrunner >/dev/null ; then VOLUMES+=("--volume=$(which crunchrunner):/usr/local/bin/crunchrunner:ro") ; fi }
- .q{&& if test -f /etc/ssl/certs/ca-certificates.crt ; then VOLUMES+=("--volume=/etc/ssl/certs/ca-certificates.crt:/etc/arvados/ca-certificates.crt:ro") ; }
- .q{elif test -f /etc/pki/tls/certs/ca-bundle.crt ; then VOLUMES+=("--volume=/etc/pki/tls/certs/ca-bundle.crt:/etc/arvados/ca-certificates.crt:ro") ; fi };
-
- $command .= "&& exec arv-mount --read-write --mount-by-pdh=by_pdh --mount-tmp=tmp --crunchstat-interval=10 --allow-other $arv_file_cache \Q$keep_mnt\E --exec ";
- $ENV{TASK_KEEPMOUNT} = "$keep_mnt/by_pdh";
- $ENV{TASK_KEEPMOUNT_TMP} = "$keep_mnt/tmp";
-
- if ($docker_hash)
- {
- my $containername = "$Jobstep->{arvados_task}->{uuid}-$Jobstep->{failures}";
- my $cidfile = "$ENV{CRUNCH_TMP}/$containername.cid";
- $command .= "crunchstat -cgroup-root=\Q$cgroup_root\E -cgroup-parent=docker -cgroup-cid=$cidfile -poll=10000 ";
- $command .= "$docker_bin run $docker_run_args --name=$containername --attach=stdout --attach=stderr --attach=stdin -i \Q$dockeruserarg\E --cidfile=$cidfile --sig-proxy ";
- # We only set memory limits if Docker lets us limit both memory and swap.
- # Memory limits alone have been supported longer, but subprocesses tend
- # to get SIGKILL if they exceed that without any swap limit set.
- # See #5642 for additional background.
- if ($docker_limitmem) {
- $command .= "--memory=\${MEMLIMIT}k --memory-swap=\${SWAPLIMIT}k ";
- }
-
- # The source tree and $destdir directory (which we have
- # installed on the worker host) are available in the container,
- # under the same path.
- $command .= "--volume=\Q$ENV{CRUNCH_SRC}:$ENV{CRUNCH_SRC}:ro\E ";
- $command .= "--volume=\Q$ENV{CRUNCH_INSTALL}:$ENV{CRUNCH_INSTALL}:ro\E ";
-
- # Currently, we make the "by_pdh" directory in arv-mount's mount
- # point appear at /keep inside the container (instead of using
- # the same path as the host like we do with CRUNCH_SRC and
- # CRUNCH_INSTALL). However, crunch scripts and utilities must
- # not rely on this. They must use $TASK_KEEPMOUNT.
- $command .= "--volume=\Q$ENV{TASK_KEEPMOUNT}:/keep:ro\E ";
- $ENV{TASK_KEEPMOUNT} = "/keep";
-
- # Ditto TASK_KEEPMOUNT_TMP, as /keep_tmp.
- $command .= "--volume=\Q$ENV{TASK_KEEPMOUNT_TMP}:/keep_tmp\E ";
- $ENV{TASK_KEEPMOUNT_TMP} = "/keep_tmp";
-
- # TASK_WORK is almost exactly like a docker data volume: it
- # starts out empty, is writable, and persists until no
- # containers use it any more. We don't use --volumes-from to
- # share it with other containers: it is only accessible to this
- # task, and it goes away when this task stops.
- #
- # However, a docker data volume is writable only by root unless
- # the mount point already happens to exist in the container with
- # different permissions. Therefore, we [1] assume /tmp already
- # exists in the image and is writable by the crunch user; [2]
- # avoid putting TASK_WORK inside CRUNCH_TMP (which won't be
- # writable if they are created by docker while setting up the
- # other --volumes); and [3] create $TASK_WORK inside the
- # container using $build_script.
- $command .= "--volume=/tmp ";
- $ENV{"TASK_WORK"} = "/tmp/crunch-job-task-work/$childslotname";
- $ENV{"HOME"} = $ENV{"TASK_WORK"};
- $ENV{"TASK_TMPDIR"} = $ENV{"TASK_WORK"}; # deprecated
-
- # TODO: Share a single JOB_WORK volume across all task
- # containers on a given worker node, and delete it when the job
- # ends (and, in case that doesn't work, when the next job
- # starts).
- #
- # For now, use the same approach as TASK_WORK above.
- $ENV{"JOB_WORK"} = "/tmp/crunch-job-work";
-
- # Bind mount the crunchrunner binary and host TLS certificates file into
- # the container.
- $command .= '"${VOLUMES[@]}" ';
-
- while (my ($env_key, $env_val) = each %ENV)
- {
- if ($env_key =~ /^(ARVADOS|CRUNCH|JOB|TASK)_/) {
- $command .= "--env=\Q$env_key=$env_val\E ";
- }
- }
- $command .= "--env=\QHOME=$ENV{HOME}\E ";
- $command .= "\Q$docker_hash\E ";
-
- if ($Job->{arvados_sdk_version}) {
- $command .= $stdbuf;
- $command .= "perl - \Q$ENV{CRUNCH_SRC}/crunch_scripts/$Job->{script}\E";
- } else {
- $command .= "/bin/sh -c \'python -c " .
- '"from pkg_resources import get_distribution as get; print \"Using Arvados SDK version\", get(\"arvados-python-client\").version"' .
- ">&2 2>/dev/null; " .
- "mkdir -p \"$ENV{JOB_WORK}\" \"$ENV{TASK_WORK}\" && " .
- "if which stdbuf >/dev/null ; then " .
- " exec $stdbuf \Q$ENV{CRUNCH_SRC}/crunch_scripts/$Job->{script}\E ;" .
- " else " .
- " exec \Q$ENV{CRUNCH_SRC}/crunch_scripts/$Job->{script}\E ;" .
- " fi\'";
- }
- } else {
- # Non-docker run
- $command .= "crunchstat -cgroup-root=\Q$cgroup_root\E -poll=10000 ";
- $command .= $stdbuf;
- $command .= "perl - $ENV{CRUNCH_SRC}/crunch_scripts/" . $Job->{"script"};
- }
-
- my @execargs = ('bash', '-c', $command);
- srun (\@srunargs, \@execargs, undef, $build_script);
- # exec() failed, we assume nothing happened.
- die "srun() failed on build script\n";
- }
- close("writer");
- if (!defined $childpid)
- {
- close $reader{$id};
- delete $reader{$id};
- next;
- }
- shift @freeslot;
- $proc{$childpid} = {
- jobstepidx => $id,
- time => time,
- slot => $childslot,
- jobstepname => "$job_id.$id.$childpid",
- };
- croak ("assert failed: \$slot[$childslot]->{'pid'} exists") if exists $slot[$childslot]->{pid};
- $slot[$childslot]->{pid} = $childpid;
-
- Log ($id, "job_task ".$Jobstep->{'arvados_task'}->{'uuid'});
- Log ($id, "child $childpid started on $childslotname");
- $Jobstep->{starttime} = time;
- $Jobstep->{node} = $childnode->{name};
- $Jobstep->{slotindex} = $childslot;
- delete $Jobstep->{stderr};
- delete $Jobstep->{finishtime};
- delete $Jobstep->{tempfail};
-
- $Jobstep->{'arvados_task'}->{started_at} = strftime "%Y-%m-%dT%H:%M:%SZ", gmtime($Jobstep->{starttime});
- retry_op(sub { $Jobstep->{'arvados_task'}->save; }, "job_tasks.update API");
-
- splice @jobstep_todo, $todo_ptr, 1;
- --$todo_ptr;
-
- $progress_is_dirty = 1;
-
- while (!@freeslot
- ||
- ($round_num_freeslots > @freeslot && $todo_ptr+1 > $#jobstep_todo))
- {
- last THISROUND if $main::please_freeze;
- if ($main::please_info)
- {
- $main::please_info = 0;
- freeze();
- create_output_collection();
- save_meta(1);
- update_progress_stats();
- }
- my $gotsome
- = readfrompipes ()
- + reapchildren ();
- if (!$gotsome || ($latest_refresh + 2 < scalar time))
- {
- check_refresh_wanted();
- check_squeue();
- update_progress_stats();
- }
- elsif (time - $progress_stats_updated >= 30 || $progress_is_dirty)
- {
- update_progress_stats();
- }
- if (!$gotsome) {
- select (undef, undef, undef, 0.1);
- }
- $working_slot_count = scalar(grep { $_->{node}->{fail_count} == 0 &&
- $_->{node}->{hold_count} < 4 } @slot);
- if (($thisround_failed_multiple >= 8 && $thisround_succeeded == 0) ||
- ($thisround_failed_multiple >= 16 && $thisround_failed_multiple > $thisround_succeeded))
- {
- my $message = "Repeated failure rate too high ($thisround_failed_multiple/"
- .($thisround_failed+$thisround_succeeded)
- .") -- giving up on this round";
- Log (undef, $message);
- last THISROUND;
- }
-
- # move slots from freeslot to holdslot (or back to freeslot) if necessary
- for (my $i=$#freeslot; $i>=0; $i--) {
- if ($slot[$freeslot[$i]]->{node}->{hold_until} > scalar time) {
- push @holdslot, (splice @freeslot, $i, 1);
- }
- }
- for (my $i=$#holdslot; $i>=0; $i--) {
- if ($slot[$holdslot[$i]]->{node}->{hold_until} <= scalar time) {
- push @freeslot, (splice @holdslot, $i, 1);
- }
- }
-
- # give up if no nodes are succeeding
- if ($working_slot_count < 1) {
- Log(undef, "Every node has failed -- giving up");
- last THISROUND;
- }
- }
-}
-
-
-push @freeslot, splice @holdslot;
-map { $slot[$freeslot[$_]]->{node}->{losing_streak} = 0 } (0..$#freeslot);
-
-
-Log (undef, "wait for last ".(scalar keys %proc)." children to finish");
-while (%proc)
-{
- if ($main::please_continue) {
- $main::please_continue = 0;
- goto THISROUND;
- }
- $main::please_info = 0, freeze(), create_output_collection(), save_meta(1) if $main::please_info;
- readfrompipes ();
- if (!reapchildren())
- {
- check_refresh_wanted();
- check_squeue();
- update_progress_stats();
- select (undef, undef, undef, 0.1);
- killem (keys %proc) if $main::please_freeze;
- }
-}
-
-update_progress_stats();
-freeze_if_want_freeze();
-
-
-if (!defined $main::success)
-{
- if (!@jobstep_todo) {
- $main::success = 1;
- } elsif ($working_slot_count < 1) {
- save_output_collection();
- save_meta();
- exit_retry_unlocked();
- } elsif ($thisround_succeeded == 0 &&
- ($thisround_failed == 0 || $thisround_failed > 4)) {
- my $message = "stop because $thisround_failed tasks failed and none succeeded";
- Log (undef, $message);
- $main::success = 0;
- }
-}
-
-goto ONELEVEL if !defined $main::success;
-
-
-release_allocation();
-freeze();
-my $collated_output = save_output_collection();
-Log (undef, "finish");
-
-my $final_log = save_meta();
-
-my $final_state;
-if ($collated_output && $final_log && $main::success) {
- $final_state = 'Complete';
-} else {
- $final_state = 'Failed';
-}
-$Job->update_attributes('state' => $final_state);
-
-exit (($final_state eq 'Complete') ? 0 : 1);
-
-
-
-sub update_progress_stats
-{
- $progress_stats_updated = time;
- return if !$progress_is_dirty;
- my ($todo, $done, $running) = (scalar @jobstep_todo,
- scalar @jobstep_done,
- scalar keys(%proc));
- $Job->{'tasks_summary'} ||= {};
- $Job->{'tasks_summary'}->{'todo'} = $todo;
- $Job->{'tasks_summary'}->{'done'} = $done;
- $Job->{'tasks_summary'}->{'running'} = $running;
- $Job->update_attributes('tasks_summary' => $Job->{'tasks_summary'});
- Log (undef, "status: $done done, $running running, $todo todo");
- $progress_is_dirty = 0;
-}
-
-
-
-sub reapchildren
-{
- my $children_reaped = 0;
- my @successful_task_uuids = ();
-
- while((my $pid = waitpid (-1, WNOHANG)) > 0)
- {
- my $childstatus = $?;
-
- my $whatslot = ($slot[$proc{$pid}->{slot}]->{node}->{name}
- . "."
- . $slot[$proc{$pid}->{slot}]->{cpu});
- my $jobstepidx = $proc{$pid}->{jobstepidx};
-
- readfrompipes_after_exit ($jobstepidx);
-
- $children_reaped++;
- my $elapsed = time - $proc{$pid}->{time};
- my $Jobstep = $jobstep[$jobstepidx];
-
- my $exitvalue = $childstatus >> 8;
- my $exitinfo = "exit ".exit_status_s($childstatus);
- $Jobstep->{'arvados_task'}->reload;
- my $task_success = $Jobstep->{'arvados_task'}->{success};
-
- Log ($jobstepidx, "child $pid on $whatslot $exitinfo success=$task_success");
-
- if (!defined $task_success) {
- # task did not indicate one way or the other --> fail
- Log($jobstepidx, sprintf(
- "ERROR: Task process exited %s, but never updated its task record to indicate success and record its output.",
- exit_status_s($childstatus)));
- $Jobstep->{'arvados_task'}->{success} = 0;
- retry_op(sub { $Jobstep->{'arvados_task'}->save; }, "job_tasks.update API");
- $task_success = 0;
- }
-
- if (!$task_success)
- {
- my $temporary_fail;
- $temporary_fail ||= $Jobstep->{tempfail};
- $temporary_fail ||= ($exitvalue == TASK_TEMPFAIL);
-
- ++$thisround_failed;
- ++$thisround_failed_multiple if $Jobstep->{'failures'} >= 1;
-
- # Check for signs of a failed or misconfigured node
- if (++$slot[$proc{$pid}->{slot}]->{node}->{losing_streak} >=
- 2+$slot[$proc{$pid}->{slot}]->{node}->{ncpus}) {
- # Don't count this against jobstep failure thresholds if this
- # node is already suspected faulty and srun exited quickly
- if ($slot[$proc{$pid}->{slot}]->{node}->{hold_until} &&
- $elapsed < 5) {
- Log ($jobstepidx, "blaming failure on suspect node " .
- $slot[$proc{$pid}->{slot}]->{node}->{name});
- $temporary_fail ||= 1;
- }
- ban_node_by_slot($proc{$pid}->{slot});
- }
-
- Log ($jobstepidx, sprintf('failure (#%d, %s) after %d seconds',
- ++$Jobstep->{'failures'},
- $temporary_fail ? 'temporary' : 'permanent',
- $elapsed));
-
- if (!$temporary_fail || $Jobstep->{'failures'} >= 3) {
- # Give up on this task, and the whole job
- $main::success = 0;
- }
- # Put this task back on the todo queue
- push @jobstep_todo, $jobstepidx;
- $Job->{'tasks_summary'}->{'failed'}++;
- }
- else # task_success
- {
- push @successful_task_uuids, $Jobstep->{'arvados_task'}->{uuid};
- ++$thisround_succeeded;
- $slot[$proc{$pid}->{slot}]->{node}->{losing_streak} = 0;
- $slot[$proc{$pid}->{slot}]->{node}->{hold_until} = 0;
- $slot[$proc{$pid}->{slot}]->{node}->{fail_count} = 0;
- push @jobstep_done, $jobstepidx;
- Log ($jobstepidx, "success in $elapsed seconds");
- }
- $Jobstep->{exitcode} = $childstatus;
- $Jobstep->{finishtime} = time;
- $Jobstep->{'arvados_task'}->{finished_at} = strftime "%Y-%m-%dT%H:%M:%SZ", gmtime($Jobstep->{finishtime});
- retry_op(sub { $Jobstep->{'arvados_task'}->save; }, "job_tasks.update API");
- Log ($jobstepidx, sprintf("task output (%d bytes): %s",
- length($Jobstep->{'arvados_task'}->{output}),
- $Jobstep->{'arvados_task'}->{output}));
-
- close $reader{$jobstepidx};
- delete $reader{$jobstepidx};
- delete $slot[$proc{$pid}->{slot}]->{pid};
- push @freeslot, $proc{$pid}->{slot};
- delete $proc{$pid};
-
- $progress_is_dirty = 1;
- }
-
- if (scalar(@successful_task_uuids) > 0)
- {
- Log (undef, sprintf("%d tasks exited (%d succeeded), checking for new tasks from API server.", $children_reaped, scalar(@successful_task_uuids)));
- # Load new tasks
- my $newtask_list = [];
- my $newtask_results;
- do {
- $newtask_results = api_call(
- "job_tasks/list",
- 'filters' => [["created_by_job_task_uuid","in",\@successful_task_uuids]],
- 'order' => 'qsequence',
- 'offset' => scalar(@$newtask_list),
- );
- push(@$newtask_list, @{$newtask_results->{items}});
- } while (@{$newtask_results->{items}});
- Log (undef, sprintf("Got %d new tasks from API server.", scalar(@$newtask_list)));
- foreach my $arvados_task (@$newtask_list) {
- my $jobstep = {
- 'level' => $arvados_task->{'sequence'},
- 'failures' => 0,
- 'arvados_task' => $arvados_task
- };
- push @jobstep, $jobstep;
- push @jobstep_todo, $#jobstep;
- }
- }
-
- return $children_reaped;
-}
-
-sub check_refresh_wanted
-{
- my @stat = stat $ENV{"CRUNCH_REFRESH_TRIGGER"};
- if (@stat &&
- $stat[9] > $latest_refresh &&
- # ...and we have actually locked the job record...
- $job_id eq $Job->{'uuid'}) {
- $latest_refresh = scalar time;
- my $Job2 = api_call("jobs/get", uuid => $jobspec);
- for my $attr ('cancelled_at',
- 'cancelled_by_user_uuid',
- 'cancelled_by_client_uuid',
- 'state') {
- $Job->{$attr} = $Job2->{$attr};
- }
- if ($Job->{'state'} ne "Running") {
- if ($Job->{'state'} eq "Cancelled") {
- Log (undef, "Job cancelled at " . $Job->{'cancelled_at'} . " by user " . $Job->{'cancelled_by_user_uuid'});
- } else {
- Log (undef, "Job state unexpectedly changed to " . $Job->{'state'});
- }
- $main::success = 0;
- $main::please_freeze = 1;
- }
- }
-}
-
-sub check_squeue
-{
- my $last_squeue_check = $squeue_checked;
-
- # Do not call `squeue` or check the kill list more than once every
- # 15 seconds.
- return if $last_squeue_check > time - 15;
- $squeue_checked = time;
-
- # Look for children from which we haven't received stderr data since
- # the last squeue check. If no such children exist, all procs are
- # alive and there's no need to even look at squeue.
- #
- # As long as the crunchstat poll interval (10s) is shorter than the
- # squeue check interval (15s) this should make the squeue check an
- # infrequent event.
- my $silent_procs = 0;
- for my $js (map {$jobstep[$_->{jobstepidx}]} values %proc)
- {
- if (!exists($js->{stderr_at}))
- {
- $js->{stderr_at} = 0;
- }
- if ($js->{stderr_at} < $last_squeue_check)
- {
- $silent_procs++;
- }
- }
- return if $silent_procs == 0;
-
- # use killem() on procs whose killtime is reached
- while (my ($pid, $procinfo) = each %proc)
- {
- my $js = $jobstep[$procinfo->{jobstepidx}];
- if (exists $procinfo->{killtime}
- && $procinfo->{killtime} <= time
- && $js->{stderr_at} < $last_squeue_check)
- {
- my $sincewhen = "";
- if ($js->{stderr_at}) {
- $sincewhen = " in last " . (time - $js->{stderr_at}) . "s";
- }
- Log($procinfo->{jobstepidx}, "killing orphaned srun process $pid (task not in slurm queue, no stderr received$sincewhen)");
- killem ($pid);
- }
- }
-
- if (!$have_slurm)
- {
- # here is an opportunity to check for mysterious problems with local procs
- return;
- }
-
- # Get a list of steps still running. Note: squeue(1) says --steps
- # selects a format (which we override anyway) and allows us to
- # specify which steps we're interested in (which we don't).
- # Importantly, it also changes the meaning of %j from "job name" to
- # "step name" and (although this isn't mentioned explicitly in the
- # docs) switches from "one line per job" mode to "one line per step"
- # mode. Without it, we'd just get a list of one job, instead of a
- # list of N steps.
- my @squeue = `squeue --jobs=\Q$ENV{SLURM_JOB_ID}\E --steps --format='%j' --noheader`;
- if ($? != 0)
- {
- Log(undef, "warning: squeue exit status $? ($!)");
- return;
- }
- chop @squeue;
-
- # which of my jobsteps are running, according to squeue?
- my %ok;
- for my $jobstepname (@squeue)
- {
- $ok{$jobstepname} = 1;
- }
-
- # Check for child procs >60s old and not mentioned by squeue.
- while (my ($pid, $procinfo) = each %proc)
- {
- if ($procinfo->{time} < time - 60
- && $procinfo->{jobstepname}
- && !exists $ok{$procinfo->{jobstepname}}
- && !exists $procinfo->{killtime})
- {
- # According to slurm, this task has ended (successfully or not)
- # -- but our srun child hasn't exited. First we must wait (30
- # seconds) in case this is just a race between communication
- # channels. Then, if our srun child process still hasn't
- # terminated, we'll conclude some slurm communication
- # error/delay has caused the task to die without notifying srun,
- # and we'll kill srun ourselves.
- $procinfo->{killtime} = time + 30;
- Log($procinfo->{jobstepidx}, "notice: task is not in slurm queue but srun process $pid has not exited");
- }
- }
-}
-
-sub check_sinfo
-{
- # If a node fails in a multi-node "srun" call during job setup, the call
- # may hang instead of exiting with a nonzero code. This function checks
- # "sinfo" for the health of the nodes that were allocated and ensures that
- # they are all still in the "alloc" state. If a node that is allocated to
- # this job is not in "alloc" state, then set please_freeze.
- #
- # This is only called from srun_sync() for node configuration. If a
- # node fails doing actual work, there are other recovery mechanisms.
-
- # Do not call `sinfo` more than once every 15 seconds.
- return if $sinfo_checked > time - 15;
- $sinfo_checked = time;
-
- # The output format "%t" means output node states.
- my @sinfo = `sinfo --nodes=\Q$ENV{SLURM_NODELIST}\E --noheader -o "%t"`;
- if ($? != 0)
- {
- Log(undef, "warning: sinfo exit status $? ($!)");
- return;
- }
- chop @sinfo;
-
- foreach (@sinfo)
- {
- if ($_ != "alloc" && $_ != "alloc*") {
- $main::please_freeze = 1;
- }
- }
-}
-
-sub release_allocation
-{
- if ($have_slurm)
- {
- Log (undef, "release job allocation");
- system "scancel $ENV{SLURM_JOB_ID}";
- }
-}
-
-
-sub readfrompipes
-{
- my $gotsome = 0;
- my %fd_job;
- my $sel = IO::Select->new();
- foreach my $jobstepidx (keys %reader)
- {
- my $fd = $reader{$jobstepidx};
- $sel->add($fd);
- $fd_job{$fd} = $jobstepidx;
-
- if (my $stdout_fd = $jobstep[$jobstepidx]->{stdout_r}) {
- $sel->add($stdout_fd);
- $fd_job{$stdout_fd} = $jobstepidx;
- }
- }
- # select on all reader fds with 0.1s timeout
- my @ready_fds = $sel->can_read(0.1);
- foreach my $fd (@ready_fds)
- {
- my $buf;
- if (0 < sysread ($fd, $buf, 65536))
- {
- $gotsome = 1;
- print STDERR $buf if $ENV{CRUNCH_DEBUG};
-
- my $jobstepidx = $fd_job{$fd};
- if ($jobstep[$jobstepidx]->{stdout_r} == $fd) {
- $jobstep[$jobstepidx]->{stdout_captured} .= $buf;
- next;
- }
-
- $jobstep[$jobstepidx]->{stderr_at} = time;
- $jobstep[$jobstepidx]->{stderr} .= $buf;
-
- # Consume everything up to the last \n
- preprocess_stderr ($jobstepidx);
-
- if (length ($jobstep[$jobstepidx]->{stderr}) > 16384)
- {
- # If we get a lot of stderr without a newline, chop off the
- # front to avoid letting our buffer grow indefinitely.
- substr ($jobstep[$jobstepidx]->{stderr},
- 0, length($jobstep[$jobstepidx]->{stderr}) - 8192) = "";
- }
- }
- }
- return $gotsome;
-}
-
-
-# Consume all full lines of stderr for a jobstep. Everything after the
-# last newline will remain in $jobstep[$jobstepidx]->{stderr} after
-# returning.
-sub preprocess_stderr
-{
- my $jobstepidx = shift;
- # slotindex is only defined for children running Arvados job tasks.
- # Be prepared to handle the undef case (for setup srun calls, etc.).
- my $job_slot_index = $jobstep[$jobstepidx]->{slotindex};
-
- while ($jobstep[$jobstepidx]->{stderr} =~ /^(.*?)\n/) {
- my $line = $1;
- substr $jobstep[$jobstepidx]->{stderr}, 0, 1+length($line), "";
- Log ($jobstepidx, "stderr $line");
- if ($line =~ /srun: error: (SLURM job $ENV{SLURM_JOB_ID} has expired|Unable to confirm allocation for job $ENV{SLURM_JOB_ID})/i) {
- # If the allocation is revoked, we can't possibly continue, so mark all
- # nodes as failed. This will cause the overall exit code to be
- # EX_RETRY_UNLOCKED instead of failure so that crunch_dispatch can re-run
- # this job.
- $main::please_freeze = 1;
- foreach my $st (@slot) {
- $st->{node}->{fail_count}++;
- }
- }
- elsif ($line =~ /srun: error: .*?\b(Node failure on|Aborting, .*?\bio error\b|cannot communicate with node .* aborting job)/i) {
- $jobstep[$jobstepidx]->{tempfail} = 1;
- if (defined($job_slot_index)) {
- $slot[$job_slot_index]->{node}->{fail_count}++;
- ban_node_by_slot($job_slot_index);
- }
- }
- elsif ($line =~ /srun: error: (Unable to create job step|.*?: Communication connection failure)/i) {
- $jobstep[$jobstepidx]->{tempfail} = 1;
- ban_node_by_slot($job_slot_index) if (defined($job_slot_index));
- }
- elsif ($line =~ /\bKeep(Read|Write|Request)Error:/) {
- $jobstep[$jobstepidx]->{tempfail} = 1;
- }
- }
-}
-
-
-# Read whatever is still available on its stderr+stdout pipes after
-# the given child process has exited.
-sub readfrompipes_after_exit
-{
- my $jobstepidx = shift;
-
- # The fact that the child has exited allows some convenient
- # simplifications: (1) all data must have already been written, so
- # there's no need to wait for more once sysread returns 0; (2) the
- # total amount of data available is bounded by the pipe buffer size,
- # so it's safe to read everything into one string.
- my $buf;
- while (0 < sysread ($reader{$jobstepidx}, $buf, 65536)) {
- $jobstep[$jobstepidx]->{stderr_at} = time;
- $jobstep[$jobstepidx]->{stderr} .= $buf;
- }
- if ($jobstep[$jobstepidx]->{stdout_r}) {
- while (0 < sysread ($jobstep[$jobstepidx]->{stdout_r}, $buf, 65536)) {
- $jobstep[$jobstepidx]->{stdout_captured} .= $buf;
- }
- }
- preprocess_stderr ($jobstepidx);
-
- map {
- Log ($jobstepidx, "stderr $_");
- } split ("\n", $jobstep[$jobstepidx]->{stderr});
- $jobstep[$jobstepidx]->{stderr} = '';
-}
-
-sub fetch_block
-{
- my $hash = shift;
- my $keep;
- if (!open($keep, "-|", "arv-get", "--retries", retry_count(), $hash)) {
- Log(undef, "fetch_block run error from arv-get $hash: $!");
- return undef;
- }
- my $output_block = "";
- while (1) {
- my $buf;
- my $bytes = sysread($keep, $buf, 1024 * 1024);
- if (!defined $bytes) {
- Log(undef, "fetch_block read error from arv-get: $!");
- $output_block = undef;
- last;
- } elsif ($bytes == 0) {
- # sysread returns 0 at the end of the pipe.
- last;
- } else {
- # some bytes were read into buf.
- $output_block .= $buf;
- }
- }
- close $keep;
- if ($?) {
- Log(undef, "fetch_block arv-get exited " . exit_status_s($?));
- $output_block = undef;
- }
- return $output_block;
-}
-
-# Create a collection by concatenating the output of all tasks (each
-# task's output is either a manifest fragment, a locator for a
-# manifest fragment stored in Keep, or nothing at all). Return the
-# portable_data_hash of the new collection.
-sub create_output_collection
-{
- Log (undef, "collate");
-
- my ($child_out, $child_in);
- # This depends on the python-arvados-python-client package, which needs to be installed
- # on the machine running crunch-dispatch (typically, the API server).
- my $pid = open2($child_out, $child_in, '/usr/share/python2.7/dist/python-arvados-python-client/bin/python', '-c', q{
-import arvados
-import sys
-print (arvados.api("v1").collections().
- create(body={"manifest_text": sys.stdin.read(),
- "owner_uuid": sys.argv[2]}).
- execute(num_retries=int(sys.argv[1]))["portable_data_hash"])
-}, retry_count(), $Job->{owner_uuid});
-
- my $task_idx = -1;
- my $manifest_size = 0;
- for (@jobstep)
- {
- ++$task_idx;
- my $output = $_->{'arvados_task'}->{output};
- next if (!defined($output));
- my $next_write;
- if ($output =~ /^[0-9a-f]{32}(\+\S+)*$/) {
- $next_write = fetch_block($output);
- } else {
- $next_write = $output;
- }
- if (defined($next_write)) {
- if (!defined(syswrite($child_in, $next_write))) {
- # There's been an error writing. Stop the loop.
- # We'll log details about the exit code later.
- last;
- } else {
- $manifest_size += length($next_write);
- }
- } else {
- my $uuid = $_->{'arvados_task'}->{'uuid'};
- Log (undef, "Error retrieving '$output' output by task $task_idx ($uuid)");
- $main::success = 0;
- }
- }
- close($child_in);
- Log(undef, "collated output manifest text to send to API server is $manifest_size bytes with access tokens");
-
- my $joboutput;
- my $s = IO::Select->new($child_out);
- if ($s->can_read(120)) {
- sysread($child_out, $joboutput, 1024 * 1024);
- waitpid($pid, 0);
- if ($?) {
- Log(undef, "output collection creation exited " . exit_status_s($?));
- $joboutput = undef;
- } else {
- chomp($joboutput);
- }
- } else {
- Log (undef, "timed out while creating output collection");
- foreach my $signal (2, 2, 2, 15, 15, 9) {
- kill($signal, $pid);
- last if waitpid($pid, WNOHANG) == -1;
- sleep(1);
- }
- }
- close($child_out);
-
- return $joboutput;
-}
-
-# Calls create_output_collection, logs the result, and returns it.
-# If that was successful, save that as the output in the job record.
-sub save_output_collection {
- my $collated_output = create_output_collection();
-
- if (!$collated_output) {
- Log(undef, "Failed to write output collection");
- }
- else {
- Log(undef, "job output $collated_output");
- $Job->update_attributes('output' => $collated_output);
- }
- return $collated_output;
-}
-
-sub killem
-{
- foreach (@_)
- {
- my $sig = 2; # SIGINT first
- if (exists $proc{$_}->{"sent_$sig"} &&
- time - $proc{$_}->{"sent_$sig"} > 4)
- {
- $sig = 15; # SIGTERM if SIGINT doesn't work
- }
- if (exists $proc{$_}->{"sent_$sig"} &&
- time - $proc{$_}->{"sent_$sig"} > 4)
- {
- $sig = 9; # SIGKILL if SIGTERM doesn't work
- }
- if (!exists $proc{$_}->{"sent_$sig"})
- {
- Log ($proc{$_}->{jobstepidx}, "sending 2x signal $sig to pid $_");
- kill $sig, $_;
- select (undef, undef, undef, 0.1);
- if ($sig == 2)
- {
- kill $sig, $_; # srun wants two SIGINT to really interrupt
- }
- $proc{$_}->{"sent_$sig"} = time;
- $proc{$_}->{"killedafter"} = time - $proc{$_}->{"time"};
- }
- }
-}
-
-
-sub fhbits
-{
- my($bits);
- for (@_) {
- vec($bits,fileno($_),1) = 1;
- }
- $bits;
-}
-
-
-# Send log output to Keep via arv-put.
-#
-# $log_pipe_in and $log_pipe_out are the input and output filehandles to the arv-put pipe.
-# $log_pipe_out_buf is a string containing all output read from arv-put so far.
-# $log_pipe_out_select is an IO::Select object around $log_pipe_out.
-# $log_pipe_pid is the pid of the arv-put subprocess.
-#
-# The only functions that should access these variables directly are:
-#
-# log_writer_start($logfilename)
-# Starts an arv-put pipe, reading data on stdin and writing it to
-# a $logfilename file in an output collection.
-#
-# log_writer_read_output([$timeout])
-# Read output from $log_pipe_out and append it to $log_pipe_out_buf.
-# Passes $timeout to the select() call, with a default of 0.01.
-# Returns the result of the last read() call on $log_pipe_out, or
-# -1 if read() wasn't called because select() timed out.
-# Only other log_writer_* functions should need to call this.
-#
-# log_writer_send($txt)
-# Writes $txt to the output log collection.
-#
-# log_writer_finish()
-# Closes the arv-put pipe and returns the output that it produces.
-#
-# log_writer_is_active()
-# Returns a true value if there is currently a live arv-put
-# process, false otherwise.
-#
-my ($log_pipe_in, $log_pipe_out, $log_pipe_out_buf, $log_pipe_out_select,
- $log_pipe_pid);
-
-sub log_writer_start($)
-{
- my $logfilename = shift;
- $log_pipe_pid = open2($log_pipe_out, $log_pipe_in,
- 'arv-put',
- '--stream',
- '--retries', '6',
- '--filename', $logfilename,
- '-');
- $log_pipe_out_buf = "";
- $log_pipe_out_select = IO::Select->new($log_pipe_out);
-}
-
-sub log_writer_read_output {
- my $timeout = shift || 0.01;
- my $read = -1;
- while ($read && $log_pipe_out_select->can_read($timeout)) {
- $read = read($log_pipe_out, $log_pipe_out_buf, 65536,
- length($log_pipe_out_buf));
- }
- if (!defined($read)) {
- Log(undef, "error reading log manifest from arv-put: $!");
- }
- return $read;
-}
-
-sub log_writer_send($)
-{
- my $txt = shift;
- print $log_pipe_in $txt;
- log_writer_read_output();
-}
-
-sub log_writer_finish()
-{
- return unless $log_pipe_pid;
-
- close($log_pipe_in);
-
- my $logger_failed = 0;
- my $read_result = log_writer_read_output(600);
- if ($read_result == -1) {
- $logger_failed = -1;
- Log (undef, "timed out reading from 'arv-put'");
- } elsif ($read_result != 0) {
- $logger_failed = -2;
- Log(undef, "failed to read arv-put log manifest to EOF");
- }
-
- waitpid($log_pipe_pid, 0);
- if ($?) {
- $logger_failed ||= $?;
- Log(undef, "log_writer_finish: arv-put exited " . exit_status_s($?))
- }
-
- close($log_pipe_out);
- my $arv_put_output = $logger_failed ? undef : $log_pipe_out_buf;
- $log_pipe_pid = $log_pipe_in = $log_pipe_out = $log_pipe_out_buf =
- $log_pipe_out_select = undef;
-
- return $arv_put_output;
-}
-
-sub log_writer_is_active() {
- return $log_pipe_pid;
-}
-
-sub Log # ($jobstepidx, $logmessage)
-{
- my ($jobstepidx, $logmessage) = @_;
- if ($logmessage =~ /\n/) {
- for my $line (split (/\n/, $_[1])) {
- Log ($jobstepidx, $line);
- }
- return;
- }
- my $fh = select STDERR; $|=1; select $fh;
- my $task_qseq = '';
- if (defined($jobstepidx) && exists($jobstep[$jobstepidx]->{arvados_task})) {
- $task_qseq = $jobstepidx;
- }
- my $message = sprintf ("%s %d %s %s", $job_id, $$, $task_qseq, $logmessage);
- $message =~ s{([^ -\176])}{"\\" . sprintf ("%03o", ord($1))}ge;
- $message .= "\n";
- my $datetime;
- if (log_writer_is_active() || -t STDERR) {
- my @gmtime = gmtime;
- $datetime = sprintf ("%04d-%02d-%02d_%02d:%02d:%02d",
- $gmtime[5]+1900, $gmtime[4]+1, @gmtime[3,2,1,0]);
- }
- print STDERR ((-t STDERR) ? ($datetime." ".$message) : $message);
-
- if (log_writer_is_active()) {
- log_writer_send($datetime . " " . $message);
- }
-}
-
-
-sub croak
-{
- my ($package, $file, $line) = caller;
- my $message = "@_ at $file line $line\n";
- Log (undef, $message);
- release_allocation();
- freeze() if @jobstep_todo;
- create_output_collection() if @jobstep_todo;
- cleanup();
- save_meta();
- die;
-}
-
-
-sub cleanup
-{
- return unless $Job;
- if ($Job->{'state'} eq 'Cancelled') {
- $Job->update_attributes('finished_at' => scalar gmtime);
- } else {
- $Job->update_attributes('state' => 'Failed');
- }
-}
-
-
-sub save_meta
-{
- my $justcheckpoint = shift; # false if this will be the last meta saved
- return if $justcheckpoint; # checkpointing is not relevant post-Warehouse.pm
- return unless log_writer_is_active();
- my $log_manifest = log_writer_finish();
- return unless defined($log_manifest);
-
- if ($Job->{log}) {
- my $prev_log_coll = api_call("collections/get", uuid => $Job->{log});
- $log_manifest = $prev_log_coll->{manifest_text} . $log_manifest;
- }
-
- my $log_coll = api_call(
- "collections/create", ensure_unique_name => 1, collection => {
- manifest_text => $log_manifest,
- owner_uuid => $Job->{owner_uuid},
- name => sprintf("Log from %s job %s", $Job->{script}, $Job->{uuid}),
- });
- Log(undef, "log collection is " . $log_coll->{portable_data_hash});
- $Job->update_attributes('log' => $log_coll->{portable_data_hash});
-
- return $log_coll->{portable_data_hash};
-}
-
-
-sub freeze_if_want_freeze
-{
- if ($main::please_freeze)
- {
- release_allocation();
- if (@_)
- {
- # kill some srun procs before freeze+stop
- map { $proc{$_} = {} } @_;
- while (%proc)
- {
- killem (keys %proc);
- select (undef, undef, undef, 0.1);
- my $died;
- while (($died = waitpid (-1, WNOHANG)) > 0)
- {
- delete $proc{$died};
- }
- }
- }
- freeze();
- create_output_collection();
- cleanup();
- save_meta();
- exit 1;
- }
-}
-
-
-sub freeze
-{
- Log (undef, "Freeze not implemented");
- return;
-}
-
-
-sub thaw
-{
- croak ("Thaw not implemented");
-}
-
-
-sub freezequote
-{
- my $s = shift;
- $s =~ s/\\/\\\\/g;
- $s =~ s/\n/\\n/g;
- return $s;
-}
-
-
-sub freezeunquote
-{
- my $s = shift;
- $s =~ s{\\(.)}{$1 eq "n" ? "\n" : $1}ge;
- return $s;
-}
-
-sub srun_sync
-{
- my $srunargs = shift;
- my $execargs = shift;
- my $opts = shift || {};
- my $stdin = shift;
-
- my $label = exists $opts->{label} ? $opts->{label} : "@$execargs";
- Log (undef, "$label: start");
-
- my ($stderr_r, $stderr_w);
- pipe $stderr_r, $stderr_w or croak("pipe() failed: $!");
-
- my ($stdout_r, $stdout_w);
- pipe $stdout_r, $stdout_w or croak("pipe() failed: $!");
-
- my $started_srun = scalar time;
-
- my $srunpid = fork();
- if ($srunpid == 0)
- {
- close($stderr_r);
- close($stdout_r);
- fcntl($stderr_w, F_SETFL, 0) or croak($!); # no close-on-exec
- fcntl($stdout_w, F_SETFL, 0) or croak($!);
- open(STDERR, ">&", $stderr_w) or croak ($!);
- open(STDOUT, ">&", $stdout_w) or croak ($!);
- srun ($srunargs, $execargs, $opts, $stdin);
- exit (1);
- }
- close($stderr_w);
- close($stdout_w);
-
- set_nonblocking($stderr_r);
- set_nonblocking($stdout_r);
-
- # Add entries to @jobstep and %proc so check_squeue() and
- # freeze_if_want_freeze() can treat it like a job task process.
- push @jobstep, {
- stderr => '',
- stderr_at => 0,
- stderr_captured => '',
- stdout_r => $stdout_r,
- stdout_captured => '',
- };
- my $jobstepidx = $#jobstep;
- $proc{$srunpid} = {
- jobstepidx => $jobstepidx,
- };
- $reader{$jobstepidx} = $stderr_r;
-
- while ($srunpid != waitpid ($srunpid, WNOHANG)) {
- my $busy = readfrompipes();
- if (!$busy || ($latest_refresh + 2 < scalar time)) {
- check_refresh_wanted();
- check_squeue();
- check_sinfo();
- }
- if (!$busy) {
- select(undef, undef, undef, 0.1);
- }
- if (($started_srun + $srun_sync_timeout) < scalar time) {
- # Exceeded general timeout for "srun_sync" operations, likely
- # means something got stuck on the remote node.
- Log(undef, "srun_sync exceeded timeout, will fail.");
- $main::please_freeze = 1;
- }
- killem(keys %proc) if $main::please_freeze;
- }
- my $exited = $?;
-
- readfrompipes_after_exit ($jobstepidx);
-
- Log (undef, "$label: exit ".exit_status_s($exited));
-
- close($stdout_r);
- close($stderr_r);
- delete $proc{$srunpid};
- delete $reader{$jobstepidx};
-
- my $j = pop @jobstep;
- # If the srun showed signs of tempfail, ensure the caller treats that as a
- # failure case.
- if ($main::please_freeze || $j->{tempfail}) {
- $exited ||= 255;
- }
- return ($exited, $j->{stdout_captured}, $j->{stderr_captured}, $j->{tempfail});
-}
-
-
-sub srun
-{
- my $srunargs = shift;
- my $execargs = shift;
- my $opts = shift || {};
- my $stdin = shift;
- my $args = $have_slurm ? [@$srunargs, @$execargs] : $execargs;
-
- $Data::Dumper::Terse = 1;
- $Data::Dumper::Indent = 0;
- my $show_cmd = Dumper($args);
- $show_cmd =~ s/(TOKEN\\*=)[^\s\']+/${1}[...]/g;
- $show_cmd =~ s/\n/ /g;
- if ($opts->{fork}) {
- Log(undef, "starting: $show_cmd");
- } else {
- # This is a child process: parent is in charge of reading our
- # stderr and copying it to Log() if needed.
- warn "starting: $show_cmd\n";
- }
-
- if (defined $stdin) {
- my $child = open STDIN, "-|";
- defined $child or die "no fork: $!";
- if ($child == 0) {
- print $stdin or die $!;
- close STDOUT or die $!;
- exit 0;
- }
- }
-
- return system (@$args) if $opts->{fork};
-
- exec @$args;
- warn "ENV size is ".length(join(" ",%ENV));
- die "exec failed: $!: @$args";
-}
-
-
-sub ban_node_by_slot {
- # Don't start any new jobsteps on this node for 60 seconds
- my $slotid = shift;
- $slot[$slotid]->{node}->{hold_until} = 60 + scalar time;
- $slot[$slotid]->{node}->{hold_count}++;
- Log (undef, "backing off node " . $slot[$slotid]->{node}->{name} . " for 60 seconds");
-}
-
-sub must_lock_now
-{
- my ($lockfile, $error_message) = @_;
- open L, ">", $lockfile or croak("$lockfile: $!");
- if (!flock L, LOCK_EX|LOCK_NB) {
- croak("Can't lock $lockfile: $error_message\n");
- }
-}
-
-sub find_docker_image {
- # Given a Keep locator, check to see if it contains a Docker image.
- # If so, return its stream name and Docker hash.
- # If not, return undef for both values.
- my $locator = shift;
- my ($streamname, $filename);
- my $image = api_call("collections/get", uuid => $locator);
- if ($image) {
- foreach my $line (split(/\n/, $image->{manifest_text})) {
- my @tokens = split(/\s+/, $line);
- next if (!@tokens);
- $streamname = shift(@tokens);
- foreach my $filedata (grep(/^\d+:\d+:/, @tokens)) {
- if (defined($filename)) {
- return (undef, undef); # More than one file in the Collection.
- } else {
- $filename = (split(/:/, $filedata, 3))[2];
- $filename =~ s/\\([0-3][0-7][0-7])/chr(oct($1))/ge;
- }
- }
- }
- }
- if (defined($filename) and ($filename =~ /^((?:sha256:)?[0-9A-Fa-f]{64})\.tar$/)) {
- return ($streamname, $1);
- } else {
- return (undef, undef);
- }
-}
-
-sub exit_retry_unlocked {
- Log(undef, "Transient failure with lock acquired; asking for re-dispatch by exiting ".EX_RETRY_UNLOCKED);
- exit(EX_RETRY_UNLOCKED);
-}
-
-sub retry_count {
- # Calculate the number of times an operation should be retried,
- # assuming exponential backoff, and that we're willing to retry as
- # long as tasks have been running. Enforce a minimum of 3 retries.
- my ($starttime, $endtime, $timediff, $retries);
- if (@jobstep) {
- $starttime = $jobstep[0]->{starttime};
- $endtime = $jobstep[-1]->{finishtime};
- }
- if (!defined($starttime)) {
- $timediff = 0;
- } elsif (!defined($endtime)) {
- $timediff = time - $starttime;
- } else {
- $timediff = ($endtime - $starttime) - (time - $endtime);
- }
- if ($timediff > 0) {
- $retries = int(log($timediff) / log(2));
- } else {
- $retries = 1; # Use the minimum.
- }
- return ($retries > 3) ? $retries : 3;
-}
-
-sub retry_op {
- # Pass in two function references.
- # This method will be called with the remaining arguments.
- # If it dies, retry it with exponential backoff until it succeeds,
- # or until the current retry_count is exhausted. After each failure
- # that can be retried, the second function will be called with
- # the current try count (0-based), next try time, and error message.
- my $operation = shift;
- my $op_text = shift;
- my $retries = retry_count();
- my $retry_callback = sub {
- my ($try_count, $next_try_at, $errmsg) = @_;
- $errmsg =~ s/\s*\bat \Q$0\E line \d+\.?\s*//;
- $errmsg =~ s/\s/ /g;
- $errmsg =~ s/\s+$//;
- my $retry_msg;
- if ($next_try_at < time) {
- $retry_msg = "Retrying.";
- } else {
- my $next_try_fmt = strftime "%Y-%m-%dT%H:%M:%SZ", gmtime($next_try_at);
- $retry_msg = "Retrying at $next_try_fmt.";
- }
- Log(undef, "$op_text failed: $errmsg. $retry_msg");
- };
- foreach my $try_count (0..$retries) {
- my $next_try = time + (2 ** $try_count);
- my $result = eval { $operation->(@_); };
- if (!$@) {
- return $result;
- } elsif ($try_count < $retries) {
- $retry_callback->($try_count, $next_try, $@);
- my $sleep_time = $next_try - time;
- sleep($sleep_time) if ($sleep_time > 0);
- }
- }
- # Ensure the error message ends in a newline, so Perl doesn't add
- # retry_op's line number to it.
- chomp($@);
- die($@ . "\n");
-}
-
-sub api_call {
- # Pass in a /-separated API method name, and arguments for it.
- # This function will call that method, retrying as needed until
- # the current retry_count is exhausted, with a log on the first failure.
- my $method_name = shift;
- my $method = $arv;
- foreach my $key (split(/\//, $method_name)) {
- $method = $method->{$key};
- }
- return retry_op(sub { $method->execute(@_); }, "API method $method_name", @_);
-}
-
-sub exit_status_s {
- # Given a $?, return a human-readable exit code string like "0" or
- # "1" or "0 with signal 1" or "1 with signal 11".
- my $exitcode = shift;
- my $s = $exitcode >> 8;
- if ($exitcode & 0x7f) {
- $s .= " with signal " . ($exitcode & 0x7f);
- }
- if ($exitcode & 0x80) {
- $s .= " with core dump";
- }
- return $s;
-}
-
-sub handle_readall {
- # Pass in a glob reference to a file handle.
- # Read all its contents and return them as a string.
- my $fh_glob_ref = shift;
- local $/ = undef;
- return <$fh_glob_ref>;
-}
-
-sub tar_filename_n {
- my $n = shift;
- return sprintf("%s/git.%s.%d.tar", $ENV{CRUNCH_TMP}, $job_id, $n);
-}
-
-sub add_git_archive {
- # Pass in a git archive command as a string or list, a la system().
- # This method will save its output to be included in the archive sent to the
- # build script.
- my $git_input;
- $git_tar_count++;
- if (!open(GIT_ARCHIVE, ">", tar_filename_n($git_tar_count))) {
- croak("Failed to save git archive: $!");
- }
- my $git_pid = open2(">&GIT_ARCHIVE", $git_input, @_);
- close($git_input);
- waitpid($git_pid, 0);
- close(GIT_ARCHIVE);
- if ($?) {
- croak("Failed to save git archive: git exited " . exit_status_s($?));
- }
-}
-
-sub combined_git_archive {
- # Combine all saved tar archives into a single archive, then return its
- # contents in a string. Return undef if no archives have been saved.
- if ($git_tar_count < 1) {
- return undef;
- }
- my $base_tar_name = tar_filename_n(1);
- foreach my $tar_to_append (map { tar_filename_n($_); } (2..$git_tar_count)) {
- my $tar_exit = system("tar", "-Af", $base_tar_name, $tar_to_append);
- if ($tar_exit != 0) {
- croak("Error preparing build archive: tar -A exited " .
- exit_status_s($tar_exit));
- }
- }
- if (!open(GIT_TAR, "<", $base_tar_name)) {
- croak("Could not open build archive: $!");
- }
- my $tar_contents = handle_readall(\*GIT_TAR);
- close(GIT_TAR);
- return $tar_contents;
-}
-
-sub set_nonblocking {
- my $fh = shift;
- my $flags = fcntl ($fh, F_GETFL, 0) or croak ($!);
- fcntl ($fh, F_SETFL, $flags | O_NONBLOCK) or croak ($!);
-}
-
-__DATA__
-#!/usr/bin/env perl
-#
-# This is crunch-job's internal dispatch script. crunch-job running on the API
-# server invokes this script on individual compute nodes, or localhost if we're
-# running a job locally. It gets called in two modes:
-#
-# * No arguments: Installation mode. Read a tar archive from the DATA
-# file handle; it includes the Crunch script's source code, and
-# maybe SDKs as well. Those should be installed in the proper
-# locations. This runs outside of any Docker container, so don't try to
-# introspect Crunch's runtime environment.
-#
-# * With arguments: Crunch script run mode. This script should set up the
-# environment, then run the command specified in the arguments. This runs
-# inside any Docker container.
-
-use Fcntl ':flock';
-use File::Path qw( make_path remove_tree );
-use POSIX qw(getcwd);
-
-use constant TASK_TEMPFAIL => 111;
-
-# Map SDK subdirectories to the path environments they belong to.
-my %SDK_ENVVARS = ("perl/lib" => "PERLLIB", "ruby/lib" => "RUBYLIB");
-
-my $destdir = $ENV{"CRUNCH_SRC"};
-my $archive_hash = $ENV{"CRUNCH_GIT_ARCHIVE_HASH"};
-my $repo = $ENV{"CRUNCH_SRC_URL"};
-my $install_dir = $ENV{"CRUNCH_INSTALL"} || (getcwd() . "/opt");
-my $job_work = $ENV{"JOB_WORK"};
-my $task_work = $ENV{"TASK_WORK"};
-
-open(STDOUT_ORIG, ">&", STDOUT);
-open(STDERR_ORIG, ">&", STDERR);
-
-for my $dir ($destdir, $job_work, $task_work) {
- if ($dir) {
- make_path $dir;
- -e $dir or die "Failed to create temporary directory ($dir): $!";
- }
-}
-
-if ($task_work) {
- remove_tree($task_work, {keep_root => 1});
-}
-
-### Crunch script run mode
-if (@ARGV) {
- # We want to do routine logging during task 0 only. This gives the user
- # the information they need, but avoids repeating the information for every
- # task.
- my $Log;
- if ($ENV{TASK_SEQUENCE} eq "0") {
- $Log = sub {
- my $msg = shift;
- printf STDERR_ORIG "[Crunch] $msg\n", @_;
- };
- } else {
- $Log = sub { };
- }
-
- my $python_src = "$install_dir/python";
- my $venv_dir = "$job_work/.arvados.venv";
- my $venv_built = -e "$venv_dir/bin/activate";
- if ((!$venv_built) and (-d $python_src) and can_run("virtualenv")) {
- shell_or_die(undef, "virtualenv", "--quiet", "--system-site-packages",
- "--python=python2.7", $venv_dir);
- shell_or_die(TASK_TEMPFAIL, "$venv_dir/bin/pip", "--quiet", "install", "-I", $python_src);
- $venv_built = 1;
- $Log->("Built Python SDK virtualenv");
- }
-
- my @pysdk_version_cmd = ("python", "-c",
- "from pkg_resources import get_distribution as get; print get('arvados-python-client').version");
- if ($venv_built) {
- $Log->("Running in Python SDK virtualenv");
- @pysdk_version_cmd = ();
- my $orig_argv = join(" ", map { quotemeta($_); } @ARGV);
- @ARGV = ("/bin/sh", "-ec",
- ". \Q$venv_dir/bin/activate\E; exec $orig_argv");
- } elsif (-d $python_src) {
- $Log->("Warning: virtualenv not found inside Docker container default " .
- "\$PATH. Can't install Python SDK.");
- }
-
- if (@pysdk_version_cmd) {
- open(my $pysdk_version_pipe, "-|", @pysdk_version_cmd);
- my $pysdk_version = <$pysdk_version_pipe>;
- close($pysdk_version_pipe);
- if ($? == 0) {
- chomp($pysdk_version);
- $Log->("Using Arvados SDK version $pysdk_version");
- } else {
- # A lot could've gone wrong here, but pretty much all of it means that
- # Python won't be able to load the Arvados SDK.
- $Log->("Warning: Arvados SDK not found");
- }
- }
-
- while (my ($sdk_dir, $sdk_envkey) = each(%SDK_ENVVARS)) {
- my $sdk_path = "$install_dir/$sdk_dir";
- if (-d $sdk_path) {
- if ($ENV{$sdk_envkey}) {
- $ENV{$sdk_envkey} = "$sdk_path:" . $ENV{$sdk_envkey};
- } else {
- $ENV{$sdk_envkey} = $sdk_path;
- }
- $Log->("Arvados SDK added to %s", $sdk_envkey);
- }
- }
-
- exec(@ARGV);
- die "Cannot exec `@ARGV`: $!";
-}
-
-### Installation mode
-open L, ">", "$destdir.lock" or die "$destdir.lock: $!";
-flock L, LOCK_EX;
-if (readlink ("$destdir.archive_hash") eq $archive_hash && -d $destdir) {
- # This exact git archive (source + arvados sdk) is already installed
- # here, so there's no need to reinstall it.
-
- # We must consume our DATA section, though: otherwise the process
- # feeding it to us will get SIGPIPE.
- my $buf;
- while (read(DATA, $buf, 65536)) { }
-
- exit(0);
-}
-
-unlink "$destdir.archive_hash";
-mkdir $destdir;
-
-do {
- # Ignore SIGPIPE: we check retval of close() instead. See perlipc(1).
- local $SIG{PIPE} = "IGNORE";
- warn "Extracting archive: $archive_hash\n";
- # --ignore-zeros is necessary sometimes: depending on how much NUL
- # padding tar -A put on our combined archive (which in turn depends
- # on the length of the component archives) tar without
- # --ignore-zeros will exit before consuming stdin and cause close()
- # to fail on the resulting SIGPIPE.
- if (!open(TARX, "|-", "tar", "--ignore-zeros", "-xC", $destdir)) {
- die "Error launching 'tar -xC $destdir': $!";
- }
- # If we send too much data to tar in one write (> 4-5 MiB), it stops, and we
- # get SIGPIPE. We must feed it data incrementally.
- my $tar_input;
- while (read(DATA, $tar_input, 65536)) {
- print TARX $tar_input;
- }
- if(!close(TARX)) {
- die "'tar -xC $destdir' exited $?: $!";
- }
-};
-
-mkdir $install_dir;
-
-my $sdk_root = "$destdir/.arvados.sdk/sdk";
-if (-d $sdk_root) {
- foreach my $sdk_lang (("python",
- map { (split /\//, $_, 2)[0]; } keys(%SDK_ENVVARS))) {
- if (-d "$sdk_root/$sdk_lang") {
- if (!rename("$sdk_root/$sdk_lang", "$install_dir/$sdk_lang")) {
- die "Failed to install $sdk_lang SDK: $!";
- }
- }
- }
-}
-
-my $python_dir = "$install_dir/python";
-if ((-d $python_dir) and can_run("python2.7")) {
- open(my $egg_info_pipe, "-|",
- "python2.7 \Q$python_dir/setup.py\E egg_info 2>&1 >/dev/null");
- my @egg_info_errors = <$egg_info_pipe>;
- close($egg_info_pipe);
-
- if ($?) {
- if (@egg_info_errors and (($egg_info_errors[-1] =~ /\bgit\b/) or ($egg_info_errors[-1] =~ /\[Errno 2\]/))) {
- # egg_info apparently failed because it couldn't ask git for a build tag.
- # Specify no build tag.
- open(my $pysdk_cfg, ">>", "$python_dir/setup.cfg");
- print $pysdk_cfg "\n[egg_info]\ntag_build =\n";
- close($pysdk_cfg);
- } else {
- my $egg_info_exit = $? >> 8;
- foreach my $errline (@egg_info_errors) {
- warn $errline;
- }
- warn "python setup.py egg_info failed: exit $egg_info_exit";
- exit ($egg_info_exit || 1);
- }
- }
-}
-
-# Hide messages from the install script (unless it fails: shell_or_die
-# will show $destdir.log in that case).
-open(STDOUT, ">>", "$destdir.log") or die ($!);
-open(STDERR, ">&", STDOUT) or die ($!);
-
-if (-e "$destdir/crunch_scripts/install") {
- shell_or_die (undef, "$destdir/crunch_scripts/install", $install_dir);
-} elsif (!-e "./install.sh" && -e "./tests/autotests.sh") {
- # Old version
- shell_or_die (undef, "./tests/autotests.sh", $install_dir);
-} elsif (-e "./install.sh") {
- shell_or_die (undef, "./install.sh", $install_dir);
-}
-
-if ($archive_hash) {
- unlink "$destdir.archive_hash.new";
- symlink ($archive_hash, "$destdir.archive_hash.new") or die "$destdir.archive_hash.new: $!";
- rename ("$destdir.archive_hash.new", "$destdir.archive_hash") or die "$destdir.archive_hash: $!";
-}
-
-close L;
-
-sub can_run {
- my $command_name = shift;
- open(my $which, "-|", "which", $command_name) or die ($!);
- while (<$which>) { }
- close($which);
- return ($? == 0);
-}
-
-sub shell_or_die
-{
- my $exitcode = shift;
-
- if ($ENV{"DEBUG"}) {
- print STDERR "@_\n";
- }
- if (system (@_) != 0) {
- my $err = $!;
- my $code = $?;
- my $exitstatus = sprintf("exit %d signal %d", $code >> 8, $code & 0x7f);
- open STDERR, ">&STDERR_ORIG";
- system ("cat $destdir.log >&2");
- warn "@_ failed ($err): $exitstatus";
- if (defined($exitcode)) {
- exit $exitcode;
- }
- else {
- exit (($code >> 8) || 1);
- }
- }
-}
-
-__DATA__
commit bc1947e4aef52fe5f3aebc10dc2ea74cad86672d
Author: Peter Amstutz <pamstutz at veritasgenetics.com>
Date: Wed Aug 7 16:39:55 2019 -0400
15133: API tests passing
Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <pamstutz at veritasgenetics.com>
diff --git a/services/api/app/controllers/arvados/v1/jobs_controller.rb b/services/api/app/controllers/arvados/v1/jobs_controller.rb
index f6308c528..58a3fd168 100644
--- a/services/api/app/controllers/arvados/v1/jobs_controller.rb
+++ b/services/api/app/controllers/arvados/v1/jobs_controller.rb
@@ -28,13 +28,12 @@ class Arvados::V1::JobsController < ApplicationController
end
def queue
- return send_error("Unsupported legacy jobs API",
- status: 400)
+ @objects = []
+ index
end
def queue_size
- return send_error("Unsupported legacy jobs API",
- status: 400)
+ render :json => {:queue_size => 0}
end
def self._create_requires_parameters
diff --git a/services/api/test/functional/arvados/v1/job_reuse_controller_test.rb b/services/api/test/functional/arvados/v1/job_reuse_controller_test.rb
index d10ab6a71..02c5c6892 100644
--- a/services/api/test/functional/arvados/v1/job_reuse_controller_test.rb
+++ b/services/api/test/functional/arvados/v1/job_reuse_controller_test.rb
@@ -8,370 +8,11 @@ require 'helpers/git_test_helper'
class Arvados::V1::JobReuseControllerTest < ActionController::TestCase
fixtures :repositories, :users, :jobs, :links, :collections
- # See git_setup.rb for the commit log for test.git.tar
- include GitTestHelper
-
setup do
@controller = Arvados::V1::JobsController.new
authorize_with :active
end
- test "reuse job with no_reuse=false" do
- post :create, params: {
- job: {
- no_reuse: false,
- script: "hash",
- script_version: "4fe459abe02d9b365932b8f5dc419439ab4e2577",
- repository: "active/foo",
- script_parameters: {
- an_integer: '1',
- input: 'fa7aeb5140e2848d39b416daeef4ffc5+45'
- }
- }
- }
- assert_response :success
- assert_not_nil assigns(:object)
- new_job = JSON.parse(@response.body)
- assert_equal 'zzzzz-8i9sb-cjs4pklxxjykqqq', new_job['uuid']
- assert_equal '4fe459abe02d9b365932b8f5dc419439ab4e2577', new_job['script_version']
- end
-
- test "reuse job with find_or_create=true" do
- post :create, params: {
- job: {
- script: "hash",
- script_version: "4fe459abe02d9b365932b8f5dc419439ab4e2577",
- repository: "active/foo",
- script_parameters: {
- input: 'fa7aeb5140e2848d39b416daeef4ffc5+45',
- an_integer: '1'
- }
- },
- find_or_create: true
- }
- assert_response :success
- assert_not_nil assigns(:object)
- new_job = JSON.parse(@response.body)
- assert_equal 'zzzzz-8i9sb-cjs4pklxxjykqqq', new_job['uuid']
- assert_equal '4fe459abe02d9b365932b8f5dc419439ab4e2577', new_job['script_version']
- end
-
- test "no reuse job with null log" do
- post :create, params: {
- job: {
- script: "hash",
- script_version: "4fe459abe02d9b365932b8f5dc419439ab4e2577",
- repository: "active/foo",
- script_parameters: {
- input: 'fa7aeb5140e2848d39b416daeef4ffc5+45',
- an_integer: '3'
- }
- },
- find_or_create: true
- }
- assert_response :success
- assert_not_nil assigns(:object)
- new_job = JSON.parse(@response.body)
- assert_not_equal 'zzzzz-8i9sb-cjs4pklxxjykqq3', new_job['uuid']
- assert_equal '4fe459abe02d9b365932b8f5dc419439ab4e2577', new_job['script_version']
- end
-
- test "reuse job with symbolic script_version" do
- post :create, params: {
- job: {
- script: "hash",
- script_version: "tag1",
- repository: "active/foo",
- script_parameters: {
- input: 'fa7aeb5140e2848d39b416daeef4ffc5+45',
- an_integer: '1'
- }
- },
- find_or_create: true
- }
- assert_response :success
- assert_not_nil assigns(:object)
- new_job = JSON.parse(@response.body)
- assert_equal 'zzzzz-8i9sb-cjs4pklxxjykqqq', new_job['uuid']
- assert_equal '4fe459abe02d9b365932b8f5dc419439ab4e2577', new_job['script_version']
- end
-
- test "do not reuse job because no_reuse=true" do
- post :create, params: {
- job: {
- no_reuse: true,
- script: "hash",
- script_version: "4fe459abe02d9b365932b8f5dc419439ab4e2577",
- repository: "active/foo",
- script_parameters: {
- input: 'fa7aeb5140e2848d39b416daeef4ffc5+45',
- an_integer: '1'
- }
- }
- }
- assert_response :success
- assert_not_nil assigns(:object)
- new_job = JSON.parse(@response.body)
- assert_not_equal 'zzzzz-8i9sb-cjs4pklxxjykqqq', new_job['uuid']
- assert_equal '4fe459abe02d9b365932b8f5dc419439ab4e2577', new_job['script_version']
- end
-
- [false, "false"].each do |whichfalse|
- test "do not reuse job because find_or_create=#{whichfalse.inspect}" do
- post :create, params: {
- job: {
- script: "hash",
- script_version: "4fe459abe02d9b365932b8f5dc419439ab4e2577",
- repository: "active/foo",
- script_parameters: {
- input: 'fa7aeb5140e2848d39b416daeef4ffc5+45',
- an_integer: '1'
- }
- },
- find_or_create: whichfalse
- }
- assert_response :success
- assert_not_nil assigns(:object)
- new_job = JSON.parse(@response.body)
- assert_not_equal 'zzzzz-8i9sb-cjs4pklxxjykqqq', new_job['uuid']
- assert_equal '4fe459abe02d9b365932b8f5dc419439ab4e2577', new_job['script_version']
- end
- end
-
- test "do not reuse job because output is not readable by user" do
- authorize_with :job_reader
- post :create, params: {
- job: {
- script: "hash",
- script_version: "4fe459abe02d9b365932b8f5dc419439ab4e2577",
- repository: "active/foo",
- script_parameters: {
- input: 'fa7aeb5140e2848d39b416daeef4ffc5+45',
- an_integer: '1'
- }
- },
- find_or_create: true
- }
- assert_response :success
- assert_not_nil assigns(:object)
- new_job = JSON.parse(@response.body)
- assert_not_equal 'zzzzz-8i9sb-cjs4pklxxjykqqq', new_job['uuid']
- assert_equal '4fe459abe02d9b365932b8f5dc419439ab4e2577', new_job['script_version']
- end
-
- test "test_cannot_reuse_job_no_output" do
- post :create, params: {
- job: {
- no_reuse: false,
- script: "hash",
- script_version: "4fe459abe02d9b365932b8f5dc419439ab4e2577",
- repository: "active/foo",
- script_parameters: {
- input: 'fa7aeb5140e2848d39b416daeef4ffc5+45',
- an_integer: '2'
- }
- }
- }
- assert_response :success
- assert_not_nil assigns(:object)
- new_job = JSON.parse(@response.body)
- assert_not_equal 'zzzzz-8i9sb-cjs4pklxxjykppp', new_job['uuid']
- end
-
- test "test_reuse_job_range" do
- post :create, params: {
- job: {
- no_reuse: false,
- script: "hash",
- minimum_script_version: "tag1",
- script_version: "master",
- repository: "active/foo",
- script_parameters: {
- input: 'fa7aeb5140e2848d39b416daeef4ffc5+45',
- an_integer: '1'
- }
- }
- }
- assert_response :success
- assert_not_nil assigns(:object)
- new_job = JSON.parse(@response.body)
- assert_equal 'zzzzz-8i9sb-cjs4pklxxjykqqq', new_job['uuid']
- assert_equal '4fe459abe02d9b365932b8f5dc419439ab4e2577', new_job['script_version']
- end
-
- test "cannot_reuse_job_no_minimum_given_so_must_use_specified_commit" do
- post :create, params: {
- job: {
- no_reuse: false,
- script: "hash",
- script_version: "master",
- repository: "active/foo",
- script_parameters: {
- input: 'fa7aeb5140e2848d39b416daeef4ffc5+45',
- an_integer: '1'
- }
- }
- }
- assert_response :success
- assert_not_nil assigns(:object)
- new_job = JSON.parse(@response.body)
- assert_not_equal 'zzzzz-8i9sb-cjs4pklxxjykqqq', new_job['uuid']
- assert_equal '077ba2ad3ea24a929091a9e6ce545c93199b8e57', new_job['script_version']
- end
-
- test "test_cannot_reuse_job_different_input" do
- post :create, params: {
- job: {
- no_reuse: false,
- script: "hash",
- script_version: "4fe459abe02d9b365932b8f5dc419439ab4e2577",
- repository: "active/foo",
- script_parameters: {
- input: 'fa7aeb5140e2848d39b416daeef4ffc5+45',
- an_integer: '2'
- }
- }
- }
- assert_response :success
- assert_not_nil assigns(:object)
- new_job = JSON.parse(@response.body)
- assert_not_equal 'zzzzz-8i9sb-cjs4pklxxjykqqq', new_job['uuid']
- assert_equal '4fe459abe02d9b365932b8f5dc419439ab4e2577', new_job['script_version']
- end
-
- test "test_cannot_reuse_job_different_version" do
- post :create, params: {
- job: {
- no_reuse: false,
- script: "hash",
- script_version: "master",
- repository: "active/foo",
- script_parameters: {
- input: 'fa7aeb5140e2848d39b416daeef4ffc5+45',
- an_integer: '2'
- }
- }
- }
- assert_response :success
- assert_not_nil assigns(:object)
- new_job = JSON.parse(@response.body)
- assert_not_equal 'zzzzz-8i9sb-cjs4pklxxjykqqq', new_job['uuid']
- assert_equal '077ba2ad3ea24a929091a9e6ce545c93199b8e57', new_job['script_version']
- end
-
- test "test_can_reuse_job_submitted_nondeterministic" do
- post :create, params: {
- job: {
- no_reuse: false,
- script: "hash",
- script_version: "4fe459abe02d9b365932b8f5dc419439ab4e2577",
- repository: "active/foo",
- script_parameters: {
- input: 'fa7aeb5140e2848d39b416daeef4ffc5+45',
- an_integer: '1'
- },
- nondeterministic: true
- }
- }
- assert_response :success
- assert_not_nil assigns(:object)
- new_job = JSON.parse(@response.body)
- assert_equal 'zzzzz-8i9sb-cjs4pklxxjykqqq', new_job['uuid']
- assert_equal '4fe459abe02d9b365932b8f5dc419439ab4e2577', new_job['script_version']
- end
-
- test "test_cannot_reuse_job_past_nondeterministic" do
- post :create, params: {
- job: {
- no_reuse: false,
- script: "hash2",
- script_version: "4fe459abe02d9b365932b8f5dc419439ab4e2577",
- repository: "active/foo",
- script_parameters: {
- input: 'fa7aeb5140e2848d39b416daeef4ffc5+45',
- an_integer: '1'
- }
- }
- }
- assert_response :success
- assert_not_nil assigns(:object)
- new_job = JSON.parse(@response.body)
- assert_not_equal 'zzzzz-8i9sb-cjs4pklxxjykyyy', new_job['uuid']
- assert_equal '4fe459abe02d9b365932b8f5dc419439ab4e2577', new_job['script_version']
- end
-
- test "test_cannot_reuse_job_no_permission" do
- authorize_with :spectator
- post :create, params: {
- job: {
- no_reuse: false,
- script: "hash",
- script_version: "4fe459abe02d9b365932b8f5dc419439ab4e2577",
- repository: "active/foo",
- script_parameters: {
- input: 'fa7aeb5140e2848d39b416daeef4ffc5+45',
- an_integer: '1'
- }
- }
- }
- assert_response :success
- assert_not_nil assigns(:object)
- new_job = JSON.parse(@response.body)
- assert_not_equal 'zzzzz-8i9sb-cjs4pklxxjykqqq', new_job['uuid']
- assert_equal '4fe459abe02d9b365932b8f5dc419439ab4e2577', new_job['script_version']
- end
-
- test "test_cannot_reuse_job_excluded" do
- post :create, params: {
- job: {
- no_reuse: false,
- script: "hash",
- minimum_script_version: "31ce37fe365b3dc204300a3e4c396ad333ed0556",
- script_version: "master",
- repository: "active/foo",
- exclude_script_versions: ["tag1"],
- script_parameters: {
- input: 'fa7aeb5140e2848d39b416daeef4ffc5+45',
- an_integer: '1'
- }
- }
- }
- assert_response :success
- assert_not_nil assigns(:object)
- new_job = JSON.parse(@response.body)
- assert_not_equal 'zzzzz-8i9sb-cjs4pklxxjykqqq', new_job['uuid']
- assert_not_equal('4fe459abe02d9b365932b8f5dc419439ab4e2577',
- new_job['script_version'])
- end
-
- test "cannot reuse job with find_or_create but excluded version" do
- post :create, params: {
- job: {
- script: "hash",
- script_version: "master",
- repository: "active/foo",
- script_parameters: {
- input: 'fa7aeb5140e2848d39b416daeef4ffc5+45',
- an_integer: '1'
- }
- },
- find_or_create: true,
- minimum_script_version: "31ce37fe365b3dc204300a3e4c396ad333ed0556",
- exclude_script_versions: ["tag1"],
- }
- assert_response :success
- assert_not_nil assigns(:object)
- new_job = JSON.parse(@response.body)
- assert_not_equal 'zzzzz-8i9sb-cjs4pklxxjykqqq', new_job['uuid']
- assert_not_equal('4fe459abe02d9b365932b8f5dc419439ab4e2577',
- new_job['script_version'])
- end
-
- test "cannot reuse job when hash-like branch includes newer commit" do
- check_new_job_created_from({job: {script_version: "738783"}},
- :previous_job_run_superseded_by_hash_branch)
- end
-
BASE_FILTERS = {
'repository' => ['=', 'active/foo'],
'script' => ['=', 'hash'],
@@ -384,217 +25,6 @@ class Arvados::V1::JobReuseControllerTest < ActionController::TestCase
hash.each_pair.map { |name, filter| [name] + filter }
end
- test "can reuse a Job based on filters" do
- filters_hash = BASE_FILTERS.
- merge('script_version' => ['in git', 'tag1'])
- post(:create, params: {
- job: {
- script: "hash",
- script_version: "master",
- repository: "active/foo",
- script_parameters: {
- input: 'fa7aeb5140e2848d39b416daeef4ffc5+45',
- an_integer: '1'
- }
- },
- filters: filters_from_hash(filters_hash),
- find_or_create: true,
- })
- assert_response :success
- assert_not_nil assigns(:object)
- new_job = JSON.parse(@response.body)
- assert_equal 'zzzzz-8i9sb-cjs4pklxxjykqqq', new_job['uuid']
- assert_equal '4fe459abe02d9b365932b8f5dc419439ab4e2577', new_job['script_version']
- end
-
- test "can not reuse a Job based on filters" do
- filters = filters_from_hash(BASE_FILTERS
- .reject { |k| k == 'script_version' })
- filters += [["script_version", "in git",
- "31ce37fe365b3dc204300a3e4c396ad333ed0556"],
- ["script_version", "not in git", ["tag1"]]]
- post(:create, params: {
- job: {
- script: "hash",
- script_version: "master",
- repository: "active/foo",
- script_parameters: {
- input: 'fa7aeb5140e2848d39b416daeef4ffc5+45',
- an_integer: '1'
- }
- },
- filters: filters,
- find_or_create: true,
- })
- assert_response :success
- assert_not_nil assigns(:object)
- new_job = JSON.parse(@response.body)
- assert_not_equal 'zzzzz-8i9sb-cjs4pklxxjykqqq', new_job['uuid']
- assert_equal '077ba2ad3ea24a929091a9e6ce545c93199b8e57', new_job['script_version']
- end
-
- test "can not reuse a Job based on arbitrary filters" do
- filters_hash = BASE_FILTERS.
- merge("created_at" => ["<", "2010-01-01T00:00:00Z"])
- post(:create, params: {
- job: {
- script: "hash",
- script_version: "4fe459abe02d9b365932b8f5dc419439ab4e2577",
- repository: "active/foo",
- script_parameters: {
- input: 'fa7aeb5140e2848d39b416daeef4ffc5+45',
- an_integer: '1'
- }
- },
- filters: filters_from_hash(filters_hash),
- find_or_create: true,
- })
- assert_response :success
- assert_not_nil assigns(:object)
- new_job = JSON.parse(@response.body)
- assert_not_equal 'zzzzz-8i9sb-cjs4pklxxjykqqq', new_job['uuid']
- assert_equal '4fe459abe02d9b365932b8f5dc419439ab4e2577', new_job['script_version']
- end
-
- test "can reuse a Job with a Docker image" do
- post(:create, params: {
- job: {
- script: "hash",
- script_version: "4fe459abe02d9b365932b8f5dc419439ab4e2577",
- repository: "active/foo",
- script_parameters: {
- input: 'fa7aeb5140e2848d39b416daeef4ffc5+45',
- an_integer: '1'
- },
- runtime_constraints: {
- docker_image: 'arvados/apitestfixture',
- }
- },
- find_or_create: true,
- })
- assert_response :success
- new_job = assigns(:object)
- assert_not_nil new_job
- target_job = jobs(:previous_docker_job_run)
- [:uuid, :script_version, :docker_image_locator].each do |attr|
- assert_equal(target_job.send(attr), new_job.send(attr))
- end
- end
-
- test "can reuse a Job with a Docker image hash filter" do
- filters_hash = BASE_FILTERS.
- merge("script_version" =>
- ["=", "4fe459abe02d9b365932b8f5dc419439ab4e2577"],
- "docker_image_locator" =>
- ["in docker", links(:docker_image_collection_hash).name])
- post(:create, params: {
- job: {
- script: "hash",
- script_version: "4fe459abe02d9b365932b8f5dc419439ab4e2577",
- repository: "active/foo",
- script_parameters: {
- input: 'fa7aeb5140e2848d39b416daeef4ffc5+45',
- an_integer: '1'
- },
- },
- filters: filters_from_hash(filters_hash),
- find_or_create: true,
- })
- assert_response :success
- new_job = assigns(:object)
- assert_not_nil new_job
- target_job = jobs(:previous_docker_job_run)
- [:uuid, :script_version, :docker_image_locator].each do |attr|
- assert_equal(target_job.send(attr), new_job.send(attr))
- end
- end
-
- test "reuse Job with Docker image repo+tag" do
- filters_hash = BASE_FILTERS.
- merge("script_version" =>
- ["=", "4fe459abe02d9b365932b8f5dc419439ab4e2577"],
- "docker_image_locator" =>
- ["in docker", links(:docker_image_collection_tag2).name])
- post(:create, params: {
- job: {
- script: "hash",
- script_version: "4fe459abe02d9b365932b8f5dc419439ab4e2577",
- repository: "active/foo",
- script_parameters: {
- input: 'fa7aeb5140e2848d39b416daeef4ffc5+45',
- an_integer: '1'
- },
- },
- filters: filters_from_hash(filters_hash),
- find_or_create: true,
- })
- assert_response :success
- new_job = assigns(:object)
- assert_not_nil new_job
- target_job = jobs(:previous_docker_job_run)
- [:uuid, :script_version, :docker_image_locator].each do |attr|
- assert_equal(target_job.send(attr), new_job.send(attr))
- end
- end
-
- test "new job with unknown Docker image filter" do
- filters_hash = BASE_FILTERS.
- merge("docker_image_locator" => ["in docker", "_nonesuchname_"])
- post(:create, params: {
- job: {
- script: "hash",
- script_version: "4fe459abe02d9b365932b8f5dc419439ab4e2577",
- repository: "active/foo",
- script_parameters: {
- input: 'fa7aeb5140e2848d39b416daeef4ffc5+45',
- an_integer: '1'
- },
- },
- filters: filters_from_hash(filters_hash),
- find_or_create: true,
- })
- assert_response :success
- new_job = assigns(:object)
- assert_not_nil new_job
- assert_not_equal(jobs(:previous_docker_job_run).uuid, new_job.uuid)
- end
-
- test "don't reuse job using older Docker image of same name" do
- jobspec = {runtime_constraints: {
- docker_image: "arvados/apitestfixture",
- }}
- check_new_job_created_from({job: jobspec},
- :previous_ancient_docker_image_job_run)
- end
-
- test "reuse job with Docker image that has hash name" do
- jobspec = {runtime_constraints: {
- docker_image: "a" * 64,
- }}
- check_job_reused_from(jobspec, :previous_docker_job_run)
- end
-
- ["repository", "script"].each do |skip_key|
- test "missing #{skip_key} filter raises an error" do
- filters = filters_from_hash(BASE_FILTERS.reject { |k| k == skip_key })
- post(:create, params: {
- job: {
- script: "hash",
- script_version: "master",
- repository: "active/foo",
- script_parameters: {
- input: 'fa7aeb5140e2848d39b416daeef4ffc5+45',
- an_integer: '1'
- }
- },
- filters: filters,
- find_or_create: true,
- })
- assert_includes(405..599, @response.code.to_i,
- "bad status code with missing #{skip_key} filter")
- end
- end
-
test "find Job with script version range" do
get :index, params: {
filters: [["repository", "=", "active/foo"],
@@ -673,136 +103,4 @@ class Arvados::V1::JobReuseControllerTest < ActionController::TestCase
jobs(:previous_docker_job_run).uuid)
end
- JOB_SUBMIT_KEYS = [:script, :script_parameters, :script_version, :repository]
- DEFAULT_START_JOB = :previous_job_run
-
- def create_job_params(params, start_from=DEFAULT_START_JOB)
- if not params.has_key?(:find_or_create)
- params[:find_or_create] = true
- end
- job_attrs = params.delete(:job) || {}
- start_job = jobs(start_from)
- params[:job] = Hash[JOB_SUBMIT_KEYS.map do |key|
- [key, start_job.send(key)]
- end]
- params[:job][:runtime_constraints] =
- job_attrs.delete(:runtime_constraints) || {}
- { arvados_sdk_version: :arvados_sdk_version,
- docker_image_locator: :docker_image }.each do |method, constraint_key|
- if constraint_value = start_job.send(method)
- params[:job][:runtime_constraints][constraint_key] ||= constraint_value
- end
- end
- params[:job].merge!(job_attrs)
- params
- end
-
- def create_job_from(params, start_from)
- post(:create, params: create_job_params(params, start_from))
- assert_response :success
- new_job = assigns(:object)
- assert_not_nil new_job
- new_job
- end
-
- def check_new_job_created_from(params, start_from=DEFAULT_START_JOB)
- start_time = Time.now
- new_job = create_job_from(params, start_from)
- assert_operator(start_time, :<=, new_job.created_at)
- new_job
- end
-
- def check_job_reused_from(params, start_from)
- new_job = create_job_from(params, start_from)
- assert_equal(jobs(start_from).uuid, new_job.uuid)
- end
-
- def check_errors_from(params, start_from=DEFAULT_START_JOB)
- post(:create, params: create_job_params(params, start_from))
- assert_includes(405..499, @response.code.to_i)
- errors = json_response.fetch("errors", [])
- assert(errors.any?, "no errors assigned from #{params}")
- refute(errors.any? { |msg| msg =~ /^#<[A-Za-z]+: / },
- "errors include raw exception: #{errors.inspect}")
- errors
- end
-
- # 1de84a8 is on the b1 branch, after master's tip.
- test "new job created from unsatisfiable minimum version filter" do
- filters_hash = BASE_FILTERS.merge("script_version" => ["in git", "1de84a8"])
- check_new_job_created_from(filters: filters_from_hash(filters_hash))
- end
-
- test "new job created from unsatisfiable minimum version parameter" do
- check_new_job_created_from(minimum_script_version: "1de84a8")
- end
-
- test "new job created from unsatisfiable minimum version attribute" do
- check_new_job_created_from(job: {minimum_script_version: "1de84a8"})
- end
-
- test "graceful error from nonexistent minimum version filter" do
- filters_hash = BASE_FILTERS.merge("script_version" =>
- ["in git", "__nosuchbranch__"])
- errors = check_errors_from(filters: filters_from_hash(filters_hash))
- assert(errors.any? { |msg| msg.include? "__nosuchbranch__" },
- "bad refspec not mentioned in error message")
- end
-
- test "graceful error from nonexistent minimum version parameter" do
- errors = check_errors_from(minimum_script_version: "__nosuchbranch__")
- assert(errors.any? { |msg| msg.include? "__nosuchbranch__" },
- "bad refspec not mentioned in error message")
- end
-
- test "graceful error from nonexistent minimum version attribute" do
- errors = check_errors_from(job: {minimum_script_version: "__nosuchbranch__"})
- assert(errors.any? { |msg| msg.include? "__nosuchbranch__" },
- "bad refspec not mentioned in error message")
- end
-
- test "don't reuse job with older Arvados SDK version specified by branch" do
- jobspec = {runtime_constraints: {
- arvados_sdk_version: "master",
- }}
- check_new_job_created_from({job: jobspec},
- :previous_job_run_with_arvados_sdk_version)
- end
-
- test "don't reuse job with older Arvados SDK version specified by commit" do
- jobspec = {runtime_constraints: {
- arvados_sdk_version: "ca68b24e51992e790f29df5cc4bc54ce1da4a1c2",
- }}
- check_new_job_created_from({job: jobspec},
- :previous_job_run_with_arvados_sdk_version)
- end
-
- test "don't reuse job with newer Arvados SDK version specified by commit" do
- jobspec = {runtime_constraints: {
- arvados_sdk_version: "436637c87a1d2bdbf4b624008304064b6cf0e30c",
- }}
- check_new_job_created_from({job: jobspec},
- :previous_job_run_with_arvados_sdk_version)
- end
-
- test "reuse job from arvados_sdk_version git filters" do
- prev_job = jobs(:previous_job_run_with_arvados_sdk_version)
- filters_hash = BASE_FILTERS.
- merge("arvados_sdk_version" => ["in git", "commit2"],
- "docker_image_locator" => ["=", prev_job.docker_image_locator])
- filters_hash.delete("script_version")
- params = create_job_params(filters: filters_from_hash(filters_hash))
- post(:create, params: params)
- assert_response :success
- assert_equal(prev_job.uuid, assigns(:object).uuid)
- end
-
- test "create new job because of arvados_sdk_version 'not in git' filters" do
- filters_hash = BASE_FILTERS.reject { |k| k == "script_version" }
- filters = filters_from_hash(filters_hash)
- # Allow anything from the root commit, but before commit 2.
- filters += [["arvados_sdk_version", "in git", "436637c8"],
- ["arvados_sdk_version", "not in git", "00634b2b"]]
- check_new_job_created_from(filters: filters)
- end
end
diff --git a/services/api/test/functional/arvados/v1/jobs_controller_test.rb b/services/api/test/functional/arvados/v1/jobs_controller_test.rb
index 3803a0dc4..9298f23d5 100644
--- a/services/api/test/functional/arvados/v1/jobs_controller_test.rb
+++ b/services/api/test/functional/arvados/v1/jobs_controller_test.rb
@@ -7,172 +7,6 @@ require 'helpers/git_test_helper'
class Arvados::V1::JobsControllerTest < ActionController::TestCase
- include GitTestHelper
-
- test "submit a job" do
- authorize_with :active
- post :create, params: {
- job: {
- script: "hash",
- script_version: "master",
- repository: "active/foo",
- script_parameters: {}
- }
- }
- assert_response :success
- assert_not_nil assigns(:object)
- new_job = JSON.parse(@response.body)
- assert_not_nil new_job['uuid']
- assert_not_nil new_job['script_version'].match(/^[0-9a-f]{40}$/)
- assert_equal 0, new_job['priority']
- end
-
- test "normalize output and log uuids when creating job" do
- authorize_with :active
- post :create, params: {
- job: {
- script: "hash",
- script_version: "master",
- script_parameters: {},
- repository: "active/foo",
- started_at: Time.now,
- finished_at: Time.now,
- running: false,
- success: true,
- output: 'd41d8cd98f00b204e9800998ecf8427e+0+K at xyzzy',
- log: 'd41d8cd98f00b204e9800998ecf8427e+0+K at xyzzy'
- }
- }
- assert_response :success
- assert_not_nil assigns(:object)
- new_job = assigns(:object)
- assert_equal 'd41d8cd98f00b204e9800998ecf8427e+0', new_job['log']
- assert_equal 'd41d8cd98f00b204e9800998ecf8427e+0', new_job['output']
- version = new_job['script_version']
-
- # Make sure version doesn't get mangled by normalize
- assert_not_nil version.match(/^[0-9a-f]{40}$/)
- assert_equal 'master', json_response['supplied_script_version']
- end
-
- test "normalize output and log uuids when updating job" do
- authorize_with :active
-
- foobar_job = jobs(:foobar)
-
- new_output = 'd41d8cd98f00b204e9800998ecf8427e+0+K at xyzzy'
- new_log = 'd41d8cd98f00b204e9800998ecf8427e+0+K at xyzzy'
- put :update, params: {
- id: foobar_job['uuid'],
- job: {
- output: new_output,
- log: new_log
- }
- }
-
- updated_job = json_response
- assert_not_equal foobar_job['log'], updated_job['log']
- assert_not_equal new_log, updated_job['log'] # normalized during update
- assert_equal new_log[0,new_log.rindex('+')], updated_job['log']
- assert_not_equal foobar_job['output'], updated_job['output']
- assert_not_equal new_output, updated_job['output'] # normalized during update
- assert_equal new_output[0,new_output.rindex('+')], updated_job['output']
- end
-
- test "cancel a running job" do
- # We need to verify that "cancel" creates a trigger file, so first
- # let's make sure there is no stale trigger file.
- begin
- File.unlink(Rails.configuration.Containers.JobsAPI.CrunchRefreshTrigger)
- rescue Errno::ENOENT
- end
-
- authorize_with :active
- put :update, params: {
- id: jobs(:running).uuid,
- job: {
- cancelled_at: 4.day.ago
- }
- }
- assert_response :success
- assert_not_nil assigns(:object)
- job = JSON.parse(@response.body)
- assert_not_nil job['uuid']
- assert_not_nil job['cancelled_at']
- assert_not_nil job['cancelled_by_user_uuid']
- assert_not_nil job['cancelled_by_client_uuid']
- assert_equal(true, Time.parse(job['cancelled_at']) > 1.minute.ago,
- 'server should correct bogus cancelled_at ' +
- job['cancelled_at'])
- assert_equal(true,
- File.exist?(Rails.configuration.Containers.JobsAPI.CrunchRefreshTrigger),
- 'trigger file should be created when job is cancelled')
- end
-
- [
- [:put, :update, {job:{cancelled_at: Time.now}}, :success],
- [:put, :update, {job:{cancelled_at: nil}}, :unprocessable_entity],
- [:put, :update, {job:{state: 'Cancelled'}}, :success],
- [:put, :update, {job:{state: 'Queued'}}, :unprocessable_entity],
- [:put, :update, {job:{state: 'Running'}}, :unprocessable_entity],
- [:put, :update, {job:{state: 'Failed'}}, :unprocessable_entity],
- [:put, :update, {job:{state: 'Complete'}}, :unprocessable_entity],
- [:post, :cancel, {}, :success],
- ].each do |http_method, action, params, expected_response|
- test "cancelled job stays cancelled after #{[http_method, action, params].inspect}" do
- # We need to verify that "cancel" creates a trigger file, so first
- # let's make sure there is no stale trigger file.
- begin
- File.unlink(Rails.configuration.Containers.JobsAPI.CrunchRefreshTrigger)
- rescue Errno::ENOENT
- end
-
- authorize_with :active
- self.send http_method, action, params: { id: jobs(:cancelled).uuid }.merge(params)
- assert_response expected_response
- if expected_response == :success
- job = json_response
- assert_not_nil job['cancelled_at'], 'job cancelled again using #{attribute}=#{value} did not have cancelled_at value'
- assert_equal job['state'], 'Cancelled', 'cancelled again job state changed when updated using using #{attribute}=#{value}'
- end
- # Verify database record still says Cancelled
- assert_equal 'Cancelled', Job.find(jobs(:cancelled).id).state, 'job was un-cancelled'
- end
- end
-
- test "cancelled job updated to any other state change results in error" do
- # We need to verify that "cancel" creates a trigger file, so first
- # let's make sure there is no stale trigger file.
- begin
- File.unlink(Rails.configuration.Containers.JobsAPI.CrunchRefreshTrigger)
- rescue Errno::ENOENT
- end
-
- authorize_with :active
- put :update, params: {
- id: jobs(:running_cancelled).uuid,
- job: {
- cancelled_at: nil
- }
- }
- assert_response 422
- end
-
- ['abc.py', 'hash.py'].each do |script|
- test "update job script attribute to #{script} without failing script_version check" do
- authorize_with :admin
- put :update, params: {
- id: jobs(:uses_nonexistent_script_version).uuid,
- job: {
- script: script
- }
- }
- assert_response :success
- resp = assigns(:object)
- assert_equal jobs(:uses_nonexistent_script_version).script_version, resp['script_version']
- end
- end
-
test "search jobs by uuid with >= query" do
authorize_with :active
get :index, params: {
@@ -331,52 +165,12 @@ class Arvados::V1::JobsControllerTest < ActionController::TestCase
assert_response 422
end
- test "finish a job" do
- authorize_with :active
- put :update, params: {
- id: jobs(:nearly_finished_job).uuid,
- job: {
- output: '551392cc37a317abf865b95f66f4ef94+101',
- log: '9215de2a951a721f5f156bc08cf63ad7+93',
- tasks_summary: {done: 1, running: 0, todo: 0, failed: 0},
- success: true,
- running: false,
- finished_at: Time.now.to_s
- }
- }
- assert_response :success
- end
-
[:spectator, :admin].each_with_index do |which_token, i|
test "get job queue as #{which_token} user" do
authorize_with which_token
get :queue
assert_response :success
- assert_equal i, assigns(:objects).count
- end
- end
-
- test "get job queue as with a = filter" do
- authorize_with :admin
- get :queue, params: { filters: [['script','=','foo']] }
- assert_response :success
- assert_equal ['foo'], assigns(:objects).collect(&:script).uniq
- assert_equal 0, assigns(:objects)[0].queue_position
- end
-
- test "get job queue as with a != filter" do
- authorize_with :admin
- get :queue, params: { filters: [['script','!=','foo']] }
- assert_response :success
- assert_equal 0, assigns(:objects).count
- end
-
- [:spectator, :admin].each do |which_token|
- test "get queue_size as #{which_token} user" do
- authorize_with which_token
- get :queue_size
- assert_response :success
- assert_equal 1, JSON.parse(@response.body)["queue_size"]
+ assert_equal 0, assigns(:objects).count
end
end
@@ -387,67 +181,6 @@ class Arvados::V1::JobsControllerTest < ActionController::TestCase
assert_equal([nodes(:busy).uuid], json_response["node_uuids"])
end
- test "job lock success" do
- authorize_with :active
- post :lock, params: {id: jobs(:queued).uuid}
- assert_response :success
- job = Job.where(uuid: jobs(:queued).uuid).first
- assert_equal "Running", job.state
- end
-
- test "job lock conflict" do
- authorize_with :active
- post :lock, params: {id: jobs(:running).uuid}
- assert_response 422 # invalid state transition
- end
-
- test 'reject invalid commit in remote repository' do
- authorize_with :active
- url = "http://localhost:1/fake/fake.git"
- fetch_remote_from_local_repo url, :foo
- post :create, params: {
- job: {
- script: "hash",
- script_version: "abc123",
- repository: url,
- script_parameters: {}
- }
- }
- assert_response 422
- end
-
- test 'tag remote commit in internal repository' do
- authorize_with :active
- url = "http://localhost:1/fake/fake.git"
- fetch_remote_from_local_repo url, :foo
- post :create, params: {
- job: {
- script: "hash",
- script_version: "master",
- repository: url,
- script_parameters: {}
- }
- }
- assert_response :success
- assert_equal('077ba2ad3ea24a929091a9e6ce545c93199b8e57',
- internal_tag(json_response['uuid']))
- end
-
- test 'tag local commit in internal repository' do
- authorize_with :active
- post :create, params: {
- job: {
- script: "hash",
- script_version: "master",
- repository: "active/foo",
- script_parameters: {}
- }
- }
- assert_response :success
- assert_equal('077ba2ad3ea24a929091a9e6ce545c93199b8e57',
- internal_tag(json_response['uuid']))
- end
-
test 'get job with components' do
authorize_with :active
get :show, params: {id: jobs(:running_job_with_components).uuid}
@@ -455,42 +188,4 @@ class Arvados::V1::JobsControllerTest < ActionController::TestCase
assert_not_nil json_response["components"]
assert_equal ["component1", "component2"], json_response["components"].keys
end
-
- [
- [:active, :success],
- [:system_user, :success],
- [:admin, 403],
- ].each do |user, expected|
- test "add components to job locked by active user as #{user} user and expect #{expected}" do
- authorize_with user
- put :update, params: {
- id: jobs(:running).uuid,
- job: {
- components: {"component1" => "value1", "component2" => "value2"}
- }
- }
- assert_response expected
- if expected == :success
- assert_not_nil json_response["components"]
- keys = json_response["components"].keys
- assert_equal ["component1", "component2"], keys
- assert_equal "value1", json_response["components"][keys[0]]
- end
- end
- end
-
- test 'jobs.create disabled in config' do
- Rails.configuration.API.DisabledAPIs = {"jobs.create"=>{},
- "pipeline_instances.create"=>{}}
- authorize_with :active
- post :create, params: {
- job: {
- script: "hash",
- script_version: "master",
- repository: "active/foo",
- script_parameters: {}
- }
- }
- assert_response 404
- end
end
diff --git a/services/api/test/functional/arvados/v1/pipeline_instances_controller_test.rb b/services/api/test/functional/arvados/v1/pipeline_instances_controller_test.rb
index a76151150..e455354c1 100644
--- a/services/api/test/functional/arvados/v1/pipeline_instances_controller_test.rb
+++ b/services/api/test/functional/arvados/v1/pipeline_instances_controller_test.rb
@@ -5,48 +5,4 @@
require 'test_helper'
class Arvados::V1::PipelineInstancesControllerTest < ActionController::TestCase
-
- test 'create pipeline with components copied from template' do
- authorize_with :active
- post :create, params: {
- pipeline_instance: {
- pipeline_template_uuid: pipeline_templates(:two_part).uuid
- }
- }
- assert_response :success
- assert_equal(pipeline_templates(:two_part).components.to_json,
- assigns(:object).components.to_json)
- end
-
- test 'create pipeline with no template' do
- authorize_with :active
- post :create, params: {
- pipeline_instance: {
- components: {}
- }
- }
- assert_response :success
- assert_equal({}, assigns(:object).components)
- end
-
- [
- true,
- false
- ].each do |cascade|
- test "cancel a pipeline instance with cascade=#{cascade}" do
- authorize_with :active
- pi_uuid = pipeline_instances(:job_child_pipeline_with_components_at_level_2).uuid
-
- post :cancel, params: {id: pi_uuid, cascade: cascade}
- assert_response :success
-
- pi = PipelineInstance.where(uuid: pi_uuid).first
- assert_equal "Paused", pi.state
-
- children = Job.where(uuid: ['zzzzz-8i9sb-job1atlevel3noc', 'zzzzz-8i9sb-job2atlevel3noc'])
- children.each do |child|
- assert_equal ("Cancelled" == child.state), cascade
- end
- end
- end
end
diff --git a/services/api/test/integration/crunch_dispatch_test.rb b/services/api/test/integration/crunch_dispatch_test.rb
deleted file mode 100644
index 6ac127087..000000000
--- a/services/api/test/integration/crunch_dispatch_test.rb
+++ /dev/null
@@ -1,47 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: AGPL-3.0
-
-require 'test_helper'
-require 'helpers/git_test_helper'
-
-class CrunchDispatchIntegrationTest < ActionDispatch::IntegrationTest
- include GitTestHelper
-
- fixtures :all
-
- @@crunch_dispatch_pid = nil
-
- def launch_crunch_dispatch
- @@crunch_dispatch_pid = Process.fork {
- ENV['PATH'] = ENV['HOME'] + '/arvados/services/crunch:' + ENV['PATH']
- exec(ENV['HOME'] + '/arvados/services/api/script/crunch-dispatch.rb')
- }
- end
-
- teardown do
- if @@crunch_dispatch_pid
- Process.kill "TERM", @@crunch_dispatch_pid
- Process.wait
- @@crunch_dispatch_pid = nil
- end
- end
-
- test "job runs" do
- post "/arvados/v1/jobs",
- params: {
- format: "json",
- job: {
- script: "log",
- repository: "active/crunchdispatchtest",
- script_version: "f35f99b7d32bac257f5989df02b9f12ee1a9b0d6",
- script_parameters: {
- input: 'fa7aeb5140e2848d39b416daeef4ffc5+45',
- an_integer: '1'
- }
- }
- },
- headers: auth(:admin)
- assert_response :success
- end
-end
diff --git a/services/api/test/integration/jobs_api_test.rb b/services/api/test/integration/jobs_api_test.rb
index f5fb920b4..76d4fff59 100644
--- a/services/api/test/integration/jobs_api_test.rb
+++ b/services/api/test/integration/jobs_api_test.rb
@@ -5,87 +5,4 @@
require 'test_helper'
class JobsApiTest < ActionDispatch::IntegrationTest
- fixtures :all
-
- test "cancel job" do
- post "/arvados/v1/jobs/#{jobs(:running).uuid}/cancel",
- params: {:format => :json},
- headers: {'HTTP_AUTHORIZATION' => "OAuth2 #{api_client_authorizations(:active).api_token}"}
- assert_response :success
- assert_equal "arvados#job", json_response['kind']
- assert_not_nil json_response['cancelled_at']
- end
-
- test "cancel someone else's visible job" do
- post "/arvados/v1/jobs/#{jobs(:runningbarbaz).uuid}/cancel",
- params: {:format => :json},
- headers: {'HTTP_AUTHORIZATION' => "OAuth2 #{api_client_authorizations(:spectator).api_token}"}
- assert_response 403
- end
-
- test "cancel someone else's invisible job" do
- post "/arvados/v1/jobs/#{jobs(:running).uuid}/cancel",
- params: {:format => :json},
- headers: {'HTTP_AUTHORIZATION' => "OAuth2 #{api_client_authorizations(:spectator).api_token}"}
- assert_response 404
- end
-
- test "task qsequence values automatically increase monotonically" do
- post_args = ["/arvados/v1/job_tasks",
- params: {job_task: {
- job_uuid: jobs(:running).uuid,
- sequence: 1,
- }},
- headers: auth(:active)]
- last_qsequence = -1
- (1..3).each do |task_num|
- @response = nil
- post(*post_args)
- assert_response :success
- qsequence = json_response["qsequence"]
- assert_not_nil(qsequence, "task not assigned qsequence")
- assert_operator(qsequence, :>, last_qsequence,
- "qsequence did not increase between tasks")
- last_qsequence = qsequence
- end
- end
-
- test 'get_delete components_get again for job with components' do
- authorize_with :active
- get "/arvados/v1/jobs/#{jobs(:running_job_with_components).uuid}",
- headers: auth(:active)
- assert_response 200
- assert_not_nil json_response["components"]
- assert_equal ["component1", "component2"], json_response["components"].keys
-
- # delete second component
- put "/arvados/v1/jobs/#{jobs(:running_job_with_components).uuid}", params: {
- job: {
- components: {"component1" => "zzzzz-8i9sb-jobuuid00000001"}
- },
- limit: 1000
- }, headers: auth(:active)
- assert_response 200
-
- get "/arvados/v1/jobs/#{jobs(:running_job_with_components).uuid}",
- headers: auth(:active)
- assert_response 200
- assert_not_nil json_response["components"]
- assert_equal ["component1"], json_response["components"].keys
-
- # delete all components
- put "/arvados/v1/jobs/#{jobs(:running_job_with_components).uuid}", params: {
- job: {
- components: nil
- },
- limit: 1000
- }, headers: auth(:active)
- assert_response 200
-
- get "/arvados/v1/jobs/#{jobs(:running_job_with_components).uuid}",
- headers: auth(:active)
- assert_response 200
- assert_not_nil json_response["components"]
- assert_equal [], json_response["components"].keys
- end
end
diff --git a/services/api/test/integration/pipeline_test.rb b/services/api/test/integration/pipeline_test.rb
index d4f7eba30..4d8f88248 100644
--- a/services/api/test/integration/pipeline_test.rb
+++ b/services/api/test/integration/pipeline_test.rb
@@ -5,40 +5,4 @@
require 'test_helper'
class PipelineIntegrationTest < ActionDispatch::IntegrationTest
- # These tests simulate the workflow of arv-run-pipeline-instance
- # and other pipeline-running code.
-
- def check_component_match(comp_key, comp_hash)
- assert_response :success
- built_json = json_response
- built_component = built_json["components"][comp_key]
- comp_hash.each_pair do |key, expected|
- assert_equal(expected, built_component[key.to_s],
- "component's #{key} field changed")
- end
- end
-
- test "creating a pipeline instance preserves required component parameters" do
- comp_name = "test_component"
- component = {
- repository: "test_repo",
- script: "test_script",
- script_version: "test_refspec",
- script_parameters: {},
- }
-
- post("/arvados/v1/pipeline_instances",
- params: {
- pipeline_instance: {
- components: {comp_name => component}
- }.to_json
- },
- headers: auth(:active))
- check_component_match(comp_name, component)
- pi_uuid = json_response["uuid"]
-
- @response = nil
- get("/arvados/v1/pipeline_instances/#{pi_uuid}", params: {}, headers: auth(:active))
- check_component_match(comp_name, component)
- end
end
diff --git a/services/api/test/integration/serialized_encoding_test.rb b/services/api/test/integration/serialized_encoding_test.rb
index 16d43e6f3..f41c033b3 100644
--- a/services/api/test/integration/serialized_encoding_test.rb
+++ b/services/api/test/integration/serialized_encoding_test.rb
@@ -15,31 +15,10 @@ class SerializedEncodingTest < ActionDispatch::IntegrationTest
human: {properties: {eye_color: 'gray'}},
- job: {
- repository: 'active/foo',
- runtime_constraints: {docker_image: 'arvados/apitestfixture'},
- script: 'hash',
- script_version: 'master',
- script_parameters: {pattern: 'foobar'},
- tasks_summary: {todo: 0},
- },
-
- job_task: {parameters: {pattern: 'foo'}},
-
link: {link_class: 'test', name: 'test', properties: {foo: :bar}},
node: {info: {uptime: 1234}},
- pipeline_instance: {
- components: {"job1" => {parameters: {pattern: "xyzzy"}}},
- components_summary: {todo: 0},
- properties: {test: true},
- },
-
- pipeline_template: {
- components: {"job1" => {parameters: {pattern: "xyzzy"}}},
- },
-
specimen: {properties: {eye_color: 'meringue'}},
trait: {properties: {eye_color: 'brown'}},
diff --git a/services/api/test/unit/crunch_dispatch_test.rb b/services/api/test/unit/crunch_dispatch_test.rb
deleted file mode 100644
index 3a8f90a66..000000000
--- a/services/api/test/unit/crunch_dispatch_test.rb
+++ /dev/null
@@ -1,218 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: AGPL-3.0
-
-require 'test_helper'
-require 'crunch_dispatch'
-require 'helpers/git_test_helper'
-
-class CrunchDispatchTest < ActiveSupport::TestCase
- include GitTestHelper
-
- test 'choose cheaper nodes first' do
- act_as_system_user do
- # Replace test fixtures with a set suitable for testing dispatch
- Node.destroy_all
-
- # Idle nodes with different prices
- [['compute1', 3.20, 32],
- ['compute2', 1.60, 16],
- ['compute3', 0.80, 8]].each do |hostname, price, cores|
- Node.create!(hostname: hostname,
- info: {
- 'slurm_state' => 'idle',
- },
- properties: {
- 'cloud_node' => {
- 'price' => price,
- },
- 'total_cpu_cores' => cores,
- 'total_ram_mb' => cores*1024,
- 'total_scratch_mb' => cores*10000,
- })
- end
-
- # Node with no price information
- Node.create!(hostname: 'compute4',
- info: {
- 'slurm_state' => 'idle',
- },
- properties: {
- 'total_cpu_cores' => 8,
- 'total_ram_mb' => 8192,
- 'total_scratch_mb' => 80000,
- })
-
- # Cheap but busy node
- Node.create!(hostname: 'compute5',
- info: {
- 'slurm_state' => 'alloc',
- },
- properties: {
- 'cloud_node' => {
- 'price' => 0.10,
- },
- 'total_cpu_cores' => 32,
- 'total_ram_mb' => 32768,
- 'total_scratch_mb' => 320000,
- })
- end
-
- dispatch = CrunchDispatch.new
- [[1, 16384, ['compute2']],
- [2, 16384, ['compute2', 'compute1']],
- [2, 8000, ['compute4', 'compute3']],
- ].each do |min_nodes, min_ram, expect_nodes|
- job = Job.new(uuid: 'zzzzz-8i9sb-382lhiizavzhqlp',
- runtime_constraints: {
- 'min_nodes' => min_nodes,
- 'min_ram_mb_per_node' => min_ram,
- })
- nodes = dispatch.nodes_available_for_job_now job
- assert_equal expect_nodes, nodes
- end
- end
-
- test 'respond to TERM' do
- lockfile = Rails.root.join 'tmp', 'dispatch.lock'
- ENV['CRUNCH_DISPATCH_LOCKFILE'] = lockfile.to_s
- begin
- pid = Process.fork do
- begin
- dispatch = CrunchDispatch.new
- dispatch.stubs(:did_recently).returns true
- dispatch.run []
- ensure
- Process.exit!
- end
- end
- assert_with_timeout 5, "Dispatch did not lock #{lockfile}" do
- !can_lock(lockfile)
- end
- ensure
- Process.kill("TERM", pid)
- end
- assert_with_timeout 20, "Dispatch did not unlock #{lockfile}" do
- can_lock(lockfile)
- end
- end
-
- test 'override --cgroup-root with CRUNCH_CGROUP_ROOT' do
- ENV['CRUNCH_CGROUP_ROOT'] = '/path/to/cgroup'
- Rails.configuration.Containers.JobsAPI.CrunchJobWrapper = "none"
- act_as_system_user do
- j = Job.create(repository: 'active/foo',
- script: 'hash',
- script_version: '4fe459abe02d9b365932b8f5dc419439ab4e2577',
- script_parameters: {})
- ok = false
- Open3.expects(:popen3).at_least_once.with do |*args|
- if args.index(j.uuid)
- ok = ((i = args.index '--cgroup-root') and
- (args[i+1] == '/path/to/cgroup'))
- end
- true
- end.raises(StandardError.new('all is well'))
- dispatch = CrunchDispatch.new
- dispatch.parse_argv ['--jobs']
- dispatch.refresh_todo
- dispatch.start_jobs
- assert ok
- end
- end
-
- def assert_with_timeout timeout, message
- t = 0
- while (t += 0.1) < timeout
- if yield
- return
- end
- sleep 0.1
- end
- assert false, message + " (waited #{timeout} seconds)"
- end
-
- def can_lock lockfile
- lockfile.open(File::RDWR|File::CREAT, 0644) do |f|
- return f.flock(File::LOCK_EX|File::LOCK_NB)
- end
- end
-
- test 'rate limit of partial line segments' do
- act_as_system_user do
- Rails.configuration.Containers.Logging.LogPartialLineThrottlePeriod = 1
-
- job = {}
- job[:bytes_logged] = 0
- job[:log_throttle_bytes_so_far] = 0
- job[:log_throttle_lines_so_far] = 0
- job[:log_throttle_bytes_skipped] = 0
- job[:log_throttle_is_open] = true
- job[:log_throttle_partial_line_last_at] = Time.new(0)
- job[:log_throttle_first_partial_line] = true
-
- dispatch = CrunchDispatch.new
-
- line = "first log line"
- limit = dispatch.rate_limit(job, line)
- assert_equal true, limit
- assert_equal "first log line", line
- assert_equal 1, job[:log_throttle_lines_so_far]
-
- # first partial line segment is skipped and counted towards skipped lines
- now = Time.now.strftime('%Y-%m-%d-%H:%M:%S')
- line = "#{now} localhost 100 0 stderr [...] this is first partial line segment [...]"
- limit = dispatch.rate_limit(job, line)
- assert_equal true, limit
- assert_includes line, "Rate-limiting partial segments of long lines", line
- assert_equal 2, job[:log_throttle_lines_so_far]
-
- # next partial line segment within throttle interval is skipped but not counted towards skipped lines
- line = "#{now} localhost 100 0 stderr [...] second partial line segment within the interval [...]"
- limit = dispatch.rate_limit(job, line)
- assert_equal false, limit
- assert_equal 2, job[:log_throttle_lines_so_far]
-
- # next partial line after interval is counted towards skipped lines
- sleep(1)
- line = "#{now} localhost 100 0 stderr [...] third partial line segment after the interval [...]"
- limit = dispatch.rate_limit(job, line)
- assert_equal false, limit
- assert_equal 3, job[:log_throttle_lines_so_far]
-
- # this is not a valid line segment
- line = "#{now} localhost 100 0 stderr [...] does not end with [...] and is not a partial segment"
- limit = dispatch.rate_limit(job, line)
- assert_equal true, limit
- assert_equal "#{now} localhost 100 0 stderr [...] does not end with [...] and is not a partial segment", line
- assert_equal 4, job[:log_throttle_lines_so_far]
-
- # this also is not a valid line segment
- line = "#{now} localhost 100 0 stderr does not start correctly but ends with [...]"
- limit = dispatch.rate_limit(job, line)
- assert_equal true, limit
- assert_equal "#{now} localhost 100 0 stderr does not start correctly but ends with [...]", line
- assert_equal 5, job[:log_throttle_lines_so_far]
- end
- end
-
- test 'scancel orphaned job nodes' do
- Rails.configuration.Containers.JobsAPI.CrunchJobWrapper = "slurm_immediate"
- act_as_system_user do
- dispatch = CrunchDispatch.new
-
- squeue_resp = IO.popen("echo zzzzz-8i9sb-pshmckwoma9plh7\necho thisisnotvalidjobuuid\necho zzzzz-8i9sb-4cf0abc123e809j\necho zzzzz-dz642-o04e3r651turtdr\n")
- scancel_resp = IO.popen("true")
-
- IO.expects(:popen).
- with(['squeue', '-a', '-h', '-o', '%j']).
- returns(squeue_resp)
-
- IO.expects(:popen).
- with(dispatch.sudo_preface + ['scancel', '-n', 'zzzzz-8i9sb-4cf0abc123e809j']).
- returns(scancel_resp)
-
- dispatch.check_orphaned_slurm_jobs
- end
- end
-end
diff --git a/services/api/test/unit/fail_jobs_test.rb b/services/api/test/unit/fail_jobs_test.rb
deleted file mode 100644
index 304335c6f..000000000
--- a/services/api/test/unit/fail_jobs_test.rb
+++ /dev/null
@@ -1,83 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: AGPL-3.0
-
-require 'test_helper'
-require 'crunch_dispatch'
-
-class FailJobsTest < ActiveSupport::TestCase
- include DbCurrentTime
-
- BOOT_TIME = 1448378837
-
- setup do
- @job = {}
- act_as_user users(:admin) do
- @job[:before_reboot] = Job.create!(state: 'Running',
- running: true,
- started_at: Time.at(BOOT_TIME - 300))
- @job[:after_reboot] = Job.create!(state: 'Running',
- running: true,
- started_at: Time.at(BOOT_TIME + 300))
- @job[:complete] = Job.create!(state: 'Running',
- running: true,
- started_at: Time.at(BOOT_TIME - 300))
- @job[:complete].update_attributes(state: 'Complete')
- @job[:complete].update_attributes(finished_at: Time.at(BOOT_TIME + 100))
- @job[:queued] = jobs(:queued)
-
- @job.values.each do |job|
- # backdate timestamps
- Job.where(uuid: job.uuid).
- update_all(created_at: Time.at(BOOT_TIME - 330),
- modified_at: (job.finished_at ||
- job.started_at ||
- Time.at(BOOT_TIME - 300)))
- end
- end
- @dispatch = CrunchDispatch.new
- @test_start_time = db_current_time
- end
-
- test 'cancel slurm jobs' do
- Rails.configuration.Containers.JobsAPI.CrunchJobWrapper = "slurm_immediate"
- Rails.configuration.Containers.JobsAPI.CrunchJobUser = 'foobar'
- fake_squeue = IO.popen("echo #{@job[:before_reboot].uuid}")
- fake_scancel = IO.popen("true")
- IO.expects(:popen).
- with(['squeue', '-a', '-h', '-o', '%j']).
- returns(fake_squeue)
- IO.expects(:popen).
- with(includes('sudo', '-u', 'foobar', 'scancel', '-n', @job[:before_reboot].uuid)).
- returns(fake_scancel)
- @dispatch.fail_jobs(before: Time.at(BOOT_TIME).to_s)
- assert_end_states
- end
-
- test 'use reboot time' do
- Rails.configuration.Containers.JobsAPI.CrunchJobWrapper = nil
- @dispatch.expects(:open).once.with('/proc/stat').
- returns open(Rails.root.join('test/fixtures/files/proc_stat'))
- @dispatch.fail_jobs(before: 'reboot')
- assert_end_states
- end
-
- test 'command line help' do
- cmd = Rails.root.join('script/fail-jobs.rb').to_s
- assert_match(/Options:.*--before=/m, File.popen([cmd, '--help']).read)
- end
-
- protected
-
- def assert_end_states
- @job.values.map(&:reload)
- assert_equal 'Failed', @job[:before_reboot].state
- assert_equal false, @job[:before_reboot].running
- assert_equal false, @job[:before_reboot].success
- assert_operator @job[:before_reboot].finished_at, :>=, @test_start_time
- assert_operator @job[:before_reboot].finished_at, :<=, db_current_time
- assert_equal 'Running', @job[:after_reboot].state
- assert_equal 'Complete', @job[:complete].state
- assert_equal 'Queued', @job[:queued].state
- end
-end
commit 4d56f9b913fcf41fbf89bf5016463b5353fa3a9f
Author: Peter Amstutz <pamstutz at veritasgenetics.com>
Date: Tue Aug 6 15:20:47 2019 -0400
15133: Delete crunch_scripts, start clearing out API server
Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <pamstutz at veritasgenetics.com>
diff --git a/crunch_scripts/GATK2-VariantFiltration b/crunch_scripts/GATK2-VariantFiltration
deleted file mode 100755
index 0ef4a7473..000000000
--- a/crunch_scripts/GATK2-VariantFiltration
+++ /dev/null
@@ -1,64 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-import arvados
-import os
-import re
-
-arvados.job_setup.one_task_per_input_file(if_sequence=0, and_end_task=True)
-
-this_job = arvados.current_job()
-this_task = arvados.current_task()
-gatk_path = arvados.util.tarball_extract(
- tarball = this_job['script_parameters']['gatk_binary_tarball'],
- path = 'gatk')
-bundle_path = arvados.util.collection_extract(
- collection = this_job['script_parameters']['gatk_bundle'],
- path = 'gatk-bundle',
- files = ['human_g1k_v37.dict', 'human_g1k_v37.fasta', 'human_g1k_v37.fasta.fai'])
-this_task_input = this_task['parameters']['input']
-
-input_file = list(arvados.CollectionReader(this_task_input).all_files())[0]
-
-# choose vcf temporary file names
-vcf_in = os.path.join(arvados.current_task().tmpdir,
- os.path.basename(input_file.name()))
-vcf_out = re.sub('(.*)\\.vcf', '\\1-filtered.vcf', vcf_in)
-
-# fetch the unfiltered data
-vcf_in_file = open(vcf_in, 'w')
-for buf in input_file.readall():
- vcf_in_file.write(buf)
-vcf_in_file.close()
-
-stdoutdata, stderrdata = arvados.util.run_command(
- ['java', '-Xmx1g',
- '-jar', os.path.join(gatk_path,'GenomeAnalysisTK.jar'),
- '-T', 'VariantFiltration', '--variant', vcf_in,
- '--out', vcf_out,
- '--filterExpression', 'QD < 2.0',
- '--filterName', 'GATK_QD',
- '--filterExpression', 'MQ < 40.0',
- '--filterName', 'GATK_MQ',
- '--filterExpression', 'FS > 60.0',
- '--filterName', 'GATK_FS',
- '--filterExpression', 'MQRankSum < -12.5',
- '--filterName', 'GATK_MQRankSum',
- '--filterExpression', 'ReadPosRankSum < -8.0',
- '--filterName', 'GATK_ReadPosRankSum',
- '-R', os.path.join(bundle_path, 'human_g1k_v37.fasta')],
- cwd=arvados.current_task().tmpdir)
-
-# store the filtered data
-with open(vcf_out, 'rb') as f:
- out = arvados.CollectionWriter()
- while True:
- buf = f.read()
- if len(buf) == 0:
- break
- out.write(buf)
-out.set_current_file_name(os.path.basename(vcf_out))
-
-this_task.set_output(out.finish())
diff --git a/crunch_scripts/GATK2-bqsr b/crunch_scripts/GATK2-bqsr
deleted file mode 100755
index ab7822602..000000000
--- a/crunch_scripts/GATK2-bqsr
+++ /dev/null
@@ -1,103 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-import os
-import re
-import arvados
-import arvados_gatk2
-import arvados_samtools
-from arvados_ipc import *
-
-class InvalidArgumentError(Exception):
- pass
-
-arvados_samtools.one_task_per_bam_file(if_sequence=0, and_end_task=True)
-
-this_job = arvados.current_job()
-this_task = arvados.current_task()
-tmpdir = arvados.current_task().tmpdir
-arvados.util.clear_tmpdir()
-
-known_sites_files = arvados.getjobparam(
- 'known_sites',
- ['dbsnp_137.b37.vcf',
- 'Mills_and_1000G_gold_standard.indels.b37.vcf',
- ])
-bundle_dir = arvados.util.collection_extract(
- collection = this_job['script_parameters']['gatk_bundle'],
- files = [
- 'human_g1k_v37.dict',
- 'human_g1k_v37.fasta',
- 'human_g1k_v37.fasta.fai'
- ] + known_sites_files + [v + '.idx' for v in known_sites_files],
- path = 'gatk_bundle')
-ref_fasta_files = [os.path.join(bundle_dir, f)
- for f in os.listdir(bundle_dir)
- if re.search(r'\.fasta(\.gz)?$', f)]
-
-input_collection = this_task['parameters']['input']
-input_dir = arvados.util.collection_extract(
- collection = input_collection,
- path = os.path.join(this_task.tmpdir, 'input'))
-input_bam_files = []
-for f in arvados.util.listdir_recursive(input_dir):
- if re.search(r'\.bam$', f):
- input_stream_name, input_file_name = os.path.split(f)
- input_bam_files += [os.path.join(input_dir, f)]
-if len(input_bam_files) != 1:
- raise InvalidArgumentError("Expected exactly one bam file per task.")
-
-known_sites_args = []
-for f in known_sites_files:
- known_sites_args += ['-knownSites', os.path.join(bundle_dir, f)]
-
-recal_file = os.path.join(tmpdir, 'recal.csv')
-
-children = {}
-pipes = {}
-
-arvados_gatk2.run(
- args=[
- '-nct', arvados_gatk2.cpus_on_this_node(),
- '-T', 'BaseRecalibrator',
- '-R', ref_fasta_files[0],
- '-I', input_bam_files[0],
- '-o', recal_file,
- ] + known_sites_args)
-
-pipe_setup(pipes, 'BQSR')
-if 0 == named_fork(children, 'BQSR'):
- pipe_closeallbut(pipes, ('BQSR', 'w'))
- arvados_gatk2.run(
- args=[
- '-T', 'PrintReads',
- '-R', ref_fasta_files[0],
- '-I', input_bam_files[0],
- '-o', '/dev/fd/' + str(pipes['BQSR','w']),
- '-BQSR', recal_file,
- '--disable_bam_indexing',
- ],
- close_fds=False)
- os._exit(0)
-os.close(pipes.pop(('BQSR','w'), None))
-
-out = arvados.CollectionWriter()
-out.start_new_stream(input_stream_name)
-
-out.start_new_file(input_file_name + '.recal.csv')
-out.write(open(recal_file, 'rb'))
-
-out.start_new_file(input_file_name)
-while True:
- buf = os.read(pipes['BQSR','r'], 2**20)
- if len(buf) == 0:
- break
- out.write(buf)
-pipe_closeallbut(pipes)
-
-if waitpid_and_check_children(children):
- this_task.set_output(out.finish())
-else:
- sys.exit(1)
diff --git a/crunch_scripts/GATK2-merge-call b/crunch_scripts/GATK2-merge-call
deleted file mode 100755
index 6d175172e..000000000
--- a/crunch_scripts/GATK2-merge-call
+++ /dev/null
@@ -1,242 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-import os
-import re
-import string
-import threading
-import arvados
-import arvados_gatk2
-import arvados_picard
-from arvados_ipc import *
-
-class InvalidArgumentError(Exception):
- pass
-
-this_job = arvados.current_job()
-this_task = arvados.current_task()
-tmpdir = arvados.current_task().tmpdir
-arvados.util.clear_tmpdir()
-
-bundle_dir = arvados.util.collection_extract(
- collection = this_job['script_parameters']['gatk_bundle'],
- files = [
- 'human_g1k_v37.dict',
- 'human_g1k_v37.fasta',
- 'human_g1k_v37.fasta.fai',
- 'dbsnp_137.b37.vcf',
- 'dbsnp_137.b37.vcf.idx',
- ],
- path = 'gatk_bundle')
-ref_fasta_files = [os.path.join(bundle_dir, f)
- for f in os.listdir(bundle_dir)
- if re.search(r'\.fasta(\.gz)?$', f)]
-regions_args = []
-if 'regions' in this_job['script_parameters']:
- regions_dir = arvados.util.collection_extract(
- collection = this_job['script_parameters']['regions'],
- path = 'regions')
- region_padding = int(this_job['script_parameters']['region_padding'])
- for f in os.listdir(regions_dir):
- if re.search(r'\.bed$', f):
- regions_args += [
- '--intervals', os.path.join(regions_dir, f),
- '--interval_padding', str(region_padding)
- ]
-
-
-# Start a child process for each input file, feeding data to picard.
-
-input_child_names = []
-children = {}
-pipes = {}
-
-input_collection = this_job['script_parameters']['input']
-input_index = 0
-for s in arvados.CollectionReader(input_collection).all_streams():
- for f in s.all_files():
- if not re.search(r'\.bam$', f.name()):
- continue
- input_index += 1
- childname = 'input-' + str(input_index)
- input_child_names += [childname]
- pipe_setup(pipes, childname)
- childpid = named_fork(children, childname)
- if childpid == 0:
- pipe_closeallbut(pipes, (childname, 'w'))
- for s in f.readall():
- os.write(pipes[childname, 'w'], s)
- os.close(pipes[childname, 'w'])
- os._exit(0)
- sys.stderr.write("pid %d writing %s to fd %d->%d\n" %
- (childpid,
- s.name()+'/'+f.name(),
- pipes[childname, 'w'],
- pipes[childname, 'r']))
- pipe_closeallbut(pipes, *[(childname, 'r')
- for childname in input_child_names])
-
-
-# Merge-sort the input files to merge.bam
-
-arvados_picard.run(
- 'MergeSamFiles',
- args=[
- 'I=/dev/fd/' + str(pipes[childname, 'r'])
- for childname in input_child_names
- ],
- params={
- 'o': 'merge.bam',
- 'quiet': 'true',
- 'so': 'coordinate',
- 'use_threading': 'true',
- 'create_index': 'true',
- 'validation_stringency': 'LENIENT',
- },
- close_fds=False,
- )
-pipe_closeallbut(pipes)
-
-
-# Run CoverageBySample on merge.bam
-
-pipe_setup(pipes, 'stats_log')
-pipe_setup(pipes, 'stats_out')
-if 0 == named_fork(children, 'GATK'):
- pipe_closeallbut(pipes,
- ('stats_log', 'w'),
- ('stats_out', 'w'))
- arvados_gatk2.run(
- args=[
- '-T', 'CoverageBySample',
- '-R', ref_fasta_files[0],
- '-I', 'merge.bam',
- '-o', '/dev/fd/' + str(pipes['stats_out', 'w']),
- '--log_to_file', '/dev/fd/' + str(pipes['stats_log', 'w']),
- ]
- + regions_args,
- close_fds=False)
- pipe_closeallbut(pipes)
- os._exit(0)
-pipe_closeallbut(pipes, ('stats_log', 'r'), ('stats_out', 'r'))
-
-
-# Start two threads to read from CoverageBySample pipes
-
-class ExceptionPropagatingThread(threading.Thread):
- """
- If a subclassed thread calls _raise(e) in run(), running join() on
- the thread will raise e in the thread that calls join().
- """
- def __init__(self, *args, **kwargs):
- super(ExceptionPropagatingThread, self).__init__(*args, **kwargs)
- self.__exception = None
- def join(self, *args, **kwargs):
- ret = super(ExceptionPropagatingThread, self).join(*args, **kwargs)
- if self.__exception:
- raise self.__exception
- return ret
- def _raise(self, exception):
- self.__exception = exception
-
-class StatsLogReader(ExceptionPropagatingThread):
- def __init__(self, **kwargs):
- super(StatsLogReader, self).__init__()
- self.args = kwargs
- def run(self):
- try:
- for logline in self.args['infile']:
- x = re.search('Processing (\d+) bp from intervals', logline)
- if x:
- self._total_bp = int(x.group(1))
- except Exception as e:
- self._raise(e)
- def total_bp(self):
- self.join()
- return self._total_bp
-stats_log_thr = StatsLogReader(infile=os.fdopen(pipes.pop(('stats_log', 'r'))))
-stats_log_thr.start()
-
-class StatsOutReader(ExceptionPropagatingThread):
- """
- Read output of CoverageBySample and collect a histogram of
- coverage (last column) -> number of loci (number of rows).
- """
- def __init__(self, **kwargs):
- super(StatsOutReader, self).__init__()
- self.args = kwargs
- def run(self):
- try:
- hist = [0]
- histtot = 0
- for line in self.args['infile']:
- try:
- i = int(string.split(line)[-1])
- except ValueError:
- continue
- if i >= 1:
- if len(hist) <= i:
- hist.extend([0 for x in range(1+i-len(hist))])
- hist[i] += 1
- histtot += 1
- hist[0] = stats_log_thr.total_bp() - histtot
- self._histogram = hist
- except Exception as e:
- self._raise(e)
- def histogram(self):
- self.join()
- return self._histogram
-stats_out_thr = StatsOutReader(infile=os.fdopen(pipes.pop(('stats_out', 'r'))))
-stats_out_thr.start()
-
-
-# Run UnifiedGenotyper on merge.bam
-
-arvados_gatk2.run(
- args=[
- '-nt', arvados_gatk2.cpus_on_this_node(),
- '-T', 'UnifiedGenotyper',
- '-R', ref_fasta_files[0],
- '-I', 'merge.bam',
- '-o', os.path.join(tmpdir, 'out.vcf'),
- '--dbsnp', os.path.join(bundle_dir, 'dbsnp_137.b37.vcf'),
- '-metrics', 'UniGenMetrics',
- '-A', 'DepthOfCoverage',
- '-A', 'AlleleBalance',
- '-A', 'QualByDepth',
- '-A', 'HaplotypeScore',
- '-A', 'MappingQualityRankSumTest',
- '-A', 'ReadPosRankSumTest',
- '-A', 'FisherStrand',
- '-glm', 'both',
- ]
- + regions_args
- + arvados.getjobparam('GATK2_UnifiedGenotyper_args',[]))
-
-# Copy the output VCF file to Keep
-
-out = arvados.CollectionWriter()
-out.start_new_stream()
-out.start_new_file('out.vcf')
-out.write(open(os.path.join(tmpdir, 'out.vcf'), 'rb'))
-
-
-# Write statistics to Keep
-
-out.start_new_file('mincoverage_nlocus.csv')
-sofar = 0
-hist = stats_out_thr.histogram()
-total_bp = stats_log_thr.total_bp()
-for i in range(len(hist)):
- out.write("%d,%d,%f\n" %
- (i,
- total_bp - sofar,
- 100.0 * (total_bp - sofar) / total_bp))
- sofar += hist[i]
-
-if waitpid_and_check_children(children):
- this_task.set_output(out.finish())
-else:
- sys.exit(1)
diff --git a/crunch_scripts/GATK2-realign b/crunch_scripts/GATK2-realign
deleted file mode 100755
index 2787dffd5..000000000
--- a/crunch_scripts/GATK2-realign
+++ /dev/null
@@ -1,163 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-import os
-import re
-import arvados
-import arvados_gatk2
-import arvados_picard
-import arvados_samtools
-from arvados_ipc import *
-
-class InvalidArgumentError(Exception):
- pass
-
-arvados_samtools.one_task_per_bam_file(if_sequence=0, and_end_task=True)
-
-this_job = arvados.current_job()
-this_task = arvados.current_task()
-tmpdir = arvados.current_task().tmpdir
-arvados.util.clear_tmpdir()
-
-known_sites_files = arvados.getjobparam(
- 'known_sites',
- ['dbsnp_137.b37.vcf',
- 'Mills_and_1000G_gold_standard.indels.b37.vcf',
- ])
-bundle_dir = arvados.util.collection_extract(
- collection = this_job['script_parameters']['gatk_bundle'],
- files = [
- 'human_g1k_v37.dict',
- 'human_g1k_v37.fasta',
- 'human_g1k_v37.fasta.fai'
- ] + known_sites_files + [v + '.idx' for v in known_sites_files],
- path = 'gatk_bundle')
-ref_fasta_files = [os.path.join(bundle_dir, f)
- for f in os.listdir(bundle_dir)
- if re.search(r'\.fasta(\.gz)?$', f)]
-regions_args = []
-if 'regions' in this_job['script_parameters']:
- regions_dir = arvados.util.collection_extract(
- collection = this_job['script_parameters']['regions'],
- path = 'regions')
- region_padding = int(this_job['script_parameters']['region_padding'])
- for f in os.listdir(regions_dir):
- if re.search(r'\.bed$', f):
- regions_args += [
- '--intervals', os.path.join(regions_dir, f),
- '--interval_padding', str(region_padding)
- ]
-
-input_collection = this_task['parameters']['input']
-input_dir = arvados.util.collection_extract(
- collection = input_collection,
- path = os.path.join(this_task.tmpdir, 'input'))
-input_bam_files = []
-for f in arvados.util.listdir_recursive(input_dir):
- if re.search(r'\.bam$', f):
- input_stream_name, input_file_name = os.path.split(f)
- input_bam_files += [os.path.join(input_dir, f)]
-if len(input_bam_files) != 1:
- raise InvalidArgumentError("Expected exactly one bam file per task.")
-
-known_sites_args = []
-for f in known_sites_files:
- known_sites_args += ['-known', os.path.join(bundle_dir, f)]
-
-children = {}
-pipes = {}
-
-arvados_gatk2.run(
- args=[
- '-nt', arvados_gatk2.cpus_per_task(),
- '-T', 'RealignerTargetCreator',
- '-R', ref_fasta_files[0],
- '-I', input_bam_files[0],
- '-o', os.path.join(tmpdir, 'intervals.list')
- ] + known_sites_args + regions_args)
-
-pipe_setup(pipes, 'IndelRealigner')
-if 0 == named_fork(children, 'IndelRealigner'):
- pipe_closeallbut(pipes, ('IndelRealigner', 'w'))
- arvados_gatk2.run(
- args=[
- '-T', 'IndelRealigner',
- '-R', ref_fasta_files[0],
- '-targetIntervals', os.path.join(tmpdir, 'intervals.list'),
- '-I', input_bam_files[0],
- '-o', '/dev/fd/' + str(pipes['IndelRealigner','w']),
- '--disable_bam_indexing',
- ] + known_sites_args + regions_args,
- close_fds=False)
- os._exit(0)
-os.close(pipes.pop(('IndelRealigner','w'), None))
-
-pipe_setup(pipes, 'bammanifest')
-pipe_setup(pipes, 'bam')
-if 0==named_fork(children, 'bammanifest'):
- pipe_closeallbut(pipes,
- ('IndelRealigner', 'r'),
- ('bammanifest', 'w'),
- ('bam', 'w'))
- out = arvados.CollectionWriter()
- out.start_new_stream(input_stream_name)
- out.start_new_file(input_file_name)
- while True:
- buf = os.read(pipes['IndelRealigner','r'], 2**20)
- if len(buf) == 0:
- break
- os.write(pipes['bam','w'], buf)
- out.write(buf)
- os.write(pipes['bammanifest','w'], out.manifest_text())
- os.close(pipes['bammanifest','w'])
- os._exit(0)
-
-pipe_setup(pipes, 'index')
-if 0==named_fork(children, 'index'):
- pipe_closeallbut(pipes, ('bam', 'r'), ('index', 'w'))
- arvados_picard.run(
- 'BuildBamIndex',
- params={
- 'i': '/dev/fd/' + str(pipes['bam','r']),
- 'o': '/dev/fd/' + str(pipes['index','w']),
- 'quiet': 'true',
- 'validation_stringency': 'LENIENT'
- },
- close_fds=False)
- os._exit(0)
-
-pipe_setup(pipes, 'indexmanifest')
-if 0==named_fork(children, 'indexmanifest'):
- pipe_closeallbut(pipes, ('index', 'r'), ('indexmanifest', 'w'))
- out = arvados.CollectionWriter()
- out.start_new_stream(input_stream_name)
- out.start_new_file(re.sub('\.bam$', '.bai', input_file_name))
- while True:
- buf = os.read(pipes['index','r'], 2**20)
- if len(buf) == 0:
- break
- out.write(buf)
- os.write(pipes['indexmanifest','w'], out.manifest_text())
- os.close(pipes['indexmanifest','w'])
- os._exit(0)
-
-pipe_closeallbut(pipes, ('bammanifest', 'r'), ('indexmanifest', 'r'))
-outmanifest = ''
-for which in ['bammanifest', 'indexmanifest']:
- with os.fdopen(pipes[which,'r'], 'rb', 2**20) as f:
- while True:
- buf = f.read()
- if buf == '':
- break
- outmanifest += buf
-
-all_ok = True
-for (childname, pid) in children.items():
- all_ok = all_ok and waitpid_and_check_exit(pid, childname)
-
-if all_ok:
- this_task.set_output(outmanifest)
-else:
- sys.exit(1)
diff --git a/crunch_scripts/arvados-bcbio-nextgen.py b/crunch_scripts/arvados-bcbio-nextgen.py
deleted file mode 100755
index b7e19ecdd..000000000
--- a/crunch_scripts/arvados-bcbio-nextgen.py
+++ /dev/null
@@ -1,145 +0,0 @@
-#!/usr/bin/python
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-import arvados
-import subprocess
-import crunchutil.subst as subst
-import shutil
-import os
-import sys
-import time
-
-if len(arvados.current_task()['parameters']) > 0:
- p = arvados.current_task()['parameters']
-else:
- p = arvados.current_job()['script_parameters']
-
-t = arvados.current_task().tmpdir
-
-os.unlink("/usr/local/share/bcbio-nextgen/galaxy")
-os.mkdir("/usr/local/share/bcbio-nextgen/galaxy")
-shutil.copy("/usr/local/share/bcbio-nextgen/config/bcbio_system.yaml", "/usr/local/share/bcbio-nextgen/galaxy")
-
-with open("/usr/local/share/bcbio-nextgen/galaxy/tool_data_table_conf.xml", "w") as f:
- f.write('''<tables>
- <!-- Locations of indexes in the BWA mapper format -->
- <table name="bwa_indexes" comment_char="#">
- <columns>value, dbkey, name, path</columns>
- <file path="tool-data/bwa_index.loc" />
- </table>
- <!-- Locations of indexes in the Bowtie2 mapper format -->
- <table name="bowtie2_indexes" comment_char="#">
- <columns>value, dbkey, name, path</columns>
- <file path="tool-data/bowtie2_indices.loc" />
- </table>
- <!-- Locations of indexes in the Bowtie2 mapper format for TopHat2 to use -->
- <table name="tophat2_indexes" comment_char="#">
- <columns>value, dbkey, name, path</columns>
- <file path="tool-data/bowtie2_indices.loc" />
- </table>
- <!-- Location of SAMTools indexes and other files -->
- <table name="sam_fa_indexes" comment_char="#">
- <columns>index, value, path</columns>
- <file path="tool-data/sam_fa_indices.loc" />
- </table>
- <!-- Location of Picard dict file and other files -->
- <table name="picard_indexes" comment_char="#">
- <columns>value, dbkey, name, path</columns>
- <file path="tool-data/picard_index.loc" />
- </table>
- <!-- Location of Picard dict files valid for GATK -->
- <table name="gatk_picard_indexes" comment_char="#">
- <columns>value, dbkey, name, path</columns>
- <file path="tool-data/gatk_sorted_picard_index.loc" />
- </table>
-</tables>
-''')
-
-os.mkdir("/usr/local/share/bcbio-nextgen/galaxy/tool-data")
-
-with open("/usr/local/share/bcbio-nextgen/galaxy/tool-data/bowtie2_indices.loc", "w") as f:
- f.write(subst.do_substitution(p, "GRCh37\tGRCh37\tHuman (GRCh37)\t$(dir $(bowtie2_indices))\n"))
-
-with open("/usr/local/share/bcbio-nextgen/galaxy/tool-data/bwa_index.loc", "w") as f:
- f.write(subst.do_substitution(p, "GRCh37\tGRCh37\tHuman (GRCh37)\t$(file $(bwa_index))\n"))
-
-with open("/usr/local/share/bcbio-nextgen/galaxy/tool-data/gatk_sorted_picard_index.loc", "w") as f:
- f.write(subst.do_substitution(p, "GRCh37\tGRCh37\tHuman (GRCh37)\t$(file $(gatk_sorted_picard_index))\n"))
-
-with open("/usr/local/share/bcbio-nextgen/galaxy/tool-data/picard_index.loc", "w") as f:
- f.write(subst.do_substitution(p, "GRCh37\tGRCh37\tHuman (GRCh37)\t$(file $(picard_index))\n"))
-
-with open("/usr/local/share/bcbio-nextgen/galaxy/tool-data/sam_fa_indices.loc", "w") as f:
- f.write(subst.do_substitution(p, "index\tGRCh37\t$(file $(sam_fa_indices))\n"))
-
-with open("/tmp/crunch-job/freebayes-variant.yaml", "w") as f:
- f.write('''
-# Template for whole genome Illumina variant calling with FreeBayes
-# This is a GATK-free pipeline without post-alignment BAM pre-processing
-# (recalibration and realignment)
----
-details:
- - analysis: variant2
- genome_build: GRCh37
- # to do multi-sample variant calling, assign samples the same metadata / batch
- # metadata:
- # batch: your-arbitrary-batch-name
- algorithm:
- aligner: bwa
- mark_duplicates: true
- recalibrate: false
- realign: false
- variantcaller: freebayes
- platform: illumina
- quality_format: Standard
- # for targetted projects, set the region
- # variant_regions: /path/to/your.bed
-''')
-
-os.unlink("/usr/local/share/bcbio-nextgen/gemini_data")
-os.symlink(arvados.get_job_param_mount("gemini_data"), "/usr/local/share/bcbio-nextgen/gemini_data")
-
-os.chdir(arvados.current_task().tmpdir)
-
-rcode = subprocess.call(["bcbio_nextgen.py", "--workflow", "template", "/tmp/crunch-job/freebayes-variant.yaml", "project1",
- subst.do_substitution(p, "$(file $(R1))"),
- subst.do_substitution(p, "$(file $(R2))")])
-
-os.chdir("project1/work")
-
-os.symlink("/usr/local/share/bcbio-nextgen/galaxy/tool-data", "tool-data")
-
-rcode = subprocess.call(["bcbio_nextgen.py", "../config/project1.yaml", "-n", os.environ['CRUNCH_NODE_SLOTS']])
-
-print("run-command: completed with exit code %i (%s)" % (rcode, "success" if rcode == 0 else "failed"))
-
-if rcode == 0:
- os.chdir("../final")
-
- print("arvados-bcbio-nextgen: the follow output files will be saved to keep:")
-
- subprocess.call(["find", ".", "-type", "f", "-printf", "arvados-bcbio-nextgen: %12.12s %h/%f\\n"])
-
- print("arvados-bcbio-nextgen: start writing output to keep")
-
- done = False
- api = arvados.api('v1')
- while not done:
- try:
- out = arvados.CollectionWriter()
- out.write_directory_tree(".", max_manifest_depth=0)
- outuuid = out.finish()
- api.job_tasks().update(uuid=arvados.current_task()['uuid'],
- body={
- 'output':outuuid,
- 'success': (rcode == 0),
- 'progress':1.0
- }).execute()
- done = True
- except Exception as e:
- print("arvados-bcbio-nextgen: caught exception: {}".format(e))
- time.sleep(5)
-
-sys.exit(rcode)
diff --git a/crunch_scripts/arvados_bwa.py b/crunch_scripts/arvados_bwa.py
deleted file mode 100644
index aefc1f064..000000000
--- a/crunch_scripts/arvados_bwa.py
+++ /dev/null
@@ -1,115 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-import arvados
-import re
-import os
-import sys
-import fcntl
-import subprocess
-
-bwa_install_path = None
-
-def install_path():
- """
- Extract the bwa source tree, build the bwa binary, and return the
- path to the source tree.
- """
- global bwa_install_path
- if bwa_install_path:
- return bwa_install_path
-
- bwa_install_path = arvados.util.tarball_extract(
- tarball = arvados.current_job()['script_parameters']['bwa_tbz'],
- path = 'bwa')
-
- # build "bwa" binary
- lockfile = open(os.path.split(bwa_install_path)[0] + '.bwa-make.lock',
- 'w')
- fcntl.flock(lockfile, fcntl.LOCK_EX)
- arvados.util.run_command(['make', '-j16'], cwd=bwa_install_path)
- lockfile.close()
-
- return bwa_install_path
-
-def bwa_binary():
- """
- Return the path to the bwa executable.
- """
- return os.path.join(install_path(), 'bwa')
-
-def run(command, command_args, **kwargs):
- """
- Build and run the bwa binary.
-
- command is the bwa module, e.g., "index" or "aln".
-
- command_args is a list of additional command line arguments, e.g.,
- ['-a', 'bwtsw', 'ref.fasta']
-
- It is assumed that we are running in a Crunch job environment, and
- the job's "bwa_tbz" parameter is a collection containing the bwa
- source tree in a .tbz file.
- """
- execargs = [bwa_binary(),
- command]
- execargs += command_args
- sys.stderr.write("%s.run: exec %s\n" % (__name__, str(execargs)))
- arvados.util.run_command(
- execargs,
- cwd=arvados.current_task().tmpdir,
- stderr=sys.stderr,
- stdin=kwargs.get('stdin', subprocess.PIPE),
- stdout=kwargs.get('stdout', sys.stderr))
-
-def one_task_per_pair_input_file(if_sequence=0, and_end_task=True):
- """
- Queue one task for each pair of fastq files in this job's input
- collection.
-
- Each new task will have two parameters, named "input_1" and
- "input_2", each being a manifest containing a single fastq file.
-
- A matching pair of files in the input collection is assumed to
- have names "x_1.y" and "x_2.y".
-
- Files in the input collection that are not part of a matched pair
- are silently ignored.
-
- if_sequence and and_end_task arguments have the same significance
- as in arvados.job_setup.one_task_per_input_file().
- """
- if if_sequence != arvados.current_task()['sequence']:
- return
- job_input = arvados.current_job()['script_parameters']['input']
- cr = arvados.CollectionReader(job_input)
- all_files = []
- for s in cr.all_streams():
- all_files += list(s.all_files())
- for s in cr.all_streams():
- for left_file in s.all_files():
- left_name = left_file.name()
- right_file = None
- right_name = re.sub(r'(.*_)1\.', '\g<1>2.', left_name)
- if right_name == left_name:
- continue
- for f2 in s.all_files():
- if right_name == f2.name():
- right_file = f2
- if right_file != None:
- new_task_attrs = {
- 'job_uuid': arvados.current_job()['uuid'],
- 'created_by_job_task_uuid': arvados.current_task()['uuid'],
- 'sequence': if_sequence + 1,
- 'parameters': {
- 'input_1':left_file.as_manifest(),
- 'input_2':right_file.as_manifest()
- }
- }
- arvados.api().job_tasks().create(body=new_task_attrs).execute()
- if and_end_task:
- arvados.api().job_tasks().update(uuid=arvados.current_task()['uuid'],
- body={'success':True}
- ).execute()
- exit(0)
diff --git a/crunch_scripts/arvados_gatk2.py b/crunch_scripts/arvados_gatk2.py
deleted file mode 100644
index fa00b44d8..000000000
--- a/crunch_scripts/arvados_gatk2.py
+++ /dev/null
@@ -1,52 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-import arvados
-import re
-import os
-import sys
-import fcntl
-import subprocess
-
-gatk2_install_path = None
-
-def install_path():
- global gatk2_install_path
- if gatk2_install_path:
- return gatk2_install_path
- gatk2_install_path = arvados.util.tarball_extract(
- tarball = arvados.current_job()['script_parameters']['gatk_tbz'],
- path = 'gatk2')
- return gatk2_install_path
-
-def memory_limit():
- taskspernode = int(os.environ.get('CRUNCH_NODE_SLOTS', '1'))
- with open('/proc/meminfo', 'r') as f:
- ram = int(re.search(r'MemTotal:\s*(\d+)', f.read()).group(1)) / 1024
- if taskspernode > 1:
- ram = ram / taskspernode
- return max(ram-700, 500)
-
-def cpus_on_this_node():
- with open('/proc/cpuinfo', 'r') as cpuinfo:
- return max(int(os.environ.get('SLURM_CPUS_ON_NODE', 1)),
- len(re.findall(r'^processor\s*:\s*\d',
- cpuinfo.read(),
- re.MULTILINE)))
-
-def cpus_per_task():
- return max(1, (cpus_on_this_node()
- / int(os.environ.get('CRUNCH_NODE_SLOTS', 1))))
-
-def run(**kwargs):
- kwargs.setdefault('cwd', arvados.current_task().tmpdir)
- kwargs.setdefault('stdout', sys.stderr)
- execargs = ['java',
- '-Xmx%dm' % memory_limit(),
- '-Djava.io.tmpdir=' + arvados.current_task().tmpdir,
- '-jar', os.path.join(install_path(), 'GenomeAnalysisTK.jar')]
- execargs += [str(arg) for arg in kwargs.pop('args', [])]
- sys.stderr.write("%s.run: exec %s\n" % (__name__, str(execargs)))
- return arvados.util.run_command(execargs, **kwargs)
-
diff --git a/crunch_scripts/arvados_ipc.py b/crunch_scripts/arvados_ipc.py
deleted file mode 100644
index 97871627b..000000000
--- a/crunch_scripts/arvados_ipc.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-import os
-import re
-import sys
-import subprocess
-
-def pipe_setup(pipes, name):
- pipes[name,'r'], pipes[name,'w'] = os.pipe()
-
-def pipe_closeallbut(pipes, *keepus):
- for n,m in pipes.keys():
- if (n,m) not in keepus:
- os.close(pipes.pop((n,m), None))
-
-def named_fork(children, name):
- children[name] = os.fork()
- return children[name]
-
-def waitpid_and_check_children(children):
- """
- Given a dict of childname->pid, wait for each child process to
- finish, and report non-zero exit status on stderr. Return True if
- all children exited 0.
- """
- all_ok = True
- for (childname, pid) in children.items():
- # all_ok must be on RHS here -- we need to call waitpid() on
- # every child, even if all_ok is already False.
- all_ok = waitpid_and_check_exit(pid, childname) and all_ok
- return all_ok
-
-def waitpid_and_check_exit(pid, childname=''):
- """
- Wait for a child process to finish. If it exits non-zero, report
- exit status on stderr (mentioning the given childname) and return
- False. If it exits zero, return True.
- """
- _, childstatus = os.waitpid(pid, 0)
- exitvalue = childstatus >> 8
- signal = childstatus & 127
- dumpedcore = childstatus & 128
- if childstatus != 0:
- sys.stderr.write("%s child %d failed: exit %d signal %d core %s\n"
- % (childname, pid, exitvalue, signal,
- ('y' if dumpedcore else 'n')))
- return False
- return True
-
diff --git a/crunch_scripts/arvados_picard.py b/crunch_scripts/arvados_picard.py
deleted file mode 100644
index 3d830dbca..000000000
--- a/crunch_scripts/arvados_picard.py
+++ /dev/null
@@ -1,42 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-import arvados
-import re
-import os
-import sys
-import fcntl
-import subprocess
-
-picard_install_path = None
-
-def install_path():
- global picard_install_path
- if picard_install_path:
- return picard_install_path
- zipball = arvados.current_job()['script_parameters']['picard_zip']
- extracted = arvados.util.zipball_extract(
- zipball = zipball,
- path = 'picard')
- for f in os.listdir(extracted):
- if (re.search(r'^picard-tools-[\d\.]+$', f) and
- os.path.exists(os.path.join(extracted, f, '.'))):
- picard_install_path = os.path.join(extracted, f)
- break
- if not picard_install_path:
- raise Exception("picard-tools-{version} directory not found in %s" %
- zipball)
- return picard_install_path
-
-def run(module, **kwargs):
- kwargs.setdefault('cwd', arvados.current_task().tmpdir)
- execargs = ['java',
- '-Xmx1500m',
- '-Djava.io.tmpdir=' + arvados.current_task().tmpdir,
- '-jar', os.path.join(install_path(), module + '.jar')]
- execargs += [str(arg) for arg in kwargs.pop('args', [])]
- for key, value in kwargs.pop('params', {}).items():
- execargs += [key.upper() + '=' + str(value)]
- sys.stderr.write("%s.run: exec %s\n" % (__name__, str(execargs)))
- return arvados.util.run_command(execargs, **kwargs)
diff --git a/crunch_scripts/arvados_samtools.py b/crunch_scripts/arvados_samtools.py
deleted file mode 100644
index 09992f6f2..000000000
--- a/crunch_scripts/arvados_samtools.py
+++ /dev/null
@@ -1,110 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-import arvados
-import re
-import os
-import sys
-import fcntl
-import subprocess
-
-samtools_path = None
-
-def samtools_install_path():
- """
- Extract the samtools source tree, build the samtools binary, and
- return the path to the source tree.
- """
- global samtools_path
- if samtools_path:
- return samtools_path
- samtools_path = arvados.util.tarball_extract(
- tarball = arvados.current_job()['script_parameters']['samtools_tgz'],
- path = 'samtools')
-
- # build "samtools" binary
- lockfile = open(os.path.split(samtools_path)[0] + '.samtools-make.lock',
- 'w')
- fcntl.flock(lockfile, fcntl.LOCK_EX)
- arvados.util.run_command(['make', '-j16'], cwd=samtools_path)
- lockfile.close()
-
- return samtools_path
-
-def samtools_binary():
- """
- Return the path to the samtools executable.
- """
- return os.path.join(samtools_install_path(), 'samtools')
-
-def run(command, command_args, **kwargs):
- """
- Build and run the samtools binary.
-
- command is the samtools subcommand, e.g., "view" or "sort".
-
- command_args is a list of additional command line arguments, e.g.,
- ['-bt', 'ref_list.txt', '-o', 'aln.bam', 'aln.sam.gz']
-
- It is assumed that we are running in a Crunch job environment, and
- the job's "samtools_tgz" parameter is a collection containing the
- samtools source tree in a .tgz file.
- """
- execargs = [samtools_binary(),
- command]
- execargs += command_args
- sys.stderr.write("%s.run: exec %s\n" % (__name__, str(execargs)))
- arvados.util.run_command(
- execargs,
- cwd=arvados.current_task().tmpdir,
- stdin=kwargs.get('stdin', subprocess.PIPE),
- stderr=kwargs.get('stderr', sys.stderr),
- stdout=kwargs.get('stdout', sys.stderr))
-
-def one_task_per_bam_file(if_sequence=0, and_end_task=True):
- """
- Queue one task for each bam file in this job's input collection.
-
- Each new task will have an "input" parameter: a manifest
- containing one .bam file and (if available) the corresponding .bai
- index file.
-
- Files in the input collection that are not named *.bam or *.bai
- (as well as *.bai files that do not match any .bam file present)
- are silently ignored.
-
- if_sequence and and_end_task arguments have the same significance
- as in arvados.job_setup.one_task_per_input_file().
- """
- if if_sequence != arvados.current_task()['sequence']:
- return
- job_input = arvados.current_job()['script_parameters']['input']
- cr = arvados.CollectionReader(job_input)
- bam = {}
- bai = {}
- for s in cr.all_streams():
- for f in s.all_files():
- if re.search(r'\.bam$', f.name()):
- bam[s.name(), f.name()] = f
- elif re.search(r'\.bai$', f.name()):
- bai[s.name(), f.name()] = f
- for ((s_name, f_name), bam_f) in bam.items():
- bai_f = bai.get((s_name, re.sub(r'bam$', 'bai', f_name)), None)
- task_input = bam_f.as_manifest()
- if bai_f:
- task_input += bai_f.as_manifest()
- new_task_attrs = {
- 'job_uuid': arvados.current_job()['uuid'],
- 'created_by_job_task_uuid': arvados.current_task()['uuid'],
- 'sequence': if_sequence + 1,
- 'parameters': {
- 'input': task_input
- }
- }
- arvados.api().job_tasks().create(body=new_task_attrs).execute()
- if and_end_task:
- arvados.api().job_tasks().update(uuid=arvados.current_task()['uuid'],
- body={'success':True}
- ).execute()
- exit(0)
diff --git a/crunch_scripts/bwa-aln b/crunch_scripts/bwa-aln
deleted file mode 100755
index e3d85a7c3..000000000
--- a/crunch_scripts/bwa-aln
+++ /dev/null
@@ -1,127 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-import arvados
-import arvados_bwa
-import arvados_samtools
-import os
-import re
-import sys
-import subprocess
-
-arvados_bwa.one_task_per_pair_input_file(if_sequence=0, and_end_task=True)
-
-this_job = arvados.current_job()
-this_task = arvados.current_task()
-ref_dir = arvados.util.collection_extract(
- collection = this_job['script_parameters']['reference_index'],
- path = 'reference',
- decompress = False)
-
-ref_basename = None
-for f in os.listdir(ref_dir):
- basename = re.sub(r'\.bwt$', '', f)
- if basename != f:
- ref_basename = os.path.join(ref_dir, basename)
-if ref_basename == None:
- raise Exception("Could not find *.bwt in reference collection.")
-
-tmp_dir = arvados.current_task().tmpdir
-
-class Aligner:
- def input_filename(self):
- for s in arvados.CollectionReader(self.collection).all_streams():
- for f in s.all_files():
- return f.decompressed_name()
- def generate_input(self):
- for s in arvados.CollectionReader(self.collection).all_streams():
- for f in s.all_files():
- for s in f.readall_decompressed():
- yield s
- def aln(self, input_param):
- self.collection = this_task['parameters'][input_param]
- reads_filename = os.path.join(tmp_dir, self.input_filename())
- aln_filename = os.path.join(tmp_dir, self.input_filename() + '.sai')
- reads_pipe_r, reads_pipe_w = os.pipe()
- if os.fork() == 0:
- os.close(reads_pipe_r)
- reads_file = open(reads_filename, 'wb')
- for s in self.generate_input():
- if len(s) != os.write(reads_pipe_w, s):
- raise Exception("short write")
- reads_file.write(s)
- reads_file.close()
- os.close(reads_pipe_w)
- sys.exit(0)
- os.close(reads_pipe_w)
-
- aln_file = open(aln_filename, 'wb')
- bwa_proc = subprocess.Popen(
- [arvados_bwa.bwa_binary(),
- 'aln', '-t', '16',
- ref_basename,
- '-'],
- stdin=os.fdopen(reads_pipe_r, 'rb', 2**20),
- stdout=aln_file)
- aln_file.close()
- return reads_filename, aln_filename
-
-reads_1, alignments_1 = Aligner().aln('input_1')
-reads_2, alignments_2 = Aligner().aln('input_2')
-pid1, exit1 = os.wait()
-pid2, exit2 = os.wait()
-if exit1 != 0 or exit2 != 0:
- raise Exception("bwa aln exited non-zero (0x%x, 0x%x)" % (exit1, exit2))
-
-# output alignments in sam format to pipe
-sam_pipe_r, sam_pipe_w = os.pipe()
-sam_pid = os.fork()
-if sam_pid != 0:
- # parent
- os.close(sam_pipe_w)
-else:
- # child
- os.close(sam_pipe_r)
- arvados_bwa.run('sampe',
- [ref_basename,
- alignments_1, alignments_2,
- reads_1, reads_2],
- stdout=os.fdopen(sam_pipe_w, 'wb', 2**20))
- sys.exit(0)
-
-# convert sam (sam_pipe_r) to bam (bam_pipe_w)
-bam_pipe_r, bam_pipe_w = os.pipe()
-bam_pid = os.fork()
-if bam_pid != 0:
- # parent
- os.close(bam_pipe_w)
- os.close(sam_pipe_r)
-else:
- # child
- os.close(bam_pipe_r)
- arvados_samtools.run('view',
- ['-S', '-b',
- '-'],
- stdin=os.fdopen(sam_pipe_r, 'rb', 2**20),
- stdout=os.fdopen(bam_pipe_w, 'wb', 2**20))
- sys.exit(0)
-
-# copy bam (bam_pipe_r) to Keep
-out_bam_filename = os.path.split(reads_1)[-1] + '.bam'
-out = arvados.CollectionWriter()
-out.start_new_stream()
-out.start_new_file(out_bam_filename)
-out.write(os.fdopen(bam_pipe_r, 'rb', 2**20))
-
-# make sure everyone exited nicely
-pid3, exit3 = os.waitpid(sam_pid, 0)
-if exit3 != 0:
- raise Exception("bwa sampe exited non-zero (0x%x)" % exit3)
-pid4, exit4 = os.waitpid(bam_pid, 0)
-if exit4 != 0:
- raise Exception("samtools view exited non-zero (0x%x)" % exit4)
-
-# proclaim success
-this_task.set_output(out.finish())
diff --git a/crunch_scripts/bwa-index b/crunch_scripts/bwa-index
deleted file mode 100755
index f5b7030c0..000000000
--- a/crunch_scripts/bwa-index
+++ /dev/null
@@ -1,41 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-import arvados
-import arvados_bwa
-import os
-import re
-import sys
-
-this_job = arvados.current_job()
-this_task = arvados.current_task()
-ref_dir = arvados.util.collection_extract(
- collection = this_job['script_parameters']['input'],
- path = 'reference',
- decompress = False)
-
-ref_fasta_files = (os.path.join(ref_dir, f)
- for f in os.listdir(ref_dir)
- if re.search(r'\.fasta(\.gz)?$', f))
-
-# build reference index
-arvados_bwa.run('index',
- ['-a', 'bwtsw'] + list(ref_fasta_files))
-
-# move output files to new empty directory
-out_dir = os.path.join(arvados.current_task().tmpdir, 'out')
-arvados.util.run_command(['rm', '-rf', out_dir], stderr=sys.stderr)
-os.mkdir(out_dir)
-for f in os.listdir(ref_dir):
- if re.search(r'\.(amb|ann|bwt|pac|rbwt|rpac|rsa|sa)$', f):
- sys.stderr.write("bwa output: %s (%d)\n" %
- (f, os.stat(os.path.join(ref_dir, f)).st_size))
- os.rename(os.path.join(ref_dir, f),
- os.path.join(out_dir, f))
-
-# store output
-out = arvados.CollectionWriter()
-out.write_directory_tree(out_dir, max_manifest_depth=0)
-this_task.set_output(out.finish())
diff --git a/crunch_scripts/collection-merge b/crunch_scripts/collection-merge
deleted file mode 100755
index f3aa5ce9c..000000000
--- a/crunch_scripts/collection-merge
+++ /dev/null
@@ -1,49 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-# collection-merge
-#
-# Merge two or more collections together. Can also be used to extract specific
-# files from a collection to produce a new collection.
-#
-# input:
-# An array of collections or collection/file paths in script_parameter["input"]
-#
-# output:
-# A manifest with the collections merged. Duplicate file names will
-# have their contents concatenated in the order that they appear in the input
-# array.
-
-import arvados
-import md5
-import crunchutil.subst as subst
-import subprocess
-import os
-import hashlib
-
-p = arvados.current_job()['script_parameters']
-
-merged = ""
-src = []
-for c in p["input"]:
- c = subst.do_substitution(p, c)
- i = c.find('/')
- if i == -1:
- src.append(c)
- merged += arvados.CollectionReader(c).manifest_text()
- else:
- src.append(c[0:i])
- cr = arvados.CollectionReader(c[0:i])
- j = c.rfind('/')
- stream = c[i+1:j]
- if stream == "":
- stream = "."
- fn = c[(j+1):]
- for s in cr.all_streams():
- if s.name() == stream:
- if fn in s.files():
- merged += s.files()[fn].as_manifest()
-
-arvados.current_task().set_output(merged)
diff --git a/crunch_scripts/crunchrunner b/crunch_scripts/crunchrunner
deleted file mode 100755
index 25d3ba524..000000000
--- a/crunch_scripts/crunchrunner
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/bin/sh
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-if test -n "$JOB_PARAMETER_CRUNCHRUNNER" ; then
- exec $TASK_KEEPMOUNT/$JOB_PARAMETER_CRUNCHRUNNER
-else
- exec /usr/local/bin/crunchrunner
-fi
diff --git a/crunch_scripts/crunchutil/__init__.py b/crunch_scripts/crunchutil/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/crunch_scripts/crunchutil/robust_put.py b/crunch_scripts/crunchutil/robust_put.py
deleted file mode 100644
index 27b0bf345..000000000
--- a/crunch_scripts/crunchutil/robust_put.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-import arvados
-import arvados.commands.put as put
-import os
-import logging
-import time
-
-def machine_progress(bytes_written, bytes_expected):
- return "upload wrote {} total {}\n".format(
- bytes_written, -1 if (bytes_expected is None) else bytes_expected)
-
-class Args(object):
- def __init__(self, fn):
- self.filename = None
- self.paths = [fn]
- self.max_manifest_depth = 0
-
-# Upload to Keep with error recovery.
-# Return a uuid or raise an exception if there are too many failures.
-def upload(source_dir, logger=None):
- if logger is None:
- logger = logging.getLogger("arvados")
-
- source_dir = os.path.abspath(source_dir)
- done = False
- if 'TASK_WORK' in os.environ:
- resume_cache = put.ResumeCache(os.path.join(arvados.current_task().tmpdir, "upload-output-checkpoint"))
- else:
- resume_cache = put.ResumeCache(put.ResumeCache.make_path(Args(source_dir)))
- reporter = put.progress_writer(machine_progress)
- bytes_expected = put.expected_bytes_for([source_dir])
- backoff = 1
- outuuid = None
- while not done:
- try:
- out = put.ArvPutCollectionWriter.from_cache(resume_cache, reporter, bytes_expected)
- out.do_queued_work()
- out.write_directory_tree(source_dir, max_manifest_depth=0)
- outuuid = out.finish()
- done = True
- except KeyboardInterrupt as e:
- logger.critical("caught interrupt signal 2")
- raise e
- except Exception as e:
- logger.exception("caught exception:")
- backoff *= 2
- if backoff > 256:
- logger.critical("Too many upload failures, giving up")
- raise e
- else:
- logger.warning("Sleeping for %s seconds before trying again" % backoff)
- time.sleep(backoff)
- return outuuid
diff --git a/crunch_scripts/crunchutil/subst.py b/crunch_scripts/crunchutil/subst.py
deleted file mode 100644
index 53def97f9..000000000
--- a/crunch_scripts/crunchutil/subst.py
+++ /dev/null
@@ -1,102 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-import glob
-import os
-import re
-import stat
-
-BACKSLASH_ESCAPE_RE = re.compile(r'\\(.)')
-
-class SubstitutionError(Exception):
- pass
-
-def search(c):
- DEFAULT = 0
- DOLLAR = 1
-
- i = 0
- state = DEFAULT
- start = None
- depth = 0
- while i < len(c):
- if c[i] == '\\':
- i += 1
- elif state == DEFAULT:
- if c[i] == '$':
- state = DOLLAR
- if depth == 0:
- start = i
- elif c[i] == ')':
- if depth == 1:
- return [start, i]
- if depth > 0:
- depth -= 1
- elif state == DOLLAR:
- if c[i] == '(':
- depth += 1
- state = DEFAULT
- i += 1
- if depth != 0:
- raise SubstitutionError("Substitution error, mismatched parentheses {}".format(c))
- return None
-
-def sub_file(v):
- path = os.path.join(os.environ['TASK_KEEPMOUNT'], v)
- st = os.stat(path)
- if st and stat.S_ISREG(st.st_mode):
- return path
- else:
- raise SubstitutionError("$(file {}) is not accessible or is not a regular file".format(path))
-
-def sub_dir(v):
- d = os.path.dirname(v)
- if d == '':
- d = v
- path = os.path.join(os.environ['TASK_KEEPMOUNT'], d)
- st = os.stat(path)
- if st and stat.S_ISDIR(st.st_mode):
- return path
- else:
- raise SubstitutionError("$(dir {}) is not accessible or is not a directory".format(path))
-
-def sub_basename(v):
- return os.path.splitext(os.path.basename(v))[0]
-
-def sub_glob(v):
- l = glob.glob(v)
- if len(l) == 0:
- raise SubstitutionError("$(glob {}) no match found".format(v))
- else:
- return l[0]
-
-default_subs = {"file ": sub_file,
- "dir ": sub_dir,
- "basename ": sub_basename,
- "glob ": sub_glob}
-
-def do_substitution(p, c, subs=default_subs):
- while True:
- m = search(c)
- if m is None:
- return BACKSLASH_ESCAPE_RE.sub(r'\1', c)
-
- v = do_substitution(p, c[m[0]+2 : m[1]])
- var = True
- for sub in subs:
- if v.startswith(sub):
- r = subs[sub](v[len(sub):])
- var = False
- break
- if var:
- if v in p:
- r = p[v]
- else:
- raise SubstitutionError("Unknown variable or function '%s' while performing substitution on '%s'" % (v, c))
- if r is None:
- raise SubstitutionError("Substitution for '%s' is null while performing substitution on '%s'" % (v, c))
- if not isinstance(r, basestring):
- raise SubstitutionError("Substitution for '%s' must be a string while performing substitution on '%s'" % (v, c))
-
- c = c[:m[0]] + r + c[m[1]+1:]
diff --git a/crunch_scripts/crunchutil/vwd.py b/crunch_scripts/crunchutil/vwd.py
deleted file mode 100644
index 3245da14b..000000000
--- a/crunch_scripts/crunchutil/vwd.py
+++ /dev/null
@@ -1,107 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-import arvados
-import os
-import stat
-import arvados.commands.run
-import logging
-
-# Implements "Virtual Working Directory"
-# Provides a way of emulating a shared writable directory in Keep based
-# on a "check out, edit, check in, merge" model.
-# At the moment, this only permits adding new files, applications
-# cannot modify or delete existing files.
-
-# Create a symlink tree rooted at target_dir mirroring arv-mounted
-# source_collection. target_dir must be empty, and will be created if it
-# doesn't exist.
-def checkout(source_collection, target_dir, keepmount=None):
- # create symlinks
- if keepmount is None:
- keepmount = os.environ['TASK_KEEPMOUNT']
-
- if not os.path.exists(target_dir):
- os.makedirs(target_dir)
-
- l = os.listdir(target_dir)
- if len(l) > 0:
- raise Exception("target_dir must be empty before checkout, contains %s" % l)
-
- stem = os.path.join(keepmount, source_collection)
- for root, dirs, files in os.walk(os.path.join(keepmount, source_collection), topdown=True):
- rel = root[len(stem)+1:]
- for d in dirs:
- os.mkdir(os.path.join(target_dir, rel, d))
- for f in files:
- os.symlink(os.path.join(root, f), os.path.join(target_dir, rel, f))
-
-def checkin(target_dir):
- """Write files in `target_dir` to Keep.
-
- Regular files or symlinks to files outside the keep mount are written to
- Keep as normal files (Keep does not support symlinks).
-
- Symlinks to files in the keep mount will result in files in the new
- collection which reference existing Keep blocks, no data copying necessary.
-
- Returns a new Collection object, with data flushed but the collection record
- not saved to the API.
-
- """
-
- outputcollection = arvados.collection.Collection(num_retries=5)
-
- if target_dir[-1:] != '/':
- target_dir += '/'
-
- collections = {}
-
- logger = logging.getLogger("arvados")
-
- last_error = None
- for root, dirs, files in os.walk(target_dir):
- for f in files:
- try:
- s = os.lstat(os.path.join(root, f))
-
- writeIt = False
-
- if stat.S_ISREG(s.st_mode):
- writeIt = True
- elif stat.S_ISLNK(s.st_mode):
- # 1. check if it is a link into a collection
- real = os.path.split(os.path.realpath(os.path.join(root, f)))
- (pdh, branch) = arvados.commands.run.is_in_collection(real[0], real[1])
- if pdh is not None:
- # 2. load collection
- if pdh not in collections:
- # 2.1 make sure it is flushed (see #5787 note 11)
- fd = os.open(real[0], os.O_RDONLY)
- os.fsync(fd)
- os.close(fd)
-
- # 2.2 get collection from API server
- collections[pdh] = arvados.collection.CollectionReader(pdh,
- api_client=outputcollection._my_api(),
- keep_client=outputcollection._my_keep(),
- num_retries=5)
- # 3. copy arvfile to new collection
- outputcollection.copy(branch, os.path.join(root[len(target_dir):], f), source_collection=collections[pdh])
- else:
- writeIt = True
-
- if writeIt:
- reldir = root[len(target_dir):]
- with outputcollection.open(os.path.join(reldir, f), "wb") as writer:
- with open(os.path.join(root, f), "rb") as reader:
- dat = reader.read(64*1024)
- while dat:
- writer.write(dat)
- dat = reader.read(64*1024)
- except (IOError, OSError) as e:
- logger.error(e)
- last_error = e
-
- return (outputcollection, last_error)
diff --git a/crunch_scripts/cwl-runner b/crunch_scripts/cwl-runner
deleted file mode 100755
index 0c79844d5..000000000
--- a/crunch_scripts/cwl-runner
+++ /dev/null
@@ -1,117 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-# Crunch script integration for running arvados-cwl-runner inside a crunch job.
-
-import arvados_cwl
-import sys
-
-try:
- # Use the crunch script defined in the arvados_cwl package. This helps
- # prevent the crunch script from going out of sync with the rest of the
- # arvados_cwl package.
- import arvados_cwl.crunch_script
- arvados_cwl.crunch_script.run()
- sys.exit()
-except ImportError:
- pass
-
-# When running against an older arvados-cwl-runner package without
-# arvados_cwl.crunch_script, fall back to the old code.
-
-
-# This gets the job record, transforms the script parameters into a valid CWL
-# input object, then executes the CWL runner to run the underlying workflow or
-# tool. When the workflow completes, record the output object in an output
-# collection for this runner job.
-
-import arvados
-import arvados.collection
-import arvados.util
-import cwltool.main
-import logging
-import os
-import json
-import argparse
-import re
-import functools
-
-from arvados.api import OrderedJsonModel
-from cwltool.process import shortname, adjustFileObjs, adjustDirObjs, getListing, normalizeFilesDirs
-from cwltool.load_tool import load_tool
-
-# Print package versions
-logging.info(cwltool.main.versionstring())
-
-api = arvados.api("v1")
-
-try:
- job_order_object = arvados.current_job()['script_parameters']
-
- pdh_path = re.compile(r'^[0-9a-f]{32}\+\d+(/.+)?$')
-
- def keeppath(v):
- if pdh_path.match(v):
- return "keep:%s" % v
- else:
- return v
-
- def keeppathObj(v):
- v["location"] = keeppath(v["location"])
-
- job_order_object["cwl:tool"] = "file://%s/%s" % (os.environ['TASK_KEEPMOUNT'], job_order_object["cwl:tool"])
-
- for k,v in job_order_object.items():
- if isinstance(v, basestring) and arvados.util.keep_locator_pattern.match(v):
- job_order_object[k] = {
- "class": "File",
- "location": "keep:%s" % v
- }
-
- adjustFileObjs(job_order_object, keeppathObj)
- adjustDirObjs(job_order_object, keeppathObj)
- normalizeFilesDirs(job_order_object)
- adjustDirObjs(job_order_object, functools.partial(getListing, arvados_cwl.fsaccess.CollectionFsAccess("", api_client=api)))
-
- output_name = None
- if "arv:output_name" in job_order_object:
- output_name = job_order_object["arv:output_name"]
- del job_order_object["arv:output_name"]
-
- runner = arvados_cwl.ArvCwlRunner(api_client=arvados.api('v1', model=OrderedJsonModel()),
- output_name=output_name)
-
- t = load_tool(job_order_object, runner.arv_make_tool)
-
- args = argparse.Namespace()
- args.project_uuid = arvados.current_job()["owner_uuid"]
- args.enable_reuse = True
- args.submit = False
- args.debug = True
- args.quiet = False
- args.ignore_docker_for_reuse = False
- args.basedir = os.getcwd()
- args.cwl_runner_job={"uuid": arvados.current_job()["uuid"], "state": arvados.current_job()["state"]}
- outputObj = runner.arv_executor(t, job_order_object, **vars(args))
-
- if runner.final_output_collection:
- outputCollection = runner.final_output_collection.portable_data_hash()
- else:
- outputCollection = None
-
- api.job_tasks().update(uuid=arvados.current_task()['uuid'],
- body={
- 'output': outputCollection,
- 'success': True,
- 'progress':1.0
- }).execute()
-except Exception as e:
- logging.exception("Unhandled exception")
- api.job_tasks().update(uuid=arvados.current_task()['uuid'],
- body={
- 'output': None,
- 'success': False,
- 'progress':1.0
- }).execute()
diff --git a/crunch_scripts/decompress-all.py b/crunch_scripts/decompress-all.py
deleted file mode 100755
index 100ea1223..000000000
--- a/crunch_scripts/decompress-all.py
+++ /dev/null
@@ -1,64 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-#
-# decompress-all.py
-#
-# Decompress all compressed files in the collection using the "dtrx" tool and
-# produce a new collection with the contents. Uncompressed files
-# are passed through.
-#
-# input:
-# A collection at script_parameters["input"]
-#
-# output:
-# A manifest of the uncompressed contents of the input collection.
-
-import arvados
-import re
-import subprocess
-import os
-import sys
-import crunchutil.robust_put as robust_put
-
-arvados.job_setup.one_task_per_input_file(if_sequence=0, and_end_task=True,
- input_as_path=True)
-
-task = arvados.current_task()
-
-input_file = task['parameters']['input']
-
-infile_parts = re.match(r"(^[a-f0-9]{32}\+\d+)(\+\S+)*(/.*)?(/[^/]+)$", input_file)
-
-outdir = os.path.join(task.tmpdir, "output")
-os.makedirs(outdir)
-os.chdir(outdir)
-
-if infile_parts is None:
- print >>sys.stderr, "Failed to parse input filename '%s' as a Keep file\n" % input_file
- sys.exit(1)
-
-cr = arvados.CollectionReader(infile_parts.group(1))
-streamname = infile_parts.group(3)[1:]
-filename = infile_parts.group(4)[1:]
-
-if streamname is not None:
- subprocess.call(["mkdir", "-p", streamname])
- os.chdir(streamname)
-else:
- streamname = '.'
-
-m = re.match(r'.*\.(gz|Z|bz2|tgz|tbz|zip|rar|7z|cab|deb|rpm|cpio|gem)$', arvados.get_task_param_mount('input'), re.IGNORECASE)
-
-if m is not None:
- rc = subprocess.call(["dtrx", "-r", "-n", "-q", arvados.get_task_param_mount('input')])
- if rc == 0:
- task.set_output(robust_put.upload(outdir))
- else:
- sys.exit(rc)
-else:
- streamreader = filter(lambda s: s.name() == streamname, cr.all_streams())[0]
- filereader = streamreader.files()[filename]
- task.set_output(streamname + filereader.as_manifest()[1:])
diff --git a/crunch_scripts/file-select b/crunch_scripts/file-select
deleted file mode 100755
index c4af05c82..000000000
--- a/crunch_scripts/file-select
+++ /dev/null
@@ -1,18 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-import arvados
-import os
-import re
-
-this_job = arvados.current_job()
-this_task = arvados.current_task()
-this_job_input = this_job['script_parameters']['input']
-manifest_text = ""
-for f in arvados.CollectionReader(this_job_input).all_files():
- if f.name() in this_job['script_parameters']['names']:
- manifest_text += f.as_manifest()
-
-this_task.set_output(arvados.Keep.put(manifest_text))
diff --git a/crunch_scripts/grep b/crunch_scripts/grep
deleted file mode 100755
index a84c0f671..000000000
--- a/crunch_scripts/grep
+++ /dev/null
@@ -1,24 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-import arvados
-import re
-
-arvados.job_setup.one_task_per_input_file(if_sequence=0, and_end_task=True)
-
-this_job = arvados.current_job()
-this_task = arvados.current_task()
-this_task_input = this_task['parameters']['input']
-pattern = re.compile(this_job['script_parameters']['pattern'])
-
-input_file = list(arvados.CollectionReader(this_task_input).all_files())[0]
-out = arvados.CollectionWriter()
-out.set_current_file_name(input_file.decompressed_name())
-out.set_current_stream_name(input_file.stream_name())
-for line in input_file.readlines():
- if pattern.search(line):
- out.write(line)
-
-this_task.set_output(out.finish())
diff --git a/crunch_scripts/hash b/crunch_scripts/hash
deleted file mode 100755
index 56eec7a5f..000000000
--- a/crunch_scripts/hash
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-import arvados
-import hashlib
-import os
-
-arvados.job_setup.one_task_per_input_file(if_sequence=0, and_end_task=True, input_as_path=True)
-
-this_job = arvados.current_job()
-this_task = arvados.current_task()
-
-if 'algorithm' in this_job['script_parameters']:
- alg = this_job['script_parameters']['algorithm']
-else:
- alg = 'md5'
-digestor = hashlib.new(alg)
-
-input_file = arvados.get_task_param_mount('input')
-
-with open(input_file) as f:
- while True:
- buf = f.read(2**20)
- if len(buf) == 0:
- break
- digestor.update(buf)
-
-hexdigest = digestor.hexdigest()
-
-file_name = '/'.join(this_task['parameters']['input'].split('/')[1:])
-
-out = arvados.CollectionWriter()
-out.set_current_file_name("md5sum.txt")
-out.write("%s %s\n" % (hexdigest, file_name))
-this_task.set_output(out.finish())
diff --git a/crunch_scripts/pgp-survey-import b/crunch_scripts/pgp-survey-import
deleted file mode 100755
index f12e84b2d..000000000
--- a/crunch_scripts/pgp-survey-import
+++ /dev/null
@@ -1,119 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-import arvados
-import string
-import json
-import UserDict
-import sys
-
-this_job = arvados.current_job()
-this_task = arvados.current_task()
-this_job_input = this_job['script_parameters']['input']
-
-out = arvados.CollectionWriter()
-out.set_current_file_name("arvados_objects.json")
-out.write("[\n")
-separator = ""
-
-traits = {}
-done_bytes = 0
-done_ratio = 0
-for input_file in arvados.CollectionReader(this_job_input).all_files():
- for line_number, line in enumerate(input_file.readlines()):
-
- done_bytes += len(line)
- new_done_ratio = 1.0 * done_bytes / input_file.size()
- if line_number == 2 or new_done_ratio - done_ratio > 0.05:
- sys.stderr.write("progress: %d%% after %d lines\n" % (int(done_ratio * 100), line_number+1))
- done_ratio = new_done_ratio
-
- words = string.split(string.strip(line), "\t")
- if line_number == 0:
- headings = words
- for t in arvados.api('v1').traits().list(
- where={'name':words},
- limit=1000
- ).execute()['items']:
- traits[t['name']] = t
- for i, trait_name in enumerate(words[3:], start=3):
- # find or create trait
- if trait_name not in traits:
- traits_match = arvados.api('v1').traits().list(
- where={'name':trait_name}
- ).execute()['items']
- if len(traits_match) > 0:
- traits[trait_name] = traits_match[0]
- else:
- traits[trait_name] = arvados.api('v1').traits().create(
- trait={'name':trait_name}).execute()
- out.write(separator)
- out.write(json.dumps(traits[trait_name]))
- separator = ",\n"
- else:
- huID_links_match = arvados.api('v1').links().list(
- where={'link_class':'identifier','name':words[0]}
- ).execute()['items']
- if len(huID_links_match) > 0:
- human_uuid = huID_links_match[0]['head_uuid']
- else:
- human = arvados.api('v1').humans().create(
- body={}
- ).execute()
- huID_link = arvados.api('v1').links().create(
- body={
- 'link_class':'identifier',
- 'name':words[0],
- 'head_kind':'arvados#human',
- 'head_uuid':human['uuid']
- }
- ).execute()
- human_uuid = human['uuid']
- human_trait = {}
- for t in arvados.api('v1').links().list(
- limit=10000,
- where={
- 'tail_uuid':human_uuid,
- 'tail_kind':'arvados#human',
- 'head_kind':'arvados#trait',
- 'link_class':'human_trait',
- 'name':'pgp-survey-response'
- }
- ).execute()['items']:
- human_trait[t['head_uuid']] = t
- for i, trait_value in enumerate(words[3:], start=3):
- trait_uuid = traits[headings[i]]['uuid']
- if trait_uuid in human_trait:
- trait_link = human_trait[trait_uuid]
- if trait_link['properties']['value'] != trait_value:
- # update database value to match survey response
- trait_link['properties']['value'] = trait_value
- arvados.api('v1').links().update(
- uuid=trait_link['uuid'],
- body={'properties':trait_link['properties']}
- ).execute()
- out.write(",\n")
- out.write(json.dumps(trait_link))
- elif trait_value == '':
- # nothing in database, nothing in input
- pass
- else:
- trait_link = {
- 'tail_uuid':human_uuid,
- 'tail_kind':'arvados#human',
- 'head_uuid':traits[headings[i]]['uuid'],
- 'head_kind':'arvados#trait',
- 'link_class':'human_trait',
- 'name':'pgp-survey-response',
- 'properties': { 'value': trait_value }
- }
- arvados.api('v1').links().create(
- body=trait_link
- ).execute()
- out.write(",\n")
- out.write(json.dumps(trait_link))
-
-out.write("\n]\n")
-this_task.set_output(out.finish())
diff --git a/crunch_scripts/pgp-survey-parse b/crunch_scripts/pgp-survey-parse
deleted file mode 100755
index ee852f1d2..000000000
--- a/crunch_scripts/pgp-survey-parse
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-import arvados
-
-this_job = arvados.current_job()
-this_task = arvados.current_task()
-parser_path = arvados.util.git_checkout(
- url = this_job['script_parameters']['parser_url'],
- version = this_job['script_parameters']['parser_version'],
- path = 'parser')
-
-stdoutdata, stderrdata = arvados.util.run_command(
- ["python", "demo.py"],
- cwd=parser_path)
-
-out = arvados.CollectionWriter()
-out.write(stdoutdata)
-out.set_current_file_name('participant_traits.tsv')
-this_task.set_output(out.finish())
diff --git a/crunch_scripts/picard-gatk2-prep b/crunch_scripts/picard-gatk2-prep
deleted file mode 100755
index 976060f01..000000000
--- a/crunch_scripts/picard-gatk2-prep
+++ /dev/null
@@ -1,211 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-import arvados
-import os
-import re
-import sys
-import subprocess
-import arvados_picard
-from arvados_ipc import *
-
-arvados.job_setup.one_task_per_input_file(if_sequence=0, and_end_task=True)
-
-this_job = arvados.current_job()
-this_task = arvados.current_task()
-ref_dir = arvados.util.collection_extract(
- collection = this_job['script_parameters']['reference'],
- path = 'reference',
- decompress = True)
-ref_fasta_files = [os.path.join(ref_dir, f)
- for f in os.listdir(ref_dir)
- if re.search(r'\.fasta(\.gz)?$', f)]
-input_collection = this_task['parameters']['input']
-
-for s in arvados.CollectionReader(input_collection).all_streams():
- for f in s.all_files():
- input_stream_name = s.name()
- input_file_name = f.name()
- break
-
-# Unfortunately, picard FixMateInformation cannot read from a pipe. We
-# must copy the input to a temporary file before running picard.
-input_bam_path = os.path.join(this_task.tmpdir, input_file_name)
-with open(input_bam_path, 'wb') as bam:
- for s in arvados.CollectionReader(input_collection).all_streams():
- for f in s.all_files():
- for s in f.readall():
- bam.write(s)
-
-children = {}
-pipes = {}
-
-pipe_setup(pipes, 'fixmate')
-if 0==named_fork(children, 'fixmate'):
- pipe_closeallbut(pipes, ('fixmate', 'w'))
- arvados_picard.run(
- 'FixMateInformation',
- params={
- 'i': input_bam_path,
- 'o': '/dev/stdout',
- 'quiet': 'true',
- 'so': 'coordinate',
- 'validation_stringency': 'LENIENT',
- 'compression_level': 0
- },
- stdout=os.fdopen(pipes['fixmate','w'], 'wb', 2**20))
- os._exit(0)
-os.close(pipes.pop(('fixmate','w'), None))
-
-pipe_setup(pipes, 'sortsam')
-if 0==named_fork(children, 'sortsam'):
- pipe_closeallbut(pipes, ('fixmate', 'r'), ('sortsam', 'w'))
- arvados_picard.run(
- 'SortSam',
- params={
- 'i': '/dev/stdin',
- 'o': '/dev/stdout',
- 'quiet': 'true',
- 'so': 'coordinate',
- 'validation_stringency': 'LENIENT',
- 'compression_level': 0
- },
- stdin=os.fdopen(pipes['fixmate','r'], 'rb', 2**20),
- stdout=os.fdopen(pipes['sortsam','w'], 'wb', 2**20))
- os._exit(0)
-
-pipe_setup(pipes, 'reordersam')
-if 0==named_fork(children, 'reordersam'):
- pipe_closeallbut(pipes, ('sortsam', 'r'), ('reordersam', 'w'))
- arvados_picard.run(
- 'ReorderSam',
- params={
- 'i': '/dev/stdin',
- 'o': '/dev/stdout',
- 'reference': ref_fasta_files[0],
- 'quiet': 'true',
- 'validation_stringency': 'LENIENT',
- 'compression_level': 0
- },
- stdin=os.fdopen(pipes['sortsam','r'], 'rb', 2**20),
- stdout=os.fdopen(pipes['reordersam','w'], 'wb', 2**20))
- os._exit(0)
-
-pipe_setup(pipes, 'addrg')
-if 0==named_fork(children, 'addrg'):
- pipe_closeallbut(pipes, ('reordersam', 'r'), ('addrg', 'w'))
- arvados_picard.run(
- 'AddOrReplaceReadGroups',
- params={
- 'i': '/dev/stdin',
- 'o': '/dev/stdout',
- 'quiet': 'true',
- 'rglb': this_job['script_parameters'].get('rglb', 0),
- 'rgpl': this_job['script_parameters'].get('rgpl', 'illumina'),
- 'rgpu': this_job['script_parameters'].get('rgpu', 0),
- 'rgsm': this_job['script_parameters'].get('rgsm', 0),
- 'validation_stringency': 'LENIENT'
- },
- stdin=os.fdopen(pipes['reordersam','r'], 'rb', 2**20),
- stdout=os.fdopen(pipes['addrg','w'], 'wb', 2**20))
- os._exit(0)
-
-pipe_setup(pipes, 'bammanifest')
-pipe_setup(pipes, 'bam')
-pipe_setup(pipes, 'casm_in')
-if 0==named_fork(children, 'bammanifest'):
- pipe_closeallbut(pipes,
- ('addrg', 'r'),
- ('bammanifest', 'w'),
- ('bam', 'w'),
- ('casm_in', 'w'))
- out = arvados.CollectionWriter()
- out.start_new_stream(input_stream_name)
- out.start_new_file(input_file_name)
- while True:
- buf = os.read(pipes['addrg','r'], 2**20)
- if len(buf) == 0:
- break
- os.write(pipes['bam','w'], buf)
- os.write(pipes['casm_in','w'], buf)
- out.write(buf)
- os.write(pipes['bammanifest','w'], out.manifest_text())
- os.close(pipes['bammanifest','w'])
- os._exit(0)
-
-pipe_setup(pipes, 'casm')
-if 0 == named_fork(children, 'casm'):
- pipe_closeallbut(pipes, ('casm_in', 'r'), ('casm', 'w'))
- arvados_picard.run(
- 'CollectAlignmentSummaryMetrics',
- params={
- 'input': '/dev/fd/' + str(pipes['casm_in','r']),
- 'output': '/dev/fd/' + str(pipes['casm','w']),
- 'reference_sequence': ref_fasta_files[0],
- 'validation_stringency': 'LENIENT',
- },
- close_fds=False)
- os._exit(0)
-
-pipe_setup(pipes, 'index')
-if 0==named_fork(children, 'index'):
- pipe_closeallbut(pipes, ('bam', 'r'), ('index', 'w'))
- arvados_picard.run(
- 'BuildBamIndex',
- params={
- 'i': '/dev/stdin',
- 'o': '/dev/stdout',
- 'quiet': 'true',
- 'validation_stringency': 'LENIENT'
- },
- stdin=os.fdopen(pipes['bam','r'], 'rb', 2**20),
- stdout=os.fdopen(pipes['index','w'], 'wb', 2**20))
- os._exit(0)
-
-pipe_setup(pipes, 'indexmanifest')
-if 0==named_fork(children, 'indexmanifest'):
- pipe_closeallbut(pipes, ('index', 'r'), ('indexmanifest', 'w'))
- out = arvados.CollectionWriter()
- out.start_new_stream(input_stream_name)
- out.start_new_file(re.sub('\.bam$', '.bai', input_file_name))
- while True:
- buf = os.read(pipes['index','r'], 2**20)
- if len(buf) == 0:
- break
- out.write(buf)
- os.write(pipes['indexmanifest','w'], out.manifest_text())
- os.close(pipes['indexmanifest','w'])
- os._exit(0)
-
-pipe_closeallbut(pipes,
- ('bammanifest', 'r'),
- ('indexmanifest', 'r'),
- ('casm', 'r'))
-
-outmanifest = ''
-
-for which in ['bammanifest', 'indexmanifest']:
- with os.fdopen(pipes[which,'r'], 'rb', 2**20) as f:
- while True:
- buf = f.read()
- if buf == '':
- break
- outmanifest += buf
-
-casm_out = arvados.CollectionWriter()
-casm_out.start_new_stream(input_stream_name)
-casm_out.start_new_file(input_file_name + '.casm.tsv')
-casm_out.write(os.fdopen(pipes.pop(('casm','r'))))
-
-outmanifest += casm_out.manifest_text()
-
-all_ok = True
-for (childname, pid) in children.items():
- all_ok = all_ok and waitpid_and_check_exit(pid, childname)
-
-if all_ok:
- this_task.set_output(outmanifest)
-else:
- sys.exit(1)
diff --git a/crunch_scripts/pyrtg.py b/crunch_scripts/pyrtg.py
deleted file mode 100644
index d733270f8..000000000
--- a/crunch_scripts/pyrtg.py
+++ /dev/null
@@ -1,75 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-import arvados
-import re
-import os
-import sys
-
-rtg_install_path = None
-
-def setup():
- global rtg_install_path
- if rtg_install_path:
- return rtg_install_path
- rtg_path = arvados.util.zipball_extract(
- zipball = arvados.current_job()['script_parameters']['rtg_binary_zip'],
- path = 'rtg')
- rtg_license_path = arvados.util.collection_extract(
- collection = arvados.current_job()['script_parameters']['rtg_license'],
- path = 'license',
- decompress = False)
-
- # symlink to rtg-license.txt
- license_txt_path = os.path.join(rtg_license_path, 'rtg-license.txt')
- try:
- os.symlink(license_txt_path, os.path.join(rtg_path,'rtg-license.txt'))
- except OSError:
- if not os.path.exists(os.path.join(rtg_path,'rtg-license.txt')):
- os.symlink(license_txt_path, os.path.join(rtg_path,'rtg-license.txt'))
-
- rtg_install_path = rtg_path
- return rtg_path
-
-def run_rtg(command, output_dir, command_args, **kwargs):
- global rtg_install_path
- execargs = [os.path.join(rtg_install_path, 'rtg'),
- command,
- '-o', output_dir]
- execargs += command_args
- sys.stderr.write("run_rtg: exec %s\n" % str(execargs))
- arvados.util.run_command(
- execargs,
- cwd=arvados.current_task().tmpdir,
- stderr=sys.stderr,
- stdout=sys.stderr)
-
- # Exit status cannot be trusted in rtg 1.1.1.
- assert_done(output_dir)
-
- # Copy log files to stderr and delete them to avoid storing them
- # in Keep with the output data.
- for dirent in arvados.util.listdir_recursive(output_dir):
- if is_log_file(dirent):
- log_file = os.path.join(output_dir, dirent)
- sys.stderr.write(' '.join(['==>', dirent, '<==\n']))
- with open(log_file, 'rb') as f:
- while True:
- buf = f.read(2**20)
- if len(buf) == 0:
- break
- sys.stderr.write(buf)
- sys.stderr.write('\n') # in case log does not end in newline
- os.unlink(log_file)
-
-def assert_done(output_dir):
- # Sanity-check exit code.
- done_file = os.path.join(output_dir, 'done')
- if not os.path.exists(done_file):
- raise Exception("rtg exited 0 but %s does not exist. abort.\n" % done_file)
-
-def is_log_file(filename):
- return re.search(r'^(.*/)?(progress|done|\S+.log)$', filename)
-
-setup()
diff --git a/crunch_scripts/rtg-fasta2sdf b/crunch_scripts/rtg-fasta2sdf
deleted file mode 100755
index f1ef617f6..000000000
--- a/crunch_scripts/rtg-fasta2sdf
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-import arvados
-import os
-import re
-import sys
-import pyrtg
-
-this_job = arvados.current_job()
-this_task = arvados.current_task()
-fasta_path = arvados.util.collection_extract(
- collection = this_job['script_parameters']['input'],
- path = 'fasta',
- decompress = False)
-fasta_files = filter(lambda f: f != '.locator', os.listdir(fasta_path))
-out_dir = os.path.join(arvados.current_task().tmpdir, 'ref-sdf')
-arvados.util.run_command(['rm', '-rf', out_dir], stderr=sys.stderr)
-
-pyrtg.run_rtg('format', out_dir,
- map(lambda f: os.path.join(fasta_path, f), fasta_files))
-
-out = arvados.CollectionWriter()
-out.write_directory_tree(out_dir, max_manifest_depth=0)
-this_task.set_output(out.finish())
diff --git a/crunch_scripts/rtg-fastq2sdf b/crunch_scripts/rtg-fastq2sdf
deleted file mode 100755
index e42697fc4..000000000
--- a/crunch_scripts/rtg-fastq2sdf
+++ /dev/null
@@ -1,45 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-import arvados
-import os
-import re
-import sys
-import pyrtg
-
-this_job = arvados.current_job()
-this_task = arvados.current_task()
-fastq_path = arvados.util.collection_extract(
- collection = this_job['script_parameters']['input'],
- path = 'fastq')
-fastq_files = filter(lambda f: f != '.locator', os.listdir(fastq_path))
-tmp_dir_base = os.path.join(arvados.current_task().tmpdir, 'tmp')
-out_dir = os.path.join(arvados.current_task().tmpdir, 'reads')
-
-arvados.util.run_command(['rm', '-rf', tmp_dir_base], stderr=sys.stderr)
-arvados.util.run_command(['rm', '-rf', out_dir], stderr=sys.stderr)
-os.mkdir(tmp_dir_base)
-
-# convert fastq to sdf
-tmp_dirs = []
-for leftarm in fastq_files:
- if re.search(r'_1.f(ast)?q(.gz)?$', leftarm):
- rightarm = re.sub(r'_1(.f(ast)?q(.gz)?)$', '_2\\1', leftarm)
- if rightarm in fastq_files:
- tmp_dirs += ['%s/%08d' % (tmp_dir_base, len(tmp_dirs))]
- pyrtg.run_rtg('format', tmp_dirs[-1],
- ['-f', 'fastq',
- '-q', 'sanger',
- '-l', os.path.join(fastq_path, leftarm),
- '-r', os.path.join(fastq_path, rightarm)])
-
-# split sdf
-pyrtg.run_rtg('sdfsplit', out_dir,
- ['-n', '1500000'] + tmp_dirs)
-
-# store output
-out = arvados.CollectionWriter()
-out.write_directory_tree(out_dir, max_manifest_depth=1)
-this_task.set_output(out.finish())
diff --git a/crunch_scripts/rtg-map b/crunch_scripts/rtg-map
deleted file mode 100755
index f740888b9..000000000
--- a/crunch_scripts/rtg-map
+++ /dev/null
@@ -1,41 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-import arvados
-import os
-import re
-import sys
-import pyrtg
-
-arvados.job_setup.one_task_per_input_stream(if_sequence=0, and_end_task=True)
-
-this_job = arvados.current_job()
-this_task = arvados.current_task()
-in_dir = os.path.join(this_task.tmpdir, 'input')
-arvados.util.run_command(['rm', '-rf', in_dir], stderr=sys.stderr)
-in_dir = arvados.util.stream_extract(
- stream = arvados.StreamReader(this_task['parameters']['input']),
- path = in_dir,
- decompress = False)
-ref_dir = arvados.util.collection_extract(
- collection = this_job['script_parameters']['reference'],
- path = 'reference',
- decompress = False)
-
-out_dir = os.path.join(arvados.current_task().tmpdir, 'out')
-arvados.util.run_command(['rm', '-rf', out_dir], stderr=sys.stderr)
-
-# map reads
-pyrtg.run_rtg('map', out_dir,
- ['-i', in_dir,
- '-t', ref_dir,
- '-a', '2',
- '-b', '1',
- '--sam-rg', '@RG\\tID:NA\\tSM:NA\\tPL:ILLUMINA'])
-
-# store output
-out = arvados.CollectionWriter()
-out.write_directory_tree(out_dir, this_task['parameters']['input'][0], 0)
-this_task.set_output(out.finish())
diff --git a/crunch_scripts/rtg-snp b/crunch_scripts/rtg-snp
deleted file mode 100755
index 1d8a605b9..000000000
--- a/crunch_scripts/rtg-snp
+++ /dev/null
@@ -1,34 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-import arvados
-import os
-import re
-import sys
-import pyrtg
-
-this_job = arvados.current_job()
-this_task = arvados.current_task()
-ref_dir = arvados.util.collection_extract(
- collection = this_job['script_parameters']['reference'],
- path = 'reference',
- decompress = False)
-input_dir = arvados.util.collection_extract(
- collection = this_job['script_parameters']['input'],
- path = 'input')
-bam_files = map(lambda f: os.path.join(input_dir, f),
- filter(lambda f: re.search(r'^(.*/)?alignments.bam$', f),
- arvados.util.listdir_recursive(input_dir)))
-out_dir = os.path.join(arvados.current_task().tmpdir, 'out')
-arvados.util.run_command(['rm', '-rf', out_dir], stderr=sys.stderr)
-
-# call sequence variants
-pyrtg.run_rtg('snp', out_dir,
- ['-t', ref_dir] + bam_files)
-
-# store output
-out = arvados.CollectionWriter()
-out.write_directory_tree(out_dir, max_manifest_depth=0)
-this_task.set_output(out.finish())
diff --git a/crunch_scripts/run-command b/crunch_scripts/run-command
deleted file mode 100755
index 3fd08bf28..000000000
--- a/crunch_scripts/run-command
+++ /dev/null
@@ -1,458 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-import logging
-
-logger = logging.getLogger('run-command')
-log_handler = logging.StreamHandler()
-log_handler.setFormatter(logging.Formatter("run-command: %(message)s"))
-logger.addHandler(log_handler)
-logger.setLevel(logging.INFO)
-
-import arvados
-import re
-import os
-import subprocess
-import sys
-import shutil
-import crunchutil.subst as subst
-import time
-import arvados.commands.put as put
-import signal
-import stat
-import copy
-import traceback
-import pprint
-import multiprocessing
-import crunchutil.robust_put as robust_put
-import crunchutil.vwd as vwd
-import argparse
-import json
-import tempfile
-import errno
-
-parser = argparse.ArgumentParser()
-parser.add_argument('--dry-run', action='store_true')
-parser.add_argument('--script-parameters', type=str, default="{}")
-args = parser.parse_args()
-
-os.umask(0077)
-
-if not args.dry_run:
- api = arvados.api('v1')
- t = arvados.current_task().tmpdir
- os.chdir(arvados.current_task().tmpdir)
- os.mkdir("tmpdir")
- os.mkdir("output")
-
- os.chdir("output")
-
- outdir = os.getcwd()
-
- taskp = None
- jobp = arvados.current_job()['script_parameters']
- if len(arvados.current_task()['parameters']) > 0:
- taskp = arvados.current_task()['parameters']
-else:
- outdir = "/tmp"
- jobp = json.loads(args.script_parameters)
- os.environ['JOB_UUID'] = 'zzzzz-8i9sb-1234567890abcde'
- os.environ['TASK_UUID'] = 'zzzzz-ot0gb-1234567890abcde'
- os.environ['CRUNCH_SRC'] = '/tmp/crunch-src'
- if 'TASK_KEEPMOUNT' not in os.environ:
- os.environ['TASK_KEEPMOUNT'] = '/keep'
-
-def sub_tmpdir(v):
- return os.path.join(arvados.current_task().tmpdir, 'tmpdir')
-
-def sub_outdir(v):
- return outdir
-
-def sub_cores(v):
- return str(multiprocessing.cpu_count())
-
-def sub_jobid(v):
- return os.environ['JOB_UUID']
-
-def sub_taskid(v):
- return os.environ['TASK_UUID']
-
-def sub_jobsrc(v):
- return os.environ['CRUNCH_SRC']
-
-subst.default_subs["task.tmpdir"] = sub_tmpdir
-subst.default_subs["task.outdir"] = sub_outdir
-subst.default_subs["job.srcdir"] = sub_jobsrc
-subst.default_subs["node.cores"] = sub_cores
-subst.default_subs["job.uuid"] = sub_jobid
-subst.default_subs["task.uuid"] = sub_taskid
-
-class SigHandler(object):
- def __init__(self):
- self.sig = None
-
- def send_signal(self, subprocesses, signum):
- for sp in subprocesses:
- sp.send_signal(signum)
- self.sig = signum
-
-# http://rightfootin.blogspot.com/2006/09/more-on-python-flatten.html
-def flatten(l, ltypes=(list, tuple)):
- ltype = type(l)
- l = list(l)
- i = 0
- while i < len(l):
- while isinstance(l[i], ltypes):
- if not l[i]:
- l.pop(i)
- i -= 1
- break
- else:
- l[i:i + 1] = l[i]
- i += 1
- return ltype(l)
-
-def add_to_group(gr, match):
- m = match.groups()
- if m not in gr:
- gr[m] = []
- gr[m].append(match.group(0))
-
-class EvaluationError(Exception):
- pass
-
-# Return the name of variable ('var') that will take on each value in 'items'
-# when performing an inner substitution
-def var_items(p, c, key):
- if key not in c:
- raise EvaluationError("'%s' was expected in 'p' but is missing" % key)
-
- if "var" in c:
- if not isinstance(c["var"], basestring):
- raise EvaluationError("Value of 'var' must be a string")
- # Var specifies the variable name for inner parameter substitution
- return (c["var"], get_items(p, c[key]))
- else:
- # The component function ('key') value is a list, so return the list
- # directly with no parameter selected.
- if isinstance(c[key], list):
- return (None, get_items(p, c[key]))
- elif isinstance(c[key], basestring):
- # check if c[key] is a string that looks like a parameter
- m = re.match("^\$\((.*)\)$", c[key])
- if m and m.group(1) in p:
- return (m.group(1), get_items(p, c[key]))
- else:
- # backwards compatible, foreach specifies bare parameter name to use
- return (c[key], get_items(p, p[c[key]]))
- else:
- raise EvaluationError("Value of '%s' must be a string or list" % key)
-
-# "p" is the parameter scope, "c" is the item to be expanded.
-# If "c" is a dict, apply function expansion.
-# If "c" is a list, recursively expand each item and return a new list.
-# If "c" is a string, apply parameter substitution
-def expand_item(p, c):
- if isinstance(c, dict):
- if "foreach" in c and "command" in c:
- # Expand a command template for each item in the specified user
- # parameter
- var, items = var_items(p, c, "foreach")
- if var is None:
- raise EvaluationError("Must specify 'var' in foreach")
- r = []
- for i in items:
- params = copy.copy(p)
- params[var] = i
- r.append(expand_item(params, c["command"]))
- return r
- elif "list" in c and "index" in c and "command" in c:
- # extract a single item from a list
- var, items = var_items(p, c, "list")
- if var is None:
- raise EvaluationError("Must specify 'var' in list")
- params = copy.copy(p)
- params[var] = items[int(c["index"])]
- return expand_item(params, c["command"])
- elif "regex" in c:
- pattern = re.compile(c["regex"])
- if "filter" in c:
- # filter list so that it only includes items that match a
- # regular expression
- _, items = var_items(p, c, "filter")
- return [i for i in items if pattern.match(i)]
- elif "group" in c:
- # generate a list of lists, where items are grouped on common
- # subexpression match
- _, items = var_items(p, c, "group")
- groups = {}
- for i in items:
- match = pattern.match(i)
- if match:
- add_to_group(groups, match)
- return [groups[k] for k in groups]
- elif "extract" in c:
- # generate a list of lists, where items are split by
- # subexpression match
- _, items = var_items(p, c, "extract")
- r = []
- for i in items:
- match = pattern.match(i)
- if match:
- r.append(list(match.groups()))
- return r
- elif "batch" in c and "size" in c:
- # generate a list of lists, where items are split into a batch size
- _, items = var_items(p, c, "batch")
- sz = int(c["size"])
- r = []
- for j in xrange(0, len(items), sz):
- r.append(items[j:j+sz])
- return r
- raise EvaluationError("Missing valid list context function")
- elif isinstance(c, list):
- return [expand_item(p, arg) for arg in c]
- elif isinstance(c, basestring):
- m = re.match("^\$\((.*)\)$", c)
- if m and m.group(1) in p:
- return expand_item(p, p[m.group(1)])
- else:
- return subst.do_substitution(p, c)
- else:
- raise EvaluationError("expand_item() unexpected parameter type %s" % type(c))
-
-# Evaluate in a list context
-# "p" is the parameter scope, "value" will be evaluated
-# if "value" is a list after expansion, return that
-# if "value" is a path to a directory, return a list consisting of each entry in the directory
-# if "value" is a path to a file, return a list consisting of each line of the file
-def get_items(p, value):
- value = expand_item(p, value)
- if isinstance(value, list):
- return value
- elif isinstance(value, basestring):
- mode = os.stat(value).st_mode
- prefix = value[len(os.environ['TASK_KEEPMOUNT'])+1:]
- if mode is not None:
- if stat.S_ISDIR(mode):
- items = [os.path.join(value, l) for l in os.listdir(value)]
- elif stat.S_ISREG(mode):
- with open(value) as f:
- items = [line.rstrip("\r\n") for line in f]
- return items
- raise EvaluationError("get_items did not yield a list")
-
-stdoutname = None
-stdoutfile = None
-stdinname = None
-stdinfile = None
-
-# Construct the cross product of all values of each variable listed in fvars
-def recursive_foreach(params, fvars):
- var = fvars[0]
- fvars = fvars[1:]
- items = get_items(params, params[var])
- logger.info("parallelizing on %s with items %s" % (var, items))
- if items is not None:
- for i in items:
- params = copy.copy(params)
- params[var] = i
- if len(fvars) > 0:
- recursive_foreach(params, fvars)
- else:
- if not args.dry_run:
- arvados.api().job_tasks().create(body={
- 'job_uuid': arvados.current_job()['uuid'],
- 'created_by_job_task_uuid': arvados.current_task()['uuid'],
- 'sequence': 1,
- 'parameters': params
- }).execute()
- else:
- if isinstance(params["command"][0], list):
- for c in params["command"]:
- logger.info(flatten(expand_item(params, c)))
- else:
- logger.info(flatten(expand_item(params, params["command"])))
- else:
- logger.error("parameter %s with value %s in task.foreach yielded no items" % (var, params[var]))
- sys.exit(1)
-
-try:
- if "task.foreach" in jobp:
- if args.dry_run or arvados.current_task()['sequence'] == 0:
- # This is the first task to start the other tasks and exit
- fvars = jobp["task.foreach"]
- if isinstance(fvars, basestring):
- fvars = [fvars]
- if not isinstance(fvars, list) or len(fvars) == 0:
- logger.error("value of task.foreach must be a string or non-empty list")
- sys.exit(1)
- recursive_foreach(jobp, jobp["task.foreach"])
- if not args.dry_run:
- if "task.vwd" in jobp:
- # Set output of the first task to the base vwd collection so it
- # will be merged with output fragments from the other tasks by
- # crunch.
- arvados.current_task().set_output(subst.do_substitution(jobp, jobp["task.vwd"]))
- else:
- arvados.current_task().set_output(None)
- sys.exit(0)
- else:
- # This is the only task so taskp/jobp are the same
- taskp = jobp
-except Exception as e:
- logger.exception("caught exception")
- logger.error("job parameters were:")
- logger.error(pprint.pformat(jobp))
- sys.exit(1)
-
-try:
- if not args.dry_run:
- if "task.vwd" in taskp:
- # Populate output directory with symlinks to files in collection
- vwd.checkout(subst.do_substitution(taskp, taskp["task.vwd"]), outdir)
-
- if "task.cwd" in taskp:
- os.chdir(subst.do_substitution(taskp, taskp["task.cwd"]))
-
- cmd = []
- if isinstance(taskp["command"][0], list):
- for c in taskp["command"]:
- cmd.append(flatten(expand_item(taskp, c)))
- else:
- cmd.append(flatten(expand_item(taskp, taskp["command"])))
-
- if "task.stdin" in taskp:
- stdinname = subst.do_substitution(taskp, taskp["task.stdin"])
- if not args.dry_run:
- stdinfile = open(stdinname, "rb")
-
- if "task.stdout" in taskp:
- stdoutname = subst.do_substitution(taskp, taskp["task.stdout"])
- if not args.dry_run:
- stdoutfile = open(stdoutname, "wb")
-
- if "task.env" in taskp:
- env = copy.copy(os.environ)
- for k,v in taskp["task.env"].items():
- env[k] = subst.do_substitution(taskp, v)
- else:
- env = None
-
- logger.info("{}{}{}".format(' | '.join([' '.join(c) for c in cmd]), (" < " + stdinname) if stdinname is not None else "", (" > " + stdoutname) if stdoutname is not None else ""))
-
- if args.dry_run:
- sys.exit(0)
-except subst.SubstitutionError as e:
- logger.error(str(e))
- logger.error("task parameters were:")
- logger.error(pprint.pformat(taskp))
- sys.exit(1)
-except Exception as e:
- logger.exception("caught exception")
- logger.error("task parameters were:")
- logger.error(pprint.pformat(taskp))
- sys.exit(1)
-
-# rcode holds the return codes produced by each subprocess
-rcode = {}
-try:
- subprocesses = []
- close_streams = []
- if stdinfile:
- close_streams.append(stdinfile)
- next_stdin = stdinfile
-
- for i in xrange(len(cmd)):
- if i == len(cmd)-1:
- # this is the last command in the pipeline, so its stdout should go to stdoutfile
- next_stdout = stdoutfile
- else:
- # this is an intermediate command in the pipeline, so its stdout should go to a pipe
- next_stdout = subprocess.PIPE
-
- sp = subprocess.Popen(cmd[i], shell=False, stdin=next_stdin, stdout=next_stdout, env=env)
-
- # Need to close the FDs on our side so that subcommands will get SIGPIPE if the
- # consuming process ends prematurely.
- if sp.stdout:
- close_streams.append(sp.stdout)
-
- # Send this processes's stdout to to the next process's stdin
- next_stdin = sp.stdout
-
- subprocesses.append(sp)
-
- # File descriptors have been handed off to the subprocesses, so close them here.
- for s in close_streams:
- s.close()
-
- # Set up signal handling
- sig = SigHandler()
-
- # Forward terminate signals to the subprocesses.
- signal.signal(signal.SIGINT, lambda signum, frame: sig.send_signal(subprocesses, signum))
- signal.signal(signal.SIGTERM, lambda signum, frame: sig.send_signal(subprocesses, signum))
- signal.signal(signal.SIGQUIT, lambda signum, frame: sig.send_signal(subprocesses, signum))
-
- active = 1
- pids = set([s.pid for s in subprocesses])
- while len(pids) > 0:
- try:
- (pid, status) = os.wait()
- except OSError as e:
- if e.errno == errno.EINTR:
- pass
- else:
- raise
- else:
- pids.discard(pid)
- if not taskp.get("task.ignore_rcode"):
- rcode[pid] = (status >> 8)
- else:
- rcode[pid] = 0
-
- if sig.sig is not None:
- logger.critical("terminating on signal %s" % sig.sig)
- sys.exit(2)
- else:
- for i in xrange(len(cmd)):
- r = rcode[subprocesses[i].pid]
- logger.info("%s completed with exit code %i (%s)" % (cmd[i][0], r, "success" if r == 0 else "failed"))
-
-except Exception as e:
- logger.exception("caught exception")
-
-# restore default signal handlers.
-signal.signal(signal.SIGINT, signal.SIG_DFL)
-signal.signal(signal.SIGTERM, signal.SIG_DFL)
-signal.signal(signal.SIGQUIT, signal.SIG_DFL)
-
-logger.info("the following output files will be saved to keep:")
-
-subprocess.call(["find", "-L", ".", "-type", "f", "-printf", "run-command: %12.12s %h/%f\\n"], stdout=sys.stderr, cwd=outdir)
-
-logger.info("start writing output to keep")
-
-if "task.vwd" in taskp and "task.foreach" in jobp:
- for root, dirs, files in os.walk(outdir):
- for f in files:
- s = os.lstat(os.path.join(root, f))
- if stat.S_ISLNK(s.st_mode):
- os.unlink(os.path.join(root, f))
-
-(outcollection, checkin_error) = vwd.checkin(outdir)
-
-# Success if we ran any subprocess, and they all exited 0.
-success = rcode and all(status == 0 for status in rcode.itervalues()) and not checkin_error
-
-api.job_tasks().update(uuid=arvados.current_task()['uuid'],
- body={
- 'output': outcollection.manifest_text(),
- 'success': success,
- 'progress':1.0
- }).execute()
-
-sys.exit(0 if success else 1)
diff --git a/crunch_scripts/split-fastq.py b/crunch_scripts/split-fastq.py
deleted file mode 100755
index 61c384fbf..000000000
--- a/crunch_scripts/split-fastq.py
+++ /dev/null
@@ -1,70 +0,0 @@
-#!/usr/bin/python
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-import arvados
-import re
-import hashlib
-import string
-
-api = arvados.api('v1')
-
-piece = 0
-manifest_text = ""
-
-# Look for paired reads
-
-inp = arvados.CollectionReader(arvados.getjobparam('reads'))
-
-manifest_list = []
-
-def nextline(reader, start):
- n = -1
- while True:
- r = reader.readfrom(start, 128)
- if r == '':
- break
- n = string.find(r, "\n")
- if n > -1:
- break
- else:
- start += 128
- return n
-
-prog = re.compile(r'(.*?)(_[12])?\.fastq(\.gz)?$')
-
-# Look for fastq files
-for s in inp.all_streams():
- for f in s.all_files():
- name_pieces = prog.match(f.name())
- if name_pieces is not None:
- if s.name() != ".":
- # The downstream tool (run-command) only iterates over the top
- # level of directories so if there are fastq files in
- # directories in the input, the choice is either to forget
- # there are directories (which might lead to name conflicts) or
- # just fail.
- print >>sys.stderr, "fastq must be at the root of the collection"
- sys.exit(1)
-
- p = None
- if name_pieces.group(2) is not None:
- if name_pieces.group(2) == "_1":
- p = [{}, {}]
- p[0]["reader"] = s.files()[name_pieces.group(0)]
- p[1]["reader"] = s.files()[name_pieces.group(1) + "_2.fastq" + (name_pieces.group(3) if name_pieces.group(3) else '')]
- else:
- p = [{}]
- p[0]["reader"] = s.files()[name_pieces.group(0)]
-
- if p is not None:
- for i in xrange(0, len(p)):
- m = p[i]["reader"].as_manifest().split()
- m[0] = "./_" + str(piece)
- manifest_list.append(m)
- piece += 1
-
-manifest_text = "\n".join(" ".join(m) for m in manifest_list) + "\n"
-
-arvados.current_task().set_output(manifest_text)
diff --git a/crunch_scripts/test/task_output_dir b/crunch_scripts/test/task_output_dir
deleted file mode 100755
index 8b2c7ced4..000000000
--- a/crunch_scripts/test/task_output_dir
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-import arvados
-import arvados.crunch
-import hashlib
-import os
-
-out = arvados.crunch.TaskOutputDir()
-
-string = open(__file__).read()
-with open(os.path.join(out.path, 'example.out'), 'w') as f:
- f.write(string)
-with open(os.path.join(out.path, 'example.out.SHA1'), 'w') as f:
- f.write(hashlib.sha1(string).hexdigest() + "\n")
-
-arvados.current_task().set_output(out.manifest_text())
diff --git a/services/api/app/controllers/arvados/v1/job_tasks_controller.rb b/services/api/app/controllers/arvados/v1/job_tasks_controller.rb
index 07bbc33ab..b960d2e9e 100644
--- a/services/api/app/controllers/arvados/v1/job_tasks_controller.rb
+++ b/services/api/app/controllers/arvados/v1/job_tasks_controller.rb
@@ -4,4 +4,9 @@
class Arvados::V1::JobTasksController < ApplicationController
accept_attribute_as_json :parameters, Hash
+
+ def create
+ return send_error("Unsupported legacy jobs API",
+ status: 400)
+ end
end
diff --git a/services/api/app/controllers/arvados/v1/jobs_controller.rb b/services/api/app/controllers/arvados/v1/jobs_controller.rb
index c3655272d..f6308c528 100644
--- a/services/api/app/controllers/arvados/v1/jobs_controller.rb
+++ b/services/api/app/controllers/arvados/v1/jobs_controller.rb
@@ -13,115 +13,28 @@ class Arvados::V1::JobsController < ApplicationController
include DbCurrentTime
def create
- [:repository, :script, :script_version, :script_parameters].each do |r|
- if !resource_attrs[r]
- return send_error("#{r} attribute must be specified",
- status: :unprocessable_entity)
- end
- end
-
- # We used to ask for the minimum_, exclude_, and no_reuse params
- # in the job resource. Now we advertise them as flags that alter
- # the behavior of the create action.
- [:minimum_script_version, :exclude_script_versions].each do |attr|
- if resource_attrs.has_key? attr
- params[attr] = resource_attrs.delete attr
- end
- end
- if resource_attrs.has_key? :no_reuse
- params[:find_or_create] = !resource_attrs.delete(:no_reuse)
- end
-
- return super if !params[:find_or_create]
- return if !load_filters_param
-
- begin
- @object = Job.find_reusable(resource_attrs, params, @filters, @read_users)
- rescue ArgumentError => error
- return send_error(error.message)
- end
-
- if @object
- show
- else
- super
- end
+ return send_error("Unsupported legacy jobs API",
+ status: 400)
end
def cancel
- reload_object_before_update
- @object.cancel cascade: params[:cascade]
- show
+ return send_error("Unsupported legacy jobs API",
+ status: 400)
end
def lock
- @object.lock current_user.uuid
- show
- end
-
- class LogStreamer
- Q_UPDATE_INTERVAL = 12
- def initialize(job, opts={})
- @job = job
- @opts = opts
- end
- def each
- if @job.finished_at
- yield "#{@job.uuid} finished at #{@job.finished_at}\n"
- return
- end
- while not @job.started_at
- # send a summary (job queue + available nodes) to the client
- # every few seconds while waiting for the job to start
- current_time = db_current_time
- last_ack_at ||= current_time - Q_UPDATE_INTERVAL - 1
- if current_time - last_ack_at >= Q_UPDATE_INTERVAL
- nodes_in_state = {idle: 0, alloc: 0}
- ActiveRecord::Base.uncached do
- Node.where('hostname is not ?', nil).collect do |n|
- if n.info[:slurm_state]
- nodes_in_state[n.info[:slurm_state]] ||= 0
- nodes_in_state[n.info[:slurm_state]] += 1
- end
- end
- end
- job_queue = Job.queue.select(:uuid)
- n_queued_before_me = 0
- job_queue.each do |j|
- break if j.uuid == @job.uuid
- n_queued_before_me += 1
- end
- yield "#{db_current_time}" \
- " job #{@job.uuid}" \
- " queue_position #{n_queued_before_me}" \
- " queue_size #{job_queue.count}" \
- " nodes_idle #{nodes_in_state[:idle]}" \
- " nodes_alloc #{nodes_in_state[:alloc]}\n"
- last_ack_at = db_current_time
- end
- sleep 3
- ActiveRecord::Base.uncached do
- @job.reload
- end
- end
- end
+ return send_error("Unsupported legacy jobs API",
+ status: 400)
end
def queue
- params[:order] ||= ['priority desc', 'created_at']
- load_limit_offset_order_params
- load_where_param
- @where.merge!({state: Job::Queued})
- return if !load_filters_param
- find_objects_for_index
- index
+ return send_error("Unsupported legacy jobs API",
+ status: 400)
end
def queue_size
- # Users may not be allowed to see all the jobs in the queue, so provide a
- # method to get just the queue size in order to get a gist of how busy the
- # cluster is.
- render :json => {:queue_size => Job.queue.size}
+ return send_error("Unsupported legacy jobs API",
+ status: 400)
end
def self._create_requires_parameters
diff --git a/services/api/app/controllers/arvados/v1/pipeline_instances_controller.rb b/services/api/app/controllers/arvados/v1/pipeline_instances_controller.rb
index baffda1c9..166f71049 100644
--- a/services/api/app/controllers/arvados/v1/pipeline_instances_controller.rb
+++ b/services/api/app/controllers/arvados/v1/pipeline_instances_controller.rb
@@ -7,9 +7,13 @@ class Arvados::V1::PipelineInstancesController < ApplicationController
accept_attribute_as_json :properties, Hash
accept_attribute_as_json :components_summary, Hash
+ def create
+ return send_error("Unsupported legacy jobs API",
+ status: 400)
+ end
+
def cancel
- reload_object_before_update
- @object.cancel cascade: params[:cascade]
- show
+ return send_error("Unsupported legacy jobs API",
+ status: 400)
end
end
diff --git a/services/api/app/controllers/arvados/v1/pipeline_templates_controller.rb b/services/api/app/controllers/arvados/v1/pipeline_templates_controller.rb
index a276948d5..4a5e724ee 100644
--- a/services/api/app/controllers/arvados/v1/pipeline_templates_controller.rb
+++ b/services/api/app/controllers/arvados/v1/pipeline_templates_controller.rb
@@ -4,4 +4,9 @@
class Arvados::V1::PipelineTemplatesController < ApplicationController
accept_attribute_as_json :components, Hash
+
+ def create
+ return send_error("Unsupported legacy jobs API",
+ status: 400)
+ end
end
diff --git a/services/api/lib/crunch_dispatch.rb b/services/api/lib/crunch_dispatch.rb
deleted file mode 100644
index 4e640186d..000000000
--- a/services/api/lib/crunch_dispatch.rb
+++ /dev/null
@@ -1,981 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: AGPL-3.0
-
-require 'open3'
-require 'shellwords'
-
-class CrunchDispatch
- extend DbCurrentTime
- include ApplicationHelper
- include Process
-
- EXIT_TEMPFAIL = 75
- EXIT_RETRY_UNLOCKED = 93
- RETRY_UNLOCKED_LIMIT = 3
-
- class LogTime < Time
- def to_s
- self.utc.strftime "%Y-%m-%d_%H:%M:%S"
- end
- end
-
- def initialize
- @crunch_job_bin = (ENV['CRUNCH_JOB_BIN'] || `which arv-crunch-job`.strip)
- if @crunch_job_bin.empty?
- raise "No CRUNCH_JOB_BIN env var, and crunch-job not in path."
- end
-
- @docker_bin = ENV['CRUNCH_JOB_DOCKER_BIN']
- @docker_run_args = ENV['CRUNCH_JOB_DOCKER_RUN_ARGS']
- @cgroup_root = ENV['CRUNCH_CGROUP_ROOT']
- @srun_sync_timeout = ENV['CRUNCH_SRUN_SYNC_TIMEOUT']
-
- @arvados_internal = Rails.configuration.Containers.JobsAPI.GitInternalDir
- if not File.exist? @arvados_internal
- $stderr.puts `mkdir -p #{@arvados_internal.shellescape} && git init --bare #{@arvados_internal.shellescape}`
- raise "No internal git repository available" unless ($? == 0)
- end
-
- @repo_root = Rails.configuration.Git.Repositories
- @arvados_repo_path = Repository.where(name: "arvados").first.server_path
- @authorizations = {}
- @did_recently = {}
- @fetched_commits = {}
- @git_tags = {}
- @node_state = {}
- @pipe_auth_tokens = {}
- @running = {}
- @todo = []
- @todo_job_retries = {}
- @job_retry_counts = Hash.new(0)
- @todo_pipelines = []
- end
-
- def sysuser
- return act_as_system_user
- end
-
- def refresh_todo
- if @runoptions[:jobs]
- @todo = @todo_job_retries.values + Job.queue.select(&:repository)
- end
- if @runoptions[:pipelines]
- @todo_pipelines = PipelineInstance.queue
- end
- end
-
- def each_slurm_line(cmd, outfmt, max_fields=nil)
- max_fields ||= outfmt.split(":").size
- max_fields += 1 # To accommodate the node field we add
- @@slurm_version ||= Gem::Version.new(`sinfo --version`.match(/\b[\d\.]+\b/)[0])
- if Gem::Version.new('2.3') <= @@slurm_version
- `#{cmd} --noheader -o '%n:#{outfmt}'`.each_line do |line|
- yield line.chomp.split(":", max_fields)
- end
- else
- # Expand rows with hostname ranges (like "foo[1-3,5,9-12]:idle")
- # into multiple rows with one hostname each.
- `#{cmd} --noheader -o '%N:#{outfmt}'`.each_line do |line|
- tokens = line.chomp.split(":", max_fields)
- if (re = tokens[0].match(/^(.*?)\[([-,\d]+)\]$/))
- tokens.shift
- re[2].split(",").each do |range|
- range = range.split("-").collect(&:to_i)
- (range[0]..range[-1]).each do |n|
- yield [re[1] + n.to_s] + tokens
- end
- end
- else
- yield tokens
- end
- end
- end
- end
-
- def slurm_status
- slurm_nodes = {}
- each_slurm_line("sinfo", "%t") do |hostname, state|
- # Treat nodes in idle* state as down, because the * means that slurm
- # hasn't been able to communicate with it recently.
- state.sub!(/^idle\*/, "down")
- state.sub!(/\W+$/, "")
- state = "down" unless %w(idle alloc comp mix drng down).include?(state)
- slurm_nodes[hostname] = {state: state, job: nil}
- end
- each_slurm_line("squeue", "%j") do |hostname, job_uuid|
- slurm_nodes[hostname][:job] = job_uuid if slurm_nodes[hostname]
- end
- slurm_nodes
- end
-
- def update_node_status
- return unless Rails.configuration.Containers.JobsAPI.CrunchJobWrapper.to_s.match(/^slurm/)
- slurm_status.each_pair do |hostname, slurmdata|
- next if @node_state[hostname] == slurmdata
- begin
- node = Node.where('hostname=?', hostname).order(:last_ping_at).last
- if node
- $stderr.puts "dispatch: update #{hostname} state to #{slurmdata}"
- node.info["slurm_state"] = slurmdata[:state]
- node.job_uuid = slurmdata[:job]
- if node.save
- @node_state[hostname] = slurmdata
- else
- $stderr.puts "dispatch: failed to update #{node.uuid}: #{node.errors.messages}"
- end
- elsif slurmdata[:state] != 'down'
- $stderr.puts "dispatch: SLURM reports '#{hostname}' is not down, but no node has that name"
- end
- rescue => error
- $stderr.puts "dispatch: error updating #{hostname} node status: #{error}"
- end
- end
- end
-
- def positive_int(raw_value, default=nil)
- value = begin raw_value.to_i rescue 0 end
- if value > 0
- value
- else
- default
- end
- end
-
- NODE_CONSTRAINT_MAP = {
- # Map Job runtime_constraints keys to the corresponding Node info key.
- 'min_ram_mb_per_node' => 'total_ram_mb',
- 'min_scratch_mb_per_node' => 'total_scratch_mb',
- 'min_cores_per_node' => 'total_cpu_cores',
- }
-
- def nodes_available_for_job_now(job)
- # Find Nodes that satisfy a Job's runtime constraints (by building
- # a list of Procs and using them to test each Node). If there
- # enough to run the Job, return an array of their names.
- # Otherwise, return nil.
- need_procs = NODE_CONSTRAINT_MAP.each_pair.map do |job_key, node_key|
- Proc.new do |node|
- positive_int(node.properties[node_key], 0) >=
- positive_int(job.runtime_constraints[job_key], 0)
- end
- end
- min_node_count = positive_int(job.runtime_constraints['min_nodes'], 1)
- usable_nodes = []
- Node.all.select do |node|
- node.info['slurm_state'] == 'idle'
- end.sort_by do |node|
- # Prefer nodes with no price, then cheap nodes, then expensive nodes
- node.properties['cloud_node']['price'].to_f rescue 0
- end.each do |node|
- if need_procs.select { |need_proc| not need_proc.call(node) }.any?
- # At least one runtime constraint is not satisfied by this node
- next
- end
- usable_nodes << node
- if usable_nodes.count >= min_node_count
- hostnames = usable_nodes.map(&:hostname)
- log_nodes = usable_nodes.map do |n|
- "#{n.hostname} #{n.uuid} #{n.properties.to_json}"
- end
- log_job = "#{job.uuid} #{job.runtime_constraints}"
- log_text = "dispatching job #{log_job} to #{log_nodes.join(", ")}"
- $stderr.puts log_text
- begin
- act_as_system_user do
- Log.new(object_uuid: job.uuid,
- event_type: 'dispatch',
- owner_uuid: system_user_uuid,
- summary: "dispatching to #{hostnames.join(", ")}",
- properties: {'text' => log_text}).save!
- end
- rescue => e
- $stderr.puts "dispatch: log.create failed: #{e}"
- end
- return hostnames
- end
- end
- nil
- end
-
- def nodes_available_for_job(job)
- # Check if there are enough idle nodes with the Job's minimum
- # hardware requirements to run it. If so, return an array of
- # their names. If not, up to once per hour, signal start_jobs to
- # hold off launching Jobs. This delay is meant to give the Node
- # Manager an opportunity to make new resources available for new
- # Jobs.
- #
- # The exact timing parameters here might need to be adjusted for
- # the best balance between helping the longest-waiting Jobs run,
- # and making efficient use of immediately available resources.
- # These are all just first efforts until we have more data to work
- # with.
- nodelist = nodes_available_for_job_now(job)
- if nodelist.nil? and not did_recently(:wait_for_available_nodes, 3600)
- $stderr.puts "dispatch: waiting for nodes for #{job.uuid}"
- @node_wait_deadline = Time.now + 5.minutes
- end
- nodelist
- end
-
- def fail_job job, message, skip_lock: false
- $stderr.puts "dispatch: #{job.uuid}: #{message}"
- begin
- Log.new(object_uuid: job.uuid,
- event_type: 'dispatch',
- owner_uuid: job.owner_uuid,
- summary: message,
- properties: {"text" => message}).save!
- rescue => e
- $stderr.puts "dispatch: log.create failed: #{e}"
- end
-
- if not skip_lock and not have_job_lock?(job)
- begin
- job.lock @authorizations[job.uuid].user.uuid
- rescue ArvadosModel::AlreadyLockedError
- $stderr.puts "dispatch: tried to mark job #{job.uuid} as failed but it was already locked by someone else"
- return
- end
- end
-
- job.state = "Failed"
- if not job.save
- $stderr.puts "dispatch: save failed setting job #{job.uuid} to failed"
- end
- end
-
- def stdout_s(cmd_a, opts={})
- IO.popen(cmd_a, "r", opts) do |pipe|
- return pipe.read.chomp
- end
- end
-
- def git_cmd(*cmd_a)
- ["git", "--git-dir=#{@arvados_internal}"] + cmd_a
- end
-
- def get_authorization(job)
- if @authorizations[job.uuid] and
- @authorizations[job.uuid].user.uuid != job.modified_by_user_uuid
- # We already made a token for this job, but we need a new one
- # because modified_by_user_uuid has changed (the job will run
- # as a different user).
- @authorizations[job.uuid].update_attributes expires_at: Time.now
- @authorizations[job.uuid] = nil
- end
- if not @authorizations[job.uuid]
- auth = ApiClientAuthorization.
- new(user: User.where('uuid=?', job.modified_by_user_uuid).first,
- api_client_id: 0)
- if not auth.save
- $stderr.puts "dispatch: auth.save failed for #{job.uuid}"
- else
- @authorizations[job.uuid] = auth
- end
- end
- @authorizations[job.uuid]
- end
-
- def internal_repo_has_commit? sha1
- if (not @fetched_commits[sha1] and
- sha1 == stdout_s(git_cmd("rev-list", "-n1", sha1), err: "/dev/null") and
- $? == 0)
- @fetched_commits[sha1] = true
- end
- return @fetched_commits[sha1]
- end
-
- def get_commit src_repo, sha1
- return true if internal_repo_has_commit? sha1
-
- # commit does not exist in internal repository, so import the
- # source repository using git fetch-pack
- cmd = git_cmd("fetch-pack", "--no-progress", "--all", src_repo)
- $stderr.puts "dispatch: #{cmd}"
- $stderr.puts(stdout_s(cmd))
- @fetched_commits[sha1] = ($? == 0)
- end
-
- def tag_commit(job, commit_hash, tag_name)
- # @git_tags[T]==V if we know commit V has been tagged T in the
- # arvados_internal repository.
- if not @git_tags[tag_name]
- cmd = git_cmd("tag", tag_name, commit_hash)
- $stderr.puts "dispatch: #{cmd}"
- $stderr.puts(stdout_s(cmd, err: "/dev/null"))
- unless $? == 0
- # git tag failed. This may be because the tag already exists, so check for that.
- tag_rev = stdout_s(git_cmd("rev-list", "-n1", tag_name))
- if $? == 0
- # We got a revision back
- if tag_rev != commit_hash
- # Uh oh, the tag doesn't point to the revision we were expecting.
- # Someone has been monkeying with the job record and/or git.
- fail_job job, "Existing tag #{tag_name} points to commit #{tag_rev} but expected commit #{commit_hash}"
- return nil
- end
- # we're okay (fall through to setting @git_tags below)
- else
- # git rev-list failed for some reason.
- fail_job job, "'git tag' for #{tag_name} failed but did not find any existing tag using 'git rev-list'"
- return nil
- end
- end
- # 'git tag' was successful, or there is an existing tag that points to the same revision.
- @git_tags[tag_name] = commit_hash
- elsif @git_tags[tag_name] != commit_hash
- fail_job job, "Existing tag #{tag_name} points to commit #{@git_tags[tag_name]} but this job uses commit #{commit_hash}"
- return nil
- end
- @git_tags[tag_name]
- end
-
- def start_jobs
- @todo.each do |job|
- next if @running[job.uuid]
-
- cmd_args = nil
- case Rails.configuration.Containers.JobsAPI.CrunchJobWrapper
- when "none"
- if @running.size > 0
- # Don't run more than one at a time.
- return
- end
- cmd_args = []
- when "slurm_immediate"
- nodelist = nodes_available_for_job(job)
- if nodelist.nil?
- if Time.now < @node_wait_deadline
- break
- else
- next
- end
- end
- cmd_args = ["salloc",
- "--chdir=/",
- "--immediate",
- "--exclusive",
- "--no-kill",
- "--job-name=#{job.uuid}",
- "--nodelist=#{nodelist.join(',')}"]
- else
- raise "Unknown crunch_job_wrapper: #{Rails.configuration.Containers.JobsAPI.CrunchJobWrapper}"
- end
-
- cmd_args = sudo_preface + cmd_args
-
- next unless get_authorization job
-
- ready = internal_repo_has_commit? job.script_version
-
- if not ready
- # Import the commit from the specified repository into the
- # internal repository. This should have been done already when
- # the job was created/updated; this code is obsolete except to
- # avoid deployment races. Failing the job would be a
- # reasonable thing to do at this point.
- repo = Repository.where(name: job.repository).first
- if repo.nil? or repo.server_path.nil?
- fail_job job, "Repository #{job.repository} not found under #{@repo_root}"
- next
- end
- ready &&= get_commit repo.server_path, job.script_version
- ready &&= tag_commit job, job.script_version, job.uuid
- end
-
- # This should be unnecessary, because API server does it during
- # job create/update, but it's still not a bad idea to verify the
- # tag is correct before starting the job:
- ready &&= tag_commit job, job.script_version, job.uuid
-
- # The arvados_sdk_version doesn't support use of arbitrary
- # remote URLs, so the requested version isn't necessarily copied
- # into the internal repository yet.
- if job.arvados_sdk_version
- ready &&= get_commit @arvados_repo_path, job.arvados_sdk_version
- ready &&= tag_commit job, job.arvados_sdk_version, "#{job.uuid}-arvados-sdk"
- end
-
- if not ready
- fail_job job, "commit not present in internal repository"
- next
- end
-
- cmd_args += [@crunch_job_bin,
- '--job-api-token', @authorizations[job.uuid].api_token,
- '--job', job.uuid,
- '--git-dir', @arvados_internal]
-
- if @cgroup_root
- cmd_args += ['--cgroup-root', @cgroup_root]
- end
-
- if @docker_bin
- cmd_args += ['--docker-bin', @docker_bin]
- end
-
- if @docker_run_args
- cmd_args += ['--docker-run-args', @docker_run_args]
- end
-
- if @srun_sync_timeout
- cmd_args += ['--srun-sync-timeout', @srun_sync_timeout]
- end
-
- if have_job_lock?(job)
- cmd_args << "--force-unlock"
- end
-
- $stderr.puts "dispatch: #{cmd_args.join ' '}"
-
- begin
- i, o, e, t = Open3.popen3(*cmd_args)
- rescue
- $stderr.puts "dispatch: popen3: #{$!}"
- # This is a dispatch problem like "Too many open files";
- # retrying another job right away would be futile. Just return
- # and hope things are better next time, after (at least) a
- # did_recently() delay.
- return
- end
-
- $stderr.puts "dispatch: job #{job.uuid}"
- start_banner = "dispatch: child #{t.pid} start #{LogTime.now}"
- $stderr.puts start_banner
-
- @running[job.uuid] = {
- stdin: i,
- stdout: o,
- stderr: e,
- wait_thr: t,
- job: job,
- buf: {stderr: '', stdout: ''},
- started: false,
- sent_int: 0,
- job_auth: @authorizations[job.uuid],
- stderr_buf_to_flush: '',
- stderr_flushed_at: Time.new(0),
- bytes_logged: 0,
- events_logged: 0,
- log_throttle_is_open: true,
- log_throttle_reset_time: Time.now + Rails.configuration.Containers.Logging.LogThrottlePeriod,
- log_throttle_bytes_so_far: 0,
- log_throttle_lines_so_far: 0,
- log_throttle_bytes_skipped: 0,
- log_throttle_partial_line_last_at: Time.new(0),
- log_throttle_first_partial_line: true,
- }
- i.close
- @todo_job_retries.delete(job.uuid)
- update_node_status
- end
- end
-
- # Test for hard cap on total output and for log throttling. Returns whether
- # the log line should go to output or not. Modifies "line" in place to
- # replace it with an error if a logging limit is tripped.
- def rate_limit running_job, line
- message = false
- linesize = line.size
- if running_job[:log_throttle_is_open]
- partial_line = false
- skip_counts = false
- matches = line.match(/^\S+ \S+ \d+ \d+ stderr (.*)/)
- if matches and matches[1] and matches[1].start_with?('[...]') and matches[1].end_with?('[...]')
- partial_line = true
- if Time.now > running_job[:log_throttle_partial_line_last_at] + Rails.configuration.Containers.Logging.LogPartialLineThrottlePeriod
- running_job[:log_throttle_partial_line_last_at] = Time.now
- else
- skip_counts = true
- end
- end
-
- if !skip_counts
- running_job[:log_throttle_lines_so_far] += 1
- running_job[:log_throttle_bytes_so_far] += linesize
- running_job[:bytes_logged] += linesize
- end
-
- if (running_job[:bytes_logged] >
- Rails.configuration.Containers.Logging.LimitLogBytesPerJob)
- message = "Exceeded log limit #{Rails.configuration.Containers.Logging.LimitLogBytesPerJob} bytes (LimitLogBytesPerJob). Log will be truncated."
- running_job[:log_throttle_reset_time] = Time.now + 100.years
- running_job[:log_throttle_is_open] = false
-
- elsif (running_job[:log_throttle_bytes_so_far] >
- Rails.configuration.Containers.Logging.LogThrottleBytes)
- remaining_time = running_job[:log_throttle_reset_time] - Time.now
- message = "Exceeded rate #{Rails.configuration.Containers.Logging.LogThrottleBytes} bytes per #{Rails.configuration.Containers.Logging.LogThrottlePeriod} seconds (LogThrottleBytes). Logging will be silenced for the next #{remaining_time.round} seconds."
- running_job[:log_throttle_is_open] = false
-
- elsif (running_job[:log_throttle_lines_so_far] >
- Rails.configuration.Containers.Logging.LogThrottleLines)
- remaining_time = running_job[:log_throttle_reset_time] - Time.now
- message = "Exceeded rate #{Rails.configuration.Containers.Logging.LogThrottleLines} lines per #{Rails.configuration.Containers.Logging.LogThrottlePeriod} seconds (LogThrottleLines), logging will be silenced for the next #{remaining_time.round} seconds."
- running_job[:log_throttle_is_open] = false
-
- elsif partial_line and running_job[:log_throttle_first_partial_line]
- running_job[:log_throttle_first_partial_line] = false
- message = "Rate-limiting partial segments of long lines to one every #{Rails.configuration.Containers.Logging.LogPartialLineThrottlePeriod} seconds."
- end
- end
-
- if not running_job[:log_throttle_is_open]
- # Don't log anything if any limit has been exceeded. Just count lossage.
- running_job[:log_throttle_bytes_skipped] += linesize
- end
-
- if message
- # Yes, write to logs, but use our "rate exceeded" message
- # instead of the log message that exceeded the limit.
- message += " A complete log is still being written to Keep, and will be available when the job finishes.\n"
- line.replace message
- true
- elsif partial_line
- false
- else
- running_job[:log_throttle_is_open]
- end
- end
-
- def read_pipes
- @running.each do |job_uuid, j|
- now = Time.now
- if now > j[:log_throttle_reset_time]
- # It has been more than throttle_period seconds since the last
- # checkpoint so reset the throttle
- if j[:log_throttle_bytes_skipped] > 0
- message = "#{job_uuid} ! Skipped #{j[:log_throttle_bytes_skipped]} bytes of log"
- $stderr.puts message
- j[:stderr_buf_to_flush] << "#{LogTime.now} #{message}\n"
- end
-
- j[:log_throttle_reset_time] = now + Rails.configuration.Containers.Logging.LogThrottlePeriod
- j[:log_throttle_bytes_so_far] = 0
- j[:log_throttle_lines_so_far] = 0
- j[:log_throttle_bytes_skipped] = 0
- j[:log_throttle_is_open] = true
- j[:log_throttle_partial_line_last_at] = Time.new(0)
- j[:log_throttle_first_partial_line] = true
- end
-
- j[:buf].each do |stream, streambuf|
- # Read some data from the child stream
- buf = ''
- begin
- # It's important to use a big enough buffer here. When we're
- # being flooded with logs, we must read and discard many
- # bytes at once. Otherwise, we can easily peg a CPU with
- # time-checking and other loop overhead. (Quick tests show a
- # 1MiB buffer working 2.5x as fast as a 64 KiB buffer.)
- #
- # So don't reduce this buffer size!
- buf = j[stream].read_nonblock(2**20)
- rescue Errno::EAGAIN, EOFError
- end
-
- # Short circuit the counting code if we're just going to throw
- # away the data anyway.
- if not j[:log_throttle_is_open]
- j[:log_throttle_bytes_skipped] += streambuf.size + buf.size
- streambuf.replace ''
- next
- elsif buf == ''
- next
- end
-
- # Append to incomplete line from previous read, if any
- streambuf << buf
-
- bufend = ''
- streambuf.each_line do |line|
- if not line.end_with? $/
- if line.size > Rails.configuration.Containers.Logging.LogThrottleBytes
- # Without a limit here, we'll use 2x an arbitrary amount
- # of memory, and waste a lot of time copying strings
- # around, all without providing any feedback to anyone
- # about what's going on _or_ hitting any of our throttle
- # limits.
- #
- # Here we leave "line" alone, knowing it will never be
- # sent anywhere: rate_limit() will reach
- # crunch_log_throttle_bytes immediately. However, we'll
- # leave [...] in bufend: if the trailing end of the long
- # line does end up getting sent anywhere, it will have
- # some indication that it is incomplete.
- bufend = "[...]"
- else
- # If line length is sane, we'll wait for the rest of the
- # line to appear in the next read_pipes() call.
- bufend = line
- break
- end
- end
- # rate_limit returns true or false as to whether to actually log
- # the line or not. It also modifies "line" in place to replace
- # it with an error if a logging limit is tripped.
- if rate_limit j, line
- $stderr.print "#{job_uuid} ! " unless line.index(job_uuid)
- $stderr.puts line
- pub_msg = "#{LogTime.now} #{line.strip}\n"
- j[:stderr_buf_to_flush] << pub_msg
- end
- end
-
- # Leave the trailing incomplete line (if any) in streambuf for
- # next time.
- streambuf.replace bufend
- end
- # Flush buffered logs to the logs table, if appropriate. We have
- # to do this even if we didn't collect any new logs this time:
- # otherwise, buffered data older than seconds_between_events
- # won't get flushed until new data arrives.
- write_log j
- end
- end
-
- def reap_children
- return if 0 == @running.size
- pid_done = nil
- j_done = nil
-
- @running.each do |uuid, j|
- if !j[:wait_thr].status
- pid_done = j[:wait_thr].pid
- j_done = j
- break
- end
- end
-
- return if !pid_done
-
- job_done = j_done[:job]
-
- # Ensure every last drop of stdout and stderr is consumed.
- read_pipes
- # Reset flush timestamp to make sure log gets written.
- j_done[:stderr_flushed_at] = Time.new(0)
- # Write any remaining logs.
- write_log j_done
-
- j_done[:buf].each do |stream, streambuf|
- if streambuf != ''
- $stderr.puts streambuf + "\n"
- end
- end
-
- # Wait the thread (returns a Process::Status)
- exit_status = j_done[:wait_thr].value.exitstatus
- exit_tempfail = exit_status == EXIT_TEMPFAIL
-
- $stderr.puts "dispatch: child #{pid_done} exit #{exit_status}"
- $stderr.puts "dispatch: job #{job_done.uuid} end"
-
- jobrecord = Job.find_by_uuid(job_done.uuid)
-
- if exit_status == EXIT_RETRY_UNLOCKED or (exit_tempfail and @job_retry_counts.include? jobrecord.uuid)
- $stderr.puts("dispatch: job #{jobrecord.uuid} was interrupted by node failure")
- # Only this crunch-dispatch process can retry the job:
- # it's already locked, and there's no way to put it back in the
- # Queued state. Put it in our internal todo list unless the job
- # has failed this way excessively.
- @job_retry_counts[jobrecord.uuid] += 1
- exit_tempfail = @job_retry_counts[jobrecord.uuid] <= RETRY_UNLOCKED_LIMIT
- do_what_next = "give up now"
- if exit_tempfail
- @todo_job_retries[jobrecord.uuid] = jobrecord
- do_what_next = "re-attempt"
- end
- $stderr.puts("dispatch: job #{jobrecord.uuid} has been interrupted " +
- "#{@job_retry_counts[jobrecord.uuid]}x, will #{do_what_next}")
- end
-
- if !exit_tempfail
- @job_retry_counts.delete(jobrecord.uuid)
- if jobrecord.state == "Running"
- # Apparently there was an unhandled error. That could potentially
- # include "all allocated nodes failed" when we don't to retry
- # because the job has already been retried RETRY_UNLOCKED_LIMIT
- # times. Fail the job.
- jobrecord.state = "Failed"
- if not jobrecord.save
- $stderr.puts "dispatch: jobrecord.save failed"
- end
- end
- else
- # If the job failed to run due to an infrastructure
- # issue with crunch-job or slurm, we want the job to stay in the
- # queue. If crunch-job exited after losing a race to another
- # crunch-job process, it exits 75 and we should leave the job
- # record alone so the winner of the race can do its thing.
- # If crunch-job exited after all of its allocated nodes failed,
- # it exits 93, and we want to retry it later (see the
- # EXIT_RETRY_UNLOCKED `if` block).
- #
- # There is still an unhandled race condition: If our crunch-job
- # process is about to lose a race with another crunch-job
- # process, but crashes before getting to its "exit 75" (for
- # example, "cannot fork" or "cannot reach API server") then we
- # will assume incorrectly that it's our process's fault
- # jobrecord.started_at is non-nil, and mark the job as failed
- # even though the winner of the race is probably still doing
- # fine.
- end
-
- # Invalidate the per-job auth token, unless the job is still queued and we
- # might want to try it again.
- if jobrecord.state != "Queued" and !@todo_job_retries.include?(jobrecord.uuid)
- j_done[:job_auth].update_attributes expires_at: Time.now
- end
-
- @running.delete job_done.uuid
- end
-
- def update_pipelines
- expire_tokens = @pipe_auth_tokens.dup
- @todo_pipelines.each do |p|
- pipe_auth = (@pipe_auth_tokens[p.uuid] ||= ApiClientAuthorization.
- create(user: User.where('uuid=?', p.modified_by_user_uuid).first,
- api_client_id: 0))
- puts `export ARVADOS_API_TOKEN=#{pipe_auth.api_token} && arv-run-pipeline-instance --run-pipeline-here --no-wait --instance #{p.uuid}`
- expire_tokens.delete p.uuid
- end
-
- expire_tokens.each do |k, v|
- v.update_attributes expires_at: Time.now
- @pipe_auth_tokens.delete k
- end
- end
-
- def parse_argv argv
- @runoptions = {}
- (argv.any? ? argv : ['--jobs', '--pipelines']).each do |arg|
- case arg
- when '--jobs'
- @runoptions[:jobs] = true
- when '--pipelines'
- @runoptions[:pipelines] = true
- else
- abort "Unrecognized command line option '#{arg}'"
- end
- end
- if not (@runoptions[:jobs] or @runoptions[:pipelines])
- abort "Nothing to do. Please specify at least one of: --jobs, --pipelines."
- end
- end
-
- def run argv
- parse_argv argv
-
- # We want files written by crunch-dispatch to be writable by other
- # processes with the same GID, see bug #7228
- File.umask(0002)
-
- # This is how crunch-job child procs know where the "refresh"
- # trigger file is
- ENV["CRUNCH_REFRESH_TRIGGER"] = Rails.configuration.Containers.JobsAPI.CrunchRefreshTrigger
-
- # If salloc can't allocate resources immediately, make it use our
- # temporary failure exit code. This ensures crunch-dispatch won't
- # mark a job failed because of an issue with node allocation.
- # This often happens when another dispatcher wins the race to
- # allocate nodes.
- ENV["SLURM_EXIT_IMMEDIATE"] = CrunchDispatch::EXIT_TEMPFAIL.to_s
-
- if ENV["CRUNCH_DISPATCH_LOCKFILE"]
- lockfilename = ENV.delete "CRUNCH_DISPATCH_LOCKFILE"
- lockfile = File.open(lockfilename, File::RDWR|File::CREAT, 0644)
- unless lockfile.flock File::LOCK_EX|File::LOCK_NB
- abort "Lock unavailable on #{lockfilename} - exit"
- end
- end
-
- @signal = {}
- %w{TERM INT}.each do |sig|
- signame = sig
- Signal.trap(sig) do
- $stderr.puts "Received #{signame} signal"
- @signal[:term] = true
- end
- end
-
- act_as_system_user
- User.first.group_permissions
- $stderr.puts "dispatch: ready"
- while !@signal[:term] or @running.size > 0
- read_pipes
- if @signal[:term]
- @running.each do |uuid, j|
- if !j[:started] and j[:sent_int] < 2
- begin
- Process.kill 'INT', j[:wait_thr].pid
- rescue Errno::ESRCH
- # No such pid = race condition + desired result is
- # already achieved
- end
- j[:sent_int] += 1
- end
- end
- else
- refresh_todo unless did_recently(:refresh_todo, 1.0)
- update_node_status unless did_recently(:update_node_status, 1.0)
- unless @todo.empty? or did_recently(:start_jobs, 1.0) or @signal[:term]
- start_jobs
- end
- unless (@todo_pipelines.empty? and @pipe_auth_tokens.empty?) or did_recently(:update_pipelines, 5.0)
- update_pipelines
- end
- unless did_recently('check_orphaned_slurm_jobs', 60)
- check_orphaned_slurm_jobs
- end
- end
- reap_children
- select(@running.values.collect { |j| [j[:stdout], j[:stderr]] }.flatten,
- [], [], 1)
- end
- # If there are jobs we wanted to retry, we have to mark them as failed now.
- # Other dispatchers can't pick them up because we hold their lock.
- @todo_job_retries.each_key do |job_uuid|
- job = Job.find_by_uuid(job_uuid)
- if job.state == "Running"
- fail_job(job, "crunch-dispatch was stopped during job's tempfail retry loop")
- end
- end
- end
-
- def fail_jobs before: nil
- act_as_system_user do
- threshold = nil
- if before == 'reboot'
- boottime = nil
- open('/proc/stat').map(&:split).each do |stat, t|
- if stat == 'btime'
- boottime = t
- end
- end
- if not boottime
- raise "Could not find btime in /proc/stat"
- end
- threshold = Time.at(boottime.to_i)
- elsif before
- threshold = Time.parse(before, Time.now)
- else
- threshold = db_current_time
- end
- Rails.logger.info "fail_jobs: threshold is #{threshold}"
-
- squeue = squeue_jobs
- Job.where('state = ? and started_at < ?', Job::Running, threshold).
- each do |job|
- Rails.logger.debug "fail_jobs: #{job.uuid} started #{job.started_at}"
- squeue.each do |slurm_name|
- if slurm_name == job.uuid
- Rails.logger.info "fail_jobs: scancel #{job.uuid}"
- scancel slurm_name
- end
- end
- fail_job(job, "cleaned up stale job: started before #{threshold}",
- skip_lock: true)
- end
- end
- end
-
- def check_orphaned_slurm_jobs
- act_as_system_user do
- squeue_uuids = squeue_jobs.select{|uuid| uuid.match(/^[0-9a-z]{5}-8i9sb-[0-9a-z]{15}$/)}.
- select{|uuid| !@running.has_key?(uuid)}
-
- return if squeue_uuids.size == 0
-
- scancel_uuids = squeue_uuids - Job.where('uuid in (?) and (state in (?) or modified_at>?)',
- squeue_uuids,
- ['Running', 'Queued'],
- (Time.now - 60)).
- collect(&:uuid)
- scancel_uuids.each do |uuid|
- Rails.logger.info "orphaned job: scancel #{uuid}"
- scancel uuid
- end
- end
- end
-
- def sudo_preface
- return [] if not Rails.configuration.Containers.JobsAPI.CrunchJobUser
- ["sudo", "-E", "-u",
- Rails.configuration.Containers.JobsAPI.CrunchJobUser,
- "LD_LIBRARY_PATH=#{ENV['LD_LIBRARY_PATH']}",
- "PATH=#{ENV['PATH']}",
- "PERLLIB=#{ENV['PERLLIB']}",
- "PYTHONPATH=#{ENV['PYTHONPATH']}",
- "RUBYLIB=#{ENV['RUBYLIB']}",
- "GEM_PATH=#{ENV['GEM_PATH']}"]
- end
-
- protected
-
- def have_job_lock?(job)
- # Return true if the given job is locked by this crunch-dispatch, normally
- # because we've run crunch-job for it.
- @todo_job_retries.include?(job.uuid)
- end
-
- def did_recently(thing, min_interval)
- if !@did_recently[thing] or @did_recently[thing] < Time.now - min_interval
- @did_recently[thing] = Time.now
- false
- else
- true
- end
- end
-
- # send message to log table. we want these records to be transient
- def write_log running_job
- return if running_job[:stderr_buf_to_flush] == ''
-
- # Send out to log event if buffer size exceeds the bytes per event or if
- # it has been at least crunch_log_seconds_between_events seconds since
- # the last flush.
- if running_job[:stderr_buf_to_flush].size > Rails.configuration.Containers.Logging.LogBytesPerEvent or
- (Time.now - running_job[:stderr_flushed_at]) >= Rails.configuration.Containers.Logging.LogSecondsBetweenEvents
- begin
- log = Log.new(object_uuid: running_job[:job].uuid,
- event_type: 'stderr',
- owner_uuid: running_job[:job].owner_uuid,
- properties: {"text" => running_job[:stderr_buf_to_flush]})
- log.save!
- running_job[:events_logged] += 1
- rescue => exception
- $stderr.puts "Failed to write logs"
- $stderr.puts exception.backtrace
- end
- running_job[:stderr_buf_to_flush] = ''
- running_job[:stderr_flushed_at] = Time.now
- end
- end
-
- # An array of job_uuids in squeue
- def squeue_jobs
- if Rails.configuration.Containers.JobsAPI.CrunchJobWrapper == "slurm_immediate"
- p = IO.popen(['squeue', '-a', '-h', '-o', '%j'])
- begin
- p.readlines.map {|line| line.strip}
- ensure
- p.close
- end
- else
- []
- end
- end
-
- def scancel slurm_name
- cmd = sudo_preface + ['scancel', '-n', slurm_name]
- IO.popen(cmd) do |scancel_pipe|
- puts scancel_pipe.read
- end
- if not $?.success?
- Rails.logger.error "scancel #{slurm_name.shellescape}: $?"
- end
- end
-end
diff --git a/services/api/script/crunch-dispatch.rb b/services/api/script/crunch-dispatch.rb
deleted file mode 100755
index 38bd54b5c..000000000
--- a/services/api/script/crunch-dispatch.rb
+++ /dev/null
@@ -1,16 +0,0 @@
-#!/usr/bin/env ruby
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: AGPL-3.0
-
-dispatch_argv = []
-ARGV.reject! do |arg|
- dispatch_argv.push(arg) if /^--/ =~ arg
-end
-
-ENV["RAILS_ENV"] = ARGV[0] || ENV["RAILS_ENV"] || "development"
-require File.dirname(__FILE__) + '/../config/boot'
-require File.dirname(__FILE__) + '/../config/environment'
-require './lib/crunch_dispatch.rb'
-
-CrunchDispatch.new.run dispatch_argv
diff --git a/services/api/script/crunch_failure_report.py b/services/api/script/crunch_failure_report.py
deleted file mode 100755
index 83217d851..000000000
--- a/services/api/script/crunch_failure_report.py
+++ /dev/null
@@ -1,222 +0,0 @@
-#! /usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: AGPL-3.0
-
-import argparse
-import datetime
-import json
-import re
-import sys
-
-import arvados
-
-# Useful configuration variables:
-
-# Number of log lines to use as context in diagnosing failure.
-LOG_CONTEXT_LINES = 10
-
-# Regex that signifies a failed task.
-FAILED_TASK_REGEX = re.compile(' \d+ failure (.*permanent)')
-
-# Regular expressions used to classify failure types.
-JOB_FAILURE_TYPES = {
- 'sys/docker': 'Cannot destroy container',
- 'crunch/node': 'User not found on host',
- 'slurm/comm': 'Communication connection failure'
-}
-
-def parse_arguments(arguments):
- arg_parser = argparse.ArgumentParser(
- description='Produce a report of Crunch failures within a specified time range')
-
- arg_parser.add_argument(
- '--start',
- help='Start date and time')
- arg_parser.add_argument(
- '--end',
- help='End date and time')
-
- args = arg_parser.parse_args(arguments)
-
- if args.start and not is_valid_timestamp(args.start):
- raise ValueError(args.start)
- if args.end and not is_valid_timestamp(args.end):
- raise ValueError(args.end)
-
- return args
-
-
-def api_timestamp(when=None):
- """Returns a string representing the timestamp 'when' in a format
- suitable for delivering to the API server. Defaults to the
- current time.
- """
- if when is None:
- when = datetime.datetime.utcnow()
- return when.strftime("%Y-%m-%dT%H:%M:%SZ")
-
-
-def is_valid_timestamp(ts):
- return re.match(r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z', ts)
-
-
-def jobs_created_between_dates(api, start, end):
- return arvados.util.list_all(
- api.jobs().list,
- filters=json.dumps([ ['created_at', '>=', start],
- ['created_at', '<=', end] ]))
-
-
-def job_logs(api, job):
- # Returns the contents of the log for this job (as an array of lines).
- if job['log']:
- log_collection = arvados.CollectionReader(job['log'], api)
- log_filename = "{}.log.txt".format(job['uuid'])
- return log_collection.open(log_filename).readlines()
- return []
-
-
-user_names = {}
-def job_user_name(api, user_uuid):
- def _lookup_user_name(api, user_uuid):
- try:
- return api.users().get(uuid=user_uuid).execute()['full_name']
- except arvados.errors.ApiError:
- return user_uuid
-
- if user_uuid not in user_names:
- user_names[user_uuid] = _lookup_user_name(api, user_uuid)
- return user_names[user_uuid]
-
-
-job_pipeline_names = {}
-def job_pipeline_name(api, job_uuid):
- def _lookup_pipeline_name(api, job_uuid):
- try:
- pipelines = api.pipeline_instances().list(
- filters='[["components", "like", "%{}%"]]'.format(job_uuid)).execute()
- pi = pipelines['items'][0]
- if pi['name']:
- return pi['name']
- else:
- # Use the pipeline template name
- pt = api.pipeline_templates().get(uuid=pi['pipeline_template_uuid']).execute()
- return pt['name']
- except (TypeError, ValueError, IndexError):
- return ""
-
- if job_uuid not in job_pipeline_names:
- job_pipeline_names[job_uuid] = _lookup_pipeline_name(api, job_uuid)
- return job_pipeline_names[job_uuid]
-
-
-def is_failed_task(logline):
- return FAILED_TASK_REGEX.search(logline) != None
-
-
-def main(arguments=None, stdout=sys.stdout, stderr=sys.stderr):
- args = parse_arguments(arguments)
-
- api = arvados.api('v1')
-
- now = datetime.datetime.utcnow()
- start_time = args.start or api_timestamp(now - datetime.timedelta(days=1))
- end_time = args.end or api_timestamp(now)
-
- # Find all jobs created within the specified window,
- # and their corresponding job logs.
- jobs_created = jobs_created_between_dates(api, start_time, end_time)
- jobs_by_state = {}
- for job in jobs_created:
- jobs_by_state.setdefault(job['state'], [])
- jobs_by_state[job['state']].append(job)
-
- # Find failed jobs and record the job failure text.
-
- # failure_stats maps failure types (e.g. "sys/docker") to
- # a set of job UUIDs that failed for that reason.
- failure_stats = {}
- for job in jobs_by_state['Failed']:
- job_uuid = job['uuid']
- logs = job_logs(api, job)
- # Find the first permanent task failure, and collect the
- # preceding log lines.
- failure_type = None
- for i, lg in enumerate(logs):
- if is_failed_task(lg):
- # Get preceding log record to provide context.
- log_start = i - LOG_CONTEXT_LINES if i >= LOG_CONTEXT_LINES else 0
- log_end = i + 1
- lastlogs = ''.join(logs[log_start:log_end])
- # try to identify the type of failure.
- for key, rgx in JOB_FAILURE_TYPES.iteritems():
- if re.search(rgx, lastlogs):
- failure_type = key
- break
- if failure_type is not None:
- break
- if failure_type is None:
- failure_type = 'unknown'
- failure_stats.setdefault(failure_type, set())
- failure_stats[failure_type].add(job_uuid)
-
- # Report percentages of successful, failed and unfinished jobs.
- print "Start: {:20s}".format(start_time)
- print "End: {:20s}".format(end_time)
- print ""
-
- print "Overview"
- print ""
-
- job_start_count = len(jobs_created)
- print " {: <25s} {:4d}".format('Started', job_start_count)
- for state in ['Complete', 'Failed', 'Queued', 'Cancelled', 'Running']:
- if state in jobs_by_state:
- job_count = len(jobs_by_state[state])
- job_percentage = job_count / float(job_start_count)
- print " {: <25s} {:4d} ({: >4.0%})".format(state,
- job_count,
- job_percentage)
- print ""
-
- # Report failure types.
- failure_summary = ""
- failure_detail = ""
-
- # Generate a mapping from failed job uuids to job records, to assist
- # in generating detailed statistics for job failures.
- jobs_failed_map = { job['uuid']: job for job in jobs_by_state.get('Failed', []) }
-
- # sort the failure stats in descending order by occurrence.
- sorted_failures = sorted(failure_stats,
- reverse=True,
- key=lambda failure_type: len(failure_stats[failure_type]))
- for failtype in sorted_failures:
- job_uuids = failure_stats[failtype]
- failstat = " {: <25s} {:4d} ({: >4.0%})\n".format(
- failtype,
- len(job_uuids),
- len(job_uuids) / float(len(jobs_by_state['Failed'])))
- failure_summary = failure_summary + failstat
- failure_detail = failure_detail + failstat
- for j in job_uuids:
- job_info = jobs_failed_map[j]
- job_owner = job_user_name(api, job_info['modified_by_user_uuid'])
- job_name = job_pipeline_name(api, job_info['uuid'])
- failure_detail = failure_detail + " {} {: <15.15s} {:29.29s}\n".format(j, job_owner, job_name)
- failure_detail = failure_detail + "\n"
-
- print "Failures by class"
- print ""
- print failure_summary
-
- print "Failures by class (detail)"
- print ""
- print failure_detail
-
- return 0
-
-
-if __name__ == "__main__":
- sys.exit(main())
diff --git a/services/api/script/fail-jobs.rb b/services/api/script/fail-jobs.rb
deleted file mode 100755
index e52bfc075..000000000
--- a/services/api/script/fail-jobs.rb
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/usr/bin/env ruby
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: AGPL-3.0
-
-require 'optimist'
-
-opts = Optimist::options do
- banner 'Fail jobs that have state=="Running".'
- banner 'Options:'
- opt(:before,
- 'fail only jobs that started before the given time (or "reboot")',
- type: :string)
-end
-
-ENV["RAILS_ENV"] = ARGV[0] || ENV["RAILS_ENV"] || "development"
-require File.dirname(__FILE__) + '/../config/boot'
-require File.dirname(__FILE__) + '/../config/environment'
-require Rails.root.join('lib/crunch_dispatch.rb')
-
-CrunchDispatch.new.fail_jobs before: opts[:before]
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list