[ARVADOS] created: f5ba3202d1b0065ab8c28f553972f180fbb56238
git at public.curoverse.com
git at public.curoverse.com
Sat Jun 6 18:59:09 EDT 2015
at f5ba3202d1b0065ab8c28f553972f180fbb56238 (commit)
commit f5ba3202d1b0065ab8c28f553972f180fbb56238
Author: radhika <radhika at curoverse.com>
Date: Sat Jun 6 18:43:41 2015 -0400
6203: files_count method performance improved by using split + file regexp instead of each_file_spec. With this update, the
time spent executing files_count and files_size method for qr1hi-4zz18-tcnxylwkxg0nfhi is reduced to 4s, from earlier 12s.
Benchmarking showed that "scan /\S+/" is 2.5 times slower than split. Updated a couple of those usages accordingly.
diff --git a/sdk/ruby/lib/arvados/keep.rb b/sdk/ruby/lib/arvados/keep.rb
index 422dab5..13492a9 100644
--- a/sdk/ruby/lib/arvados/keep.rb
+++ b/sdk/ruby/lib/arvados/keep.rb
@@ -101,6 +101,8 @@ module Keep
def initialize(manifest_text)
@text = manifest_text
@files = nil
+ @files_count = nil
+ @files_size = nil
end
def each_line
@@ -109,7 +111,8 @@ module Keep
stream_name = nil
block_tokens = []
file_tokens = []
- line.scan /\S+/ do |token|
+ splits = line.split
+ splits.each do |token|
if stream_name.nil?
stream_name = unescape token
elsif file_tokens.empty? and Locator.valid? token
@@ -149,7 +152,8 @@ module Keep
@text.each_line do |line|
stream_name = nil
in_file_tokens = false
- line.scan /\S+/ do |token|
+ splits = line.split
+ splits.each do |token|
if stream_name.nil?
stream_name = unescape token
elsif in_file_tokens or not Locator.valid? token
@@ -183,26 +187,55 @@ module Keep
@files
end
+ FILE_REGEXP = /^(\d+)(:)(\d+)(:)(.*)$/
def files_count(stop_after=nil)
# Return the number of files represented in this manifest.
# If stop_after is provided, files_count will read the manifest
# incrementally, and return immediately when it counts that number of
# files. This can help you avoid parsing the entire manifest if you
# just want to check if a small number of files are specified.
- if stop_after.nil? or not @files.nil?
- return files.size
+
+ if not @files.nil?
+ return @files.size
+ elsif @files_count
+ return @files_count
end
- seen_files = {}
- each_file_spec do |streamname, _, _, filename|
- seen_files[[streamname, filename]] = true
- return stop_after if (seen_files.size >= stop_after)
+
+ files = {}
+ total_size = 0
+ count_so_far = 0
+ @text.split("\n").each do |line|
+ words = line.split
+ stream = words[0]
+ files[stream] = {}
+ words.each do |word|
+ match = FILE_REGEXP.match word
+ if match
+ if !files[stream][match[5]]
+ files[stream][match[5]] = true
+ count_so_far += 1
+ end
+
+ if stop_after and (count_so_far >= stop_after)
+ return count_so_far
+ end
+
+ total_size += match[3].to_i
+ end
+ end
end
- seen_files.size
+
+ @files_size = total_size
+ @files_count = count_so_far
end
def files_size
# Return the total size of all files in this manifest.
- files.reduce(0) { |total, (_, _, size)| total + size }
+ if @files_size
+ @files_size
+ else
+ files.reduce(0) { |total, (_, _, size)| total + size }
+ end
end
def exact_file_count?(want_count)
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list