[ARVADOS] created: 9cd3ddf205cb4b0874c2b80bc200adac2598961d

git at public.curoverse.com git at public.curoverse.com
Tue Nov 18 10:50:20 EST 2014


        at  9cd3ddf205cb4b0874c2b80bc200adac2598961d (commit)


commit 9cd3ddf205cb4b0874c2b80bc200adac2598961d
Author: Tom Clegg <tom at curoverse.com>
Date:   Mon Nov 17 11:04:54 2014 -0500

    4363: Accept unicode class, not just unicode type.

diff --git a/sdk/python/arvados/collection.py b/sdk/python/arvados/collection.py
index 0fe7b3e..fee6059 100644
--- a/sdk/python/arvados/collection.py
+++ b/sdk/python/arvados/collection.py
@@ -186,7 +186,7 @@ class CollectionReader(CollectionBase):
                     self._manifest_locator,
                     error_via_api,
                     error_via_keep))
-        if type(self._manifest_text) == unicode:
+        if isinstance(self._manifest_text, unicode):
             unicode_manifest = self._manifest_text
         else:
             unicode_manifest = self._manifest_text.decode('utf-8')

commit 4b20653d3f65c01661c6eae9cea25c0ab3c79aef
Author: Tom Clegg <tom at curoverse.com>
Date:   Mon Nov 17 11:01:49 2014 -0500

    4363: Validate utf-8 encoding of manifest_text.

diff --git a/services/api/app/models/collection.rb b/services/api/app/models/collection.rb
index 28fc70e..89513fa 100644
--- a/services/api/app/models/collection.rb
+++ b/services/api/app/models/collection.rb
@@ -5,6 +5,7 @@ class Collection < ArvadosModel
   include KindAndEtag
   include CommonApiTemplate
 
+  before_validation :check_encoding
   before_validation :check_signatures
   before_validation :strip_manifest_text
   before_validation :set_portable_data_hash
@@ -106,6 +107,15 @@ class Collection < ArvadosModel
     true
   end
 
+  def check_encoding
+    if manifest_text.encoding.name == 'UTF-8' and manifest_text.valid_encoding?
+      true
+    else
+      errors.add :manifest_text, "must use UTF-8 encoding"
+      false
+    end
+  end
+
   def redundancy_status
     if redundancy_confirmed_as.nil?
       'unconfirmed'
diff --git a/services/api/test/unit/collection_test.rb b/services/api/test/unit/collection_test.rb
index 4f73670..8853319 100644
--- a/services/api/test/unit/collection_test.rb
+++ b/services/api/test/unit/collection_test.rb
@@ -1,7 +1,41 @@
 require 'test_helper'
 
 class CollectionTest < ActiveSupport::TestCase
-  # test "the truth" do
-  #   assert true
-  # end
+  def create_collection name, enc=nil
+    txt = ". d41d8cd98f00b204e9800998ecf8427e+0 0:0:#{name}.txt\n"
+    txt.force_encoding(enc) if enc
+    return Collection.create(manifest_text: txt)
+  end
+
+  test 'accept ASCII manifest_text' do
+    act_as_system_user do
+      c = create_collection 'foo'
+      assert c.valid?
+    end
+  end
+
+  test 'accept UTF-8 manifest_text' do
+    act_as_system_user do
+      c = create_collection "f\xc3\x98\xc3\x98"
+      assert c.valid?
+    end
+  end
+
+  test 'refuse manifest_text with invalid UTF-8 byte sequence' do
+    act_as_system_user do
+      c = create_collection "f\xc8o", Encoding::UTF_8
+      assert !c.valid?
+      assert_equal [:manifest_text], c.errors.messages.keys
+      assert_match /UTF-8/, c.errors.messages[:manifest_text].first
+    end
+  end
+
+  test 'refuse manifest_text with non-UTF-8 encoding' do
+    act_as_system_user do
+      c = create_collection "f\xc8o", Encoding::ASCII_8BIT
+      assert !c.valid?
+      assert_equal [:manifest_text], c.errors.messages.keys
+      assert_match /UTF-8/, c.errors.messages[:manifest_text].first
+    end
+  end
 end

commit e8dd6d95b4ab40e6f95b0faa51752599e27f6baf
Merge: 97bb076 06e402b
Author: Tom Clegg <tom at curoverse.com>
Date:   Thu Nov 13 15:48:19 2014 -0500

    4363: Merge branch 'master' into 4363-less-filename-munging


commit 97bb07673de62fe9bdb5fe8ae0a9c6dbc8d8c2d8
Author: Tom Clegg <tom at curoverse.com>
Date:   Thu Nov 13 15:20:11 2014 -0500

    4363: Make _NameAttribute a unicode instead of a str. Avoid encoding hack.

diff --git a/sdk/python/arvados/stream.py b/sdk/python/arvados/stream.py
index cd8153e..aab5b3e 100644
--- a/sdk/python/arvados/stream.py
+++ b/sdk/python/arvados/stream.py
@@ -103,17 +103,16 @@ def split(path):
     return stream_name, file_name
 
 class StreamFileReader(ArvadosFileBase):
-    class _NameAttribute(str):
+    class _NameAttribute(unicode):
         # The Python file API provides a plain .name attribute.
         # Older SDK provided a name() method.
         # This class provides both, for maximum compatibility.
         def __call__(self):
-            return self.decode('utf-8')
+            return self
 
 
     def __init__(self, stream, segments, name):
-        super(StreamFileReader, self).__init__(
-            self._NameAttribute(name.encode('utf-8')), 'rb')
+        super(StreamFileReader, self).__init__(self._NameAttribute(name), 'rb')
         self._stream = stream
         self.segments = segments
         self._filepos = 0L

commit 34a73d5411d0e9a86ede40e6a3ebce54f4d9a94f
Author: Tom Clegg <tom at curoverse.com>
Date:   Thu Nov 13 14:54:29 2014 -0500

    4363: Remove redundant test.

diff --git a/services/fuse/tests/test_mount.py b/services/fuse/tests/test_mount.py
index 262e85a..29b237d 100644
--- a/services/fuse/tests/test_mount.py
+++ b/services/fuse/tests/test_mount.py
@@ -364,4 +364,3 @@ class FuseUnitTest(unittest.TestCase):
         self.assertEqual("_", fuse.sanitize_filename(""))
         self.assertEqual("_", fuse.sanitize_filename("."))
         self.assertEqual("__", fuse.sanitize_filename(".."))
-        self.assertEqual("__", fuse.sanitize_filename(".."))

commit 1aa683826a4e9d1956167f812e12a9386de58c81
Author: Tom Clegg <tom at curoverse.com>
Date:   Thu Nov 13 14:45:17 2014 -0500

    4363: Compare unicode strings (ignore utf-8 encoding differences). Bump fuse->sdk version dependency.

diff --git a/services/fuse/setup.py b/services/fuse/setup.py
index 1e72fee..5cf9d23 100644
--- a/services/fuse/setup.py
+++ b/services/fuse/setup.py
@@ -36,7 +36,7 @@ setup(name='arvados_fuse',
         'bin/arv-mount'
         ],
       install_requires=[
-        'arvados-python-client>=0.1.20141103223015.68dae83',
+        'arvados-python-client>=0.1.20141113192857.f78caf9',
         'llfuse',
         'python-daemon'
         ],
diff --git a/services/fuse/tests/test_mount.py b/services/fuse/tests/test_mount.py
index 7e5325d..262e85a 100644
--- a/services/fuse/tests/test_mount.py
+++ b/services/fuse/tests/test_mount.py
@@ -79,7 +79,7 @@ class FuseMountTest(MountTestBase):
 
         self._utf8 = ["\xe2\x9c\x8c",     # victory sign
                       "\xe2\x9b\xb5",     # sailboat
-                      "\xf0\x9f\x98\xb1", # scream
+                      # "\xf0\x9f\x98\xb1", # scream. doesn't work!!
                       ]
         cw.start_new_stream('edgecases/utf8')
         for f in self._utf8:
@@ -93,7 +93,8 @@ class FuseMountTest(MountTestBase):
         path = self.mounttmp
         if subdir:
             path = os.path.join(path, subdir)
-        self.assertEqual(sorted(expect_content), sorted(os.listdir(path)))
+        self.assertEqual(sorted([fn.decode('utf-8') for fn in expect_content]),
+                         sorted([fn.decode('utf-8') for fn in os.listdir(path)]))
 
     def runTest(self):
         # Create the request handler

commit f78caf95ed818b476a1b9de1c4fc868aa020f185
Author: Tom Clegg <tom at curoverse.com>
Date:   Thu Nov 13 14:28:57 2014 -0500

    4363: Do not blow up when manifests contain utf-8 characters.

diff --git a/sdk/python/arvados/collection.py b/sdk/python/arvados/collection.py
index 0f49438..0fe7b3e 100644
--- a/sdk/python/arvados/collection.py
+++ b/sdk/python/arvados/collection.py
@@ -42,14 +42,14 @@ def normalize_stream(s, stream):
                 if segmentoffset == current_span[1]:
                     current_span[1] += segment[arvados.SEGMENTSIZE]
                 else:
-                    stream_tokens.append("{0}:{1}:{2}".format(current_span[0], current_span[1] - current_span[0], fout))
+                    stream_tokens.append(u"{0}:{1}:{2}".format(current_span[0], current_span[1] - current_span[0], fout))
                     current_span = [segmentoffset, segmentoffset + segment[arvados.SEGMENTSIZE]]
 
         if current_span is not None:
-            stream_tokens.append("{0}:{1}:{2}".format(current_span[0], current_span[1] - current_span[0], fout))
+            stream_tokens.append(u"{0}:{1}:{2}".format(current_span[0], current_span[1] - current_span[0], fout))
 
         if not stream[f]:
-            stream_tokens.append("0:0:{0}".format(fout))
+            stream_tokens.append(u"0:0:{0}".format(fout))
 
     return stream_tokens
 
@@ -186,9 +186,14 @@ class CollectionReader(CollectionBase):
                     self._manifest_locator,
                     error_via_api,
                     error_via_keep))
-        self._streams = [sline.split()
-                         for sline in self._manifest_text.split("\n")
-                         if sline]
+        if type(self._manifest_text) == unicode:
+            unicode_manifest = self._manifest_text
+        else:
+            unicode_manifest = self._manifest_text.decode('utf-8')
+        self._streams = [
+            sline.split()
+            for sline in unicode_manifest.split("\n")
+            if sline]
 
     def normalize(self):
         self._populate()
@@ -211,7 +216,8 @@ class CollectionReader(CollectionBase):
         # Regenerate the manifest text based on the normalized streams
         self._manifest_text = ''.join(
             [StreamReader(stream, keep=self._my_keep()).manifest_text()
-             for stream in self._streams])
+             for stream in self._streams]
+            ).encode('utf-8')
 
     def open(self, streampath, filename=None):
         """open(streampath[, filename]) -> file-like object
diff --git a/sdk/python/arvados/stream.py b/sdk/python/arvados/stream.py
index c263dd8..cd8153e 100644
--- a/sdk/python/arvados/stream.py
+++ b/sdk/python/arvados/stream.py
@@ -108,11 +108,12 @@ class StreamFileReader(ArvadosFileBase):
         # Older SDK provided a name() method.
         # This class provides both, for maximum compatibility.
         def __call__(self):
-            return self
+            return self.decode('utf-8')
 
 
     def __init__(self, stream, segments, name):
-        super(StreamFileReader, self).__init__(self._NameAttribute(name), 'rb')
+        super(StreamFileReader, self).__init__(
+            self._NameAttribute(name.encode('utf-8')), 'rb')
         self._stream = stream
         self.segments = segments
         self._filepos = 0L
@@ -327,7 +328,7 @@ class StreamReader(object):
                 manifest_text.append(m.group(0))
         else:
             manifest_text.extend([d[LOCATOR] for d in self._data_locators])
-        manifest_text.extend([' '.join(["{}:{}:{}".format(seg[LOCATOR], seg[BLOCKSIZE], f.name().replace(' ', '\\040'))
+        manifest_text.extend([' '.join([u"{}:{}:{}".format(seg[LOCATOR], seg[BLOCKSIZE], f.name().replace(' ', '\\040'))
                                         for seg in f.segments])
                               for f in self._files.values()])
         return ' '.join(manifest_text) + '\n'
diff --git a/sdk/python/tests/test_collections.py b/sdk/python/tests/test_collections.py
index 254a29f..be62ad4 100644
--- a/sdk/python/tests/test_collections.py
+++ b/sdk/python/tests/test_collections.py
@@ -209,6 +209,10 @@ class ArvadosCollectionsTest(run_test_server.TestCaseWithServers,
 """
         self.assertEqual(arvados.CollectionReader(m8, self.api_client).manifest_text(normalize=True), m8)
 
+        m_utf8 = """./\xe2\x9b\xb5 59ca0efa9f5633cb0371bbc0355478d8+13 0:13:\xf0\x9f\x98\xb1
+"""
+        self.assertEqual(arvados.CollectionReader(m_utf8, self.api_client).manifest_text(normalize=True), m_utf8)
+
     def test_locators_and_ranges(self):
         blocks2 = [['a', 10, 0],
                   ['b', 10, 10],
diff --git a/services/api/app/models/collection.rb b/services/api/app/models/collection.rb
index accd2cc..28fc70e 100644
--- a/services/api/app/models/collection.rb
+++ b/services/api/app/models/collection.rb
@@ -76,7 +76,7 @@ class Collection < ArvadosModel
 
   def set_portable_data_hash
     if (self.portable_data_hash.nil? or (self.portable_data_hash == "") or (manifest_text_changed? and !portable_data_hash_changed?))
-      self.portable_data_hash = "#{Digest::MD5.hexdigest(manifest_text)}+#{manifest_text.length}"
+      self.portable_data_hash = "#{Digest::MD5.hexdigest(manifest_text)}+#{manifest_text.bytesize}"
     elsif portable_data_hash_changed?
       begin
         loc = Keep::Locator.parse!(self.portable_data_hash)
@@ -84,7 +84,7 @@ class Collection < ArvadosModel
         if loc.size
           self.portable_data_hash = loc.to_s
         else
-          self.portable_data_hash = "#{loc.hash}+#{self.manifest_text.length}"
+          self.portable_data_hash = "#{loc.hash}+#{self.manifest_text.bytesize}"
         end
       rescue ArgumentError => e
         errors.add(:portable_data_hash, "#{e}")
@@ -96,7 +96,7 @@ class Collection < ArvadosModel
 
   def ensure_hash_matches_manifest_text
     if manifest_text_changed? or portable_data_hash_changed?
-      computed_hash = "#{Digest::MD5.hexdigest(manifest_text)}+#{manifest_text.length}"
+      computed_hash = "#{Digest::MD5.hexdigest(manifest_text)}+#{manifest_text.bytesize}"
       unless computed_hash == portable_data_hash
         logger.debug "(computed) '#{computed_hash}' != '#{portable_data_hash}' (provided)"
         errors.add(:portable_data_hash, "does not match hash of manifest_text")
diff --git a/services/fuse/tests/test_mount.py b/services/fuse/tests/test_mount.py
index 3a59084..7e5325d 100644
--- a/services/fuse/tests/test_mount.py
+++ b/services/fuse/tests/test_mount.py
@@ -77,6 +77,15 @@ class FuseMountTest(MountTestBase):
             cw.start_new_file('x/x')
             cw.write('x')
 
+        self._utf8 = ["\xe2\x9c\x8c",     # victory sign
+                      "\xe2\x9b\xb5",     # sailboat
+                      "\xf0\x9f\x98\xb1", # scream
+                      ]
+        cw.start_new_stream('edgecases/utf8')
+        for f in self._utf8:
+            cw.start_new_file(f)
+            cw.write(f)
+
         self.testcollection = cw.finish()
         self.api.collections().create(body={"manifest_text":cw.manifest_text()}).execute()
 
@@ -105,9 +114,10 @@ class FuseMountTest(MountTestBase):
         self.assertDirContents('dir2', ['thing5.txt', 'thing6.txt', 'dir3'])
         self.assertDirContents('dir2/dir3', ['thing7.txt', 'thing8.txt'])
         self.assertDirContents('edgecases',
-                               "dirs/:/_/__/.../-/*/\x01\\/ ".split("/"))
+                               "dirs/utf8/:/_/__/.../-/*/\x01\\/ ".split("/"))
         self.assertDirContents('edgecases/dirs',
                                ":/__/.../-/*/\x01\\/ ".split("/"))
+        self.assertDirContents('edgecases/utf8', self._utf8)
 
         files = {'thing1.txt': 'data 1',
                  'thing2.txt': 'data 2',

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list