Yet another problem with unicode - some HTLM pages can not be decoded because they contain undecodable characters.

ziima · ziima · commit abd15eefb08f · 2011-07-18T09:28:25.000+02:00
It causes raise of UnicodeDecodeError deep inside python. This only happens if xrds location is not found before
some unicode character.

 - Catch UnicodeDecodeError when searching for yadis
 - Update check of whether yadis was used - if xrds location is none it was not
 - Added tests, update previous unicode test with comment
diff --git a/openid/test/data/test_discover/unicode.html b/openid/test/data/test_discover/unicode.html
@@ -1,9 +1,9 @@
 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
 <html>
   <head>
-    <title param="ěščřžýáíé &raquo;">Identity Page for Smoker</title>
+    <title param="ěščřžýáíé &raquo;">Title with param that needs decoding</title>
   </head>
   <body>
-    <p>foo</p>
+    <p>This page can be properly decoded and everything will will be fine</p>
   </body>
 </html>
diff --git a/openid/test/data/test_discover/unicode2.html b/openid/test/data/test_discover/unicode2.html
@@ -0,0 +1,17 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html>
+  <head>
+      <title param="��������� &raquo;">Title with param which raises UnicodeError</title>
+      <meta http-equiv="X-XRDS-Location" content="http://someuser.unittest/xrds" />
+  </head>
+  <body>
+      <p>
+         weird sign � to prevent successful decoding
+      </p>
+      <p>
+          This page can not be properly decoded so its content will be passed to HTML parser
+          encoded but title raises UnicodeError because x-xrds-location is not found on time
+      </p>
+  </body>
+</html>
+
diff --git a/openid/test/data/test_discover/unicode3.html b/openid/test/data/test_discover/unicode3.html
@@ -0,0 +1,17 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html>
+  <head>
+      <meta http-equiv="X-XRDS-Location" content="http://someuser.unittest/xrds" />
+      <title param="��������� &raquo;">Title with param which raises UnicodeError</title>
+  </head>
+  <body>
+      <p>
+         weird sign � to prevent successful decoding
+      </p>
+      <p>
+          This page can not be properly decoded so its content will be passed to HTML parser
+          encoded but service will be found because x-xrds-location is found on time
+      </p>
+  </body>
+</html>
+
diff --git a/openid/test/test_discover.py b/openid/test/test_discover.py
@@ -250,11 +250,36 @@ def test_404(self):
                               discover.discover, self.id_url + '/404')
 
     def test_unicode(self):
+        """
+        Check page with unicode and HTML entities
+        """
         self._discover(
             content_type='text/html;charset=utf-8',
             data=readDataFile('unicode.html'),
             expected_services=0)
 
+    def test_unicode_undecodable_html(self):
+        """
+        Check page with unicode and HTML entities that can not be decoded
+        """
+        data = readDataFile('unicode2.html')
+        self.failUnlessRaises(UnicodeDecodeError, data.decode, 'utf-8')
+        self._discover(content_type='text/html;charset=utf-8',
+            data=data, expected_services=0)
+
+    def test_unicode_undecodable_html2(self):
+        """
+        Check page with unicode and HTML entities that can not be decoded
+        but xrds document is found before it matters
+        """
+        self.documents[self.id_url + 'xrds'] = (
+            'application/xrds+xml', readDataFile('yadis_idp.xml'))
+
+        data = readDataFile('unicode3.html')
+        self.failUnlessRaises(UnicodeDecodeError, data.decode, 'utf-8')
+        self._discover(content_type='text/html;charset=utf-8',
+            data=data, expected_services=1)
+
     def test_noOpenID(self):
         services = self._discover(content_type='text/plain',
                                   data="junk",
diff --git a/openid/yadis/discover.py b/openid/yadis/discover.py
@@ -45,6 +45,8 @@ def __init__(self, request_uri):
 
     def usedYadisLocation(self):
         """Was the Yadis protocol's indirection used?"""
+        if self.xrds_uri is None:
+            return False
         return self.normalized_uri != self.xrds_uri
 
     def isXRDS(self):
@@ -131,18 +133,22 @@ def whereIsYadis(resp):
             content_type = content_type or ''
             encoding = content_type.rsplit(';', 1)
             if len(encoding) == 2 and encoding[1].strip().startswith('charset='):
-                encoding = encoding[1].split('=', 1)[1]
+                encoding = encoding[1].split('=', 1)[1].strip()
             else:
                 encoding = 'UTF-8'
 
             try:
                 content = resp.body.decode(encoding)
             except UnicodeError:
+                # Keep encoded version in case yadis location can be found before encoding shut this up.
+                # Possible errors will be caught lower.
                 content = resp.body
 
             try:
                 yadis_loc = findHTMLMeta(StringIO(content))
-            except MetaNotFound:
+            except (MetaNotFound, UnicodeError):
+                # UnicodeError: Response body could not be encoded and xrds location
+                # could not be found before troubles occurs.
                 pass
 
         return yadis_loc

-Original file line number
+Diff line change
@@ @@ -0,0 +1,17 @@ @@
 +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
 +<html>
 +  <head>
 +      <title param="¿¿¿¿¿ýáíé &raquo;">Title with param which raises UnicodeError</title>
 +      <meta http-equiv="X-XRDS-Location" content="http://someuser.unittest/xrds" />
 +  </head>
 +  <body>
 +      <p>
 +         weird sign Å to prevent successful decoding
 +      </p>
 +      <p>
 +          This page can not be properly decoded so its content will be passed to HTML parser
 +          encoded but title raises UnicodeError because x-xrds-location is not found on time
 +      </p>
 +  </body>
 +</html>
++