[Checkins] SVN: z3c.etestbrowser/trunk/src/z3c/etestbrowser/ Added workaround for libxml2 HTML fallback behaviour that guesses the wrong

Christian Theune ct at gocept.com
Tue Sep 11 09:56:10 EDT 2007


Log message for revision 79569:
  Added workaround for libxml2 HTML fallback behaviour that guesses the wrong
  encoding.
  

Changed:
  U   z3c.etestbrowser/trunk/src/z3c/etestbrowser/README.txt
  U   z3c.etestbrowser/trunk/src/z3c/etestbrowser/ftesting.zcml
  A   z3c.etestbrowser/trunk/src/z3c/etestbrowser/lxml.pt
  U   z3c.etestbrowser/trunk/src/z3c/etestbrowser/testing.py

-=-
Modified: z3c.etestbrowser/trunk/src/z3c/etestbrowser/README.txt
===================================================================
--- z3c.etestbrowser/trunk/src/z3c/etestbrowser/README.txt	2007-09-11 12:59:52 UTC (rev 79568)
+++ z3c.etestbrowser/trunk/src/z3c/etestbrowser/README.txt	2007-09-11 13:56:09 UTC (rev 79569)
@@ -51,3 +51,15 @@
   >>> browser.etree.xpath(
   ...     '//html:body', {'html': 'http://www.w3.org/1999/xhtml'})
   [<Element {http://www.w3.org/1999/xhtml}body at ...>]
+
+LXML unicode support
+====================
+
+A couple of variations of libxml2 might interpret UTF-8 encoded strings
+incorrectly. We have a workaround for that. Let's have a look at a view that
+contains a German umlaut:
+
+  >>> browser.xml_strict = False
+  >>> browser.open('http://localhost/lxml.html')
+  >>> browser.etree.xpath("//span")[0].text
+  u'K\xfcgelblitz.'

Modified: z3c.etestbrowser/trunk/src/z3c/etestbrowser/ftesting.zcml
===================================================================
--- z3c.etestbrowser/trunk/src/z3c/etestbrowser/ftesting.zcml	2007-09-11 12:59:52 UTC (rev 79568)
+++ z3c.etestbrowser/trunk/src/z3c/etestbrowser/ftesting.zcml	2007-09-11 13:56:09 UTC (rev 79569)
@@ -1,4 +1,6 @@
-<configure xmlns="http://namespaces.zope.org/zope">
+<configure xmlns="http://namespaces.zope.org/zope"
+  xmlns:browser="http://namespaces.zope.org/browser"
+  i18n_domain="zope">
 
   <include package="zope.app.zcmlfiles" file="meta.zcml"/>
   <include package="zope.app.zcmlfiles" />
@@ -19,5 +21,13 @@
                  description="All users have this role implicitly" />
 
   <grant permission="zope.View"
-                  role="zope.Anonymous" />
+    role="zope.Anonymous" />
+
+  <browser:page
+    name="lxml.html"
+    for="*"
+    template="lxml.pt"
+    permission="zope.View"
+    />
+
 </configure>

Added: z3c.etestbrowser/trunk/src/z3c/etestbrowser/lxml.pt
===================================================================
--- z3c.etestbrowser/trunk/src/z3c/etestbrowser/lxml.pt	                        (rev 0)
+++ z3c.etestbrowser/trunk/src/z3c/etestbrowser/lxml.pt	2007-09-11 13:56:09 UTC (rev 79569)
@@ -0,0 +1,8 @@
+<html>
+  <head>
+    <meta http-equiv="content-type" content="text/html; charset=utf-8"/>
+  </head>
+  <body>
+    <span>Kügelblitz.</span> &nbsp;
+  </body>
+</html>


Property changes on: z3c.etestbrowser/trunk/src/z3c/etestbrowser/lxml.pt
___________________________________________________________________
Name: svn:eol-style
   + native

Modified: z3c.etestbrowser/trunk/src/z3c/etestbrowser/testing.py
===================================================================
--- z3c.etestbrowser/trunk/src/z3c/etestbrowser/testing.py	2007-09-11 12:59:52 UTC (rev 79568)
+++ z3c.etestbrowser/trunk/src/z3c/etestbrowser/testing.py	2007-09-11 13:56:09 UTC (rev 79569)
@@ -16,13 +16,14 @@
 $Id$
 """
 
+import re
 import StringIO
 import lxml.etree
 
 import zope.testbrowser.testing
 
 
-html_parser = lxml.etree.HTMLParser()
+RE_CHARSET = re.compile('.*;charset=(.*)')
 
 
 class ExtendedTestBrowser(zope.testbrowser.testing.Browser):
@@ -45,10 +46,20 @@
         # I'm not using any internal knowledge about testbrowser
         # here, to avoid breakage. Memory usage won't be a problem.
         if self.xml_strict:
-            parser = None
+            self._etree = lxml.etree.XML(self.contents)
         else:
-            parser = html_parser
-        self._etree = lxml.etree.XML(self.contents, parser)
+            # This is a workaround against the broken fallback for 
+            # encoding detection of libxml2.
+            # We have a chance of knowing the encoding as Zope states this in
+            # the content-type response header.
+            content = self.contents
+            content_type = self.headers['content-type']
+            match = RE_CHARSET.match(content_type)
+            if match is not None:
+                charset = match.groups()[0]
+                content = content.decode(charset)
+            self._etree = lxml.etree.HTML(content)
+
         return self._etree
 
     def _changed(self):



More information about the Checkins mailing list